From 95243efe17e66e769f8e829dc25f3f71909257ba Mon Sep 17 00:00:00 2001
From: Geoffrey Martin-Noble <gcmn@brium.ai>
Date: Tue, 28 Jan 2025 16:58:34 -0800
Subject: [PATCH] Hipify source in place (#2)

This is just the output of the hipify-inplace.sh and hipify-tensoradapter.py scripts with no further modifications, so probably you'll want to just spot-check.

I think it will be easier to review changes to the actual HIP source directly rather than trying to thing about what the hipify script will do and since there are a decent number of changes to get the build and tests working, that seems worth it. In the end we should have a bunch of hip source files and .prehip cuda files that they can be generated from. Then we can handle organization however we want: restore the originals and have hipification be part of a build process, have the hip versions on a separate branch, etc.

I put the .prehip files in a separate commit to keep things a bit cleaner.

Note that this PR is based off of #1. I didn't make that the base branch though because then it would all be in my fork.
---
 include/dgl/aten/macro.h                      |   20 +-
 include/dgl/aten/macro.h.prehip               |  434 +++++
 include/dgl/runtime/device_api.h              |    6 +-
 include/dgl/runtime/device_api.h.prehip       |  268 +++
 include/dgl/runtime/ndarray.h                 |   24 +-
 include/dgl/runtime/ndarray.h.prehip          |  890 +++++++++
 include/dgl/runtime/tensordispatch.h          |   38 +-
 include/dgl/runtime/tensordispatch.h.prehip   |  281 +++
 src/array/arith.h                             |    4 +-
 src/array/arith.h.prehip                      |  109 ++
 src/array/cuda/array_cumsum.cu                |    8 +-
 src/array/cuda/array_cumsum.cu.prehip         |   57 +
 src/array/cuda/array_index_select.cu          |    8 +-
 src/array/cuda/array_index_select.cu.prehip   |   98 +
 src/array/cuda/array_index_select.cuh         |    1 +
 src/array/cuda/array_index_select.cuh.prehip  |   87 +
 src/array/cuda/array_nonzero.cu               |   10 +-
 src/array/cuda/array_nonzero.cu.prehip        |   71 +
 src/array/cuda/array_op_impl.cu               |   23 +-
 src/array/cuda/array_op_impl.cu.prehip        |  441 +++++
 src/array/cuda/array_scatter.cu               |    7 +-
 src/array/cuda/array_scatter.cu.prehip        |   61 +
 src/array/cuda/array_sort.cu                  |    8 +-
 src/array/cuda/array_sort.cu.prehip           |   61 +
 src/array/cuda/atomic.cuh                     |   32 +-
 src/array/cuda/atomic.cuh.prehip              |  336 ++++
 src/array/cuda/bf16.cuh                       |  112 +-
 src/array/cuda/bf16.cuh.prehip                |  149 ++
 src/array/cuda/coo2csr.cu                     |   13 +-
 src/array/cuda/coo2csr.cu.prehip              |  137 ++
 src/array/cuda/coo_sort.cu                    |    5 +-
 src/array/cuda/coo_sort.cu.prehip             |  168 ++
 src/array/cuda/csr2coo.cu                     |   28 +-
 src/array/cuda/csr2coo.cu.prehip              |  183 ++
 src/array/cuda/csr_get_data.cu                |   10 +-
 src/array/cuda/csr_get_data.cu.prehip         |  100 +
 src/array/cuda/csr_mm.cu                      |  126 +-
 src/array/cuda/csr_mm.cu.prehip               |  332 ++++
 src/array/cuda/csr_sort.cu                    |   31 +-
 src/array/cuda/csr_sort.cu.prehip             |  151 ++
 src/array/cuda/csr_sum.cu                     |   28 +-
 src/array/cuda/csr_sum.cu.prehip              |  177 ++
 src/array/cuda/csr_transpose.cc               |   22 +-
 src/array/cuda/csr_transpose.cc.prehip        |   95 +
 src/array/cuda/cuda_filter.cu                 |   11 +-
 src/array/cuda/cuda_filter.cu.prehip          |  140 ++
 src/array/cuda/cusparse_dispatcher.cuh        |  126 +-
 src/array/cuda/cusparse_dispatcher.cuh.prehip |  238 +++
 src/array/cuda/disjoint_union.cu              |    5 +-
 src/array/cuda/disjoint_union.cu.prehip       |  185 ++
 src/array/cuda/fp16.cuh                       |    8 +-
 src/array/cuda/fp16.cuh.prehip                |  134 ++
 src/array/cuda/functor.cuh                    |   54 +-
 src/array/cuda/functor.cuh.prehip             |  456 +++++
 src/array/cuda/gather_mm.cu                   |   83 +-
 src/array/cuda/gather_mm.cu.prehip            |  464 +++++
 src/array/cuda/ge_spmm.cuh                    |    3 +-
 src/array/cuda/ge_spmm.cuh.prehip             |  144 ++
 src/array/cuda/labor_sampling.cu              |   31 +-
 src/array/cuda/labor_sampling.cu.prehip       |  833 +++++++++
 src/array/cuda/macro.cuh                      |    8 +-
 src/array/cuda/macro.cuh.prehip               |   53 +
 src/array/cuda/negative_sampling.cu           |   29 +-
 src/array/cuda/negative_sampling.cu.prehip    |  220 +++
 src/array/cuda/rowwise_sampling.cu            |   37 +-
 src/array/cuda/rowwise_sampling.cu.prehip     |  366 ++++
 src/array/cuda/rowwise_sampling_prob.cu       |   53 +-
 .../cuda/rowwise_sampling_prob.cu.prehip      |  696 +++++++
 src/array/cuda/sddmm.cu                       |    8 +-
 src/array/cuda/sddmm.cu.prehip                |   99 +
 src/array/cuda/sddmm.cuh                      |    5 +-
 src/array/cuda/sddmm.cuh.prehip               |  368 ++++
 src/array/cuda/sddmm_hetero_coo.cu            |    4 +-
 src/array/cuda/sddmm_hetero_coo.cu.prehip     |   91 +
 src/array/cuda/sddmm_hetero_csr.cu            |    4 +-
 src/array/cuda/sddmm_hetero_csr.cu.prehip     |   90 +
 src/array/cuda/segment_reduce.cu              |   16 +-
 src/array/cuda/segment_reduce.cu.prehip       |  157 ++
 src/array/cuda/segment_reduce.cuh             |    9 +-
 src/array/cuda/segment_reduce.cuh.prehip      |  262 +++
 src/array/cuda/spmat_op_impl_coo.cu           |    5 +-
 src/array/cuda/spmat_op_impl_coo.cu.prehip    |  139 ++
 src/array/cuda/spmat_op_impl_csr.cu           |   23 +-
 src/array/cuda/spmat_op_impl_csr.cu.prehip    |  654 +++++++
 src/array/cuda/spmm.cu                        |    8 +-
 src/array/cuda/spmm.cu.prehip                 |  179 ++
 src/array/cuda/spmm.cuh                       |  201 +-
 src/array/cuda/spmm.cuh.prehip                |  802 ++++++++
 src/array/cuda/spmm_hetero.cu                 |    8 +-
 src/array/cuda/spmm_hetero.cu.prehip          |  262 +++
 src/array/cuda/utils.cu                       |    8 +-
 src/array/cuda/utils.cu.prehip                |   33 +
 src/array/cuda/utils.h                        |   19 +-
 src/array/cuda/utils.h.prehip                 |  301 +++
 src/array/cuda/uvm/array_index_select_uvm.cu  |    4 +-
 .../cuda/uvm/array_index_select_uvm.cu.prehip |  131 ++
 src/array/cuda/uvm/array_index_select_uvm.cuh |    1 +
 .../uvm/array_index_select_uvm.cuh.prehip     |   52 +
 src/array/filter.cc                           |    2 +-
 src/array/filter.cc.prehip                    |   54 +
 src/array/selector.h                          |    4 +-
 src/array/selector.h.prehip                   |   59 +
 src/array/uvm_array.cc                        |    4 +-
 src/array/uvm_array.cc.prehip                 |   74 +
 src/geometry/cuda/edge_coarsening_impl.cu     |   19 +-
 .../cuda/edge_coarsening_impl.cu.prehip       |  239 +++
 src/geometry/cuda/geometry_op_impl.cu         |    5 +-
 src/geometry/cuda/geometry_op_impl.cu.prehip  |  135 ++
 src/graph/heterograph_capi.cc                 |    2 +-
 src/graph/heterograph_capi.cc.prehip          |  841 +++++++++
 .../sampling/randomwalks/frequency_hashmap.cu |   37 +-
 .../randomwalks/frequency_hashmap.cu.prehip   |  471 +++++
 .../randomwalks/frequency_hashmap.cuh         |    4 +-
 .../randomwalks/frequency_hashmap.cuh.prehip  |   79 +
 .../randomwalks/get_node_types_gpu.cu         |    2 +-
 .../randomwalks/get_node_types_gpu.cu.prehip  |   72 +
 .../sampling/randomwalks/randomwalk_gpu.cu    |   43 +-
 .../randomwalks/randomwalk_gpu.cu.prehip      |  496 +++++
 .../transform/cuda/cuda_compact_graph.cu      |    8 +-
 .../cuda/cuda_compact_graph.cu.prehip         |  247 +++
 src/graph/transform/cuda/cuda_map_edges.cuh   |    7 +-
 .../transform/cuda/cuda_map_edges.cuh.prehip  |  240 +++
 src/graph/transform/cuda/cuda_to_block.cu     |   24 +-
 .../transform/cuda/cuda_to_block.cu.prehip    |  258 +++
 src/graph/transform/cuda/knn.cu               |   45 +-
 src/graph/transform/cuda/knn.cu.prehip        |  997 ++++++++++
 src/graph/transform/to_block.cc               |    4 +-
 src/graph/transform/to_block.cc.prehip        |  383 ++++
 src/partition/cuda/partition_op.cu            |   35 +-
 src/partition/cuda/partition_op.cu.prehip     |  613 ++++++
 src/partition/ndarray_partition.cc            |   12 +-
 src/partition/ndarray_partition.cc.prehip     |  266 +++
 src/random/continuous_seed.h                  |   14 +-
 src/random/continuous_seed.h.prehip           |  100 +
 src/runtime/c_runtime_api.cc                  |    6 +-
 src/runtime/c_runtime_api.cc.prehip           |  418 +++++
 src/runtime/cuda/cuda_common.h                |  126 +-
 src/runtime/cuda/cuda_common.h.prehip         |  259 +++
 src/runtime/cuda/cuda_device_api.cc           |  158 +-
 src/runtime/cuda/cuda_device_api.cc.prehip    |  377 ++++
 src/runtime/cuda/cuda_hashtable.cu            |   19 +-
 src/runtime/cuda/cuda_hashtable.cu.prehip     |  443 +++++
 src/runtime/cuda/cuda_hashtable.cuh           |    9 +-
 src/runtime/cuda/cuda_hashtable.cuh.prehip    |  284 +++
 src/runtime/cuda/gpu_cache.cu                 |    8 +-
 src/runtime/cuda/gpu_cache.cu.prehip          |  189 ++
 src/runtime/ndarray.cc                        |    8 +-
 src/runtime/ndarray.cc.prehip                 |  505 +++++
 tensoradapter/include/tensoradapter.h         |   19 +-
 tensoradapter/include/tensoradapter.h.prehip  |  113 ++
 tensoradapter/pytorch/torch.cpp               |   47 +-
 tensoradapter/pytorch/torch.cpp.prehip        |  106 ++
 tests/cpp/common.h                            |    2 +-
 tests/cpp/common.h.prehip                     |   56 +
 tests/cpp/test_aten.cc                        |   28 +-
 tests/cpp/test_aten.cc.prehip                 | 1437 ++++++++++++++
 tests/cpp/test_csrmm.cc                       |    6 +-
 tests/cpp/test_csrmm.cc.prehip                |  215 +++
 tests/cpp/test_partition.cc                   |    4 +-
 tests/cpp/test_partition.cc.prehip            |  196 ++
 tests/cpp/test_spmat_coo.cc                   |    6 +-
 tests/cpp/test_spmat_coo.cc.prehip            |  576 ++++++
 tests/cpp/test_spmat_csr.cc                   |   24 +-
 tests/cpp/test_spmat_csr.cc.prehip            |  760 ++++++++
 tests/cpp/test_unit_graph.cc                  |   14 +-
 tests/cpp/test_unit_graph.cc.prehip           |  434 +++++
 .../gpu_cache/include/gpu_cache_api.hpp       |   10 +-
 .../include/gpu_cache_api.hpp.prehip          |   55 +
 .../gpu_cache/include/nv_gpu_cache.hpp        |   10 +-
 .../gpu_cache/include/nv_gpu_cache.hpp.prehip |  122 ++
 .../HugeCTR/gpu_cache/include/nv_util.h       |   30 +-
 .../gpu_cache/include/nv_util.h.prehip        |   90 +
 .../HugeCTR/gpu_cache/src/nv_gpu_cache.cu     |   97 +-
 .../gpu_cache/src/nv_gpu_cache.cu.prehip      | 1645 +++++++++++++++++
 174 files changed, 27274 insertions(+), 1102 deletions(-)
 create mode 100644 include/dgl/aten/macro.h.prehip
 create mode 100644 include/dgl/runtime/device_api.h.prehip
 create mode 100644 include/dgl/runtime/ndarray.h.prehip
 create mode 100644 include/dgl/runtime/tensordispatch.h.prehip
 create mode 100644 src/array/arith.h.prehip
 create mode 100644 src/array/cuda/array_cumsum.cu.prehip
 create mode 100644 src/array/cuda/array_index_select.cu.prehip
 create mode 100644 src/array/cuda/array_index_select.cuh.prehip
 create mode 100644 src/array/cuda/array_nonzero.cu.prehip
 create mode 100644 src/array/cuda/array_op_impl.cu.prehip
 create mode 100644 src/array/cuda/array_scatter.cu.prehip
 create mode 100644 src/array/cuda/array_sort.cu.prehip
 create mode 100644 src/array/cuda/atomic.cuh.prehip
 create mode 100644 src/array/cuda/bf16.cuh.prehip
 create mode 100644 src/array/cuda/coo2csr.cu.prehip
 create mode 100644 src/array/cuda/coo_sort.cu.prehip
 create mode 100644 src/array/cuda/csr2coo.cu.prehip
 create mode 100644 src/array/cuda/csr_get_data.cu.prehip
 create mode 100644 src/array/cuda/csr_mm.cu.prehip
 create mode 100644 src/array/cuda/csr_sort.cu.prehip
 create mode 100644 src/array/cuda/csr_sum.cu.prehip
 create mode 100644 src/array/cuda/csr_transpose.cc.prehip
 create mode 100644 src/array/cuda/cuda_filter.cu.prehip
 create mode 100644 src/array/cuda/cusparse_dispatcher.cuh.prehip
 create mode 100644 src/array/cuda/disjoint_union.cu.prehip
 create mode 100644 src/array/cuda/fp16.cuh.prehip
 create mode 100644 src/array/cuda/functor.cuh.prehip
 create mode 100644 src/array/cuda/gather_mm.cu.prehip
 create mode 100644 src/array/cuda/ge_spmm.cuh.prehip
 create mode 100644 src/array/cuda/labor_sampling.cu.prehip
 create mode 100644 src/array/cuda/macro.cuh.prehip
 create mode 100644 src/array/cuda/negative_sampling.cu.prehip
 create mode 100644 src/array/cuda/rowwise_sampling.cu.prehip
 create mode 100644 src/array/cuda/rowwise_sampling_prob.cu.prehip
 create mode 100644 src/array/cuda/sddmm.cu.prehip
 create mode 100644 src/array/cuda/sddmm.cuh.prehip
 create mode 100644 src/array/cuda/sddmm_hetero_coo.cu.prehip
 create mode 100644 src/array/cuda/sddmm_hetero_csr.cu.prehip
 create mode 100644 src/array/cuda/segment_reduce.cu.prehip
 create mode 100644 src/array/cuda/segment_reduce.cuh.prehip
 create mode 100644 src/array/cuda/spmat_op_impl_coo.cu.prehip
 create mode 100644 src/array/cuda/spmat_op_impl_csr.cu.prehip
 create mode 100644 src/array/cuda/spmm.cu.prehip
 create mode 100644 src/array/cuda/spmm.cuh.prehip
 create mode 100644 src/array/cuda/spmm_hetero.cu.prehip
 create mode 100644 src/array/cuda/utils.cu.prehip
 create mode 100644 src/array/cuda/utils.h.prehip
 create mode 100644 src/array/cuda/uvm/array_index_select_uvm.cu.prehip
 create mode 100644 src/array/cuda/uvm/array_index_select_uvm.cuh.prehip
 create mode 100644 src/array/filter.cc.prehip
 create mode 100644 src/array/selector.h.prehip
 create mode 100644 src/array/uvm_array.cc.prehip
 create mode 100644 src/geometry/cuda/edge_coarsening_impl.cu.prehip
 create mode 100644 src/geometry/cuda/geometry_op_impl.cu.prehip
 create mode 100644 src/graph/heterograph_capi.cc.prehip
 create mode 100644 src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip
 create mode 100644 src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip
 create mode 100644 src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip
 create mode 100644 src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip
 create mode 100644 src/graph/transform/cuda/cuda_compact_graph.cu.prehip
 create mode 100644 src/graph/transform/cuda/cuda_map_edges.cuh.prehip
 create mode 100644 src/graph/transform/cuda/cuda_to_block.cu.prehip
 create mode 100644 src/graph/transform/cuda/knn.cu.prehip
 create mode 100644 src/graph/transform/to_block.cc.prehip
 create mode 100644 src/partition/cuda/partition_op.cu.prehip
 create mode 100644 src/partition/ndarray_partition.cc.prehip
 create mode 100644 src/random/continuous_seed.h.prehip
 create mode 100644 src/runtime/c_runtime_api.cc.prehip
 create mode 100644 src/runtime/cuda/cuda_common.h.prehip
 create mode 100644 src/runtime/cuda/cuda_device_api.cc.prehip
 create mode 100644 src/runtime/cuda/cuda_hashtable.cu.prehip
 create mode 100644 src/runtime/cuda/cuda_hashtable.cuh.prehip
 create mode 100644 src/runtime/cuda/gpu_cache.cu.prehip
 create mode 100644 src/runtime/ndarray.cc.prehip
 create mode 100644 tensoradapter/include/tensoradapter.h.prehip
 create mode 100644 tensoradapter/pytorch/torch.cpp.prehip
 create mode 100644 tests/cpp/common.h.prehip
 create mode 100644 tests/cpp/test_aten.cc.prehip
 create mode 100644 tests/cpp/test_csrmm.cc.prehip
 create mode 100644 tests/cpp/test_partition.cc.prehip
 create mode 100644 tests/cpp/test_spmat_coo.cc.prehip
 create mode 100644 tests/cpp/test_spmat_csr.cc.prehip
 create mode 100644 tests/cpp/test_unit_graph.cc.prehip
 create mode 100644 third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip
 create mode 100644 third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip
 create mode 100644 third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip
 create mode 100644 third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu.prehip

diff --git a/include/dgl/aten/macro.h b/include/dgl/aten/macro.h
index b760f3e88b41..ef97762d94c2 100644
--- a/include/dgl/aten/macro.h
+++ b/include/dgl/aten/macro.h
@@ -41,7 +41,7 @@
  * We treat pinned memory as normal host memory if we don't want
  * to enable CUDA UVA access for this operator
  */
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...)                          \
   do {                                                                   \
     if ((val) == kDGLCPU) {                                              \
@@ -55,9 +55,9 @@
                  << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \
     }                                                                    \
   } while (0)
-#else  // DGL_USE_CUDA
+#else  // DGL_USE_ROCM
 #define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 /**
  * Dispatch according to integral type (either int32 or int64):
@@ -132,7 +132,7 @@
  * Dispatch according to float type, including 16bits
  * (float16/bfloat16/float32/float64).
  */
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #if BF16_ENABLED
 #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...)   \
   do {                                                                      \
@@ -150,7 +150,7 @@
       { __VA_ARGS__ }                                                       \
     } else if (                                                             \
         XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) {  \
-      typedef __nv_bfloat16 FloatType;                                      \
+      typedef __hip_bfloat16 FloatType;                                      \
       { __VA_ARGS__ }                                                       \
     } else if (                                                             \
         XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {    \
@@ -195,7 +195,7 @@
     }                                                                      \
   } while (0)
 #endif  // BF16_ENABLED
-#else   // DGL_USE_CUDA
+#else   // DGL_USE_ROCM
 #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \
   do {                                                                    \
     CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat))            \
@@ -215,7 +215,7 @@
                  << " can only be bfloat16/float32/float64 on CPU";       \
     }                                                                     \
   } while (0)
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 /**
  * Dispatch according to data type (int32, int64, float32 or float64):
@@ -361,7 +361,7 @@
   } while (0)
 
 // Macro to dispatch according to device context (allowing cuda)
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...)                \
   ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, {       \
     ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
@@ -372,10 +372,10 @@
   ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, {       \
     ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \
   });
-#else  // DGL_USE_CUDA
+#else  // DGL_USE_ROCM
 #define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH
 #define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 ///////////////////////// Array checks //////////////////////////
 
diff --git a/include/dgl/aten/macro.h.prehip b/include/dgl/aten/macro.h.prehip
new file mode 100644
index 000000000000..b760f3e88b41
--- /dev/null
+++ b/include/dgl/aten/macro.h.prehip
@@ -0,0 +1,434 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file dgl/aten/macro.h
+ * @brief Common macros for aten package.
+ */
+
+#ifndef DGL_ATEN_MACRO_H_
+#define DGL_ATEN_MACRO_H_
+
+///////////////////////// Dispatchers //////////////////////////
+
+/**
+ * Dispatch according to device:
+ *
+ * ATEN_XPU_SWITCH(array->ctx.device_type, XPU, {
+ *   // Now XPU is a placeholder for array->ctx.device_type
+ *   DeviceSpecificImplementation<XPU>(...);
+ * });
+ */
+#define ATEN_XPU_SWITCH(val, XPU, op, ...)                               \
+  do {                                                                   \
+    if ((val) == kDGLCPU) {                                              \
+      constexpr auto XPU = kDGLCPU;                                      \
+      { __VA_ARGS__ }                                                    \
+    } else {                                                             \
+      LOG(FATAL) << "Operator " << (op) << " does not support "          \
+                 << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \
+    }                                                                    \
+  } while (0)
+
+/**
+ * Dispatch according to device:
+ *
+ * XXX(minjie): temporary macro that allows CUDA operator
+ *
+ * ATEN_XPU_SWITCH(array->ctx.device_type, XPU, {
+ *   // Now XPU is a placeholder for array->ctx.device_type
+ *   DeviceSpecificImplementation<XPU>(...);
+ * });
+ *
+ * We treat pinned memory as normal host memory if we don't want
+ * to enable CUDA UVA access for this operator
+ */
+#ifdef DGL_USE_CUDA
+#define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...)                          \
+  do {                                                                   \
+    if ((val) == kDGLCPU) {                                              \
+      constexpr auto XPU = kDGLCPU;                                      \
+      { __VA_ARGS__ }                                                    \
+    } else if ((val) == kDGLCUDA) {                                      \
+      constexpr auto XPU = kDGLCUDA;                                     \
+      { __VA_ARGS__ }                                                    \
+    } else {                                                             \
+      LOG(FATAL) << "Operator " << (op) << " does not support "          \
+                 << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \
+    }                                                                    \
+  } while (0)
+#else  // DGL_USE_CUDA
+#define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH
+#endif  // DGL_USE_CUDA
+
+/**
+ * Dispatch according to integral type (either int32 or int64):
+ *
+ * ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
+ *   // Now IdType is the type corresponding to data type in array.
+ *   // For instance, one can do this for a CPU array:
+ *   DType *data = static_cast<DType *>(array->data);
+ * });
+ */
+#define ATEN_ID_TYPE_SWITCH(val, IdType, ...)                   \
+  do {                                                          \
+    CHECK_EQ((val).code, kDGLInt) << "ID must be integer type"; \
+    if ((val).bits == 32) {                                     \
+      typedef int32_t IdType;                                   \
+      { __VA_ARGS__ }                                           \
+    } else if ((val).bits == 64) {                              \
+      typedef int64_t IdType;                                   \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "ID can only be int32 or int64";            \
+    }                                                           \
+  } while (0)
+
+/**
+ * Dispatch according to bits (either int32 or int64):
+ *
+ * ATEN_ID_BITS_SWITCH(bits, IdType, {
+ *   // Now IdType is the type corresponding to data type in array.
+ *   // For instance, one can do this for a CPU array:
+ *   DType *data = static_cast<DType *>(array->data);
+ * });
+ */
+#define ATEN_ID_BITS_SWITCH(bits, IdType, ...)                      \
+  do {                                                              \
+    CHECK((bits) == 32 || (bits) == 64) << "bits must be 32 or 64"; \
+    if ((bits) == 32) {                                             \
+      typedef int32_t IdType;                                       \
+      { __VA_ARGS__ }                                               \
+    } else if ((bits) == 64) {                                      \
+      typedef int64_t IdType;                                       \
+      { __VA_ARGS__ }                                               \
+    } else {                                                        \
+      LOG(FATAL) << "ID can only be int32 or int64";                \
+    }                                                               \
+  } while (0)
+
+/**
+ * Dispatch according to float type (either float32 or float64):
+ *
+ * ATEN_FLOAT_TYPE_SWITCH(array->dtype, FloatType, {
+ *   // Now FloatType is the type corresponding to data type in array.
+ *   // For instance, one can do this for a CPU array:
+ *   FloatType *data = static_cast<FloatType *>(array->data);
+ * });
+ */
+#define ATEN_FLOAT_TYPE_SWITCH(val, FloatType, val_name, ...)               \
+  do {                                                                      \
+    CHECK_EQ((val).code, kDGLFloat) << (val_name) << " must be float type"; \
+    if ((val).bits == 32) {                                                 \
+      typedef float FloatType;                                              \
+      { __VA_ARGS__ }                                                       \
+    } else if ((val).bits == 64) {                                          \
+      typedef double FloatType;                                             \
+      { __VA_ARGS__ }                                                       \
+    } else {                                                                \
+      LOG(FATAL) << (val_name) << " can only be float32 or float64";        \
+    }                                                                       \
+  } while (0)
+
+/**
+ * Dispatch according to float type, including 16bits
+ * (float16/bfloat16/float32/float64).
+ */
+#ifdef DGL_USE_CUDA
+#if BF16_ENABLED
+#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...)   \
+  do {                                                                      \
+    CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat))              \
+        << (val_name) << " must be float type";                             \
+    if ((val).bits == 32) {                                                 \
+      typedef float FloatType;                                              \
+      { __VA_ARGS__ }                                                       \
+    } else if ((val).bits == 64) {                                          \
+      typedef double FloatType;                                             \
+      { __VA_ARGS__ }                                                       \
+    } else if (                                                             \
+        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) {   \
+      typedef __half FloatType;                                             \
+      { __VA_ARGS__ }                                                       \
+    } else if (                                                             \
+        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) {  \
+      typedef __nv_bfloat16 FloatType;                                      \
+      { __VA_ARGS__ }                                                       \
+    } else if (                                                             \
+        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {    \
+      LOG(FATAL) << (val_name) << " can't be float16 on CPU";               \
+    } else if (                                                             \
+        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) {   \
+      typedef BFloat16 FloatType;                                           \
+      { __VA_ARGS__ }                                                       \
+    } else {                                                                \
+      LOG(FATAL) << (val_name)                                              \
+                 << " can only be float16/bfloat16/float32/float64 on GPU"; \
+    }                                                                       \
+  } while (0)
+#else  // BF16_ENABLED
+#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...)  \
+  do {                                                                     \
+    CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat))             \
+        << (val_name) << " must be float type";                            \
+    if ((val).bits == 32) {                                                \
+      typedef float FloatType;                                             \
+      { __VA_ARGS__ }                                                      \
+    } else if ((val).bits == 64) {                                         \
+      typedef double FloatType;                                            \
+      { __VA_ARGS__ }                                                      \
+    } else if (                                                            \
+        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) {  \
+      typedef __half FloatType;                                            \
+      { __VA_ARGS__ }                                                      \
+    } else if (                                                            \
+        XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \
+      LOG(FATAL) << "bfloat16 requires CUDA >= 11.0";                      \
+    } else if (                                                            \
+        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {   \
+      LOG(FATAL) << (val_name) << " can't be float16 on CPU";              \
+    } else if (                                                            \
+        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) {  \
+      typedef BFloat16 FloatType;                                          \
+      { __VA_ARGS__ }                                                      \
+    } else {                                                               \
+      LOG(FATAL) << (val_name)                                             \
+                 << " can only be float16/float32/float64 on GPU";         \
+    }                                                                      \
+  } while (0)
+#endif  // BF16_ENABLED
+#else   // DGL_USE_CUDA
+#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \
+  do {                                                                    \
+    CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat))            \
+        << (val_name) << " must be float type";                           \
+    if ((val).bits == 32) {                                               \
+      typedef float FloatType;                                            \
+      { __VA_ARGS__ }                                                     \
+    } else if ((val).bits == 64) {                                        \
+      typedef double FloatType;                                           \
+      { __VA_ARGS__ }                                                     \
+    } else if (                                                           \
+        XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) { \
+      typedef BFloat16 FloatType;                                         \
+      { __VA_ARGS__ }                                                     \
+    } else {                                                              \
+      LOG(FATAL) << (val_name)                                            \
+                 << " can only be bfloat16/float32/float64 on CPU";       \
+    }                                                                     \
+  } while (0)
+#endif  // DGL_USE_CUDA
+
+/**
+ * Dispatch according to data type (int32, int64, float32 or float64):
+ *
+ * ATEN_DTYPE_SWITCH(array->dtype, DType, {
+ *   // Now DType is the type corresponding to data type in array.
+ *   // For instance, one can do this for a CPU array:
+ *   DType *data = static_cast<DType *>(array->data);
+ * });
+ */
+#define ATEN_DTYPE_SWITCH(val, DType, val_name, ...)                 \
+  do {                                                               \
+    if ((val).code == kDGLInt && (val).bits == 32) {                 \
+      typedef int32_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLInt && (val).bits == 64) {          \
+      typedef int64_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLFloat && (val).bits == 32) {        \
+      typedef float DType;                                           \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLFloat && (val).bits == 64) {        \
+      typedef double DType;                                          \
+      { __VA_ARGS__ }                                                \
+    } else {                                                         \
+      LOG(FATAL) << (val_name)                                       \
+                 << " can only be int32, int64, float32 or float64"; \
+    }                                                                \
+  } while (0)
+
+/**
+ * Dispatch according to data type (int8, uint8, float32 or float64):
+ *
+ * ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(array->dtype, DType, {
+ *   // Now DType is the type corresponding to data type in array.
+ *   // For instance, one can do this for a CPU array:
+ *   DType *data = static_cast<DType *>(array->data);
+ * });
+ */
+#define ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(val, DType, val_name, ...) \
+  do {                                                               \
+    if ((val).code == kDGLInt && (val).bits == 8) {                  \
+      typedef int8_t DType;                                          \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLUInt && (val).bits == 8) {          \
+      typedef uint8_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLFloat && (val).bits == 32) {        \
+      typedef float DType;                                           \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).code == kDGLFloat && (val).bits == 64) {        \
+      typedef double DType;                                          \
+      { __VA_ARGS__ }                                                \
+    } else {                                                         \
+      LOG(FATAL) << (val_name)                                       \
+                 << " can only be int8, uint8, float32 or float64";  \
+    }                                                                \
+  } while (0)
+
+/**
+ * Dispatch data type only based on bit-width (8-bit, 16-bit, 32-bit, 64-bit):
+ *
+ * ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, {
+ *   // Now DType is the type which has the same bit-width with the
+ *   // data type in array.
+ *   // Do not use for computation, but only for read and write.
+ *   // For instance, one can do this for a CPU array:
+ *   DType *data = static_cast<DType *>(array->data);
+ * });
+ */
+#define ATEN_DTYPE_BITS_ONLY_SWITCH(val, DType, val_name, ...)       \
+  do {                                                               \
+    if ((val).bits == 8) {                                           \
+      typedef int8_t DType;                                          \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).bits == 16) {                                   \
+      typedef int16_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).bits == 32) {                                   \
+      typedef int32_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else if ((val).bits == 64) {                                   \
+      typedef int64_t DType;                                         \
+      { __VA_ARGS__ }                                                \
+    } else {                                                         \
+      LOG(FATAL) << (val_name)                                       \
+                 << " can only be 8-bit, 16-bit, 32-bit, or 64-bit"; \
+    }                                                                \
+  } while (0)
+
+/**
+ * Dispatch according to integral type of CSR graphs.
+ * Identical to ATEN_ID_TYPE_SWITCH except for a different error message.
+ */
+#define ATEN_CSR_DTYPE_SWITCH(val, DType, ...)                    \
+  do {                                                            \
+    if ((val).code == kDGLInt && (val).bits == 32) {              \
+      typedef int32_t DType;                                      \
+      { __VA_ARGS__ }                                             \
+    } else if ((val).code == kDGLInt && (val).bits == 64) {       \
+      typedef int64_t DType;                                      \
+      { __VA_ARGS__ }                                             \
+    } else {                                                      \
+      LOG(FATAL) << "CSR matrix data can only be int32 or int64"; \
+    }                                                             \
+  } while (0)
+
+// Macro to dispatch according to device context and index type.
+#define ATEN_CSR_SWITCH(csr, XPU, IdType, op, ...)                     \
+  ATEN_XPU_SWITCH((csr).indptr->ctx.device_type, XPU, op, {            \
+    ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
+  });
+
+// Macro to dispatch according to device context and index type.
+#define ATEN_COO_SWITCH(coo, XPU, IdType, op, ...)                  \
+  ATEN_XPU_SWITCH((coo).row->ctx.device_type, XPU, op, {            \
+    ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \
+  });
+
+#define CHECK_VALID_CONTEXT(VAR1, VAR2)                          \
+  CHECK(                                                         \
+      ((VAR1)->ctx == (VAR2)->ctx) || (VAR1).IsPinned() ||       \
+      ((VAR1).NumElements() == 0)) /* Let empty arrays pass */   \
+      << "Expected " << (#VAR2) << "(" << (VAR2)->ctx << ")"     \
+      << " to have the same device "                             \
+      << "context as " << (#VAR1) << "(" << (VAR1)->ctx << "). " \
+      << "Or " << (#VAR1) << "(" << (VAR1)->ctx << ")"           \
+      << " is pinned";
+
+/**
+ * Macro to dispatch according to the context of array and dtype of csr
+ * to enable CUDA UVA ops.
+ * Context check is covered here to avoid confusion with CHECK_SAME_CONTEXT.
+ * If csr has the same context with array, same behivor as ATEN_CSR_SWITCH_CUDA.
+ * If csr is pinned, array's context will conduct the actual operation.
+ */
+#define ATEN_CSR_SWITCH_CUDA_UVA(csr, array, XPU, IdType, op, ...)       \
+  do {                                                                   \
+    CHECK_VALID_CONTEXT(csr.indices, array);                             \
+    ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, op, {              \
+      ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
+    });                                                                  \
+  } while (0)
+
+// Macro to dispatch according to device context (allowing cuda)
+#ifdef DGL_USE_CUDA
+#define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...)                \
+  ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, {       \
+    ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
+  });
+
+// Macro to dispatch according to device context and index type.
+#define ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, op, ...)             \
+  ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, {       \
+    ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \
+  });
+#else  // DGL_USE_CUDA
+#define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH
+#define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH
+#endif  // DGL_USE_CUDA
+
+///////////////////////// Array checks //////////////////////////
+
+#define IS_INT32(a) ((a)->dtype.code == kDGLInt && (a)->dtype.bits == 32)
+#define IS_INT64(a) ((a)->dtype.code == kDGLInt && (a)->dtype.bits == 64)
+#define IS_FLOAT32(a) ((a)->dtype.code == kDGLFloat && (a)->dtype.bits == 32)
+#define IS_FLOAT64(a) ((a)->dtype.code == kDGLFloat && (a)->dtype.bits == 64)
+
+#define CHECK_IF(cond, prop, value_name, dtype_name)                           \
+  CHECK(cond) << "Expecting " << (prop) << " of " << (value_name) << " to be " \
+              << (dtype_name)
+
+#define CHECK_INT32(value, value_name) \
+  CHECK_IF(IS_INT32(value), "dtype", value_name, "int32")
+#define CHECK_INT64(value, value_name) \
+  CHECK_IF(IS_INT64(value), "dtype", value_name, "int64")
+#define CHECK_INT(value, value_name)                           \
+  CHECK_IF(                                                    \
+      IS_INT32(value) || IS_INT64(value), "dtype", value_name, \
+      "int32 or int64")
+#define CHECK_FLOAT32(value, value_name) \
+  CHECK_IF(IS_FLOAT32(value), "dtype", value_name, "float32")
+#define CHECK_FLOAT64(value, value_name) \
+  CHECK_IF(IS_FLOAT64(value), "dtype", value_name, "float64")
+#define CHECK_FLOAT(value, value_name)                             \
+  CHECK_IF(                                                        \
+      IS_FLOAT32(value) || IS_FLOAT64(value), "dtype", value_name, \
+      "float32 or float64")
+
+#define CHECK_NDIM(value, _ndim, value_name) \
+  CHECK_IF((value)->ndim == (_ndim), "ndim", value_name, _ndim)
+
+#define CHECK_SAME_DTYPE(VAR1, VAR2)                                     \
+  CHECK((VAR1)->dtype == (VAR2)->dtype)                                  \
+      << "Expected " << (#VAR2) << " to be the same type as " << (#VAR1) \
+      << "(" << (VAR1)->dtype << ")"                                     \
+      << ". But got " << (VAR2)->dtype << ".";
+
+#define CHECK_SAME_CONTEXT(VAR1, VAR2)                                    \
+  CHECK((VAR1)->ctx == (VAR2)->ctx)                                       \
+      << "Expected " << (#VAR2) << " to have the same device context as " \
+      << (#VAR1) << "(" << (VAR1)->ctx << ")"                             \
+      << ". But got " << (VAR2)->ctx << ".";
+
+#define CHECK_NO_OVERFLOW(dtype, val)                         \
+  do {                                                        \
+    if (sizeof(val) == 8 && (dtype).bits == 32)               \
+      CHECK_LE((val), 0x7FFFFFFFL)                            \
+          << "int32 overflow for argument " << (#val) << "."; \
+  } while (0);
+
+#define CHECK_IS_ID_ARRAY(VAR)                                \
+  CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \
+      << "Expected argument " << (#VAR) << " to be an 1D integer array.";
+
+#endif  // DGL_ATEN_MACRO_H_
diff --git a/include/dgl/runtime/device_api.h b/include/dgl/runtime/device_api.h
index d085d26c0fe1..f758c35ee744 100644
--- a/include/dgl/runtime/device_api.h
+++ b/include/dgl/runtime/device_api.h
@@ -174,7 +174,7 @@ class DeviceAPI {
       DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
 
   /**
-   * @brief Pin host memory using cudaHostRegister().
+   * @brief Pin host memory using hipHostRegister().
    *
    * @param ptr The host memory pointer to be pinned.
    * @param nbytes The size to be pinned.
@@ -183,7 +183,7 @@ class DeviceAPI {
   DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
 
   /**
-   * @brief Unpin host memory using cudaHostUnregister().
+   * @brief Unpin host memory using hipHostUnregister().
    *
    * @param ptr The host memory pointer to be unpinned.
    */
@@ -203,7 +203,7 @@ class DeviceAPI {
 
   /**
    * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
-   * @note It avoids unnecessary cudaFreeHost calls and puts the memory
+   * @note It avoids unnecessary hipHostFree calls and puts the memory
    *     block into CachingHostAllocator's free list.
    * @param deleter Pointer to the deleter function from PyTorch's
    *     CachingHostAllocator.
diff --git a/include/dgl/runtime/device_api.h.prehip b/include/dgl/runtime/device_api.h.prehip
new file mode 100644
index 000000000000..d085d26c0fe1
--- /dev/null
+++ b/include/dgl/runtime/device_api.h.prehip
@@ -0,0 +1,268 @@
+/**
+ *  Copyright (c) 2016 by Contributors
+ * @file dgl/runtime/device_api.h
+ * @brief Abstract device memory management API
+ */
+#ifndef DGL_RUNTIME_DEVICE_API_H_
+#define DGL_RUNTIME_DEVICE_API_H_
+
+#include <string>
+
+#include "c_runtime_api.h"
+#include "packed_func.h"
+
+namespace dgl {
+namespace runtime {
+/**
+ * @brief the query type into GetAttr
+ */
+enum DeviceAttrKind : int {
+  kExist = 0,
+  kMaxThreadsPerBlock = 1,
+  kWarpSize = 2,
+  kMaxSharedMemoryPerBlock = 3,
+  kComputeVersion = 4,
+  kDeviceName = 5,
+  kMaxClockRate = 6,
+  kMultiProcessorCount = 7,
+  kMaxThreadDimensions = 8
+};
+
+/** @brief Number of bytes each allocation must align to */
+constexpr int kAllocAlignment = 64;
+
+/** @brief Number of bytes each allocation must align to in temporary allocation
+ */
+constexpr int kTempAllocaAlignment = 64;
+
+/** @brief Maximum size that can be allocated on stack */
+constexpr int kMaxStackAlloca = 1024;
+
+/**
+ * @brief DGL Runtime Device API, abstracts the device
+ *  specific interface for memory management.
+ */
+class DeviceAPI {
+ public:
+  /** @brief virtual destructor */
+  virtual ~DeviceAPI() {}
+  /**
+   * @brief Check whether the device is available.
+   */
+  virtual bool IsAvailable() { return true; }
+
+  /**
+   * @brief Set the environment device id to ctx
+   * @param ctx The context to be set.
+   */
+  virtual void SetDevice(DGLContext ctx) = 0;
+
+  /**
+   * @brief Get attribute of specified device.
+   * @param ctx The device context
+   * @param kind The result kind
+   * @param rv The return value.
+   * @sa DeviceAttrKind
+   */
+  virtual void GetAttr(
+      DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) = 0;
+
+  /**
+   * @brief Allocate a data space on device.
+   * @param ctx The device context to perform operation.
+   * @param nbytes The number of bytes in memory.
+   * @param alignment The alignment of the memory.
+   * @param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes & alignment are sufficient for most backends.
+   * @return The allocated device pointer.
+   */
+  virtual void* AllocDataSpace(
+      DGLContext ctx, size_t nbytes, size_t alignment,
+      DGLDataType type_hint) = 0;
+
+  /**
+   * @brief Free a data space on device.
+   * @param ctx The device context to perform operation.
+   * @param ptr The data space.
+   */
+  virtual void FreeDataSpace(DGLContext ctx, void* ptr) = 0;
+
+  /**
+   * @brief copy data from one place to another
+   * @param from The source array.
+   * @param from_offset The byte offeset in the from.
+   * @param to The target array.
+   * @param to_offset The byte offset in the to.
+   * @param num_bytes The size of the memory in bytes.
+   * @param ctx_from The source context.
+   * @param ctx_to The target context.
+   * @param type_hint The type of elements, only needed by certain backends,
+   *     can be useful for cross device endian converison.
+   */
+  virtual void CopyDataFromTo(
+      const void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint) = 0;
+
+  /**
+   * @brief copy data between device and CPU while recording the event.
+   * @param from The source array.
+   * @param from_offset The byte offeset in the from.
+   * @param to The target array.
+   * @param to_offset The byte offset in the to.
+   * @param num_bytes The size of the memory in bytes.
+   * @param ctx_from The source context.
+   * @param ctx_to The target context.
+   * @param type_hint The type of elements, only needed by certain backends,
+   *     can be useful for cross device endian converison.
+   * @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
+   * @note This function only works when PyTorch CachingHostAllocator is
+   *     available.
+   */
+  virtual void RecordedCopyDataFromTo(
+      void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint, void* pytorch_ctx) = 0;
+
+  /**
+   * @brief Create a new stream of execution.
+   *
+   * @param ctx The context of allocation.
+   */
+  DGL_DLL virtual DGLStreamHandle CreateStream(DGLContext ctx);
+
+  /**
+   * @brief Free a stream of execution
+   *
+   * @param ctx The context of the stream
+   * @param stream The pointer to be freed.
+   */
+  DGL_DLL virtual void FreeStream(DGLContext ctx, DGLStreamHandle stream);
+
+  /**
+   * @brief Synchronize the stream
+   * @param ctx The context to perform operation.
+   * @param stream The stream to be sync.
+   */
+  virtual void StreamSync(DGLContext ctx, DGLStreamHandle stream) = 0;
+
+  /**
+   * @brief Set the stream
+   * @param ctx The context to set stream.
+   * @param stream The stream to be set.
+   */
+  virtual void SetStream(DGLContext ctx, DGLStreamHandle stream) {}
+
+  /**
+   * @brief Get the stream
+   */
+  virtual DGLStreamHandle GetStream() const { return nullptr; }
+
+  /**
+   * @brief Synchronize 2 streams of execution.
+   *
+   * An event is created in event_src stream that the second then
+   * stream waits on.  Neither event_src or event_dst need to be of
+   * the same device ID as the context, but they must be of the same
+   * device type.
+   *
+   * @param ctx The context of the streams.
+   * @param event_src The source stream to synchronize.
+   * @param event_dst The destination stream to synchronize.
+   */
+  DGL_DLL virtual void SyncStreamFromTo(
+      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
+
+  /**
+   * @brief Pin host memory using cudaHostRegister().
+   *
+   * @param ptr The host memory pointer to be pinned.
+   * @param nbytes The size to be pinned.
+   * @return false when pinning an empty tensor. true otherwise.
+   */
+  DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
+
+  /**
+   * @brief Unpin host memory using cudaHostUnregister().
+   *
+   * @param ptr The host memory pointer to be unpinned.
+   */
+  DGL_DLL virtual void UnpinData(void* ptr);
+
+  /**
+   * @brief Allocate the pinned memory using PyTorch CachingHostAllocator.
+   *
+   * @param nbytes The size to be pinned.
+   * @param ctx Pointer to the context pointer from PyTorch's
+   *     CachingHostAllocator.
+   * @param deleter Pointer to the deleter function from PyTorch's
+   *     CachingHostAllocator.
+   */
+  DGL_DLL virtual void* AllocPinnedDataSpace(
+      size_t nbytes, void** ctx, void** deleter);
+
+  /**
+   * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
+   * @note It avoids unnecessary cudaFreeHost calls and puts the memory
+   *     block into CachingHostAllocator's free list.
+   * @param deleter Pointer to the deleter function from PyTorch's
+   *     CachingHostAllocator.
+   */
+  DGL_DLL virtual void FreePinnedDataSpace(void** deleter);
+
+  /**
+   * @brief Check whether the memory is in pinned memory.
+   */
+  DGL_DLL virtual bool IsPinned(const void* ptr) { return false; }
+
+  /**
+   * @brief Allocate temporal workspace for backend execution.
+   *
+   *  \note We have the following assumption about backend temporal
+   *   workspace allocation, and backend will optimize for such assumption:
+   *
+   *  - Only a few allocation will happen, and space will be released after use.
+   *  - The release order is usually in reverse order of allocate (stack style).
+   *  - Repeative pattern of same allocations over different runs.
+   *  - Workspace should not overlap between different threads(i.e. be
+   * threadlocal)
+   *
+   * @param ctx The context of allocation.
+   * @param nbytes The size to be allocated.
+   * @param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes is sufficient for most backends.
+   */
+  DGL_DLL virtual void* AllocWorkspace(
+      DGLContext ctx, size_t nbytes, DGLDataType type_hint = {});
+
+  /**
+   * @brief Free temporal workspace in backend execution.
+   *
+   * @param ctx The context of allocation.
+   * @param ptr The pointer to be freed.
+   */
+  DGL_DLL virtual void FreeWorkspace(DGLContext ctx, void* ptr);
+
+  /**
+   * @brief Get device API based on context.
+   * @param ctx The context
+   * @param allow_missing Whether allow missing
+   * @return The corresponding device API.
+   */
+  DGL_DLL static DeviceAPI* Get(DGLContext ctx, bool allow_missing = false);
+
+  /**
+   * @brief Get device API based on device type.
+   * @param dev_type The device type
+   * @param allow_missing Whether allow missing
+   * @return The corresponding device API.
+   */
+  DGL_DLL static DeviceAPI* Get(
+      DGLDeviceType dev_type, bool allow_missing = false);
+};
+
+/** @brief The device type bigger than this is RPC device */
+constexpr int kRPCSessMask = 128;
+}  // namespace runtime
+}  // namespace dgl
+#endif  // DGL_RUNTIME_DEVICE_API_H_
diff --git a/include/dgl/runtime/ndarray.h b/include/dgl/runtime/ndarray.h
index 40bbbed3631f..cde14d1cbc84 100644
--- a/include/dgl/runtime/ndarray.h
+++ b/include/dgl/runtime/ndarray.h
@@ -17,16 +17,16 @@
 #include "serializer.h"
 #include "shared_mem.h"
 
-#ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef DGL_USE_ROCM
+#include <hip/hip_runtime.h>
 
 #define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000)
 
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #if BF16_ENABLED
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>
 #endif  // BF16_ENABLED
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 // forward declaration
 inline std::ostream& operator<<(std::ostream& os, DGLDataType t);
@@ -57,12 +57,12 @@ GEN_DGLDATATYPETRAITS_FOR(int64_t, kDGLInt, 64);
 // arrays, so I'm just converting uints to signed DTypes.
 GEN_DGLDATATYPETRAITS_FOR(uint32_t, kDGLInt, 32);
 GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16);
 #if BF16_ENABLED
-GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16);
+GEN_DGLDATATYPETRAITS_FOR(__hip_bfloat16, kDGLBfloat, 16);
 #endif  // BF16_ENABLED
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32);
 GEN_DGLDATATYPETRAITS_FOR(double, kDGLFloat, 64);
 #undef GEN_DGLDATATYPETRAITS_FOR
@@ -185,7 +185,7 @@ class NDArray {
    *     CachingHostAllocator for allocating pinned memory and copying data
    *     from the current NDAarray. As a result, PyTorch is responsible for
    *     managing the lifecycle of the returned NDArray, including deciding
-   *     when to flush the data for reuse or call cudaFreeHost. The current
+   *     when to flush the data for reuse or call hipHostFree. The current
    *     context must be kDGLCPU, otherwise, an error will be thrown.
    */
   inline NDArray PinMemory();
@@ -194,7 +194,7 @@ class NDArray {
    * @brief In-place method to pin the current array by calling PinContainer
    *        on the underlying NDArray:Container.
    * @note This is an in-place method that flags the memory as page-locked by
-   *     utilizing cudaHostRegister at the underlying level to pin the current
+   *     utilizing hipHostRegister at the underlying level to pin the current
    *     instance of NDArray. The current context must be kDGLCPU, otherwise,
    *     an error will be thrown.
    */
@@ -523,7 +523,7 @@ inline void NDArray::CopyFrom(const NDArray& other) {
     // Pinned by PyTorch
     if (cpu_data->pinned_by_pytorch_) {
       // To ensure correct behavior, the event must be recorded after
-      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      // hipMemcpyAsync as long as the memory is pinned by PyTorch.
       void* pytorch_ctx = cpu_data->pytorch_ctx_;
       RecordedCopyFromTo(
           &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx);
@@ -549,7 +549,7 @@ inline void NDArray::CopyTo(const NDArray& other) const {
     // pinned by PyTorch
     if (cpu_data->pinned_by_pytorch_) {
       // To ensure correct behavior, the event must be recorded after
-      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      // hipMemcpyAsync as long as the memory is pinned by PyTorch.
       void* pytorch_ctx = cpu_data->pytorch_ctx_;
       RecordedCopyFromTo(
           &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx);
diff --git a/include/dgl/runtime/ndarray.h.prehip b/include/dgl/runtime/ndarray.h.prehip
new file mode 100644
index 000000000000..40bbbed3631f
--- /dev/null
+++ b/include/dgl/runtime/ndarray.h.prehip
@@ -0,0 +1,890 @@
+/**
+ *  Copyright (c) 2017-2022 by Contributors
+ * @file dgl/runtime/ndarray.h
+ * @brief Abstract device memory management API
+ */
+#ifndef DGL_RUNTIME_NDARRAY_H_
+#define DGL_RUNTIME_NDARRAY_H_
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "bfloat16.h"
+#include "c_runtime_api.h"
+#include "serializer.h"
+#include "shared_mem.h"
+
+#ifdef DGL_USE_CUDA
+#include <cuda_runtime.h>
+
+#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000)
+
+#include <cuda_fp16.h>
+#if BF16_ENABLED
+#include <cuda_bf16.h>
+#endif  // BF16_ENABLED
+#endif  // DGL_USE_CUDA
+
+// forward declaration
+inline std::ostream& operator<<(std::ostream& os, DGLDataType t);
+
+namespace dgl {
+
+/**
+ * @brief Type traits that converts a C type to a DGLDataType.
+ *
+ * Usage:
+ * DGLDataTypeTraits<int>::dtype == dtype
+ */
+template <typename T>
+struct DGLDataTypeTraits {
+  static constexpr DGLDataType dtype{0, 0, 0};  // dummy
+};
+#define GEN_DGLDATATYPETRAITS_FOR(T, code, bits)       \
+  template <>                                          \
+  struct DGLDataTypeTraits<T> {                        \
+    static constexpr DGLDataType dtype{code, bits, 1}; \
+  }
+GEN_DGLDATATYPETRAITS_FOR(int8_t, kDGLInt, 8);
+GEN_DGLDATATYPETRAITS_FOR(uint8_t, kDGLUInt, 8);
+GEN_DGLDATATYPETRAITS_FOR(int16_t, kDGLInt, 16);
+GEN_DGLDATATYPETRAITS_FOR(int32_t, kDGLInt, 32);
+GEN_DGLDATATYPETRAITS_FOR(int64_t, kDGLInt, 64);
+// XXX(BarclayII) most DL frameworks do not support unsigned int and long
+// arrays, so I'm just converting uints to signed DTypes.
+GEN_DGLDATATYPETRAITS_FOR(uint32_t, kDGLInt, 32);
+GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64);
+#ifdef DGL_USE_CUDA
+GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16);
+#if BF16_ENABLED
+GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16);
+#endif  // BF16_ENABLED
+#endif  // DGL_USE_CUDA
+GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32);
+GEN_DGLDATATYPETRAITS_FOR(double, kDGLFloat, 64);
+#undef GEN_DGLDATATYPETRAITS_FOR
+
+namespace runtime {
+
+/**
+ * @brief DLPack converter.
+ */
+struct DLPackConvert;
+
+/**
+ * @brief Managed NDArray.
+ *  The array is backed by reference counted blocks.
+ */
+class NDArray {
+ public:
+  // internal container type
+  struct Container;
+  /** @brief default constructor */
+  NDArray() {}
+  /**
+   * @brief cosntruct a NDArray that refers to data
+   * @param data The data this NDArray refers to
+   */
+  explicit inline NDArray(Container* data);
+  /**
+   * @brief copy constructor
+   * @param other The value to be copied
+   */
+  inline NDArray(const NDArray& other);  // NOLINT(*)
+  /**
+   * @brief move constructor
+   * @param other The value to be moved
+   */
+  NDArray(NDArray&& other)  // NOLINT(*)
+      : data_(other.data_) {
+    other.data_ = nullptr;
+  }
+  /** @brief destructor */
+  ~NDArray() { this->reset(); }
+  /**
+   * @brief Swap this array with another NDArray
+   * @param other The other NDArray
+   */
+  void swap(NDArray& other) {  // NOLINT(*)
+    std::swap(data_, other.data_);
+  }
+  /**
+   * @brief copy assignmemt
+   * @param other The value to be assigned.
+   * @return reference to self.
+   */
+  NDArray& operator=(const NDArray& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(other).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /**
+   * @brief move assignmemt
+   * @param other The value to be assigned.
+   * @return reference to self.
+   */
+  NDArray& operator=(NDArray&& other) {  // NOLINT(*)
+    // copy-and-swap idiom
+    NDArray(std::move(other)).swap(*this);  // NOLINT(*)
+    return *this;
+  }
+  /** @return If NDArray is defined */
+  bool defined() const { return data_ != nullptr; }
+  /** @return If both NDArray reference the same container */
+  bool same_as(const NDArray& other) const { return data_ == other.data_; }
+  /** @brief reset the content of NDArray to be nullptr */
+  inline void reset();
+  /**
+   * @return the reference counter
+   * @note this number is approximate in multi-threaded setting.
+   */
+  inline int use_count() const;
+  /** @return Pointer to content of DGLArray */
+  inline const DGLArray* operator->() const;
+  /** @return True if the ndarray is contiguous. */
+  bool IsContiguous() const;
+  /** @return the data pointer with type. */
+  template <typename T>
+  inline T* Ptr() const {
+    if (!defined())
+      return nullptr;
+    else
+      return static_cast<T*>(operator->()->data);
+  }
+
+  /**
+   * @brief Copy data content from/into another array.
+   * @param other The source array to be copied from.
+   * @note The copy runs on the dgl internal stream if it involves a GPU
+   * context.
+   */
+  inline void CopyFrom(DGLArray* other);
+  inline void CopyFrom(const NDArray& other);
+  inline void CopyTo(DGLArray* other) const;
+  inline void CopyTo(const NDArray& other) const;
+
+  /**
+   * @brief Copy the data to another context.
+   * @param ctx The target context.
+   * @return The array under another context.
+   */
+  inline NDArray CopyTo(const DGLContext& ctx) const;
+
+  /**
+   * @brief Return a new array with a copy of the content.
+   */
+  inline NDArray Clone() const;
+
+  /**
+   * @brief Return a copy of the current instance of NDArray in pinned
+   *     (page-locked) memory.
+   * @note This is an out-of-place method, which utilizes PyTorch's
+   *     CachingHostAllocator for allocating pinned memory and copying data
+   *     from the current NDAarray. As a result, PyTorch is responsible for
+   *     managing the lifecycle of the returned NDArray, including deciding
+   *     when to flush the data for reuse or call cudaFreeHost. The current
+   *     context must be kDGLCPU, otherwise, an error will be thrown.
+   */
+  inline NDArray PinMemory();
+
+  /**
+   * @brief In-place method to pin the current array by calling PinContainer
+   *        on the underlying NDArray:Container.
+   * @note This is an in-place method that flags the memory as page-locked by
+   *     utilizing cudaHostRegister at the underlying level to pin the current
+   *     instance of NDArray. The current context must be kDGLCPU, otherwise,
+   *     an error will be thrown.
+   */
+  inline void PinMemory_();
+
+  /**
+   * @brief In-place method to unpin the current array by calling UnpinContainer
+   *        on the underlying NDArray:Container.
+   * @note This is an in-place method. Behavior depends on the current context,
+   *       IsPinned: will be unpinned;
+   *       others: directly return.
+   */
+  inline void UnpinMemory_();
+
+  /**
+   * @brief Check if the array is pinned.
+   */
+  inline bool IsPinned() const;
+
+  /**
+   * @brief Record streams that are using the underlying tensor.
+   * @param stream The stream that is using the underlying tensor.
+   */
+  inline void RecordStream(DGLStreamHandle stream) const;
+
+  /**
+   * @brief Load NDArray from stream
+   * @param stream The input data stream
+   * @return Whether load is successful
+   */
+  bool Load(dmlc::Stream* stream);
+
+  /**
+   * @brief Save NDArray to stream
+   * @param stream The output data stream
+   */
+  void Save(dmlc::Stream* stream) const;
+
+  /**
+   * @brief Create a NDArray that shares the data memory with the current one.
+   * @param shape The shape of the new array.
+   * @param dtype The data type of the new array.
+   * @param offset The offset (in bytes) of the starting pointer.
+   * @note The memory size of new array must be smaller than the current one.
+   */
+  DGL_DLL NDArray
+  CreateView(std::vector<int64_t> shape, DGLDataType dtype, int64_t offset = 0);
+
+  /**
+   * @brief Create an empty NDArray.
+   * @param shape The shape of the new array.
+   * @param dtype The data type of the new array.
+   * @param ctx The context of the array.
+   * @return The created Array
+   */
+  DGL_DLL static NDArray Empty(
+      std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx);
+
+  /**
+   * @brief Create an empty NDArray in pinned memory.
+   * @param shape The shape of the new array.
+   * @param dtype The data type of the new array.
+   * @param ctx The context of the array.
+   * @return The created array.
+   */
+  DGL_DLL static NDArray PinnedEmpty(
+      std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx);
+
+  /**
+   * @brief Create an empty NDArray with shared memory.
+   * @param name The name of shared memory.
+   * @param shape The shape of the new array.
+   * @param dtype The data type of the new array.
+   * @param ctx The context of the array.
+   * @param is_create whether to create shared memory.
+   * @return The created Array
+   */
+  DGL_DLL static NDArray EmptyShared(
+      const std::string& name, std::vector<int64_t> shape, DGLDataType dtype,
+      DGLContext ctx, bool is_create);
+
+  /**
+   * @brief Get the size of the array in the number of bytes.
+   */
+  size_t GetSize() const;
+
+  /**
+   * @brief Get the number of elements in this array.
+   */
+  int64_t NumElements() const;
+
+  /**
+   * @brief Create a NDArray by copying from std::vector.
+   * @tparam T Type of vector data.  Determines the dtype of returned array.
+   */
+  template <typename T>
+  DGL_DLL static NDArray FromVector(
+      const std::vector<T>& vec, DGLContext ctx = DGLContext{kDGLCPU, 0});
+
+  /**
+   * @brief Create a NDArray from a raw pointer.
+   */
+  DGL_DLL static NDArray CreateFromRaw(
+      const std::vector<int64_t>& shape, DGLDataType dtype, DGLContext ctx,
+      void* raw, bool auto_free);
+
+  /**
+   * @brief Create a std::vector from a 1D NDArray.
+   * @tparam T Type of vector data.
+   * @note Type casting is NOT performed.  The caller has to make sure that the
+   * vector type matches the dtype of NDArray.
+   */
+  template <typename T>
+  std::vector<T> ToVector() const;
+
+  std::shared_ptr<SharedMemory> GetSharedMem() const;
+
+  /**
+   * @brief Function to copy data from one array to another.
+   * @param from The source array.
+   * @param to The target array.
+   * @param (optional) stream The stream used in copy.
+   */
+  DGL_DLL static void CopyFromTo(DGLArray* from, DGLArray* to);
+  DGL_DLL static void CopyFromTo(
+      DGLArray* from, DGLArray* to, DGLStreamHandle stream);
+
+  /**
+   * @brief Function to copy data between device and CPU while recording the
+   *     event.
+   * @param from The source array.
+   * @param to The target array.
+   * @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
+   * @note This function fuses data-copy and event recording to ensure
+   *     CachingHostAllocator works properly.
+   */
+  DGL_DLL static void RecordedCopyFromTo(
+      DGLArray* from, DGLArray* to, void* pytorch_ctx);
+
+  /**
+   * @brief Function to pin the DGLArray of a Container.
+   * @param ptr The container to be pinned.
+   * @note Data of the given array will be pinned inplace.
+   *       Behavior depends on the current context,
+   *       kDGLCPU: will be pinned;
+   *       IsPinned: directly return;
+   *       kDGLCUDA: invalid, will throw an error.
+   */
+  DGL_DLL static void PinContainer(Container* ptr);
+
+  /**
+   * @brief Function to unpin the DGLArray of a Container.
+   * @param ptr The container to be unpinned.
+   * @note Data of the given array will be unpinned inplace.
+   *       Behavior depends on the current context,
+   *       IsPinned: will be unpinned;
+   *       others: directly return.
+   */
+  DGL_DLL static void UnpinContainer(Container* ptr);
+
+  /**
+   * @brief Function check if the DGLArray of a Container is pinned.
+   * @param ptr The container to be checked.
+   * @return true if pinned.
+   */
+  DGL_DLL static bool IsContainerPinned(Container* ptr);
+
+  /**
+   * @brief Record streams that are using this tensor.
+   * @param ptr Pointer of the tensor to be recorded.
+   * @param stream The stream that is using this tensor.
+   */
+  DGL_DLL static void RecordStream(DGLArray* tensor, DGLStreamHandle stream);
+
+  // internal namespace
+  struct Internal {
+    // Default deleter for the container
+    static void DefaultDeleter(NDArray::Container* ptr);
+    // Local create function which allocates tensor metadata
+    // but does not allocate space for the data.
+    static NDArray Create(
+        std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx);
+    // Implementation of API function
+    static DGLArray* MoveAsDGLArray(NDArray arr);
+  };
+
+ private:
+  /** @brief Internal Data content */
+  Container* data_{nullptr};
+  // enable internal functions
+  friend struct Internal;
+  friend struct DLPackConvert;
+  friend class DGLRetValue;
+  friend class DGLArgsSetter;
+};
+
+/**
+ * @brief Save a DGLArray to stream
+ * @param strm The outpu stream
+ * @param tensor The tensor to be saved.
+ */
+inline bool SaveDGLArray(dmlc::Stream* strm, const DGLArray* tensor);
+
+/**
+ * @brief Reference counted Container object used to back NDArray.
+ *
+ *  This object is DGLArray compatible:
+ *    the pointer to the NDArrayContainer can be directly
+ *    interpreted as a DGLArray*
+ *
+ * @note: do not use this function directly, use NDArray.
+ */
+struct NDArray::Container {
+ public:
+  /** NOTE: the first part of this structure is the same as
+   * DLManagedTensor, note that, however, the deleter
+   * is only called when the reference counter goes to 0
+   */
+  /**
+   * @brief Tensor structure.
+   * @note it is important that the first field is DGLArray
+   *  So that this data structure is DGLArray compatible.
+   *  The head ptr of this struct can be viewed as DGLArray*.
+   */
+  DGLArray dl_tensor;
+  /**
+   * @brief addtional context, reserved for recycling
+   * @note We can attach additional content here
+   *  which the current container depend on
+   *  (e.g. reference to original memory when creating views).
+   */
+  void* manager_ctx{nullptr};
+  /**
+   * @brief Customized deleter
+   *
+   * @note The customized deleter is helpful to enable
+   *  different ways of memory allocator that are not
+   *  currently defined by the system.
+   */
+  void (*deleter)(Container* self) = nullptr;
+  /** @brief default constructor */
+  Container() {
+    dl_tensor.data = nullptr;
+    dl_tensor.ndim = 0;
+    dl_tensor.shape = nullptr;
+    dl_tensor.strides = nullptr;
+    dl_tensor.byte_offset = 0;
+  }
+  /** @brief pointer to shared memory */
+  std::shared_ptr<SharedMemory> mem;
+  /** @brief developer function, increases reference counter */
+  void IncRef() { ref_counter_.fetch_add(1, std::memory_order_relaxed); }
+  /** @brief developer function, decrease reference counter */
+  void DecRef() {
+    if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) {
+      std::atomic_thread_fence(std::memory_order_acquire);
+      if (this->deleter != nullptr) {
+        (*this->deleter)(this);
+      }
+    }
+  }
+
+ private:
+  friend struct DLPackConvert;
+  friend class NDArray;
+  friend class RPCWrappedFunc;
+  /**
+   * @brief The shape container,
+   *  can be used for shape data.
+   */
+  std::vector<int64_t> shape_;
+  /**
+   * @brief The stride container,
+   *  can be used for stride data.
+   */
+  std::vector<int64_t> stride_;
+  /** @brief The internal array object */
+  std::atomic<int> ref_counter_{0};
+
+  /** @brief Whether underlying dl_tensor is pinned by DGL. */
+  bool pinned_by_dgl_{false};
+
+  /** @brief Whether underlying dl_tensor is pinned by PyTorch
+   *    (CachingHostAllocator). */
+  bool pinned_by_pytorch_{false};
+
+  /** @brief The PyTorch storage ctx ptr if pinned_by_pytorch_ = True. */
+  void* pytorch_ctx_{nullptr};
+
+  /** @brief Pointer to the corresp. PyTorch deleter if pinned_by_pytorch_ =
+   *    True.
+   */
+  void* pytorch_raw_deleter_{nullptr};
+};
+
+// implementations of inline functions
+// the usages of functions are documented in place.
+inline NDArray::NDArray(Container* data) : data_(data) {
+  if (data_) data_->IncRef();
+}
+
+inline NDArray::NDArray(const NDArray& other) : data_(other.data_) {
+  if (data_) data_->IncRef();
+}
+
+inline void NDArray::reset() {
+  if (data_) {
+    data_->DecRef();
+    data_ = nullptr;
+  }
+}
+
+inline void NDArray::CopyFrom(DGLArray* other) {
+  CHECK(data_ != nullptr);
+  CopyFromTo(other, &(data_->dl_tensor));
+}
+
+inline void NDArray::CopyFrom(const NDArray& other) {
+  CHECK(other.data_ != nullptr);
+  // Copy between two devices
+  if (data_->dl_tensor.ctx.device_type !=
+      other.data_->dl_tensor.ctx.device_type) {
+    CHECK(data_ != nullptr);
+    auto to_ctx_type = data_->dl_tensor.ctx.device_type;
+    auto cpu_data = (to_ctx_type == kDGLCPU ? data_ : other.data_);
+    // Pinned by PyTorch
+    if (cpu_data->pinned_by_pytorch_) {
+      // To ensure correct behavior, the event must be recorded after
+      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      void* pytorch_ctx = cpu_data->pytorch_ctx_;
+      RecordedCopyFromTo(
+          &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx);
+      return;
+    }
+  }
+  CopyFrom(&(other.data_->dl_tensor));
+}
+
+inline void NDArray::CopyTo(DGLArray* other) const {
+  CHECK(data_ != nullptr);
+  CopyFromTo(&(data_->dl_tensor), other);
+}
+
+inline void NDArray::CopyTo(const NDArray& other) const {
+  CHECK(other.data_ != nullptr);
+  // copy between two devices
+  if (data_->dl_tensor.ctx.device_type !=
+      other.data_->dl_tensor.ctx.device_type) {
+    CHECK(data_ != nullptr);
+    auto from_ctx_type = data_->dl_tensor.ctx.device_type;
+    auto cpu_data = (from_ctx_type == kDGLCPU ? data_ : other.data_);
+    // pinned by PyTorch
+    if (cpu_data->pinned_by_pytorch_) {
+      // To ensure correct behavior, the event must be recorded after
+      // cudaMemcpyAsync as long as the memory is pinned by PyTorch.
+      void* pytorch_ctx = cpu_data->pytorch_ctx_;
+      RecordedCopyFromTo(
+          &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx);
+      return;
+    }
+  }
+  CopyTo(&(other.data_->dl_tensor));
+}
+
+inline NDArray NDArray::CopyTo(const DGLContext& ctx) const {
+  CHECK(data_ != nullptr);
+  const DGLArray* array = operator->();
+  NDArray ret = Empty(
+      std::vector<int64_t>(array->shape, array->shape + array->ndim),
+      array->dtype, ctx);
+  this->CopyTo(ret);
+  return ret;
+}
+
+inline NDArray NDArray::Clone() const {
+  CHECK(data_ != nullptr);
+  const DGLArray* array = operator->();
+  return this->CopyTo(array->ctx);
+}
+
+inline NDArray NDArray::PinMemory() {
+  CHECK(data_ != nullptr);
+  const DGLArray* array = operator->();
+  auto ctx = array->ctx;
+  NDArray ret = PinnedEmpty(
+      std::vector<int64_t>(array->shape, array->shape + array->ndim),
+      array->dtype, ctx);
+  this->CopyTo(ret);
+  return ret;
+}
+
+inline void NDArray::PinMemory_() {
+  CHECK(data_ != nullptr);
+  PinContainer(data_);
+}
+
+inline void NDArray::UnpinMemory_() {
+  CHECK(data_ != nullptr);
+  UnpinContainer(data_);
+}
+
+inline bool NDArray::IsPinned() const {
+  CHECK(data_ != nullptr);
+  return IsContainerPinned(data_);
+}
+
+inline void NDArray::RecordStream(DGLStreamHandle stream) const {
+  CHECK(data_ != nullptr);
+  RecordStream(&(data_->dl_tensor), stream);
+}
+
+inline int NDArray::use_count() const {
+  if (data_ == nullptr) return 0;
+  return data_->ref_counter_.load(std::memory_order_relaxed);
+}
+
+inline const DGLArray* NDArray::operator->() const {
+  return &(data_->dl_tensor);
+}
+
+/** @brief Magic number for NDArray file */
+constexpr uint64_t kDGLNDArrayMagic = 0xDD5E40F096B4A13F;
+
+inline bool SaveDGLArray(dmlc::Stream* strm, DGLArray* tensor) {
+  uint64_t header = kDGLNDArrayMagic, reserved = 0;
+  strm->Write(header);
+  strm->Write(reserved);
+  // Always save data as CPU context
+  //
+  // Parameters that get serialized should be in CPU by default.
+  // So even the array's context is GPU, it will be stored as CPU array.
+  // This is used to prevent case when another user loads the parameters
+  // back on machine that do not have GPU or related context.
+  //
+  // We can always do array.CopyTo(target_ctx) to get a corresponding
+  // array in the target context.
+  DGLContext cpu_ctx;
+  cpu_ctx.device_type = kDGLCPU;
+  cpu_ctx.device_id = 0;
+  strm->Write(cpu_ctx);
+  strm->Write(tensor->ndim);
+  strm->Write(tensor->dtype);
+  int ndim = tensor->ndim;
+  strm->WriteArray(tensor->shape, ndim);
+  int type_bytes = tensor->dtype.bits / 8;
+  int64_t num_elems = 1;
+  for (int i = 0; i < ndim; ++i) {
+    num_elems *= tensor->shape[i];
+  }
+  int64_t data_byte_size = type_bytes * num_elems;
+  strm->Write(data_byte_size);
+
+  if (DMLC_IO_NO_ENDIAN_SWAP && tensor->ctx.device_type == kDGLCPU &&
+      tensor->strides == nullptr && tensor->byte_offset == 0) {
+    // quick path
+    strm->Write(tensor->data, data_byte_size);
+  } else {
+    std::vector<uint8_t> bytes(data_byte_size);
+    CHECK_EQ(
+        DGLArrayCopyToBytes(tensor, dmlc::BeginPtr(bytes), data_byte_size), 0)
+        << DGLGetLastError();
+    if (!DMLC_IO_NO_ENDIAN_SWAP) {
+      dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems);
+    }
+    strm->Write(dmlc::BeginPtr(bytes), data_byte_size);
+  }
+  return true;
+}
+
+/**
+ * @brief Convert type code to its name
+ * @param type_code The type code .
+ * @return The name of type code.
+ */
+inline const char* TypeCode2Str(int type_code) {
+  switch (type_code) {
+    case kDGLInt:
+      return "int";
+    case kDGLUInt:
+      return "uint";
+    case kDGLFloat:
+      return "float";
+    case kStr:
+      return "str";
+    case kBytes:
+      return "bytes";
+    case kHandle:
+      return "handle";
+    case kNull:
+      return "NULL";
+    case kObjectHandle:
+      return "ObjectHandle";
+    case kArrayHandle:
+      return "ArrayHandle";
+    case kDGLDataType:
+      return "DGLDataType";
+    case kDGLContext:
+      return "DGLContext";
+    case kFuncHandle:
+      return "FunctionHandle";
+    case kModuleHandle:
+      return "ModuleHandle";
+    case kNDArrayContainer:
+      return "NDArrayContainer";
+    default:
+      LOG(FATAL) << "unknown type_code=" << static_cast<int>(type_code);
+      return "";
+  }
+}
+
+/**
+ * @brief Convert device type code to its name
+ * @param device_type The device type code.
+ * @return The name of the device.
+ */
+inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) {
+  switch (device_type) {
+    case kDGLCPU:
+      return "cpu";
+    case kDGLCUDA:
+      return "cuda";
+    default:
+      LOG(FATAL) << "Unsupported device type code="
+                 << static_cast<int>(device_type);
+      return "";
+  }
+}
+
+/**
+ * @brief convert a string to DGL type.
+ * @param s The string to be converted.
+ * @return The corresponding dgl type.
+ */
+inline DGLDataType String2DGLDataType(std::string s) {
+  DGLDataType t;
+  t.bits = 32;
+  t.lanes = 1;
+  const char* scan;
+  if (s.substr(0, 3) == "int") {
+    t.code = kDGLInt;
+    scan = s.c_str() + 3;
+  } else if (s.substr(0, 4) == "uint") {
+    t.code = kDGLUInt;
+    scan = s.c_str() + 4;
+  } else if (s.substr(0, 5) == "float") {
+    t.code = kDGLFloat;
+    scan = s.c_str() + 5;
+  } else if (s.substr(0, 6) == "handle") {
+    t.code = kHandle;
+    t.bits = 64;  // handle uses 64 bit by default.
+    scan = s.c_str() + 6;
+  } else {
+    scan = s.c_str();
+    LOG(FATAL) << "unknown type " << s;
+  }
+  char* xdelim;  // emulate sscanf("%ux%u", bits, lanes)
+  uint8_t bits = static_cast<uint8_t>(strtoul(scan, &xdelim, 10));
+  if (bits != 0) t.bits = bits;
+  if (*xdelim == 'x') {
+    t.lanes = static_cast<uint16_t>(strtoul(xdelim + 1, nullptr, 10));
+  }
+  return t;
+}
+
+/**
+ * @brief convert a DGL type to string.
+ * @param t The type to be converted.
+ * @return The corresponding dgl type in string.
+ */
+inline std::string DGLDataType2String(DGLDataType t) {
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+  std::ostringstream os;
+  os << t;
+  return os.str();
+#else
+  std::string repr = "";
+  repr += TypeCode2Str(t.code);
+  if (t.code == kHandle) return repr;
+  repr += std::to_string(static_cast<int>(t.bits));
+  if (t.lanes != 1) {
+    repr += "x" + std::to_string(static_cast<int>(t.lanes));
+  }
+  return repr;
+#endif
+}
+
+// macro to check type code.
+#define DGL_CHECK_TYPE_CODE(CODE, T)                                  \
+  CHECK_EQ(CODE, T) << " expected " << TypeCode2Str(T) << " but get " \
+                    << TypeCode2Str(CODE)
+
+}  // namespace runtime
+}  // namespace dgl
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, dgl::runtime::NDArray, true);
+}  // namespace dmlc
+
+///////////////// Operator overloading for NDArray /////////////////
+dgl::runtime::NDArray operator+(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator-(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator*(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator/(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator%(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator+(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator-(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator*(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator/(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator%(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator+(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator-(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator*(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator/(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator%(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator-(const dgl::runtime::NDArray& array);
+
+dgl::runtime::NDArray operator>(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator<(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator>=(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator<=(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator==(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator!=(
+    const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator>(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator<(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator>=(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator<=(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator==(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator!=(const dgl::runtime::NDArray& a1, int64_t rhs);
+dgl::runtime::NDArray operator>(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator<(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator>=(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator<=(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator==(int64_t lhs, const dgl::runtime::NDArray& a2);
+dgl::runtime::NDArray operator!=(int64_t lhs, const dgl::runtime::NDArray& a2);
+
+std::ostream& operator<<(std::ostream& os, dgl::runtime::NDArray array);
+
+///////////////// Operator overloading for DGLDataType /////////////////
+
+/** @brief Check whether two data types are the same.*/
+inline bool operator==(const DGLDataType& ty1, const DGLDataType& ty2) {
+  return ty1.code == ty2.code && ty1.bits == ty2.bits && ty1.lanes == ty2.lanes;
+}
+
+/** @brief Check whether two data types are different.*/
+inline bool operator!=(const DGLDataType& ty1, const DGLDataType& ty2) {
+  return !(ty1 == ty2);
+}
+
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+inline std::ostream& operator<<(std::ostream& os, DGLDataType t) {
+  os << dgl::runtime::TypeCode2Str(t.code);
+  if (t.code == kHandle) return os;
+  os << static_cast<int>(t.bits);
+  if (t.lanes != 1) {
+    os << 'x' << static_cast<int>(t.lanes);
+  }
+  return os;
+}
+#endif
+
+///////////////// Operator overloading for DGLContext /////////////////
+
+/** @brief Check whether two device contexts are the same.*/
+inline bool operator==(const DGLContext& ctx1, const DGLContext& ctx2) {
+  return ctx1.device_type == ctx2.device_type &&
+         ctx1.device_id == ctx2.device_id;
+}
+
+/** @brief Check whether two device contexts are different.*/
+inline bool operator!=(const DGLContext& ctx1, const DGLContext& ctx2) {
+  return !(ctx1 == ctx2);
+}
+
+#ifndef _LIBCPP_SGX_NO_IOSTREAMS
+inline std::ostream& operator<<(std::ostream& os, const DGLContext& ctx) {
+  return os << dgl::runtime::DeviceTypeCode2Str(ctx.device_type) << ":"
+            << ctx.device_id;
+}
+#endif
+
+#endif  // DGL_RUNTIME_NDARRAY_H_
diff --git a/include/dgl/runtime/tensordispatch.h b/include/dgl/runtime/tensordispatch.h
index 872c1d68e582..6dd5748bb01e 100644
--- a/include/dgl/runtime/tensordispatch.h
+++ b/include/dgl/runtime/tensordispatch.h
@@ -33,9 +33,9 @@
 #if defined(WIN32) || defined(_WIN32)
 #include <windows.h>
 #endif  // WIN32
-#ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
-#endif  // DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
+#include <hip/hip_runtime.h>
+#endif  // DGL_USE_ROCM
 #include "ndarray.h"
 
 /**
@@ -90,21 +90,21 @@ class TensorDispatcher {
     FUNCCAST(tensoradapter::CPURawDelete, entry)(ptr);
   }
 
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   /**
    * @brief Allocate a piece of GPU memory via
    * PyTorch's THCCachingAllocator.
    * Used in CUDADeviceAPI::AllocWorkspace().
    *
    * @note THCCachingAllocator specify the device to allocate on
-   * via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * via hipGetDevice(). Make sure to call hipSetDevice()
    * before invoking this function.
    *
    * @param nbytes The size to be allocated.
    * @param stream The stream to be allocated on.
    * @return Pointer to the allocated memory.
    */
-  inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) {
+  inline void* CUDAAllocWorkspace(size_t nbytes, hipStream_t stream) {
     auto entry = entrypoints_[Op::kCUDARawAlloc];
     return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream);
   }
@@ -125,12 +125,12 @@ class TensorDispatcher {
    * Used in runtime::getCurrentCUDAStream().
    *
    * @note PyTorch pre-allocates/sets the current CUDA stream
-   * on current device via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * on current device via hipGetDevice(). Make sure to call hipSetDevice()
    * before invoking this function.
    *
-   * @return cudaStream_t stream handle
+   * @return hipStream_t stream handle
    */
-  inline cudaStream_t CUDAGetCurrentStream() {
+  inline hipStream_t CUDAGetCurrentStream() {
     auto entry = entrypoints_[Op::kCUDACurrentStream];
     return FUNCCAST(tensoradapter::CUDACurrentStream, entry)();
   }
@@ -183,7 +183,7 @@ class TensorDispatcher {
    * @param device_id Device of the tensor.
    */
   inline void CUDARecordHostAlloc(
-      void* data, void* ctx, cudaStream_t stream, int device_id) {
+      void* data, void* ctx, hipStream_t stream, int device_id) {
     auto entry = entrypoints_[Op::kCUDARecordHostAlloc];
     auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry);
     recorded_alloc(data, ctx, stream, device_id);
@@ -198,7 +198,7 @@ class TensorDispatcher {
     auto entry = entrypoints_[Op::kCUDAHostAllocatorEmptyCache];
     FUNCCAST(tensoradapter::CUDAHostAllocatorEmptyCache, entry)();
   }
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
   /**
    * @brief Record streams that are using this tensor.
@@ -209,10 +209,10 @@ class TensorDispatcher {
    * @param device_id Device of the tensor.
    */
   inline void RecordStream(void* ptr, DGLStreamHandle stream, int device_id) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto entry = entrypoints_[Op::kRecordStream];
     FUNCCAST(tensoradapter::RecordStream, entry)
-    (ptr, static_cast<cudaStream_t>(stream), device_id);
+    (ptr, static_cast<hipStream_t>(stream), device_id);
 #endif
   }
 
@@ -229,12 +229,12 @@ class TensorDispatcher {
    */
   static constexpr const char* names_[] = {
       "CPURawAlloc",         "CPURawDelete",
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
       "CUDARawAlloc",        "CUDARawDelete",
       "CUDACurrentStream",   "RecordStream",
       "CUDARawHostAlloc",    "CUDARawHostDelete",
       "CUDARecordHostAlloc", "CUDAHostAllocatorEmptyCache",
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
   };
 
   /** @brief Index of each function to the symbol list */
@@ -242,7 +242,7 @@ class TensorDispatcher {
    public:
     static constexpr int kCPURawAlloc = 0;
     static constexpr int kCPURawDelete = 1;
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     static constexpr int kCUDARawAlloc = 2;
     static constexpr int kCUDARawDelete = 3;
     static constexpr int kCUDACurrentStream = 4;
@@ -251,7 +251,7 @@ class TensorDispatcher {
     static constexpr int kCUDARawHostDelete = 7;
     static constexpr int kCUDARecordHostAlloc = 8;
     static constexpr int kCUDAHostAllocatorEmptyCache = 9;
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
   };
 
   /** @brief Number of functions */
@@ -260,9 +260,9 @@ class TensorDispatcher {
   /** @brief Entrypoints of each function */
   void* entrypoints_[num_entries_] = {
       nullptr, nullptr,
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
       nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
   };
 
   bool available_ = false;
diff --git a/include/dgl/runtime/tensordispatch.h.prehip b/include/dgl/runtime/tensordispatch.h.prehip
new file mode 100644
index 000000000000..872c1d68e582
--- /dev/null
+++ b/include/dgl/runtime/tensordispatch.h.prehip
@@ -0,0 +1,281 @@
+/**
+ *  Copyright (c) 2020-2022 by Contributors
+ * @file array/tensordispatch.h
+ * @brief This file defines the dispatcher of tensor operators to
+ * framework-specific implementations.
+ *
+ *  The dispatcher consists of a TensorDispatcher singleton in DGL C library and
+ *  one separately-built shared library per supported backend.
+ *
+ *  Those shared libraries contain wrappers of the framework-specific operators.
+ *  The wrappers are defined with extern "C", meaning that the C++ compiler will
+ *  not do name mangling for those functions so that DGL can conveniently locate
+ *  them using dlsym(3) (or GetProcAddress in Windows).
+ *
+ *  The TensorDispatcher singleton maintains a mapping from an array operator to
+ *  the address of the corresponding symbol in the shared library.  During
+ *  initialization, the TensorDispatcher checks which backend DGL is using.
+ *  It then locates and opens the corresponding shared library using dlopen(3)
+ * (or LoadLibrary in Windows), and populates the said mapping above with
+ * dlsym(3) (or GetProcAddress in Windows).
+ *
+ *  A tensor operator in TensorDispatcher first checks whether the corresponding
+ * symbol address is found in the mapping.  If so, it calls the function located
+ * at the symbol address instead, allocate/free pieces of memory on CPU/GPU. If
+ * not, it falls back to DeviceAPI::AllocWorkspace/FreeWorkspace.
+ */
+
+#ifndef DGL_RUNTIME_TENSORDISPATCH_H_
+#define DGL_RUNTIME_TENSORDISPATCH_H_
+
+#include <stddef.h>
+#include <tensoradapter.h>
+#if defined(WIN32) || defined(_WIN32)
+#include <windows.h>
+#endif  // WIN32
+#ifdef DGL_USE_CUDA
+#include <cuda_runtime.h>
+#endif  // DGL_USE_CUDA
+#include "ndarray.h"
+
+/**
+ * @brief Casts a pointer \c entry to a function pointer with signature of \c
+ * func.
+ */
+#define FUNCCAST(func, entry) (*reinterpret_cast<decltype(&(func))>(entry))
+
+namespace dgl {
+namespace runtime {
+
+/**
+ * @brief Dispatcher that delegates the function calls to framework-specific C++
+ * APIs.
+ *
+ * This class is not thread-safe.
+ */
+class TensorDispatcher {
+ public:
+  /** @brief Get the singleton instance. */
+  static TensorDispatcher* Global() {
+    static TensorDispatcher inst;
+    return &inst;
+  }
+
+  /** @brief Whether an adapter library is available. */
+  inline bool IsAvailable() { return available_; }
+
+  /** @brief Load symbols from the given tensor adapter library path. */
+  bool Load(const char* path_cstr);
+
+  /**
+   * @brief Allocate a piece of CPU memory via PyTorch's CPUAllocator.
+   * Used in CPUDeviceAPI::AllocWorkspace().
+   *
+   * @param nbytes The size to be allocated.
+   * @return Pointer to the allocated memory.
+   */
+  inline void* CPUAllocWorkspace(size_t nbytes) {
+    auto entry = entrypoints_[Op::kCPURawAlloc];
+    return FUNCCAST(tensoradapter::CPURawAlloc, entry)(nbytes);
+  }
+
+  /**
+   * @brief Free the CPU memory.
+   * Used in CPUDeviceAPI::FreeWorkspace().
+   *
+   * @param ptr Pointer to the memory to be freed.
+   */
+  inline void CPUFreeWorkspace(void* ptr) {
+    auto entry = entrypoints_[Op::kCPURawDelete];
+    FUNCCAST(tensoradapter::CPURawDelete, entry)(ptr);
+  }
+
+#ifdef DGL_USE_CUDA
+  /**
+   * @brief Allocate a piece of GPU memory via
+   * PyTorch's THCCachingAllocator.
+   * Used in CUDADeviceAPI::AllocWorkspace().
+   *
+   * @note THCCachingAllocator specify the device to allocate on
+   * via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * before invoking this function.
+   *
+   * @param nbytes The size to be allocated.
+   * @param stream The stream to be allocated on.
+   * @return Pointer to the allocated memory.
+   */
+  inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) {
+    auto entry = entrypoints_[Op::kCUDARawAlloc];
+    return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream);
+  }
+
+  /**
+   * @brief Free the GPU memory.
+   * Used in CUDADeviceAPI::FreeWorkspace().
+   *
+   * @param ptr Pointer to the memory to be freed.
+   */
+  inline void CUDAFreeWorkspace(void* ptr) {
+    auto entry = entrypoints_[Op::kCUDARawDelete];
+    FUNCCAST(tensoradapter::CUDARawDelete, entry)(ptr);
+  }
+
+  /**
+   * @brief Find the current PyTorch CUDA stream
+   * Used in runtime::getCurrentCUDAStream().
+   *
+   * @note PyTorch pre-allocates/sets the current CUDA stream
+   * on current device via cudaGetDevice(). Make sure to call cudaSetDevice()
+   * before invoking this function.
+   *
+   * @return cudaStream_t stream handle
+   */
+  inline cudaStream_t CUDAGetCurrentStream() {
+    auto entry = entrypoints_[Op::kCUDACurrentStream];
+    return FUNCCAST(tensoradapter::CUDACurrentStream, entry)();
+  }
+
+  /**
+   * @brief Allocate a piece of pinned CPU memory via PyTorch
+   *     CachingHostAllocator.
+   * @note Used in CUDADeviceAPI::AllocPinnedDataSpace().
+   * @param nbytes The size to be allocated.
+   * @param ctx Pointer to the PyTorch storage ctx ptr returned from the
+   *     allocator.
+   * @param deleter Pointer to the delete function ptr returned from the
+   *     allocator.
+   * @return Raw pointer to the allocated memory.
+   */
+  inline void* CUDAAllocHostWorkspace(
+      size_t nbytes, void** ctx, void** deleter) {
+    auto entry = entrypoints_[Op::kCUDARawHostAlloc];
+
+    auto alloc_func = FUNCCAST(tensoradapter::CUDARawHostAlloc, entry);
+    return alloc_func(nbytes, ctx, deleter);
+  }
+
+  /**
+   * @brief Insert the pinned memory block (allocated via PyTorch
+   *     CachingHostAllocator) back to the free list for future usage.(ref:
+   *     pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp).
+   * @note Used in CUDADeviceAPI::FreePinnedDataSpace().
+   * @param deleter Pointer to the delete function ptr returned from the
+   *     allocator.
+   */
+  inline void CUDAFreeHostWorkspace(void** deleter) {
+    auto entry = entrypoints_[Op::kCUDARawHostDelete];
+    FUNCCAST(tensoradapter::CUDARawHostDelete, entry)(deleter);
+  }
+
+  /**
+   * @brief Invoke the record_event function call from PyTorch
+   *     CachingHostAllocator.
+   * @note This function assoicates a CUDA stream (used by a copy kernel) to the
+   *     pinned data. In the free path of this data, which is achieved by
+   *     calling CUDAFreeHostWorkspace, the set of associated streams is then
+   *     consumed to ensure proper functionlity. (ref:
+   *     pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp).
+   *     Used in CUDADeviceAPI::RecordedCopyDataFromTo().
+   *
+   * @param data Pointer of the tensor to be recorded.
+   * @param ctx PyTorch storage ctx ptr returned from the allocator.
+   * @param stream The stream that currently consumes this tensor.
+   * @param device_id Device of the tensor.
+   */
+  inline void CUDARecordHostAlloc(
+      void* data, void* ctx, cudaStream_t stream, int device_id) {
+    auto entry = entrypoints_[Op::kCUDARecordHostAlloc];
+    auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry);
+    recorded_alloc(data, ctx, stream, device_id);
+  }
+
+  /**
+   * @brief Release cached pinned memory allocations via cudaHostFree.
+   * @note Used in CUDADeviceAPI::PinData() before pinning any host memory by
+   *     DGL.
+   */
+  inline void CUDAHostAllocatorEmptyCache() {
+    auto entry = entrypoints_[Op::kCUDAHostAllocatorEmptyCache];
+    FUNCCAST(tensoradapter::CUDAHostAllocatorEmptyCache, entry)();
+  }
+#endif  // DGL_USE_CUDA
+
+  /**
+   * @brief Record streams that are using this tensor.
+   * Used in NDArray::RecordStream().
+   *
+   * @param ptr Pointer of the tensor to be recorded.
+   * @param stream The stream that is using this tensor.
+   * @param device_id Device of the tensor.
+   */
+  inline void RecordStream(void* ptr, DGLStreamHandle stream, int device_id) {
+#ifdef DGL_USE_CUDA
+    auto entry = entrypoints_[Op::kRecordStream];
+    FUNCCAST(tensoradapter::RecordStream, entry)
+    (ptr, static_cast<cudaStream_t>(stream), device_id);
+#endif
+  }
+
+ private:
+  /** @brief ctor */
+  TensorDispatcher() = default;
+  /** @brief dtor */
+  ~TensorDispatcher();
+
+  /**
+   * @brief List of symbols in the adapter library.
+   *
+   * Must match the functions in tensoradapter/include/tensoradapter.h.
+   */
+  static constexpr const char* names_[] = {
+      "CPURawAlloc",         "CPURawDelete",
+#ifdef DGL_USE_CUDA
+      "CUDARawAlloc",        "CUDARawDelete",
+      "CUDACurrentStream",   "RecordStream",
+      "CUDARawHostAlloc",    "CUDARawHostDelete",
+      "CUDARecordHostAlloc", "CUDAHostAllocatorEmptyCache",
+#endif  // DGL_USE_CUDA
+  };
+
+  /** @brief Index of each function to the symbol list */
+  class Op {
+   public:
+    static constexpr int kCPURawAlloc = 0;
+    static constexpr int kCPURawDelete = 1;
+#ifdef DGL_USE_CUDA
+    static constexpr int kCUDARawAlloc = 2;
+    static constexpr int kCUDARawDelete = 3;
+    static constexpr int kCUDACurrentStream = 4;
+    static constexpr int kRecordStream = 5;
+    static constexpr int kCUDARawHostAlloc = 6;
+    static constexpr int kCUDARawHostDelete = 7;
+    static constexpr int kCUDARecordHostAlloc = 8;
+    static constexpr int kCUDAHostAllocatorEmptyCache = 9;
+#endif  // DGL_USE_CUDA
+  };
+
+  /** @brief Number of functions */
+  static constexpr int num_entries_ = sizeof(names_) / sizeof(names_[0]);
+
+  /** @brief Entrypoints of each function */
+  void* entrypoints_[num_entries_] = {
+      nullptr, nullptr,
+#ifdef DGL_USE_CUDA
+      nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+#endif  // DGL_USE_CUDA
+  };
+
+  bool available_ = false;
+#if defined(WIN32) || defined(_WIN32)
+  HINSTANCE handle_;
+#else   // !WIN32
+  void* handle_;
+#endif  // WIN32
+};
+
+};  // namespace runtime
+};  // namespace dgl
+
+#undef FUNCCAST
+
+#endif  // DGL_RUNTIME_TENSORDISPATCH_H_
diff --git a/src/array/arith.h b/src/array/arith.h
index 9526c694eb74..808f5f458168 100644
--- a/src/array/arith.h
+++ b/src/array/arith.h
@@ -6,13 +6,13 @@
 #ifndef DGL_ARRAY_ARITH_H_
 #define DGL_ARRAY_ARITH_H_
 
-#ifdef __CUDACC__
+#ifdef __HIPCC__
 #define DGLDEVICE __device__
 #define DGLINLINE __forceinline__
 #else
 #define DGLDEVICE
 #define DGLINLINE inline
-#endif  // __CUDACC__
+#endif  // __HIPCC__
 
 namespace dgl {
 namespace aten {
diff --git a/src/array/arith.h.prehip b/src/array/arith.h.prehip
new file mode 100644
index 000000000000..9526c694eb74
--- /dev/null
+++ b/src/array/arith.h.prehip
@@ -0,0 +1,109 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file array/arith.h
+ * @brief Arithmetic functors
+ */
+#ifndef DGL_ARRAY_ARITH_H_
+#define DGL_ARRAY_ARITH_H_
+
+#ifdef __CUDACC__
+#define DGLDEVICE __device__
+#define DGLINLINE __forceinline__
+#else
+#define DGLDEVICE
+#define DGLINLINE inline
+#endif  // __CUDACC__
+
+namespace dgl {
+namespace aten {
+namespace arith {
+
+struct Add {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) {
+    return t1 + t2;
+  }
+};
+
+struct Sub {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) {
+    return t1 - t2;
+  }
+};
+
+struct Mul {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) {
+    return t1 * t2;
+  }
+};
+
+struct Div {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) {
+    return t1 / t2;
+  }
+};
+
+struct Mod {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) {
+    return t1 % t2;
+  }
+};
+
+struct GT {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 > t2;
+  }
+};
+
+struct LT {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 < t2;
+  }
+};
+
+struct GE {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 >= t2;
+  }
+};
+
+struct LE {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 <= t2;
+  }
+};
+
+struct EQ {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 == t2;
+  }
+};
+
+struct NE {
+  template <typename T>
+  static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) {
+    return t1 != t2;
+  }
+};
+
+struct Neg {
+  template <typename T>
+  static DGLINLINE DGLDEVICE T Call(const T& t1) {
+    return -t1;
+  }
+};
+
+}  // namespace arith
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_ARITH_H_
diff --git a/src/array/cuda/array_cumsum.cu b/src/array/cuda/array_cumsum.cu
index 5d5ef1603c46..c372c75abf4a 100644
--- a/src/array/cuda/array_cumsum.cu
+++ b/src/array/cuda/array_cumsum.cu
@@ -5,7 +5,7 @@
  */
 #include <dgl/array.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -23,7 +23,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
                          : aten::Full(0, 1, array->dtype.bits, array->ctx);
 
   auto device = runtime::DeviceAPI::Get(array->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const IdType* in_d = array.Ptr<IdType>();
   IdArray ret;
   IdType* out_d = nullptr;
@@ -36,12 +36,12 @@ IdArray CumSum(IdArray array, bool prepend_zero) {
   }
   // Allocate workspace
   size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
       nullptr, workspace_size, in_d, out_d, len, stream));
   void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
 
   // Compute cumsum
-  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::InclusiveSum(
       workspace, workspace_size, in_d, out_d, len, stream));
 
   device->FreeWorkspace(array->ctx, workspace);
diff --git a/src/array/cuda/array_cumsum.cu.prehip b/src/array/cuda/array_cumsum.cu.prehip
new file mode 100644
index 000000000000..5d5ef1603c46
--- /dev/null
+++ b/src/array/cuda/array_cumsum.cu.prehip
@@ -0,0 +1,57 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cpu/array_cumsum.cu
+ * @brief Array cumsum GPU implementation
+ */
+#include <dgl/array.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray CumSum(IdArray array, bool prepend_zero) {
+  const int64_t len = array.NumElements();
+  if (len == 0)
+    return !prepend_zero ? array
+                         : aten::Full(0, 1, array->dtype.bits, array->ctx);
+
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const IdType* in_d = array.Ptr<IdType>();
+  IdArray ret;
+  IdType* out_d = nullptr;
+  if (prepend_zero) {
+    ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx);
+    out_d = ret.Ptr<IdType>() + 1;
+  } else {
+    ret = aten::NewIdArray(len, array->ctx, array->dtype.bits);
+    out_d = ret.Ptr<IdType>();
+  }
+  // Allocate workspace
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+      nullptr, workspace_size, in_d, out_d, len, stream));
+  void* workspace = device->AllocWorkspace(array->ctx, workspace_size);
+
+  // Compute cumsum
+  CUDA_CALL(cub::DeviceScan::InclusiveSum(
+      workspace, workspace_size, in_d, out_d, len, stream));
+
+  device->FreeWorkspace(array->ctx, workspace);
+
+  return ret;
+}
+
+template IdArray CumSum<kDGLCUDA, int32_t>(IdArray, bool);
+template IdArray CumSum<kDGLCUDA, int64_t>(IdArray, bool);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/array_index_select.cu b/src/array/cuda/array_index_select.cu
index 6e29e996a1c3..e5370a4c27c7 100644
--- a/src/array/cuda/array_index_select.cu
+++ b/src/array/cuda/array_index_select.cu
@@ -33,7 +33,7 @@ NDArray IndexSelect(NDArray array, IdArray index) {
   const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
   const IdType* idx_data = static_cast<IdType*>(index->data);
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   if (num_feat == 1) {
     const int nt = cuda::FindNumThreads(len);
     const int nb = (len + nt - 1) / nt;
@@ -61,9 +61,9 @@ template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
 template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
 #if BF16_ENABLED
-template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int32_t>(
+template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int32_t>(
     NDArray, IdArray);
-template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int64_t>(
+template NDArray IndexSelect<kDGLCUDA, __hip_bfloat16, int64_t>(
     NDArray, IdArray);
 #endif  // BF16_ENABLED
 template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
@@ -87,7 +87,7 @@ template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
 template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
 template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
 #if BF16_ENABLED
-template __nv_bfloat16 IndexSelect<kDGLCUDA, __nv_bfloat16>(
+template __hip_bfloat16 IndexSelect<kDGLCUDA, __hip_bfloat16>(
     NDArray array, int64_t index);
 #endif  // BF16_ENABLED
 template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);
diff --git a/src/array/cuda/array_index_select.cu.prehip b/src/array/cuda/array_index_select.cu.prehip
new file mode 100644
index 000000000000..6e29e996a1c3
--- /dev/null
+++ b/src/array/cuda/array_index_select.cu.prehip
@@ -0,0 +1,98 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file array/cpu/array_index_select.cu
+ * @brief Array index select GPU implementation
+ */
+#include <dgl/array.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./array_index_select.cuh"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename DType, typename IdType>
+NDArray IndexSelect(NDArray array, IdArray index) {
+  const int64_t arr_len = array->shape[0];
+  const int64_t len = index->shape[0];
+  int64_t num_feat = 1;
+  std::vector<int64_t> shape{len};
+  for (int d = 1; d < array->ndim; ++d) {
+    num_feat *= array->shape[d];
+    shape.emplace_back(array->shape[d]);
+  }
+
+  // use index->ctx for pinned array
+  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
+  if (len == 0 || arr_len * num_feat == 0) return ret;
+  DType* ret_data = static_cast<DType*>(ret->data);
+
+  const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  if (num_feat == 1) {
+    const int nt = cuda::FindNumThreads(len);
+    const int nb = (len + nt - 1) / nt;
+    CUDA_KERNEL_CALL(
+        IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len,
+        arr_len, ret_data);
+  } else {
+    dim3 block(256, 1);
+    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
+      block.x /= 2;
+      block.y *= 2;
+    }
+    const dim3 grid((len + block.y - 1) / block.y);
+    CUDA_KERNEL_CALL(
+        IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat,
+        idx_data, len, arr_len, ret_data);
+  }
+  return ret;
+}
+
+template NDArray IndexSelect<kDGLCUDA, int32_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, int32_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, int64_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, int64_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, __half, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, __half, int64_t>(NDArray, IdArray);
+#if BF16_ENABLED
+template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int32_t>(
+    NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, __nv_bfloat16, int64_t>(
+    NDArray, IdArray);
+#endif  // BF16_ENABLED
+template NDArray IndexSelect<kDGLCUDA, float, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, float, int64_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, double, int32_t>(NDArray, IdArray);
+template NDArray IndexSelect<kDGLCUDA, double, int64_t>(NDArray, IdArray);
+
+template <DGLDeviceType XPU, typename DType>
+DType IndexSelect(NDArray array, int64_t index) {
+  auto device = runtime::DeviceAPI::Get(array->ctx);
+  DType ret = static_cast<DType>(0.0f);
+  device->CopyDataFromTo(
+      static_cast<DType*>(array->data) + index, 0, &ret, 0, sizeof(DType),
+      array->ctx, DGLContext{kDGLCPU, 0}, array->dtype);
+  return ret;
+}
+
+template int32_t IndexSelect<kDGLCUDA, int32_t>(NDArray array, int64_t index);
+template int64_t IndexSelect<kDGLCUDA, int64_t>(NDArray array, int64_t index);
+template uint32_t IndexSelect<kDGLCUDA, uint32_t>(NDArray array, int64_t index);
+template uint64_t IndexSelect<kDGLCUDA, uint64_t>(NDArray array, int64_t index);
+template __half IndexSelect<kDGLCUDA, __half>(NDArray array, int64_t index);
+#if BF16_ENABLED
+template __nv_bfloat16 IndexSelect<kDGLCUDA, __nv_bfloat16>(
+    NDArray array, int64_t index);
+#endif  // BF16_ENABLED
+template float IndexSelect<kDGLCUDA, float>(NDArray array, int64_t index);
+template double IndexSelect<kDGLCUDA, double>(NDArray array, int64_t index);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/array_index_select.cuh b/src/array/cuda/array_index_select.cuh
index e4d8673ca026..441bd8e505a4 100644
--- a/src/array/cuda/array_index_select.cuh
+++ b/src/array/cuda/array_index_select.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021-2022 by Contributors
  * @file array/cuda/array_index_select.cuh
diff --git a/src/array/cuda/array_index_select.cuh.prehip b/src/array/cuda/array_index_select.cuh.prehip
new file mode 100644
index 000000000000..e4d8673ca026
--- /dev/null
+++ b/src/array/cuda/array_index_select.cuh.prehip
@@ -0,0 +1,87 @@
+/**
+ *  Copyright (c) 2021-2022 by Contributors
+ * @file array/cuda/array_index_select.cuh
+ * @brief Array index select GPU kernel implementation
+ */
+
+#ifndef DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_
+#define DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+template <typename DType, typename IdType>
+__global__ void IndexSelectSingleKernel(
+    const DType* array, const IdType* index, const int64_t length,
+    const int64_t arr_len, DType* out, const int64_t* perm = nullptr) {
+  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    assert(index[tx] >= 0 && index[tx] < arr_len);
+    const auto out_row = perm ? perm[tx] : tx;
+    out[out_row] = array[index[tx]];
+    tx += stride_x;
+  }
+}
+
+template <typename DType, typename IdType>
+__global__ void IndexSelectMultiKernel(
+    const DType* const array, const int64_t num_feat, const IdType* const index,
+    const int64_t length, const int64_t arr_len, DType* const out,
+    const int64_t* perm = nullptr) {
+  int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y;
+
+  const int64_t stride = blockDim.y * gridDim.x;
+
+  while (out_row_index < length) {
+    int64_t col = threadIdx.x;
+    const int64_t in_row = index[out_row_index];
+    assert(in_row >= 0 && in_row < arr_len);
+    const auto out_row = perm ? perm[out_row_index] : out_row_index;
+    while (col < num_feat) {
+      out[out_row * num_feat + col] = array[in_row * num_feat + col];
+      col += blockDim.x;
+    }
+    out_row_index += stride;
+  }
+}
+
+template <typename DType, typename IdType>
+__global__ void IndexScatterSingleKernel(
+    const DType* array, const IdType* index, const int64_t length,
+    const int64_t arr_len, DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    assert(index[tx] >= 0 && index[tx] < arr_len);
+    out[index[tx]] = array[tx];
+    tx += stride_x;
+  }
+}
+
+template <typename DType, typename IdType>
+__global__ void IndexScatterMultiKernel(
+    const DType* const array, const int64_t num_feat, const IdType* const index,
+    const int64_t length, const int64_t arr_len, DType* const out) {
+  int64_t in_row = blockIdx.x * blockDim.y + threadIdx.y;
+
+  const int64_t stride = blockDim.y * gridDim.x;
+
+  while (in_row < length) {
+    int64_t col = threadIdx.x;
+    const int64_t out_row = index[in_row];
+    assert(out_row >= 0 && out_row < arr_len);
+    while (col < num_feat) {
+      out[out_row * num_feat + col] = array[in_row * num_feat + col];
+      col += blockDim.x;
+    }
+    in_row += stride;
+  }
+}
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_
diff --git a/src/array/cuda/array_nonzero.cu b/src/array/cuda/array_nonzero.cu
index 3ffaad2d657a..299afd4cc235 100644
--- a/src/array/cuda/array_nonzero.cu
+++ b/src/array/cuda/array_nonzero.cu
@@ -6,7 +6,7 @@
 
 #include <dgl/array.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -33,24 +33,24 @@ IdArray NonZero(IdArray array) {
   const int64_t len = array->shape[0];
   IdArray ret = NewIdArray(len, ctx, 64);
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const IdType* const in_data = static_cast<const IdType*>(array->data);
   int64_t* const out_data = static_cast<int64_t*>(ret->data);
 
   IsNonZeroIndex<IdType> comp(in_data);
-  cub::CountingInputIterator<int64_t> counter(0);
+  hipcub::CountingInputIterator<int64_t> counter(0);
 
   // room for cub to output on GPU
   int64_t* d_num_nonzeros =
       static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
 
   size_t temp_size = 0;
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
       nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
       stream));
   void* temp = device->AllocWorkspace(ctx, temp_size);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
       temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
   device->FreeWorkspace(ctx, temp);
 
diff --git a/src/array/cuda/array_nonzero.cu.prehip b/src/array/cuda/array_nonzero.cu.prehip
new file mode 100644
index 000000000000..3ffaad2d657a
--- /dev/null
+++ b/src/array/cuda/array_nonzero.cu.prehip
@@ -0,0 +1,71 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cpu/array_nonzero.cc
+ * @brief Array nonzero CPU implementation
+ */
+
+#include <dgl/array.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename IdType>
+struct IsNonZeroIndex {
+  explicit IsNonZeroIndex(const IdType* array) : array_(array) {}
+
+  __device__ bool operator()(const int64_t index) { return array_[index] != 0; }
+
+  const IdType* array_;
+};
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray NonZero(IdArray array) {
+  const auto& ctx = array->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+
+  const int64_t len = array->shape[0];
+  IdArray ret = NewIdArray(len, ctx, 64);
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const IdType* const in_data = static_cast<const IdType*>(array->data);
+  int64_t* const out_data = static_cast<int64_t*>(ret->data);
+
+  IsNonZeroIndex<IdType> comp(in_data);
+  cub::CountingInputIterator<int64_t> counter(0);
+
+  // room for cub to output on GPU
+  int64_t* d_num_nonzeros =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+
+  size_t temp_size = 0;
+  CUDA_CALL(cub::DeviceSelect::If(
+      nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp,
+      stream));
+  void* temp = device->AllocWorkspace(ctx, temp_size);
+  CUDA_CALL(cub::DeviceSelect::If(
+      temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream));
+  device->FreeWorkspace(ctx, temp);
+
+  // copy number of selected elements from GPU to CPU
+  int64_t num_nonzeros = cuda::GetCUDAScalar(device, ctx, d_num_nonzeros);
+  device->FreeWorkspace(ctx, d_num_nonzeros);
+  device->StreamSync(ctx, stream);
+
+  // truncate array to size
+  return ret.CreateView({num_nonzeros}, ret->dtype, 0);
+}
+
+template IdArray NonZero<kDGLCUDA, int32_t>(IdArray);
+template IdArray NonZero<kDGLCUDA, int64_t>(IdArray);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/array_op_impl.cu b/src/array/cuda/array_op_impl.cu
index f5f19be81ba3..f517e1057468 100644
--- a/src/array/cuda/array_op_impl.cu
+++ b/src/array/cuda/array_op_impl.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020-2021 by Contributors
  * @file array/cuda/array_op_impl.cu
@@ -36,7 +37,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
   const IdType* lhs_data = static_cast<IdType*>(lhs->data);
   const IdType* rhs_data = static_cast<IdType*>(rhs->data);
   IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -107,7 +108,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) {
   IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
   const IdType* lhs_data = static_cast<IdType*>(lhs->data);
   IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -178,7 +179,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) {
   IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
   const IdType* rhs_data = static_cast<IdType*>(rhs->data);
   IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -249,7 +250,7 @@ IdArray UnaryElewise(IdArray lhs) {
   IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
   const IdType* lhs_data = static_cast<IdType*>(lhs->data);
   IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(len);
   int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -277,7 +278,7 @@ template <DGLDeviceType XPU, typename DType>
 NDArray Full(DType val, int64_t length, DGLContext ctx) {
   NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx);
   DType* ret_data = static_cast<DType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -292,8 +293,8 @@ template IdArray Full<kDGLCUDA, int64_t>(
 template IdArray Full<kDGLCUDA, __half>(
     __half val, int64_t length, DGLContext ctx);
 #if BF16_ENABLED
-template IdArray Full<kDGLCUDA, __nv_bfloat16>(
-    __nv_bfloat16 val, int64_t length, DGLContext ctx);
+template IdArray Full<kDGLCUDA, __hip_bfloat16>(
+    __hip_bfloat16 val, int64_t length, DGLContext ctx);
 #endif  // BF16_ENABLED
 template IdArray Full<kDGLCUDA, float>(
     float val, int64_t length, DGLContext ctx);
@@ -319,7 +320,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) {
   IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
   if (length == 0) return ret;
   IdType* ret_data = static_cast<IdType*>(ret->data);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
   CUDA_KERNEL_CALL(
@@ -355,7 +356,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
 
   const auto& ctx = arrays[0]->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   // build node maps and get the induced nodes
   OrderedHashTable<IdType> node_map(total_length, ctx, stream);
@@ -364,7 +365,7 @@ IdArray Relabel_(const std::vector<IdArray>& arrays) {
       static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
   IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8);
 
-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
       num_induced_device, 0, sizeof(*num_induced_device), stream));
 
   node_map.FillWithDuplicates(
@@ -416,7 +417,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) {
   const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
   IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx);
   const int64_t length = ret.NumElements();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = cuda::FindNumThreads(length);
   int nb = (length + nt - 1) / nt;
   if (bits == 32) {
diff --git a/src/array/cuda/array_op_impl.cu.prehip b/src/array/cuda/array_op_impl.cu.prehip
new file mode 100644
index 000000000000..f5f19be81ba3
--- /dev/null
+++ b/src/array/cuda/array_op_impl.cu.prehip
@@ -0,0 +1,441 @@
+/**
+ *  Copyright (c) 2020-2021 by Contributors
+ * @file array/cuda/array_op_impl.cu
+ * @brief Array operator GPU implementation
+ */
+#include <dgl/array.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../runtime/cuda/cuda_hashtable.cuh"
+#include "../arith.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+using namespace runtime::cuda;
+namespace aten {
+namespace impl {
+
+///////////////////////////// BinaryElewise /////////////////////////////
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx], rhs[tx]);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdArray lhs, IdArray rhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_BinaryElewiseKernel<IdType, Op>), nb, nt, 0, stream, lhs_data, rhs_data,
+      ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Add>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Sub>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mul>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Div>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mod>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::EQ>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::NE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Add>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Sub>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mul>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Div>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mod>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LT>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LE>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::EQ>(
+    IdArray lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::NE>(
+    IdArray lhs, IdArray rhs);
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    const IdType* lhs, IdType rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx], rhs);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdArray lhs, IdType rhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_BinaryElewiseKernel<IdType, Op>), nb, nt, 0, stream, lhs_data, rhs,
+      ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Add>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Sub>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mul>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Div>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mod>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GT>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LT>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::EQ>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::NE>(
+    IdArray lhs, int32_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Add>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Sub>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mul>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Div>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mod>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GT>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LT>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GE>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LE>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::EQ>(
+    IdArray lhs, int64_t rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::NE>(
+    IdArray lhs, int64_t rhs);
+
+template <typename IdType, typename Op>
+__global__ void _BinaryElewiseKernel(
+    IdType lhs, const IdType* rhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs, rhs[tx]);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType, typename Op>
+IdArray BinaryElewise(IdType lhs, IdArray rhs) {
+  const int64_t len = rhs->shape[0];
+  IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits);
+  const IdType* rhs_data = static_cast<IdType*>(rhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_BinaryElewiseKernel<IdType, Op>), nb, nt, 0, stream, lhs, rhs_data,
+      ret_data, len);
+  return ret;
+}
+
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Add>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Sub>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mul>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Div>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::Mod>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GT>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LT>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::GE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::LE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::EQ>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int32_t, arith::NE>(
+    int32_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Add>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Sub>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mul>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Div>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::Mod>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GT>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LT>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::GE>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::LE>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::EQ>(
+    int64_t lhs, IdArray rhs);
+template IdArray BinaryElewise<kDGLCUDA, int64_t, arith::NE>(
+    int64_t lhs, IdArray rhs);
+
+template <typename IdType, typename Op>
+__global__ void _UnaryElewiseKernel(
+    const IdType* lhs, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = Op::Call(lhs[tx]);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType, typename Op>
+IdArray UnaryElewise(IdArray lhs) {
+  const int64_t len = lhs->shape[0];
+  IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits);
+  const IdType* lhs_data = static_cast<IdType*>(lhs->data);
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(len);
+  int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_UnaryElewiseKernel<IdType, Op>), nb, nt, 0, stream, lhs_data, ret_data,
+      len);
+  return ret;
+}
+
+template IdArray UnaryElewise<kDGLCUDA, int32_t, arith::Neg>(IdArray lhs);
+template IdArray UnaryElewise<kDGLCUDA, int64_t, arith::Neg>(IdArray lhs);
+
+///////////////////////////// Full /////////////////////////////
+
+template <typename DType>
+__global__ void _FullKernel(DType* out, int64_t length, DType val) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = val;
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename DType>
+NDArray Full(DType val, int64_t length, DGLContext ctx) {
+  NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits<DType>::dtype, ctx);
+  DType* ret_data = static_cast<DType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(length);
+  int nb = (length + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_FullKernel<DType>), nb, nt, 0, stream, ret_data, length, val);
+  return ret;
+}
+
+template IdArray Full<kDGLCUDA, int32_t>(
+    int32_t val, int64_t length, DGLContext ctx);
+template IdArray Full<kDGLCUDA, int64_t>(
+    int64_t val, int64_t length, DGLContext ctx);
+template IdArray Full<kDGLCUDA, __half>(
+    __half val, int64_t length, DGLContext ctx);
+#if BF16_ENABLED
+template IdArray Full<kDGLCUDA, __nv_bfloat16>(
+    __nv_bfloat16 val, int64_t length, DGLContext ctx);
+#endif  // BF16_ENABLED
+template IdArray Full<kDGLCUDA, float>(
+    float val, int64_t length, DGLContext ctx);
+template IdArray Full<kDGLCUDA, double>(
+    double val, int64_t length, DGLContext ctx);
+
+///////////////////////////// Range /////////////////////////////
+
+template <typename IdType>
+__global__ void _RangeKernel(IdType* out, IdType low, IdType length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = low + tx;
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray Range(IdType low, IdType high, DGLContext ctx) {
+  CHECK(high >= low) << "high must be bigger than low";
+  const IdType length = high - low;
+  IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8);
+  if (length == 0) return ret;
+  IdType* ret_data = static_cast<IdType*>(ret->data);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(length);
+  int nb = (length + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      (_RangeKernel<IdType>), nb, nt, 0, stream, ret_data, low, length);
+  return ret;
+}
+
+template IdArray Range<kDGLCUDA, int32_t>(int32_t, int32_t, DGLContext);
+template IdArray Range<kDGLCUDA, int64_t>(int64_t, int64_t, DGLContext);
+
+///////////////////////////// Relabel_ //////////////////////////////
+
+template <typename IdType>
+__global__ void _RelabelKernel(
+    IdType* out, int64_t length, DeviceOrderedHashTable<IdType> table) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+
+  while (tx < length) {
+    out[tx] = table.Search(out[tx])->local;
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray Relabel_(const std::vector<IdArray>& arrays) {
+  IdArray all_nodes = Concat(arrays);
+  const int64_t total_length = all_nodes->shape[0];
+
+  if (total_length == 0) {
+    return all_nodes;
+  }
+
+  const auto& ctx = arrays[0]->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  // build node maps and get the induced nodes
+  OrderedHashTable<IdType> node_map(total_length, ctx, stream);
+  int64_t num_induced = 0;
+  int64_t* num_induced_device =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+  IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8);
+
+  CUDA_CALL(cudaMemsetAsync(
+      num_induced_device, 0, sizeof(*num_induced_device), stream));
+
+  node_map.FillWithDuplicates(
+      all_nodes.Ptr<IdType>(), all_nodes->shape[0], induced_nodes.Ptr<IdType>(),
+      num_induced_device, stream);
+  // copy using the internal current stream
+  device->CopyDataFromTo(
+      num_induced_device, 0, &num_induced, 0, sizeof(num_induced), ctx,
+      DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
+
+  device->StreamSync(ctx, stream);
+  device->FreeWorkspace(ctx, num_induced_device);
+
+  // resize the induced nodes
+  induced_nodes->shape[0] = num_induced;
+
+  // relabel
+  const int nt = 128;
+  for (IdArray arr : arrays) {
+    const int64_t length = arr->shape[0];
+    int nb = (length + nt - 1) / nt;
+    CUDA_KERNEL_CALL(
+        (_RelabelKernel<IdType>), nb, nt, 0, stream, arr.Ptr<IdType>(), length,
+        node_map.DeviceHandle());
+  }
+
+  return induced_nodes;
+}
+
+template IdArray Relabel_<kDGLCUDA, int32_t>(
+    const std::vector<IdArray>& arrays);
+template IdArray Relabel_<kDGLCUDA, int64_t>(
+    const std::vector<IdArray>& arrays);
+
+///////////////////////////// AsNumBits /////////////////////////////
+
+template <typename InType, typename OutType>
+__global__ void _CastKernel(const InType* in, OutType* out, size_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[tx] = in[tx];
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray AsNumBits(IdArray arr, uint8_t bits) {
+  const std::vector<int64_t> shape(arr->shape, arr->shape + arr->ndim);
+  IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx);
+  const int64_t length = ret.NumElements();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = cuda::FindNumThreads(length);
+  int nb = (length + nt - 1) / nt;
+  if (bits == 32) {
+    CUDA_KERNEL_CALL(
+        (_CastKernel<IdType, int32_t>), nb, nt, 0, stream,
+        static_cast<IdType*>(arr->data), static_cast<int32_t*>(ret->data),
+        length);
+  } else {
+    CUDA_KERNEL_CALL(
+        (_CastKernel<IdType, int64_t>), nb, nt, 0, stream,
+        static_cast<IdType*>(arr->data), static_cast<int64_t*>(ret->data),
+        length);
+  }
+  return ret;
+}
+
+template IdArray AsNumBits<kDGLCUDA, int32_t>(IdArray arr, uint8_t bits);
+template IdArray AsNumBits<kDGLCUDA, int64_t>(IdArray arr, uint8_t bits);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/array_scatter.cu b/src/array/cuda/array_scatter.cu
index 41acbba92581..83480f7b6be8 100644
--- a/src/array/cuda/array_scatter.cu
+++ b/src/array/cuda/array_scatter.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2019 by Contributors
  * @file array/cuda/array_scatter.cu
@@ -31,7 +32,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) {
   const DType* val = value.Ptr<DType>();
   DType* outd = out.Ptr<DType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int nt = cuda::FindNumThreads(len);
   const int nb = (len + nt - 1) / nt;
   CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd);
@@ -41,7 +42,7 @@ template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray);
 #if BF16_ENABLED
-template void Scatter_<kDGLCUDA, __nv_bfloat16, int32_t>(
+template void Scatter_<kDGLCUDA, __hip_bfloat16, int32_t>(
     IdArray, NDArray, NDArray);
 #endif  // BF16_ENABLED
 template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray);
@@ -50,7 +51,7 @@ template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray);
 template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray);
 #if BF16_ENABLED
-template void Scatter_<kDGLCUDA, __nv_bfloat16, int64_t>(
+template void Scatter_<kDGLCUDA, __hip_bfloat16, int64_t>(
     IdArray, NDArray, NDArray);
 #endif  // BF16_ENABLED
 template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray);
diff --git a/src/array/cuda/array_scatter.cu.prehip b/src/array/cuda/array_scatter.cu.prehip
new file mode 100644
index 000000000000..41acbba92581
--- /dev/null
+++ b/src/array/cuda/array_scatter.cu.prehip
@@ -0,0 +1,61 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file array/cuda/array_scatter.cu
+ * @brief Array scatter GPU implementation
+ */
+#include <dgl/array.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename DType, typename IdType>
+__global__ void _ScatterKernel(
+    const IdType* index, const DType* value, int64_t length, DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    out[index[tx]] = value[tx];
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename DType, typename IdType>
+void Scatter_(IdArray index, NDArray value, NDArray out) {
+  const int64_t len = index->shape[0];
+  const IdType* idx = index.Ptr<IdType>();
+  const DType* val = value.Ptr<DType>();
+  DType* outd = out.Ptr<DType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int nt = cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd);
+}
+
+template void Scatter_<kDGLCUDA, int32_t, int32_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, int64_t, int32_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, __half, int32_t>(IdArray, NDArray, NDArray);
+#if BF16_ENABLED
+template void Scatter_<kDGLCUDA, __nv_bfloat16, int32_t>(
+    IdArray, NDArray, NDArray);
+#endif  // BF16_ENABLED
+template void Scatter_<kDGLCUDA, float, int32_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, double, int32_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, int32_t, int64_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, int64_t, int64_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, __half, int64_t>(IdArray, NDArray, NDArray);
+#if BF16_ENABLED
+template void Scatter_<kDGLCUDA, __nv_bfloat16, int64_t>(
+    IdArray, NDArray, NDArray);
+#endif  // BF16_ENABLED
+template void Scatter_<kDGLCUDA, float, int64_t>(IdArray, NDArray, NDArray);
+template void Scatter_<kDGLCUDA, double, int64_t>(IdArray, NDArray, NDArray);
+
+};  // namespace impl
+};  // namespace aten
+};  // namespace dgl
diff --git a/src/array/cuda/array_sort.cu b/src/array/cuda/array_sort.cu
index 390483e4a85b..5f697abcf76a 100644
--- a/src/array/cuda/array_sort.cu
+++ b/src/array/cuda/array_sort.cu
@@ -5,7 +5,7 @@
  */
 #include <dgl/array.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -29,20 +29,20 @@ std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
   IdType* keys_out = sorted_array.Ptr<IdType>();
   int64_t* values_out = sorted_idx.Ptr<int64_t>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   if (num_bits == 0) {
     num_bits = sizeof(IdType) * 8;
   }
 
   // Allocate workspace
   size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
       0, num_bits, stream));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
 
   // Compute
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       workspace, workspace_size, keys_in, keys_out, values_in, values_out,
       nitems, 0, num_bits, stream));
 
diff --git a/src/array/cuda/array_sort.cu.prehip b/src/array/cuda/array_sort.cu.prehip
new file mode 100644
index 000000000000..390483e4a85b
--- /dev/null
+++ b/src/array/cuda/array_sort.cu.prehip
@@ -0,0 +1,61 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cpu/array_sort.cu
+ * @brief Array sort GPU implementation
+ */
+#include <dgl/array.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> Sort(IdArray array, int num_bits) {
+  const auto& ctx = array->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  const int64_t nitems = array->shape[0];
+  IdArray orig_idx = Range(0, nitems, 64, ctx);
+  IdArray sorted_array = NewIdArray(nitems, ctx, array->dtype.bits);
+  IdArray sorted_idx = NewIdArray(nitems, ctx, 64);
+
+  const IdType* keys_in = array.Ptr<IdType>();
+  const int64_t* values_in = orig_idx.Ptr<int64_t>();
+  IdType* keys_out = sorted_array.Ptr<IdType>();
+  int64_t* values_out = sorted_idx.Ptr<int64_t>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  if (num_bits == 0) {
+    num_bits = sizeof(IdType) * 8;
+  }
+
+  // Allocate workspace
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems,
+      0, num_bits, stream));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+
+  // Compute
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      workspace, workspace_size, keys_in, keys_out, values_in, values_out,
+      nitems, 0, num_bits, stream));
+
+  device->FreeWorkspace(ctx, workspace);
+
+  return std::make_pair(sorted_array, sorted_idx);
+}
+
+template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int32_t>(
+    IdArray, int num_bits);
+template std::pair<IdArray, IdArray> Sort<kDGLCUDA, int64_t>(
+    IdArray, int num_bits);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/atomic.cuh b/src/array/cuda/atomic.cuh
index 0b88594a1dbb..85391945ae9f 100644
--- a/src/array/cuda/atomic.cuh
+++ b/src/array/cuda/atomic.cuh
@@ -6,7 +6,7 @@
 #ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
 #define DGL_ARRAY_CUDA_ATOMIC_CUH_
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 
 #include <cassert>
 #include <cstdint>
@@ -16,7 +16,7 @@
 #include "fp16.cuh"
 
 #if __CUDA_ARCH__ >= 600
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #endif
 
 namespace dgl {
@@ -67,28 +67,28 @@ struct Cast<half> {
 
 #if BF16_ENABLED
 template <>
-struct Cast<__nv_bfloat16> {
-  typedef Code<sizeof(__nv_bfloat16)>::Type Type;
-  static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) {
+struct Cast<__hip_bfloat16> {
+  typedef Code<sizeof(__hip_bfloat16)>::Type Type;
+  static __device__ __forceinline__ Type Encode(__hip_bfloat16 val) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     return __bfloat16_as_ushort(val);
 #else
     printf(
         "Atomic operations are not supported for bfloat16 (BF16) "
         "on GPUs with compute capability less than 8.0.\n");
-    __trap();
+    abort();
     return static_cast<Type>(0);
 #endif
   }
-  static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) {
+  static __device__ __forceinline__ __hip_bfloat16 Decode(Type code) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
     return __ushort_as_bfloat16(code);
 #else
     printf(
         "Atomic operations are not supported for bfloat16 (BF16) "
         "on GPUs with compute capability less than 8.0.\n");
-    __trap();
-    return static_cast<__nv_bfloat16>(0.0f);
+    abort();
+    return static_cast<__hip_bfloat16>(0.0f);
 #endif
   }
 };
@@ -130,7 +130,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
   printf(
       "Atomic operations are not supported for half precision (FP16) "
       "on this GPU.\n");
-  __trap();
+  abort();
   return val;
 #endif  // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
 }
@@ -172,7 +172,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
 DEFINE_ATOMIC(Max)
 DEFINE_ATOMIC_16BIT(Max, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16)
+DEFINE_ATOMIC_16BIT(Max, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP
 
@@ -180,7 +180,7 @@ DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16)
 DEFINE_ATOMIC(Min)
 DEFINE_ATOMIC_16BIT(Min, half)
 #if BF16_ENABLED
-DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16)
+DEFINE_ATOMIC_16BIT(Min, __hip_bfloat16)
 #endif  // BF16_ENABLED
 #undef OP
 
@@ -304,7 +304,7 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
   printf(
       "Atomic operations are not supported for half precision (FP16) "
       "on this GPU.\n");
-  __trap();
+  abort();
   return val;
 #endif  // __CUDA_ARCH__ >= 700
 }
@@ -312,8 +312,8 @@ __device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
 
 #if BF16_ENABLED
 template <>
-__device__ __forceinline__ __nv_bfloat16
-AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
+__device__ __forceinline__ __hip_bfloat16
+AtomicAdd<__hip_bfloat16>(__hip_bfloat16* addr, __hip_bfloat16 val) {
 // make sure we have bfloat16 support
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   return atomicAdd(addr, val);
@@ -323,7 +323,7 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
   printf(
       "Atomic operations are not supported for bfloat16 (BF16) "
       "on GPUs with compute capability less than 8.0.\n");
-  __trap();
+  abort();
   return val;
 #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
 }
diff --git a/src/array/cuda/atomic.cuh.prehip b/src/array/cuda/atomic.cuh.prehip
new file mode 100644
index 000000000000..0b88594a1dbb
--- /dev/null
+++ b/src/array/cuda/atomic.cuh.prehip
@@ -0,0 +1,336 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file array/cuda/atomic.cuh
+ * @brief Atomic functions
+ */
+#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_
+#define DGL_ARRAY_CUDA_ATOMIC_CUH_
+
+#include <cuda_runtime.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include "bf16.cuh"
+#include "fp16.cuh"
+
+#if __CUDA_ARCH__ >= 600
+#include <cuda_fp16.h>
+#endif
+
+namespace dgl {
+namespace aten {
+namespace cuda {
+
+// Type trait for selecting code type
+template <int Bytes>
+struct Code {};
+
+template <>
+struct Code<2> {
+  typedef unsigned short int Type;  // NOLINT
+};
+
+template <>
+struct Code<4> {
+  typedef unsigned int Type;  // NOLINT
+};
+
+template <>
+struct Code<8> {
+  typedef unsigned long long int Type;  // NOLINT
+};
+
+// Helper class for converting to/from atomicCAS compatible types.
+template <typename T>
+struct Cast {
+  typedef typename Code<sizeof(T)>::Type Type;
+  static __device__ __forceinline__ Type Encode(T val) {
+    return static_cast<Type>(val);
+  }
+  static __device__ __forceinline__ T Decode(Type code) {
+    return static_cast<T>(code);
+  }
+};
+
+template <>
+struct Cast<half> {
+  typedef Code<sizeof(half)>::Type Type;
+  static __device__ __forceinline__ Type Encode(half val) {
+    return __half_as_ushort(val);
+  }
+  static __device__ __forceinline__ half Decode(Type code) {
+    return __ushort_as_half(code);
+  }
+};
+
+#if BF16_ENABLED
+template <>
+struct Cast<__nv_bfloat16> {
+  typedef Code<sizeof(__nv_bfloat16)>::Type Type;
+  static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __bfloat16_as_ushort(val);
+#else
+    printf(
+        "Atomic operations are not supported for bfloat16 (BF16) "
+        "on GPUs with compute capability less than 8.0.\n");
+    __trap();
+    return static_cast<Type>(0);
+#endif
+  }
+  static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    return __ushort_as_bfloat16(code);
+#else
+    printf(
+        "Atomic operations are not supported for bfloat16 (BF16) "
+        "on GPUs with compute capability less than 8.0.\n");
+    __trap();
+    return static_cast<__nv_bfloat16>(0.0f);
+#endif
+  }
+};
+#endif  // BF16_ENABLED
+
+template <>
+struct Cast<float> {
+  typedef Code<sizeof(float)>::Type Type;
+  static __device__ __forceinline__ Type Encode(float val) {
+    return __float_as_uint(val);
+  }
+  static __device__ __forceinline__ float Decode(Type code) {
+    return __uint_as_float(code);
+  }
+};
+
+template <>
+struct Cast<double> {
+  typedef Code<sizeof(double)>::Type Type;
+  static __device__ __forceinline__ Type Encode(double val) {
+    return __double_as_longlong(val);
+  }
+  static __device__ __forceinline__ double Decode(Type code) {
+    return __longlong_as_double(code);
+  }
+};
+
+static __device__ __forceinline__ unsigned short int atomicCASshort(  // NOLINT
+    unsigned short int* address,                                      // NOLINT
+    unsigned short int compare,                                       // NOLINT
+    unsigned short int val) {                                         // NOLINT
+  static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10");
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+  return atomicCAS(address, compare, val);
+#else
+  (void)address;
+  (void)compare;
+  (void)val;
+  printf(
+      "Atomic operations are not supported for half precision (FP16) "
+      "on this GPU.\n");
+  __trap();
+  return val;
+#endif  // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700)
+}
+
+#define DEFINE_ATOMIC(NAME)                                   \
+  template <typename T>                                       \
+  __device__ __forceinline__ T Atomic##NAME(T* addr, T val) { \
+    typedef typename Cast<T>::Type CT;                        \
+    CT* addr_as_ui = reinterpret_cast<CT*>(addr);             \
+    CT old = *addr_as_ui;                                     \
+    CT assumed = old;                                         \
+    do {                                                      \
+      assumed = old;                                          \
+      old = atomicCAS(                                        \
+          addr_as_ui, assumed,                                \
+          Cast<T>::Encode(OP(val, Cast<T>::Decode(old))));    \
+    } while (assumed != old);                                 \
+    return Cast<T>::Decode(old);                              \
+  }
+
+#define DEFINE_ATOMIC_16BIT(NAME, dtype)                           \
+  template <>                                                      \
+  __device__ __forceinline__ dtype Atomic##NAME<dtype>(            \
+      dtype * addr, dtype val) {                                   \
+    typedef uint16_t CT;                                           \
+    CT* addr_as_ui = reinterpret_cast<CT*>(addr);                  \
+    CT old = *addr_as_ui;                                          \
+    CT assumed = old;                                              \
+    do {                                                           \
+      assumed = old;                                               \
+      old = atomicCASshort(                                        \
+          addr_as_ui, assumed,                                     \
+          Cast<dtype>::Encode(OP(val, Cast<dtype>::Decode(old)))); \
+    } while (assumed != old);                                      \
+    return Cast<dtype>::Decode(old);                               \
+  }
+
+#define OP(a, b) max(a, b)
+DEFINE_ATOMIC(Max)
+DEFINE_ATOMIC_16BIT(Max, half)
+#if BF16_ENABLED
+DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16)
+#endif  // BF16_ENABLED
+#undef OP
+
+#define OP(a, b) min(a, b)
+DEFINE_ATOMIC(Min)
+DEFINE_ATOMIC_16BIT(Min, half)
+#if BF16_ENABLED
+DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16)
+#endif  // BF16_ENABLED
+#undef OP
+
+#define OP(a, b) a + b
+DEFINE_ATOMIC(Add)
+#undef OP
+
+/**
+ * @brief Performs an atomic compare-and-swap on 64 bit integers. That is,
+ * it the word `old` at the memory location `address`, computes
+ * `(old == compare ? val : old)` , and stores the result back to memory at
+ * the same address.
+ *
+ * @param address The address to perform the atomic operation on.
+ * @param compare The value to compare to.
+ * @param val The new value to conditionally store.
+ *
+ * @return The old value at the address.
+ */
+inline __device__ int64_t
+AtomicCAS(int64_t* const address, const int64_t compare, const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int;  // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicCAS(
+      reinterpret_cast<Type*>(address), static_cast<Type>(compare),
+      static_cast<Type>(val));
+}
+
+/**
+ * @brief Performs an atomic compare-and-swap on 32 bit integers. That is,
+ * it the word `old` at the memory location `address`, computes
+ * `(old == compare ? val : old)` , and stores the result back to memory at
+ * the same address.
+ *
+ * @param address The address to perform the atomic operation on.
+ * @param compare The value to compare to.
+ * @param val The new value to conditionally store.
+ *
+ * @return The old value at the address.
+ */
+inline __device__ int32_t
+AtomicCAS(int32_t* const address, const int32_t compare, const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int;  // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicCAS(
+      reinterpret_cast<Type*>(address), static_cast<Type>(compare),
+      static_cast<Type>(val));
+}
+
+inline __device__ int64_t AtomicMax(int64_t* const address, const int64_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = unsigned long long int;  // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicMax(reinterpret_cast<Type*>(address), static_cast<Type>(val));
+}
+
+inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) {
+  // match the type of "::atomicCAS", so ignore lint warning
+  using Type = int;  // NOLINT
+
+  static_assert(sizeof(Type) == sizeof(*address), "Type width must match");
+
+  return atomicMax(reinterpret_cast<Type*>(address), static_cast<Type>(val));
+}
+
+template <>
+__device__ __forceinline__ float AtomicAdd<float>(float* addr, float val) {
+#if __CUDA_ARCH__ >= 200
+  return atomicAdd(addr, val);
+#else
+  typedef float T;
+  typedef typename Cast<T>::Type CT;
+  CT* addr_as_ui = reinterpret_cast<CT*>(addr);
+  CT old = *addr_as_ui;
+  CT assumed = old;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        addr_as_ui, assumed, Cast<T>::Encode(Cast<T>::Decode(old) + val));
+  } while (assumed != old);
+  return Cast<T>::Decode(old);
+#endif  // __CUDA_ARCH__
+}
+
+template <>
+__device__ __forceinline__ double AtomicAdd<double>(double* addr, double val) {
+#if __CUDA_ARCH__ >= 600
+  return atomicAdd(addr, val);
+#else
+  typedef double T;
+  typedef typename Cast<T>::Type CT;
+  CT* addr_as_ui = reinterpret_cast<CT*>(addr);
+  CT old = *addr_as_ui;
+  CT assumed = old;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        addr_as_ui, assumed, Cast<T>::Encode(Cast<T>::Decode(old) + val));
+  } while (assumed != old);
+  return Cast<T>::Decode(old);
+#endif
+}
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000
+template <>
+__device__ __forceinline__ half AtomicAdd<half>(half* addr, half val) {
+// make sure we have half support
+#if __CUDA_ARCH__ >= 700
+  return atomicAdd(addr, val);
+#else
+  (void)addr;
+  (void)val;
+  printf(
+      "Atomic operations are not supported for half precision (FP16) "
+      "on this GPU.\n");
+  __trap();
+  return val;
+#endif  // __CUDA_ARCH__ >= 700
+}
+#endif  // defined(CUDART_VERSION) && CUDART_VERSION >= 10000
+
+#if BF16_ENABLED
+template <>
+__device__ __forceinline__ __nv_bfloat16
+AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) {
+// make sure we have bfloat16 support
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return atomicAdd(addr, val);
+#else
+  (void)addr;
+  (void)val;
+  printf(
+      "Atomic operations are not supported for bfloat16 (BF16) "
+      "on GPUs with compute capability less than 8.0.\n");
+  __trap();
+  return val;
+#endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+}
+#endif  // BF16_ENABLED
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_ATOMIC_CUH_
diff --git a/src/array/cuda/bf16.cuh b/src/array/cuda/bf16.cuh
index 82fd0f332297..425df014a87c 100644
--- a/src/array/cuda/bf16.cuh
+++ b/src/array/cuda/bf16.cuh
@@ -20,129 +20,129 @@
 #define DGL_ARRAY_CUDA_BF16_CUH_
 
 #if BF16_ENABLED
-#include <cuda_bf16.h>
+#include <hip/hip_bf16.h>
 
 #include <algorithm>
 
-static __device__ __forceinline__ __nv_bfloat16
-max(__nv_bfloat16 a, __nv_bfloat16 b) {
+static __device__ __forceinline__ __hip_bfloat16
+max(__hip_bfloat16 a, __hip_bfloat16 b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   return __hmax(a, b);
 #else
-  return __nv_bfloat16(max(float(a), float(b)));  // NOLINT
+  return __hip_bfloat16(max(float(a), float(b)));  // NOLINT
 #endif
 }
 
-static __device__ __forceinline__ __nv_bfloat16
-min(__nv_bfloat16 a, __nv_bfloat16 b) {
+static __device__ __forceinline__ __hip_bfloat16
+min(__hip_bfloat16 a, __hip_bfloat16 b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
   return __hmin(a, b);
 #else
-  return __nv_bfloat16(min(float(a), float(b)));  // NOLINT
+  return __hip_bfloat16(min(float(a), float(b)));  // NOLINT
 #endif
 }
 
-#ifdef __CUDACC__
+#ifdef __HIPCC__
 // Arithmetic BF16 operations for architecture >= 8.0 are already defined in
 // cuda_bf16.h
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
 // CUDA 12.2 adds "emulated" support for older architectures.
 #if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
-__device__ __forceinline__ __nv_bfloat16
-operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) + float(rh));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator+(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) + float(rh));  // NOLINT
 }
-__device__ __forceinline__ __nv_bfloat16
-operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) - float(rh));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator-(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) - float(rh));  // NOLINT
 }
-__device__ __forceinline__ __nv_bfloat16
-operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) * float(rh));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator*(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) * float(rh));  // NOLINT
 }
-__device__ __forceinline__ __nv_bfloat16
-operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
-  return __nv_bfloat16(float(lh) / float(rh));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator/(const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
+  return __hip_bfloat16(float(lh) / float(rh));  // NOLINT
 }
 
-__device__ __forceinline__ __nv_bfloat16& operator+=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) + float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator+=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) + float(rh));       // NOLINT
   return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator-=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) - float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator-=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) - float(rh));       // NOLINT
   return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator*=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) * float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator*=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) * float(rh));       // NOLINT
   return lh;
 }
-__device__ __forceinline__ __nv_bfloat16& operator/=(
-    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
-  lh = __nv_bfloat16(float(lh) / float(rh));       // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator/=(
+    __hip_bfloat16& lh, const __hip_bfloat16& rh) {  // NOLINT
+  lh = __hip_bfloat16(float(lh) / float(rh));       // NOLINT
   return lh;
 }
 
-__device__ __forceinline__ __nv_bfloat16& operator++(
-    __nv_bfloat16& h) {                // NOLINT
-  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator++(
+    __hip_bfloat16& h) {                // NOLINT
+  h = __hip_bfloat16(float(h) + 1.0f);  // NOLINT
   return h;
 }
-__device__ __forceinline__ __nv_bfloat16& operator--(
-    __nv_bfloat16& h) {                // NOLINT
-  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16& operator--(
+    __hip_bfloat16& h) {                // NOLINT
+  h = __hip_bfloat16(float(h) - 1.0f);  // NOLINT
   return h;
 }
-__device__ __forceinline__ __nv_bfloat16
-operator++(__nv_bfloat16& h, int) {  // NOLINT
-  __nv_bfloat16 ret = h;
-  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator++(__hip_bfloat16& h, int) {  // NOLINT
+  __hip_bfloat16 ret = h;
+  h = __hip_bfloat16(float(h) + 1.0f);  // NOLINT
   return ret;
 }
-__device__ __forceinline__ __nv_bfloat16
-operator--(__nv_bfloat16& h, int) {  // NOLINT
-  __nv_bfloat16 ret = h;
-  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+__device__ __forceinline__ __hip_bfloat16
+operator--(__hip_bfloat16& h, int) {  // NOLINT
+  __hip_bfloat16 ret = h;
+  h = __hip_bfloat16(float(h) - 1.0f);  // NOLINT
   return ret;
 }
 
-__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) {
+__device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& h) {
   return h;
 }
-__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) {
-  return __nv_bfloat16(-float(h));  // NOLINT
+__device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& h) {
+  return __hip_bfloat16(-float(h));  // NOLINT
 }
 
 __device__ __forceinline__ bool operator==(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) == float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator!=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) != float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator>(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) > float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator<(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) < float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator>=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) >= float(rh);  // NOLINT
 }
 __device__ __forceinline__ bool operator<=(
-    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+    const __hip_bfloat16& lh, const __hip_bfloat16& rh) {
   return float(lh) <= float(rh);  // NOLINT
 }
 #endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
 #endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-#endif  // __CUDACC__
+#endif  // __HIPCC__
 
 #endif  // BF16_ENABLED
 
diff --git a/src/array/cuda/bf16.cuh.prehip b/src/array/cuda/bf16.cuh.prehip
new file mode 100644
index 000000000000..82fd0f332297
--- /dev/null
+++ b/src/array/cuda/bf16.cuh.prehip
@@ -0,0 +1,149 @@
+/**
+ *  Copyright (c) 2022 by Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file array/cuda/bf16.cuh
+ * @brief bfloat16 related functions.
+ */
+#ifndef DGL_ARRAY_CUDA_BF16_CUH_
+#define DGL_ARRAY_CUDA_BF16_CUH_
+
+#if BF16_ENABLED
+#include <cuda_bf16.h>
+
+#include <algorithm>
+
+static __device__ __forceinline__ __nv_bfloat16
+max(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hmax(a, b);
+#else
+  return __nv_bfloat16(max(float(a), float(b)));  // NOLINT
+#endif
+}
+
+static __device__ __forceinline__ __nv_bfloat16
+min(__nv_bfloat16 a, __nv_bfloat16 b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  return __hmin(a, b);
+#else
+  return __nv_bfloat16(min(float(a), float(b)));  // NOLINT
+#endif
+}
+
+#ifdef __CUDACC__
+// Arithmetic BF16 operations for architecture >= 8.0 are already defined in
+// cuda_bf16.h
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+// CUDA 12.2 adds "emulated" support for older architectures.
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
+__device__ __forceinline__ __nv_bfloat16
+operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return __nv_bfloat16(float(lh) + float(rh));  // NOLINT
+}
+__device__ __forceinline__ __nv_bfloat16
+operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return __nv_bfloat16(float(lh) - float(rh));  // NOLINT
+}
+__device__ __forceinline__ __nv_bfloat16
+operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return __nv_bfloat16(float(lh) * float(rh));  // NOLINT
+}
+__device__ __forceinline__ __nv_bfloat16
+operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return __nv_bfloat16(float(lh) / float(rh));  // NOLINT
+}
+
+__device__ __forceinline__ __nv_bfloat16& operator+=(
+    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
+  lh = __nv_bfloat16(float(lh) + float(rh));       // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __nv_bfloat16& operator-=(
+    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
+  lh = __nv_bfloat16(float(lh) - float(rh));       // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __nv_bfloat16& operator*=(
+    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
+  lh = __nv_bfloat16(float(lh) * float(rh));       // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __nv_bfloat16& operator/=(
+    __nv_bfloat16& lh, const __nv_bfloat16& rh) {  // NOLINT
+  lh = __nv_bfloat16(float(lh) / float(rh));       // NOLINT
+  return lh;
+}
+
+__device__ __forceinline__ __nv_bfloat16& operator++(
+    __nv_bfloat16& h) {                // NOLINT
+  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+  return h;
+}
+__device__ __forceinline__ __nv_bfloat16& operator--(
+    __nv_bfloat16& h) {                // NOLINT
+  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+  return h;
+}
+__device__ __forceinline__ __nv_bfloat16
+operator++(__nv_bfloat16& h, int) {  // NOLINT
+  __nv_bfloat16 ret = h;
+  h = __nv_bfloat16(float(h) + 1.0f);  // NOLINT
+  return ret;
+}
+__device__ __forceinline__ __nv_bfloat16
+operator--(__nv_bfloat16& h, int) {  // NOLINT
+  __nv_bfloat16 ret = h;
+  h = __nv_bfloat16(float(h) - 1.0f);  // NOLINT
+  return ret;
+}
+
+__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) {
+  return h;
+}
+__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) {
+  return __nv_bfloat16(-float(h));  // NOLINT
+}
+
+__device__ __forceinline__ bool operator==(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) == float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator!=(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) != float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator>(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) > float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator<(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) < float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator>=(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) >= float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator<=(
+    const __nv_bfloat16& lh, const __nv_bfloat16& rh) {
+  return float(lh) <= float(rh);  // NOLINT
+}
+#endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
+#endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
+#endif  // __CUDACC__
+
+#endif  // BF16_ENABLED
+
+#endif  // DGL_ARRAY_CUDA_BF16_CUH_
diff --git a/src/array/cuda/coo2csr.cu b/src/array/cuda/coo2csr.cu
index 237a35a26b3e..b55b0a3812dd 100644
--- a/src/array/cuda/coo2csr.cu
+++ b/src/array/cuda/coo2csr.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/coo2csr.cc
@@ -24,12 +25,12 @@ CSRMatrix COOToCSR(COOMatrix coo) {
 template <>
 CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
   bool row_sorted = coo.row_sorted;
   bool col_sorted = coo.col_sorted;
@@ -51,9 +52,9 @@ CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
   NDArray indptr =
       aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
   int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
-  CUSPARSE_CALL(cusparseXcoo2csr(
+  CUSPARSE_CALL(hipsparseXcoo2csr(
       thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
-      indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
+      indptr_ptr, HIPSPARSE_INDEX_BASE_ZERO));
 
   return CSRMatrix(
       coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
@@ -101,7 +102,7 @@ template <>
 CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
   const auto& ctx = coo.row->ctx;
   const auto nbits = coo.row->dtype.bits;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   bool row_sorted = coo.row_sorted;
   bool col_sorted = coo.col_sorted;
   if (!row_sorted) {
diff --git a/src/array/cuda/coo2csr.cu.prehip b/src/array/cuda/coo2csr.cu.prehip
new file mode 100644
index 000000000000..237a35a26b3e
--- /dev/null
+++ b/src/array/cuda/coo2csr.cu.prehip
@@ -0,0 +1,137 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/coo2csr.cc
+ * @brief COO2CSR
+ */
+#include <dgl/array.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType>
+CSRMatrix COOToCSR(COOMatrix coo) {
+  LOG(FATAL) << "Unreachable code.";
+  return {};
+}
+
+template <>
+CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  bool row_sorted = coo.row_sorted;
+  bool col_sorted = coo.col_sorted;
+  if (!row_sorted) {
+    // we only need to sort the rows to perform conversion
+    coo = COOSort(coo, false);
+    col_sorted = coo.col_sorted;
+  }
+
+  const int64_t nnz = coo.row->shape[0];
+  CHECK_NO_OVERFLOW(coo.row->dtype, nnz);
+  // TODO(minjie): Many of our current implementation assumes that CSR must have
+  //   a data array. This is a temporary workaround. Remove this after:
+  //   - The old immutable graph implementation is deprecated.
+  //   - The old binary reduce kernel is deprecated.
+  if (!COOHasData(coo))
+    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
+
+  NDArray indptr =
+      aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits);
+  int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
+  CUSPARSE_CALL(cusparseXcoo2csr(
+      thr_entry->cusparse_handle, coo.row.Ptr<int32_t>(), nnz, coo.num_rows,
+      indptr_ptr, CUSPARSE_INDEX_BASE_ZERO));
+
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
+}
+
+/**
+ * @brief Search for the insertion positions for needle in the hay.
+ *
+ * The hay is a list of sorted elements and the result is the insertion position
+ * of each needle so that the insertion still gives sorted order.
+ *
+ * It essentially perform binary search to find upper bound for each needle
+ * elements.
+ *
+ * For example:
+ * hay = [0, 0, 1, 2, 2]
+ * needle = [0, 1, 2, 3]
+ * then,
+ * out = [2, 3, 5, 5]
+ */
+template <typename IdType>
+__global__ void _SortedSearchKernelUpperBound(
+    const IdType* hay, int64_t hay_size, const IdType* needles,
+    int64_t num_needles, IdType* pos) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_needles) {
+    const IdType ele = needles[tx];
+    // binary search
+    IdType lo = 0, hi = hay_size;
+    while (lo < hi) {
+      IdType mid = (lo + hi) >> 1;
+      if (hay[mid] <= ele) {
+        lo = mid + 1;
+      } else {
+        hi = mid;
+      }
+    }
+    pos[tx] = lo;
+    tx += stride_x;
+  }
+}
+
+template <>
+CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo) {
+  const auto& ctx = coo.row->ctx;
+  const auto nbits = coo.row->dtype.bits;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  bool row_sorted = coo.row_sorted;
+  bool col_sorted = coo.col_sorted;
+  if (!row_sorted) {
+    coo = COOSort(coo, false);
+    col_sorted = coo.col_sorted;
+  }
+
+  const int64_t nnz = coo.row->shape[0];
+  // TODO(minjie): Many of our current implementation assumes that CSR must have
+  //   a data array. This is a temporary workaround. Remove this after:
+  //   - The old immutable graph implementation is deprecated.
+  //   - The old binary reduce kernel is deprecated.
+  if (!COOHasData(coo))
+    coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx);
+
+  IdArray rowids = Range(0, coo.num_rows, nbits, ctx);
+  const int nt = cuda::FindNumThreads(coo.num_rows);
+  const int nb = (coo.num_rows + nt - 1) / nt;
+  IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx);
+  CUDA_KERNEL_CALL(
+      _SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr<int64_t>(),
+      nnz, rowids.Ptr<int64_t>(), coo.num_rows, indptr.Ptr<int64_t>() + 1);
+
+  return CSRMatrix(
+      coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted);
+}
+
+template CSRMatrix COOToCSR<kDGLCUDA, int32_t>(COOMatrix coo);
+template CSRMatrix COOToCSR<kDGLCUDA, int64_t>(COOMatrix coo);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/coo_sort.cu b/src/array/cuda/coo_sort.cu
index fc0bc67925de..585eeb8f644c 100644
--- a/src/array/cuda/coo_sort.cu
+++ b/src/array/cuda/coo_sort.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/coo_sort.cc
@@ -65,7 +66,7 @@ __global__ void _COODecodeEdgesKernel(
 
 template <DGLDeviceType XPU, typename IdType>
 void COOSort_(COOMatrix* coo, bool sort_column) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int row_bits = cuda::_NumberOfBits(coo->num_rows);
 
   const int64_t nnz = coo->row->shape[0];
@@ -138,7 +139,7 @@ template <DGLDeviceType XPU, typename IdType>
 std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
   const int64_t nnz = coo.row->shape[0];
   const auto& ctx = coo.row->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = runtime::DeviceAPI::Get(ctx);
   // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
   // should be fine.
diff --git a/src/array/cuda/coo_sort.cu.prehip b/src/array/cuda/coo_sort.cu.prehip
new file mode 100644
index 000000000000..fc0bc67925de
--- /dev/null
+++ b/src/array/cuda/coo_sort.cu.prehip
@@ -0,0 +1,168 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/coo_sort.cc
+ * @brief Sort COO index
+ */
+#include <dgl/array.h>
+
+#include "../../c_api_common.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+///////////////////////////// COOSort_ /////////////////////////////
+
+/**
+ * @brief Encode row and column IDs into a single scalar per edge.
+ *
+ * @tparam IdType The type to encode as.
+ * @param row The row (src) IDs per edge.
+ * @param col The column (dst) IDs per edge.
+ * @param nnz The number of edges.
+ * @param col_bits The number of bits used to encode the destination. The row
+ * information is packed into the remaining bits.
+ * @param key The encoded edges (output).
+ */
+template <typename IdType>
+__global__ void _COOEncodeEdgesKernel(
+    const IdType* const row, const IdType* const col, const int64_t nnz,
+    const int col_bits, IdType* const key) {
+  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+
+  if (tx < nnz) {
+    key[tx] = row[tx] << col_bits | col[tx];
+  }
+}
+
+/**
+ * @brief Decode row and column IDs from the encoded edges.
+ *
+ * @tparam IdType The type the edges are encoded as.
+ * @param key The encoded edges.
+ * @param nnz The number of edges.
+ * @param col_bits The number of bits used to store the column/dst ID.
+ * @param row The row (src) IDs per edge (output).
+ * @param col The col (dst) IDs per edge (output).
+ */
+template <typename IdType>
+__global__ void _COODecodeEdgesKernel(
+    const IdType* const key, const int64_t nnz, const int col_bits,
+    IdType* const row, IdType* const col) {
+  int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+
+  if (tx < nnz) {
+    const IdType k = key[tx];
+    row[tx] = k >> col_bits;
+    col[tx] = k & ((1 << col_bits) - 1);
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+void COOSort_(COOMatrix* coo, bool sort_column) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int row_bits = cuda::_NumberOfBits(coo->num_rows);
+
+  const int64_t nnz = coo->row->shape[0];
+  if (sort_column) {
+    const int col_bits = cuda::_NumberOfBits(coo->num_cols);
+    const int num_bits = row_bits + col_bits;
+
+    const int nt = 256;
+    const int nb = (nnz + nt - 1) / nt;
+    CHECK(static_cast<int64_t>(nb) * nt >= nnz);
+
+    IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits);
+
+    CUDA_KERNEL_CALL(
+        _COOEncodeEdgesKernel, nb, nt, 0, stream, coo->row.Ptr<IdType>(),
+        coo->col.Ptr<IdType>(), nnz, col_bits, pos.Ptr<IdType>());
+
+    auto sorted = Sort(pos, num_bits);
+
+    CUDA_KERNEL_CALL(
+        _COODecodeEdgesKernel, nb, nt, 0, stream, sorted.first.Ptr<IdType>(),
+        nnz, col_bits, coo->row.Ptr<IdType>(), coo->col.Ptr<IdType>());
+
+    if (aten::COOHasData(*coo))
+      coo->data = IndexSelect(coo->data, sorted.second);
+    else
+      coo->data = AsNumBits(sorted.second, coo->row->dtype.bits);
+    coo->row_sorted = coo->col_sorted = true;
+  } else {
+    const int num_bits = row_bits;
+
+    auto sorted = Sort(coo->row, num_bits);
+
+    coo->row = sorted.first;
+    coo->col = IndexSelect(coo->col, sorted.second);
+
+    if (aten::COOHasData(*coo))
+      coo->data = IndexSelect(coo->data, sorted.second);
+    else
+      coo->data = AsNumBits(sorted.second, coo->row->dtype.bits);
+    coo->row_sorted = true;
+  }
+}
+
+template void COOSort_<kDGLCUDA, int32_t>(COOMatrix* coo, bool sort_column);
+template void COOSort_<kDGLCUDA, int64_t>(COOMatrix* coo, bool sort_column);
+
+///////////////////////////// COOIsSorted /////////////////////////////
+
+template <typename IdType>
+__global__ void _COOIsSortedKernel(
+    const IdType* row, const IdType* col, int64_t nnz, int8_t* row_sorted,
+    int8_t* col_sorted) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < nnz) {
+    if (tx == 0) {
+      row_sorted[0] = 1;
+      col_sorted[0] = 1;
+    } else {
+      row_sorted[tx] = static_cast<int8_t>(row[tx - 1] <= row[tx]);
+      col_sorted[tx] =
+          static_cast<int8_t>(row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]);
+    }
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<bool, bool> COOIsSorted(COOMatrix coo) {
+  const int64_t nnz = coo.row->shape[0];
+  const auto& ctx = coo.row->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but
+  // should be fine.
+  int8_t* row_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  int8_t* col_flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  const int nt = cuda::FindNumThreads(nnz);
+  const int nb = (nnz + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _COOIsSortedKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+      coo.col.Ptr<IdType>(), nnz, row_flags, col_flags);
+
+  const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx);
+  const bool col_sorted =
+      row_sorted ? cuda::AllTrue(col_flags, nnz, ctx) : false;
+
+  device->FreeWorkspace(ctx, row_flags);
+  device->FreeWorkspace(ctx, col_flags);
+
+  return {row_sorted, col_sorted};
+}
+
+template std::pair<bool, bool> COOIsSorted<kDGLCUDA, int32_t>(COOMatrix coo);
+template std::pair<bool, bool> COOIsSorted<kDGLCUDA, int64_t>(COOMatrix coo);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr2coo.cu b/src/array/cuda/csr2coo.cu
index e7eecdcda125..c116a599eac5 100644
--- a/src/array/cuda/csr2coo.cu
+++ b/src/array/cuda/csr2coo.cu
@@ -8,7 +8,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -29,12 +29,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) {
 template <>
 COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
   NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
   const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
@@ -42,9 +42,9 @@ COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
       aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
   int32_t* row_ptr = static_cast<int32_t*>(row->data);
 
-  CUSPARSE_CALL(cusparseXcsr2coo(
+  CUSPARSE_CALL(hipsparseXcsr2coo(
       thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
-      row_ptr, CUSPARSE_INDEX_BASE_ZERO));
+      row_ptr, HIPSPARSE_INDEX_BASE_ZERO));
 
   return COOMatrix(
       csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
@@ -75,7 +75,7 @@ struct AdjacentDifference {
 template <>
 COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
   const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int64_t nnz = csr.indices->shape[0];
   const auto nbits = csr.indptr->dtype.bits;
@@ -94,14 +94,14 @@ COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
   constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
   for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) {
     std::size_t temp_storage_bytes = 0;
-    CUDA_CALL(cub::DeviceCopy::Batched(
+    CUDA_CALL(hipcub::DeviceCopy::Batched(
         nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
         buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
         stream));
 
     auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
 
-    CUDA_CALL(cub::DeviceCopy::Batched(
+    CUDA_CALL(hipcub::DeviceCopy::Batched(
         temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
         buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
         stream));
@@ -128,12 +128,12 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
 
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   auto device = runtime::DeviceAPI::Get(coo.row->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
   NDArray row = coo.row, col = coo.col, data = coo.data;
   int32_t* row_ptr = static_cast<int32_t*>(row->data);
@@ -141,11 +141,11 @@ COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
   int32_t* data_ptr = static_cast<int32_t*>(data->data);
 
   size_t workspace_size = 0;
-  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
+  CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt(
       thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
       data_ptr, row_ptr, &workspace_size));
   void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
-  CUSPARSE_CALL(cusparseXcoosortByRow(
+  CUSPARSE_CALL(hipsparseXcoosortByRow(
       thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
       data_ptr, row_ptr, col_ptr, workspace));
   device->FreeWorkspace(row->ctx, workspace);
diff --git a/src/array/cuda/csr2coo.cu.prehip b/src/array/cuda/csr2coo.cu.prehip
new file mode 100644
index 000000000000..e7eecdcda125
--- /dev/null
+++ b/src/array/cuda/csr2coo.cu.prehip
@@ -0,0 +1,183 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/csr2coo.cc
+ * @brief CSR2COO
+ */
+#include <dgl/array.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType>
+COOMatrix CSRToCOO(CSRMatrix csr) {
+  LOG(FATAL) << "Unreachable codes";
+  return {};
+}
+
+template <>
+COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
+  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
+  NDArray row =
+      aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits);
+  int32_t* row_ptr = static_cast<int32_t*>(row->data);
+
+  CUSPARSE_CALL(cusparseXcsr2coo(
+      thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows,
+      row_ptr, CUSPARSE_INDEX_BASE_ZERO));
+
+  return COOMatrix(
+      csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted);
+}
+
+struct RepeatIndex {
+  template <typename IdType>
+  __host__ __device__ auto operator()(IdType i) {
+    return thrust::make_constant_iterator(i);
+  }
+};
+
+template <typename IdType>
+struct OutputBufferIndexer {
+  const IdType* indptr;
+  IdType* buffer;
+  __host__ __device__ auto operator()(IdType i) { return buffer + indptr[i]; }
+};
+
+template <typename IdType>
+struct AdjacentDifference {
+  const IdType* indptr;
+  __host__ __device__ auto operator()(IdType i) {
+    return indptr[i + 1] - indptr[i];
+  }
+};
+
+template <>
+COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr) {
+  const auto& ctx = csr.indptr->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int64_t nnz = csr.indices->shape[0];
+  const auto nbits = csr.indptr->dtype.bits;
+  IdArray ret_row = NewIdArray(nnz, ctx, nbits);
+
+  runtime::CUDAWorkspaceAllocator allocator(csr.indptr->ctx);
+  thrust::counting_iterator<int64_t> iota(0);
+
+  auto input_buffer = thrust::make_transform_iterator(iota, RepeatIndex{});
+  auto output_buffer = thrust::make_transform_iterator(
+      iota, OutputBufferIndexer<int64_t>{
+                csr.indptr.Ptr<int64_t>(), ret_row.Ptr<int64_t>()});
+  auto buffer_sizes = thrust::make_transform_iterator(
+      iota, AdjacentDifference<int64_t>{csr.indptr.Ptr<int64_t>()});
+
+  constexpr int64_t max_copy_at_once = std::numeric_limits<int32_t>::max();
+  for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) {
+    std::size_t temp_storage_bytes = 0;
+    CUDA_CALL(cub::DeviceCopy::Batched(
+        nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i,
+        buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
+        stream));
+
+    auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
+
+    CUDA_CALL(cub::DeviceCopy::Batched(
+        temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i,
+        buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once),
+        stream));
+  }
+
+  return COOMatrix(
+      csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true,
+      csr.sorted);
+}
+
+template COOMatrix CSRToCOO<kDGLCUDA, int32_t>(CSRMatrix csr);
+template COOMatrix CSRToCOO<kDGLCUDA, int64_t>(CSRMatrix csr);
+
+template <DGLDeviceType XPU, typename IdType>
+COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) {
+  LOG(FATAL) << "Unreachable codes";
+  return {};
+}
+
+template <>
+COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr) {
+  COOMatrix coo = CSRToCOO<kDGLCUDA, int32_t>(csr);
+  if (aten::IsNullArray(coo.data)) return coo;
+
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(coo.row->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  NDArray row = coo.row, col = coo.col, data = coo.data;
+  int32_t* row_ptr = static_cast<int32_t*>(row->data);
+  int32_t* col_ptr = static_cast<int32_t*>(col->data);
+  int32_t* data_ptr = static_cast<int32_t*>(data->data);
+
+  size_t workspace_size = 0;
+  CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt(
+      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
+      data_ptr, row_ptr, &workspace_size));
+  void* workspace = device->AllocWorkspace(row->ctx, workspace_size);
+  CUSPARSE_CALL(cusparseXcoosortByRow(
+      thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0],
+      data_ptr, row_ptr, col_ptr, workspace));
+  device->FreeWorkspace(row->ctx, workspace);
+
+  // The row and column field have already been reordered according
+  // to data, thus the data field will be deprecated.
+  coo.data = aten::NullArray();
+  coo.row_sorted = false;
+  coo.col_sorted = false;
+  return coo;
+}
+
+template <>
+COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr) {
+  COOMatrix coo = CSRToCOO<kDGLCUDA, int64_t>(csr);
+  if (aten::IsNullArray(coo.data)) return coo;
+  const auto& sorted = Sort(coo.data);
+
+  coo.row = IndexSelect(coo.row, sorted.second);
+  coo.col = IndexSelect(coo.col, sorted.second);
+
+  // The row and column field have already been reordered according
+  // to data, thus the data field will be deprecated.
+  coo.data = aten::NullArray();
+  coo.row_sorted = false;
+  coo.col_sorted = false;
+  return coo;
+}
+
+template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int32_t>(CSRMatrix csr);
+template COOMatrix CSRToCOODataAsOrder<kDGLCUDA, int64_t>(CSRMatrix csr);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr_get_data.cu b/src/array/cuda/csr_get_data.cu
index 9574b2a5e849..ecb3e6796992 100644
--- a/src/array/cuda/csr_get_data.cu
+++ b/src/array/cuda/csr_get_data.cu
@@ -36,7 +36,7 @@ NDArray CSRGetData(
   IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
   if (rstlen == 0) return rst;
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int nt = cuda::FindNumThreads(rstlen);
   const int nb = (rstlen + nt - 1) / nt;
   if (return_eids)
@@ -67,12 +67,12 @@ template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
     CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
     NDArray weights, __half filler);
 #if BF16_ENABLED
-template NDArray CSRGetData<kDGLCUDA, int32_t, __nv_bfloat16>(
+template NDArray CSRGetData<kDGLCUDA, int32_t, __hip_bfloat16>(
     CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
-    NDArray weights, __nv_bfloat16 filler);
-template NDArray CSRGetData<kDGLCUDA, int64_t, __nv_bfloat16>(
+    NDArray weights, __hip_bfloat16 filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, __hip_bfloat16>(
     CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
-    NDArray weights, __nv_bfloat16 filler);
+    NDArray weights, __hip_bfloat16 filler);
 #endif  // BF16_ENABLED
 template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
     CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
diff --git a/src/array/cuda/csr_get_data.cu.prehip b/src/array/cuda/csr_get_data.cu.prehip
new file mode 100644
index 000000000000..9574b2a5e849
--- /dev/null
+++ b/src/array/cuda/csr_get_data.cu.prehip
@@ -0,0 +1,100 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/cuda/csr_get_data.cu
+ * @brief Retrieve entries of a CSR matrix
+ */
+#include <dgl/array.h>
+
+#include <numeric>
+#include <unordered_set>
+#include <vector>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType, typename DType>
+NDArray CSRGetData(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, DType filler) {
+  const int64_t rowlen = rows->shape[0];
+  const int64_t collen = cols->shape[0];
+
+  CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1))
+      << "Invalid row and col id array.";
+
+  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
+  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
+
+  const int64_t rstlen = std::max(rowlen, collen);
+  IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx);
+  if (rstlen == 0) return rst;
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int nt = cuda::FindNumThreads(rstlen);
+  const int nb = (rstlen + nt - 1) / nt;
+  if (return_eids)
+    BUG_IF_FAIL(DGLDataTypeTraits<DType>::dtype == rows->dtype)
+        << "DType does not match row's dtype.";
+
+  const IdType* indptr_data =
+      static_cast<IdType*>(cuda::GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(cuda::GetDevicePointer(csr.indices));
+  const IdType* data_data =
+      CSRHasData(csr) ? static_cast<IdType*>(cuda::GetDevicePointer(csr.data))
+                      : nullptr;
+
+  // TODO(minjie): use binary search for sorted csr
+  CUDA_KERNEL_CALL(
+      cuda::_LinearSearchKernel, nb, nt, 0, stream, indptr_data, indices_data,
+      data_data, rows.Ptr<IdType>(), cols.Ptr<IdType>(), row_stride, col_stride,
+      rstlen, return_eids ? nullptr : weights.Ptr<DType>(), filler,
+      rst.Ptr<DType>());
+  return rst;
+}
+
+template NDArray CSRGetData<kDGLCUDA, int32_t, __half>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, __half filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, __half>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, __half filler);
+#if BF16_ENABLED
+template NDArray CSRGetData<kDGLCUDA, int32_t, __nv_bfloat16>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, __nv_bfloat16 filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, __nv_bfloat16>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, __nv_bfloat16 filler);
+#endif  // BF16_ENABLED
+template NDArray CSRGetData<kDGLCUDA, int32_t, float>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, float filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, float>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, float filler);
+template NDArray CSRGetData<kDGLCUDA, int32_t, double>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, double filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, double>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, double filler);
+
+// For CSRGetData<XPU, IdType>(CSRMatrix, NDArray, NDArray)
+template NDArray CSRGetData<kDGLCUDA, int32_t, int32_t>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, int32_t filler);
+template NDArray CSRGetData<kDGLCUDA, int64_t, int64_t>(
+    CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids,
+    NDArray weights, int64_t filler);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr_mm.cu b/src/array/cuda/csr_mm.cu
index ad05c5f4155c..35b8a80d7c2e 100644
--- a/src/array/cuda/csr_mm.cu
+++ b/src/array/cuda/csr_mm.cu
@@ -31,74 +31,74 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
   const int nnzB = B.indices->shape[0];
   const DType alpha = 1.0;
   const DType beta = 0.0;
-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
   // device
   auto ctx = A.indptr->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const DType* A_weights = A_weights_array.Ptr<DType>();
   const DType* B_weights = B_weights_array.Ptr<DType>();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
   // all one data array
-  cusparseSpMatDescr_t matA, matB, matC;
+  hipsparseSpMatDescr_t matA, matB, matC;
   IdArray dC_csrOffsets =
       IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
   IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
   constexpr auto idtype = cusparse_idtype<IdType>::value;
   constexpr auto dtype = cuda_dtype<DType>::value;
   // Create sparse matrix A, B and C in CSR format
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
       &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
       A.indices.Ptr<IdType>(),
-      // cusparseCreateCsr only accepts non-const pointers.
-      const_cast<DType*>(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      // hipsparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(A_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
       dtype));
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
       &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
       B.indices.Ptr<IdType>(),
-      // cusparseCreateCsr only accepts non-const pointers.
-      const_cast<DType*>(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      // hipsparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(B_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO,
       dtype));
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
       &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
-      idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype));
+      idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype));
   // SpGEMM Computation
-  cusparseSpGEMMDescr_t spgemmDesc;
-  cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT;
+  hipsparseSpGEMMDescr_t spgemmDesc;
+  hipsparseSpGEMMAlg_t alg = HIPSPARSE_SPGEMM_DEFAULT;
 
-  CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc));
+  CUSPARSE_CALL(hipsparseSpGEMM_createDescr(&spgemmDesc));
   size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
   // ask bufferSize1 bytes for external memory
-  CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+  CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
   void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
   // inspect the matrices A and B to understand the memory requiremnent
-  cusparseStatus_t e = cusparseSpGEMM_workEstimation(
+  hipsparseStatus_t e = hipsparseSpGEMM_workEstimation(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
-  // CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
+  // HIPSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
   // and throws insufficient memory error within workEstimation call
-  if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
+  if (e == HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
     // fall back to ALG2 to estimate num_prods
-    alg = CUSPARSE_SPGEMM_ALG2;
+    alg = HIPSPARSE_SPGEMM_ALG2;
     device->FreeWorkspace(ctx, workspace1);
-    // rerun cusparseSpGEMM_workEstimation
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    // rerun hipsparseSpGEMM_workEstimation
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
         thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
         matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
     workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
         thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
         matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
   } else {
-    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
+    CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
   }
 
   // get the number of intermediate products required for SpGEMM compute
@@ -113,26 +113,26 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
   int64_t LARGE_NUM_PRODUCTS = 800000000;  // 800*1000*1000;
 
   // switch to ALG2/ALG3 for medium & large problem size
-  if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
+  if (alg == HIPSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
     // use ALG3 for very large problem
-    alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
-                                         : CUSPARSE_SPGEMM_ALG2;
+    alg = num_prods > LARGE_NUM_PRODUCTS ? HIPSPARSE_SPGEMM_ALG3
+                                         : HIPSPARSE_SPGEMM_ALG2;
 
     device->FreeWorkspace(ctx, workspace1);
-    // rerun cusparseSpGEMM_workEstimation
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    // rerun hipsparseSpGEMM_workEstimation
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
         thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
         matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
     workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
-    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+    CUSPARSE_CALL(hipsparseSpGEMM_workEstimation(
         thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
         matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
-  } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
-    // no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3
-    alg = CUSPARSE_SPGEMM_ALG3;
+  } else if (alg == HIPSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
+    // no need to rerun hipsparseSpGEMM_workEstimation between ALG2 and ALG3
+    alg = HIPSPARSE_SPGEMM_ALG3;
   }
 
-  if (alg == CUSPARSE_SPGEMM_ALG2 || alg == CUSPARSE_SPGEMM_ALG3) {
+  if (alg == HIPSPARSE_SPGEMM_ALG2 || alg == HIPSPARSE_SPGEMM_ALG3) {
     // estimate memory for ALG2/ALG3; note chunk_fraction is only used by ALG3
     // reduce chunk_fraction if crash due to mem., but it trades off speed
     float chunk_fraction = num_prods < 4 * LARGE_NUM_PRODUCTS ? 0.15 : 0.05;
@@ -147,40 +147,40 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
         workspace3, &workspace_size2));
     device->FreeWorkspace(ctx, workspace3);
   } else {
-    CUSPARSE_CALL(cusparseSpGEMM_compute(
+    CUSPARSE_CALL(hipsparseSpGEMM_compute(
         thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
         matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
   }
   // ask bufferSize2 bytes for external memory
   void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
   // compute the intermediate product of A * B
-  CUSPARSE_CALL(cusparseSpGEMM_compute(
+  CUSPARSE_CALL(hipsparseSpGEMM_compute(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
   // get matrix C non-zero entries C_nnz1
   int64_t C_num_rows1, C_num_cols1, C_nnz1;
   CUSPARSE_CALL(
-      cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
+      hipsparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
   IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
   NDArray dC_weights =
       NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
   IdType* dC_columns_data = dC_columns.Ptr<IdType>();
   DType* dC_weights_data = dC_weights.Ptr<DType>();
   // update matC with the new pointers
-  CUSPARSE_CALL(cusparseCsrSetPointers(
+  CUSPARSE_CALL(hipsparseCsrSetPointers(
       matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
   // copy the final products to the matrix C
-  CUSPARSE_CALL(cusparseSpGEMM_copy(
+  CUSPARSE_CALL(hipsparseSpGEMM_copy(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, alg, spgemmDesc));
 
   device->FreeWorkspace(ctx, workspace1);
   device->FreeWorkspace(ctx, workspace2);
   // destroy matrix/vector descriptors
-  CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc));
-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroySpMat(matB));
-  CUSPARSE_CALL(cusparseDestroySpMat(matC));
+  CUSPARSE_CALL(hipsparseSpGEMM_destroyDescr(spgemmDesc));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matB));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matC));
   return {
       CSRMatrix(
           A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
@@ -208,25 +208,25 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
   auto ctx = A.indptr->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto idtype = A.indptr->dtype;
   auto dtype = A_weights_array->dtype;
   const DType* A_weights = A_weights_array.Ptr<DType>();
   const DType* B_weights = B_weights_array.Ptr<DType>();
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
-  CUSPARSE_CALL(cusparseSetPointerMode(
-      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetPointerMode(
+      thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST));
 
-  CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info));
+  CUSPARSE_CALL(hipsparseCreateCsrgemm2Info(&info));
 
-  cusparseMatDescr_t matA, matB, matC, matD;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matD));  // needed even if D is null
+  hipsparseMatDescr_t matA, matB, matC, matD;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matD));  // needed even if D is null
 
   CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
       thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
@@ -252,11 +252,11 @@ std::pair<CSRMatrix, NDArray> CusparseSpgemm(
       C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
 
   device->FreeWorkspace(ctx, workspace);
-  CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matD));
+  CUSPARSE_CALL(hipsparseDestroyCsrgemm2Info(info));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matD));
 
   return {
       CSRMatrix(
@@ -314,9 +314,9 @@ template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
 template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
     const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
 #if BF16_ENABLED
-template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __hip_bfloat16>(
     const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
-template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __hip_bfloat16>(
     const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
 #endif  // BF16_ENABLED
 template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
diff --git a/src/array/cuda/csr_mm.cu.prehip b/src/array/cuda/csr_mm.cu.prehip
new file mode 100644
index 000000000000..ad05c5f4155c
--- /dev/null
+++ b/src/array/cuda/csr_mm.cu.prehip
@@ -0,0 +1,332 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/csr_mm.cu
+ * @brief SpSpMM/SpGEMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+#include <dgl/runtime/device_api.h>
+
+#include <limits>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./cusparse_dispatcher.cuh"
+#include "./functor.cuh"
+namespace dgl {
+
+using namespace dgl::runtime;
+
+namespace aten {
+namespace cusparse {
+
+#if CUDART_VERSION >= 12000
+
+/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */
+template <typename DType, typename IdType>
+std::pair<CSRMatrix, NDArray> CusparseSpgemm(
+    const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B,
+    const NDArray B_weights_array) {
+  // We use Spgemm (SpSpMM) to perform following operation:
+  // C = A x B, where A, B and C are sparse matrices in csr format.
+  const int nnzA = A.indices->shape[0];
+  const int nnzB = B.indices->shape[0];
+  const DType alpha = 1.0;
+  const DType beta = 0.0;
+  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  // device
+  auto ctx = A.indptr->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const DType* A_weights = A_weights_array.Ptr<DType>();
+  const DType* B_weights = B_weights_array.Ptr<DType>();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  // all one data array
+  cusparseSpMatDescr_t matA, matB, matC;
+  IdArray dC_csrOffsets =
+      IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx);
+  IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
+  constexpr auto idtype = cusparse_idtype<IdType>::value;
+  constexpr auto dtype = cuda_dtype<DType>::value;
+  // Create sparse matrix A, B and C in CSR format
+  CUSPARSE_CALL(cusparseCreateCsr(
+      &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr<IdType>(),
+      A.indices.Ptr<IdType>(),
+      // cusparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      dtype));
+  CUSPARSE_CALL(cusparseCreateCsr(
+      &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr<IdType>(),
+      B.indices.Ptr<IdType>(),
+      // cusparseCreateCsr only accepts non-const pointers.
+      const_cast<DType*>(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO,
+      dtype));
+  CUSPARSE_CALL(cusparseCreateCsr(
+      &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr,
+      idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype));
+  // SpGEMM Computation
+  cusparseSpGEMMDescr_t spgemmDesc;
+  cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT;
+
+  CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc));
+  size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0;
+  // ask bufferSize1 bytes for external memory
+  CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
+  void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
+  // inspect the matrices A and B to understand the memory requiremnent
+  cusparseStatus_t e = cusparseSpGEMM_workEstimation(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1);
+  // CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1
+  // and throws insufficient memory error within workEstimation call
+  if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) {
+    // fall back to ALG2 to estimate num_prods
+    alg = CUSPARSE_SPGEMM_ALG2;
+    device->FreeWorkspace(ctx, workspace1);
+    // rerun cusparseSpGEMM_workEstimation
+    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
+    workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
+    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
+  } else {
+    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e;
+  }
+
+  // get the number of intermediate products required for SpGEMM compute
+  // num_prods indicates device memory consumption for SpGEMM if using ALG2/3
+  int64_t num_prods;
+  CUSPARSE_CALL(cusparseSpGEMM_getNumProducts(spgemmDesc, &num_prods));
+
+  // assume free GPU mem at least ~15G for below heuristics to work
+  // user-defined medium problem size (below will use DEFAULT)
+  int64_t MEDIUM_NUM_PRODUCTS = 400000000;  // 400*1000*1000;
+  // user-defined large problem size (above will use ALG3)
+  int64_t LARGE_NUM_PRODUCTS = 800000000;  // 800*1000*1000;
+
+  // switch to ALG2/ALG3 for medium & large problem size
+  if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) {
+    // use ALG3 for very large problem
+    alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3
+                                         : CUSPARSE_SPGEMM_ALG2;
+
+    device->FreeWorkspace(ctx, workspace1);
+    // rerun cusparseSpGEMM_workEstimation
+    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, &workspace_size1, NULL));
+    workspace1 = (device->AllocWorkspace(ctx, workspace_size1));
+    CUSPARSE_CALL(cusparseSpGEMM_workEstimation(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1));
+  } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) {
+    // no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3
+    alg = CUSPARSE_SPGEMM_ALG3;
+  }
+
+  if (alg == CUSPARSE_SPGEMM_ALG2 || alg == CUSPARSE_SPGEMM_ALG3) {
+    // estimate memory for ALG2/ALG3; note chunk_fraction is only used by ALG3
+    // reduce chunk_fraction if crash due to mem., but it trades off speed
+    float chunk_fraction = num_prods < 4 * LARGE_NUM_PRODUCTS ? 0.15 : 0.05;
+    CUSPARSE_CALL(cusparseSpGEMM_estimateMemory(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, chunk_fraction, &workspace_size3, NULL,
+        NULL));
+    void* workspace3 = (device->AllocWorkspace(ctx, workspace_size3));
+    CUSPARSE_CALL(cusparseSpGEMM_estimateMemory(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, chunk_fraction, &workspace_size3,
+        workspace3, &workspace_size2));
+    device->FreeWorkspace(ctx, workspace3);
+  } else {
+    CUSPARSE_CALL(cusparseSpGEMM_compute(
+        thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+        matC, dtype, alg, spgemmDesc, &workspace_size2, NULL));
+  }
+  // ask bufferSize2 bytes for external memory
+  void* workspace2 = device->AllocWorkspace(ctx, workspace_size2);
+  // compute the intermediate product of A * B
+  CUSPARSE_CALL(cusparseSpGEMM_compute(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2));
+  // get matrix C non-zero entries C_nnz1
+  int64_t C_num_rows1, C_num_cols1, C_nnz1;
+  CUSPARSE_CALL(
+      cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1));
+  IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx);
+  NDArray dC_weights =
+      NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx);
+  IdType* dC_columns_data = dC_columns.Ptr<IdType>();
+  DType* dC_weights_data = dC_weights.Ptr<DType>();
+  // update matC with the new pointers
+  CUSPARSE_CALL(cusparseCsrSetPointers(
+      matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data));
+  // copy the final products to the matrix C
+  CUSPARSE_CALL(cusparseSpGEMM_copy(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, alg, spgemmDesc));
+
+  device->FreeWorkspace(ctx, workspace1);
+  device->FreeWorkspace(ctx, workspace2);
+  // destroy matrix/vector descriptors
+  CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc));
+  CUSPARSE_CALL(cusparseDestroySpMat(matA));
+  CUSPARSE_CALL(cusparseDestroySpMat(matB));
+  CUSPARSE_CALL(cusparseDestroySpMat(matC));
+  return {
+      CSRMatrix(
+          A.num_rows, B.num_cols, dC_csrOffsets, dC_columns,
+          NullArray(dC_csrOffsets->dtype, dC_csrOffsets->ctx)),
+      dC_weights};
+}
+
+#else  // CUDART_VERSION < 12000
+
+/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA
+ * versions */
+template <typename DType, typename IdType>
+std::pair<CSRMatrix, NDArray> CusparseSpgemm(
+    const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B,
+    const NDArray B_weights_array) {
+  int nnzC;
+  csrgemm2Info_t info = nullptr;
+  size_t workspace_size;
+  const DType alpha = 1.;
+  const int nnzA = A.indices->shape[0];
+  const int nnzB = B.indices->shape[0];
+  const int m = A.num_rows;
+  const int n = A.num_cols;
+  const int k = B.num_cols;
+  auto ctx = A.indptr->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto idtype = A.indptr->dtype;
+  auto dtype = A_weights_array->dtype;
+  const DType* A_weights = A_weights_array.Ptr<DType>();
+  const DType* B_weights = B_weights_array.Ptr<DType>();
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(cusparseSetPointerMode(
+      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST));
+
+  CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info));
+
+  cusparseMatDescr_t matA, matB, matC, matD;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matD));  // needed even if D is null
+
+  CUSPARSE_CALL(CSRGEMM<DType>::bufferSizeExt(
+      thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA,
+      A.indptr.Ptr<IdType>(), A.indices.Ptr<IdType>(), matB, nnzB,
+      B.indptr.Ptr<IdType>(), B.indices.Ptr<IdType>(), nullptr, matD, 0,
+      nullptr, nullptr, info, &workspace_size));
+
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  IdArray C_indptr = IdArray::Empty({m + 1}, idtype, ctx);
+  CUSPARSE_CALL(CSRGEMM<DType>::nnz(
+      thr_entry->cusparse_handle, m, n, k, matA, nnzA, A.indptr.Ptr<IdType>(),
+      A.indices.Ptr<IdType>(), matB, nnzB, B.indptr.Ptr<IdType>(),
+      B.indices.Ptr<IdType>(), matD, 0, nullptr, nullptr, matC,
+      C_indptr.Ptr<IdType>(), &nnzC, info, workspace));
+
+  IdArray C_indices = IdArray::Empty({nnzC}, idtype, ctx);
+  NDArray C_weights = NDArray::Empty({nnzC}, dtype, ctx);
+  CUSPARSE_CALL(CSRGEMM<DType>::compute(
+      thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA, A_weights,
+      A.indptr.Ptr<IdType>(), A.indices.Ptr<IdType>(), matB, nnzB, B_weights,
+      B.indptr.Ptr<IdType>(), B.indices.Ptr<IdType>(), nullptr, matD, 0,
+      nullptr, nullptr, nullptr, matC, C_weights.Ptr<DType>(),
+      C_indptr.Ptr<IdType>(), C_indices.Ptr<IdType>(), info, workspace));
+
+  device->FreeWorkspace(ctx, workspace);
+  CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matD));
+
+  return {
+      CSRMatrix(
+          m, k, C_indptr, C_indices, NullArray(C_indptr->dtype, C_indptr->ctx)),
+      C_weights};
+}
+
+#endif  // CUDART_VERSION >= 12000
+}  // namespace cusparse
+
+template <int XPU, typename IdType, typename DType>
+std::pair<CSRMatrix, NDArray> CSRMM(
+    const CSRMatrix& A, NDArray A_weights, const CSRMatrix& B,
+    NDArray B_weights) {
+  auto ctx = A.indptr->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  CSRMatrix newA, newB;
+  bool cast = false;
+
+  // Cast 64 bit indices to 32 bit.
+  if (A.indptr->dtype.bits == 64) {
+    newA = CSRMatrix(
+        A.num_rows, A.num_cols, AsNumBits(A.indptr, 32),
+        AsNumBits(A.indices, 32), AsNumBits(A.data, 32));
+    newB = CSRMatrix(
+        B.num_rows, B.num_cols, AsNumBits(B.indptr, 32),
+        AsNumBits(B.indices, 32), AsNumBits(B.data, 32));
+    cast = true;
+  }
+
+  // Reorder weights if A or B has edge IDs
+  NDArray newA_weights, newB_weights;
+  if (CSRHasData(A)) newA_weights = IndexSelect(A_weights, A.data);
+  if (CSRHasData(B)) newB_weights = IndexSelect(B_weights, B.data);
+
+  auto result = cusparse::CusparseSpgemm<DType, int32_t>(
+      cast ? newA : A, CSRHasData(A) ? newA_weights : A_weights,
+      cast ? newB : B, CSRHasData(B) ? newB_weights : B_weights);
+
+  // Cast 32 bit indices back to 64 bit if necessary
+  if (cast) {
+    CSRMatrix C = result.first;
+    return {
+        CSRMatrix(
+            C.num_rows, C.num_cols, AsNumBits(C.indptr, 64),
+            AsNumBits(C.indices, 64), AsNumBits(C.data, 64)),
+        result.second};
+  } else {
+    return result;
+  }
+}
+
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __half>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __half>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+#if BF16_ENABLED
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+#endif  // BF16_ENABLED
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, float>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, float>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int32_t, double>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+template std::pair<CSRMatrix, NDArray> CSRMM<kDGLCUDA, int64_t, double>(
+    const CSRMatrix&, NDArray, const CSRMatrix&, NDArray);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr_sort.cu b/src/array/cuda/csr_sort.cu
index 448d36e44529..e6f8d65c7d13 100644
--- a/src/array/cuda/csr_sort.cu
+++ b/src/array/cuda/csr_sort.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/csr_sort.cc
@@ -5,7 +6,7 @@
  */
 #include <dgl/array.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -39,7 +40,7 @@ __global__ void _SegmentIsSorted(
 template <DGLDeviceType XPU, typename IdType>
 bool CSRIsSorted(CSRMatrix csr) {
   const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = runtime::DeviceAPI::Get(ctx);
   // We allocate a workspace of num_rows bytes. It wastes a little bit memory
   // but should be fine.
@@ -67,12 +68,12 @@ template <>
 void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
   NDArray indptr = csr->indptr;
   NDArray indices = csr->indices;
@@ -83,16 +84,16 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
   NDArray data = csr->data;
 
   size_t workspace_size = 0;
-  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+  CUSPARSE_CALL(hipsparseXcsrsort_bufferSizeExt(
       thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
       indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
 
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-  CUSPARSE_CALL(cusparseXcsrsort(
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(hipsparseXcsrsort(
       thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
       indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
       workspace));
@@ -100,13 +101,13 @@ void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
   csr->sorted = true;
 
   // free resources
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
   device->FreeWorkspace(ctx, workspace);
 }
 
 template <>
 void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
 
   const auto& ctx = csr->indptr->ctx;
@@ -125,13 +126,13 @@ void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
 
   // Allocate workspace
   size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
       nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
       csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
 
   // Compute
-  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs(
       workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
       csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
 
diff --git a/src/array/cuda/csr_sort.cu.prehip b/src/array/cuda/csr_sort.cu.prehip
new file mode 100644
index 000000000000..448d36e44529
--- /dev/null
+++ b/src/array/cuda/csr_sort.cu.prehip
@@ -0,0 +1,151 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/csr_sort.cc
+ * @brief Sort CSR index
+ */
+#include <dgl/array.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+/**
+ * @brief Check whether each row is sorted.
+ */
+template <typename IdType>
+__global__ void _SegmentIsSorted(
+    const IdType* indptr, const IdType* indices, int64_t num_rows,
+    int8_t* flags) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_rows) {
+    bool f = true;
+    for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
+      f = (indices[i - 1] <= indices[i]);
+    }
+    flags[tx] = static_cast<int8_t>(f);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+bool CSRIsSorted(CSRMatrix csr) {
+  const auto& ctx = csr.indptr->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
+  // but should be fine.
+  int8_t* flags =
+      static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
+  const int nt = cuda::FindNumThreads(csr.num_rows);
+  const int nb = (csr.num_rows + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _SegmentIsSorted, nb, nt, 0, stream, csr.indptr.Ptr<IdType>(),
+      csr.indices.Ptr<IdType>(), csr.num_rows, flags);
+  bool ret = cuda::AllTrue(flags, csr.num_rows, ctx);
+  device->FreeWorkspace(ctx, flags);
+  return ret;
+}
+
+template bool CSRIsSorted<kDGLCUDA, int32_t>(CSRMatrix csr);
+template bool CSRIsSorted<kDGLCUDA, int64_t>(CSRMatrix csr);
+
+template <DGLDeviceType XPU, typename IdType>
+void CSRSort_(CSRMatrix* csr) {
+  LOG(FATAL) << "Unreachable codes";
+}
+
+template <>
+void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr) {
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  NDArray indptr = csr->indptr;
+  NDArray indices = csr->indices;
+  const auto& ctx = indptr->ctx;
+  const int64_t nnz = indices->shape[0];
+  if (!aten::CSRHasData(*csr))
+    csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx);
+  NDArray data = csr->data;
+
+  size_t workspace_size = 0;
+  CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt(
+      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+
+  cusparseMatDescr_t descr;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(cusparseXcsrsort(
+      thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr,
+      indptr.Ptr<int32_t>(), indices.Ptr<int32_t>(), data.Ptr<int32_t>(),
+      workspace));
+
+  csr->sorted = true;
+
+  // free resources
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  device->FreeWorkspace(ctx, workspace);
+}
+
+template <>
+void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(csr->indptr->ctx);
+
+  const auto& ctx = csr->indptr->ctx;
+  const int64_t nnz = csr->indices->shape[0];
+  const auto nbits = csr->indptr->dtype.bits;
+  if (!aten::CSRHasData(*csr)) csr->data = aten::Range(0, nnz, nbits, ctx);
+
+  IdArray new_indices = csr->indices.Clone();
+  IdArray new_data = csr->data.Clone();
+
+  const int64_t* offsets = csr->indptr.Ptr<int64_t>();
+  const int64_t* key_in = csr->indices.Ptr<int64_t>();
+  int64_t* key_out = new_indices.Ptr<int64_t>();
+  const int64_t* value_in = csr->data.Ptr<int64_t>();
+  int64_t* value_out = new_data.Ptr<int64_t>();
+
+  // Allocate workspace
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+      nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz,
+      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+
+  // Compute
+  CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs(
+      workspace, workspace_size, key_in, key_out, value_in, value_out, nnz,
+      csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream));
+
+  csr->sorted = true;
+  csr->indices = new_indices;
+  csr->data = new_data;
+
+  // free resources
+  device->FreeWorkspace(ctx, workspace);
+}
+
+template void CSRSort_<kDGLCUDA, int32_t>(CSRMatrix* csr);
+template void CSRSort_<kDGLCUDA, int64_t>(CSRMatrix* csr);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr_sum.cu b/src/array/cuda/csr_sum.cu
index b7564309c5cf..5ca340ef568f 100644
--- a/src/array/cuda/csr_sum.cu
+++ b/src/array/cuda/csr_sum.cu
@@ -32,21 +32,21 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
   auto ctx = A.indptr->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const DType* A_weights = A_weights_array.Ptr<DType>();
   const DType* B_weights = B_weights_array.Ptr<DType>();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle)
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
-  cusparseMatDescr_t matA, matB, matC;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
-  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
+  hipsparseMatDescr_t matA, matB, matC;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&matC));
 
-  cusparseSetPointerMode(
-      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST);
+  hipsparseSetPointerMode(
+      thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST);
   size_t workspace_size = 0;
   /* prepare output C */
   IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
@@ -81,9 +81,9 @@ std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
 
   device->FreeWorkspace(ctx, workspace);
   // destroy matrix/vector descriptors
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(matC));
   return {
       CSRMatrix(
           A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
@@ -159,9 +159,9 @@ template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
 template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
     const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
 #if BF16_ENABLED
-template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
-template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __nv_bfloat16>(
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
 #endif  // BF16_ENABLED
 template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
diff --git a/src/array/cuda/csr_sum.cu.prehip b/src/array/cuda/csr_sum.cu.prehip
new file mode 100644
index 000000000000..b7564309c5cf
--- /dev/null
+++ b/src/array/cuda/csr_sum.cu.prehip
@@ -0,0 +1,177 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/spmm.cu
+ * @brief SpGEAM C APIs and definitions.
+ */
+#include <dgl/array.h>
+#include <dgl/runtime/device_api.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./cusparse_dispatcher.cuh"
+#include "./functor.cuh"
+
+namespace dgl {
+
+using namespace dgl::runtime;
+
+namespace aten {
+namespace cusparse {
+
+/** Cusparse implementation of SpSum on Csr format. */
+template <typename DType, typename IdType>
+std::pair<CSRMatrix, NDArray> CusparseCsrgeam2(
+    const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B,
+    const NDArray B_weights_array) {
+  const int m = A.num_rows;
+  const int n = A.num_cols;
+  const int nnzA = A.indices->shape[0];
+  const int nnzB = B.indices->shape[0];
+  int nnzC;
+  const DType alpha = 1.0;
+  const DType beta = 1.0;
+  auto ctx = A.indptr->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const DType* A_weights = A_weights_array.Ptr<DType>();
+  const DType* B_weights = B_weights_array.Ptr<DType>();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle)
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  cusparseMatDescr_t matA, matB, matC;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matA));
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matB));
+  CUSPARSE_CALL(cusparseCreateMatDescr(&matC));
+
+  cusparseSetPointerMode(
+      thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST);
+  size_t workspace_size = 0;
+  /* prepare output C */
+  IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx);
+  IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr<IdType>();
+  IdArray dC_columns;
+  NDArray dC_weights;
+  IdType* dC_columns_data = dC_columns.Ptr<IdType>();
+  DType* dC_weights_data = dC_weights.Ptr<DType>();
+  /* prepare buffer */
+  CUSPARSE_CALL(CSRGEAM<DType>::bufferSizeExt(
+      thr_entry->cusparse_handle, m, n, &alpha, matA, nnzA, A_weights,
+      A.indptr.Ptr<IdType>(), A.indices.Ptr<IdType>(), &beta, matB, nnzB,
+      B_weights, B.indptr.Ptr<IdType>(), B.indices.Ptr<IdType>(), matC,
+      dC_weights_data, dC_csrOffsets_data, dC_columns_data, &workspace_size));
+
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(CSRGEAM<DType>::nnz(
+      thr_entry->cusparse_handle, m, n, matA, nnzA, A.indptr.Ptr<IdType>(),
+      A.indices.Ptr<IdType>(), matB, nnzB, B.indptr.Ptr<IdType>(),
+      B.indices.Ptr<IdType>(), matC, dC_csrOffsets_data, &nnzC, workspace));
+
+  dC_columns = IdArray::Empty({nnzC}, A.indptr->dtype, ctx);
+  dC_weights = NDArray::Empty({nnzC}, A_weights_array->dtype, ctx);
+  dC_columns_data = dC_columns.Ptr<IdType>();
+  dC_weights_data = dC_weights.Ptr<DType>();
+
+  CUSPARSE_CALL(CSRGEAM<DType>::compute(
+      thr_entry->cusparse_handle, m, n, &alpha, matA, nnzA, A_weights,
+      A.indptr.Ptr<IdType>(), A.indices.Ptr<IdType>(), &beta, matB, nnzB,
+      B_weights, B.indptr.Ptr<IdType>(), B.indices.Ptr<IdType>(), matC,
+      dC_weights_data, dC_csrOffsets_data, dC_columns_data, workspace));
+
+  device->FreeWorkspace(ctx, workspace);
+  // destroy matrix/vector descriptors
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matA));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matB));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(matC));
+  return {
+      CSRMatrix(
+          A.num_rows, A.num_cols, dC_csrOffsets, dC_columns,
+          NullArray(dC_csrOffsets->dtype, dC_csrOffsets->ctx), true),
+      dC_weights};
+}
+}  // namespace cusparse
+
+template <int XPU, typename IdType, typename DType>
+std::pair<CSRMatrix, NDArray> CSRSum(
+    const std::vector<CSRMatrix>& As, const std::vector<NDArray>& A_weights) {
+  const int64_t M = As[0].num_rows;
+  const int64_t N = As[0].num_cols;
+  const int64_t n = As.size();
+
+  // Cast 64 bit indices to 32 bit
+  std::vector<CSRMatrix> newAs;
+  newAs.reserve(n);
+  bool cast = false;
+  if (As[0].indptr->dtype.bits == 64) {
+    for (int i = 0; i < n; ++i)
+      newAs.emplace_back(
+          As[i].num_rows, As[i].num_cols, AsNumBits(As[i].indptr, 32),
+          AsNumBits(As[i].indices, 32), AsNumBits(As[i].data, 32));
+    cast = true;
+  } else {
+    for (int i = 0; i < n; ++i) newAs.push_back(As[i]);
+  }
+
+  // cuSPARSE csrgeam2 requires the CSR to be sorted.
+  // TODO(BarclayII): ideally the sorted CSR should be cached but I'm not sure
+  // how to do it.
+  for (int i = 0; i < n; ++i) {
+    if (!newAs[i].sorted) newAs[i] = CSRSort(newAs[i]);
+  }
+
+  // Reorder weights if A[i] has edge IDs
+  std::vector<NDArray> A_weights_reordered(n);
+  for (int i = 0; i < n; ++i) {
+    if (CSRHasData(newAs[i]))
+      A_weights_reordered[i] = IndexSelect(A_weights[i], newAs[i].data);
+    else
+      A_weights_reordered[i] = A_weights[i];
+  }
+
+  // Loop and sum
+  auto result = std::make_pair(
+      CSRMatrix(
+          newAs[0].num_rows, newAs[0].num_cols, newAs[0].indptr,
+          newAs[0].indices,
+          NullArray(newAs[0].indptr->dtype, newAs[0].indptr->ctx)),
+      A_weights_reordered[0]);  // Weights already reordered so we don't need
+                                // As[0].data
+  for (int64_t i = 1; i < n; ++i)
+    result = cusparse::CusparseCsrgeam2<DType, int32_t>(
+        result.first, result.second, newAs[i], A_weights_reordered[i]);
+
+  // Cast 32 bit indices back to 64 bit if necessary
+  if (cast) {
+    CSRMatrix C = result.first;
+    return {
+        CSRMatrix(
+            C.num_rows, C.num_cols, AsNumBits(C.indptr, 64),
+            AsNumBits(C.indices, 64), AsNumBits(C.data, 64), true),
+        result.second};
+  } else {
+    return result;
+  }
+}
+
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __half>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __half>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+#if BF16_ENABLED
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+#endif  // BF16_ENABLED
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, float>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, float>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int32_t, double>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+template std::pair<CSRMatrix, NDArray> CSRSum<kDGLCUDA, int64_t, double>(
+    const std::vector<CSRMatrix>&, const std::vector<NDArray>&);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/csr_transpose.cc b/src/array/cuda/csr_transpose.cc
index 60dee39048cb..c5321f431fe4 100644
--- a/src/array/cuda/csr_transpose.cc
+++ b/src/array/cuda/csr_transpose.cc
@@ -24,12 +24,12 @@ template <>
 CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
 #if CUDART_VERSION < 12000
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
 
   NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
   const int64_t nnz = indices->shape[0];
@@ -53,26 +53,26 @@ CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
   auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
   // workspace
   size_t workspace_size;
-  CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
+  CUSPARSE_CALL(hipsparseCsr2cscEx2_bufferSize(
       thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
       indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
+      HIPSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
       &workspace_size));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseCsr2cscEx2(
+  CUSPARSE_CALL(hipsparseCsr2cscEx2(
       thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
       indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
-      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
-      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO,
+      HIPSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
       workspace));
   device->FreeWorkspace(ctx, workspace);
 #else
-  CUSPARSE_CALL(cusparseScsr2csc(
+  CUSPARSE_CALL(hipsparseScsr2csc(
       thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
       static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
       static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
-      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
+      HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO));
 #endif
 
   return CSRMatrix(
diff --git a/src/array/cuda/csr_transpose.cc.prehip b/src/array/cuda/csr_transpose.cc.prehip
new file mode 100644
index 000000000000..60dee39048cb
--- /dev/null
+++ b/src/array/cuda/csr_transpose.cc.prehip
@@ -0,0 +1,95 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/csr_transpose.cc
+ * @brief CSR transpose (convert to CSC)
+ */
+#include <dgl/array.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+
+namespace aten {
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdType>
+CSRMatrix CSRTranspose(CSRMatrix csr) {
+  LOG(FATAL) << "Unreachable codes";
+  return {};
+}
+
+template <>
+CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr) {
+#if CUDART_VERSION < 12000
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+
+  NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data;
+  const int64_t nnz = indices->shape[0];
+  const auto& ctx = indptr->ctx;
+  const auto bits = indptr->dtype.bits;
+  if (aten::IsNullArray(data)) data = aten::Range(0, nnz, bits, ctx);
+  const int32_t* indptr_ptr = static_cast<int32_t*>(indptr->data);
+  const int32_t* indices_ptr = static_cast<int32_t*>(indices->data);
+  const void* data_ptr = data->data;
+
+  // (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz
+  // == 0. We need to do it ourselves.
+  NDArray t_indptr = aten::Full(0, csr.num_cols + 1, bits, ctx);
+  NDArray t_indices = aten::NewIdArray(nnz, ctx, bits);
+  NDArray t_data = aten::NewIdArray(nnz, ctx, bits);
+  int32_t* t_indptr_ptr = static_cast<int32_t*>(t_indptr->data);
+  int32_t* t_indices_ptr = static_cast<int32_t*>(t_indices->data);
+  void* t_data_ptr = t_data->data;
+
+#if CUDART_VERSION >= 10010
+  auto device = runtime::DeviceAPI::Get(csr.indptr->ctx);
+  // workspace
+  size_t workspace_size;
+  CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize(
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
+      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
+      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(cusparseCsr2cscEx2(
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr,
+      indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr,
+      CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO,
+      CUSPARSE_CSR2CSC_ALG1,  // see cusparse doc for reference
+      workspace));
+  device->FreeWorkspace(ctx, workspace);
+#else
+  CUSPARSE_CALL(cusparseScsr2csc(
+      thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz,
+      static_cast<const float*>(data_ptr), indptr_ptr, indices_ptr,
+      static_cast<float*>(t_data_ptr), t_indices_ptr, t_indptr_ptr,
+      CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
+#endif
+
+  return CSRMatrix(
+      csr.num_cols, csr.num_rows, t_indptr, t_indices, t_data, false);
+#else
+  return COOToCSR(COOTranspose(CSRToCOO(csr, false)));
+#endif
+}
+
+template <>
+CSRMatrix CSRTranspose<kDGLCUDA, int64_t>(CSRMatrix csr) {
+  return COOToCSR(COOTranspose(CSRToCOO(csr, false)));
+}
+
+template CSRMatrix CSRTranspose<kDGLCUDA, int32_t>(CSRMatrix csr);
+template CSRMatrix CSRTranspose<kDGLCUDA, int64_t>(CSRMatrix csr);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/cuda_filter.cu b/src/array/cuda/cuda_filter.cu
index 9bbd18f42307..72f727d40554 100644
--- a/src/array/cuda/cuda_filter.cu
+++ b/src/array/cuda/cuda_filter.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file array/cuda/cuda_filter.cc
@@ -6,7 +7,7 @@
 
 #include <dgl/runtime/device_api.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "../../runtime/cuda/cuda_hashtable.cuh"
@@ -45,7 +46,7 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
   const auto& ctx = test->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   const int64_t size = test->shape[0];
-  cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+  hipStream_t cudaStream = runtime::getCurrentCUDAStream();
 
   if (size == 0) {
     return test;
@@ -74,12 +75,12 @@ IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
   // generate prefix-sum
   {
     size_t workspace_bytes;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
         nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
         static_cast<IdType*>(nullptr), size + 1, cudaStream));
     void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
 
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
         workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
     device->FreeWorkspace(ctx, workspace);
   }
@@ -109,7 +110,7 @@ class CudaFilterSet : public Filter {
  public:
   explicit CudaFilterSet(IdArray array)
       : table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) {
-    cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+    hipStream_t cudaStream = runtime::getCurrentCUDAStream();
     table_.FillWithUnique(
         static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
   }
diff --git a/src/array/cuda/cuda_filter.cu.prehip b/src/array/cuda/cuda_filter.cu.prehip
new file mode 100644
index 000000000000..9bbd18f42307
--- /dev/null
+++ b/src/array/cuda/cuda_filter.cu.prehip
@@ -0,0 +1,140 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/cuda/cuda_filter.cc
+ * @brief Object for selecting items in a set, or selecting items not in a set.
+ */
+
+#include <dgl/runtime/device_api.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../runtime/cuda/cuda_hashtable.cuh"
+#include "../filter.h"
+
+using namespace dgl::runtime::cuda;
+
+namespace dgl {
+namespace array {
+
+namespace {
+
+template <typename IdType, bool include>
+__global__ void _IsInKernel(
+    DeviceOrderedHashTable<IdType> table, const IdType* const array,
+    const int64_t size, IdType* const mark) {
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < size) {
+    mark[idx] = table.Contains(array[idx]) ^ (!include);
+  }
+}
+
+template <typename IdType>
+__global__ void _InsertKernel(
+    const IdType* const prefix, const int64_t size, IdType* const result) {
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < size) {
+    if (prefix[idx] != prefix[idx + 1]) {
+      result[prefix[idx]] = idx;
+    }
+  }
+}
+
+template <typename IdType, bool include>
+IdArray _PerformFilter(const OrderedHashTable<IdType>& table, IdArray test) {
+  const auto& ctx = test->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  const int64_t size = test->shape[0];
+  cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+
+  if (size == 0) {
+    return test;
+  }
+
+  // we need two arrays: 1) to act as a prefixsum
+  // for the number of entries that will be inserted, and
+  // 2) to collect the included items.
+  IdType* prefix = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, sizeof(IdType) * (size + 1)));
+
+  // will resize down later
+  IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType) * 8);
+
+  // mark each index based on it's existence in the hashtable
+  {
+    const dim3 block(256);
+    const dim3 grid((size + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        (_IsInKernel<IdType, include>), grid, block, 0, cudaStream,
+        table.DeviceHandle(), static_cast<const IdType*>(test->data), size,
+        prefix);
+  }
+
+  // generate prefix-sum
+  {
+    size_t workspace_bytes;
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+        nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
+        static_cast<IdType*>(nullptr), size + 1, cudaStream));
+    void* workspace = device->AllocWorkspace(ctx, workspace_bytes);
+
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+        workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream));
+    device->FreeWorkspace(ctx, workspace);
+  }
+
+  // copy number using the internal current stream;
+  IdType num_unique;
+  device->CopyDataFromTo(
+      prefix + size, 0, &num_unique, 0, sizeof(num_unique), ctx,
+      DGLContext{kDGLCPU, 0}, test->dtype);
+
+  // insert items into set
+  {
+    const dim3 block(256);
+    const dim3 grid((size + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _InsertKernel, grid, block, 0, cudaStream, prefix, size,
+        static_cast<IdType*>(result->data));
+  }
+  device->FreeWorkspace(ctx, prefix);
+
+  return result.CreateView({num_unique}, result->dtype);
+}
+
+template <typename IdType>
+class CudaFilterSet : public Filter {
+ public:
+  explicit CudaFilterSet(IdArray array)
+      : table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) {
+    cudaStream_t cudaStream = runtime::getCurrentCUDAStream();
+    table_.FillWithUnique(
+        static_cast<const IdType*>(array->data), array->shape[0], cudaStream);
+  }
+
+  IdArray find_included_indices(IdArray test) override {
+    return _PerformFilter<IdType, true>(table_, test);
+  }
+
+  IdArray find_excluded_indices(IdArray test) override {
+    return _PerformFilter<IdType, false>(table_, test);
+  }
+
+ private:
+  OrderedHashTable<IdType> table_;
+};
+
+}  // namespace
+
+template <DGLDeviceType XPU, typename IdType>
+FilterRef CreateSetFilter(IdArray set) {
+  return FilterRef(std::make_shared<CudaFilterSet<IdType>>(set));
+}
+
+template FilterRef CreateSetFilter<kDGLCUDA, int32_t>(IdArray set);
+template FilterRef CreateSetFilter<kDGLCUDA, int64_t>(IdArray set);
+
+}  // namespace array
+}  // namespace dgl
diff --git a/src/array/cuda/cusparse_dispatcher.cuh b/src/array/cuda/cusparse_dispatcher.cuh
index 0d9152b59145..5781654985f6 100644
--- a/src/array/cuda/cusparse_dispatcher.cuh
+++ b/src/array/cuda/cusparse_dispatcher.cuh
@@ -7,7 +7,7 @@
 #ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
 #define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
 
-#include <cusparse.h>
+#include <hipsparse/hipsparse.h>
 #include <dgl/runtime/c_runtime_api.h>
 
 #include "bf16.cuh"
@@ -20,70 +20,70 @@ namespace aten {
 template <typename DType>
 struct CSRGEMM {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 
 template <>
 struct CSRGEMM<__half> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
     // different implementation would be required.
     LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
     // implementation would be required.
     LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 
 #if BF16_ENABLED
 template <>
-struct CSRGEMM<__nv_bfloat16> {
+struct CSRGEMM<__hip_bfloat16> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
     // different implementation would be required.
     LOG(FATAL)
         << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
     // implementation would be required.
     LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 #endif  // BF16_ENABLED
@@ -91,36 +91,36 @@ struct CSRGEMM<__nv_bfloat16> {
 template <>
 struct CSRGEMM<float> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseScsrgemm2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseScsrgemm2_bufferSizeExt(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseScsrgemm2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseScsrgemm2(args...);
   }
 };
 
 template <>
 struct CSRGEMM<double> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseDcsrgemm2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseDcsrgemm2_bufferSizeExt(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgemm2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgemm2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseDcsrgemm2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseDcsrgemm2(args...);
   }
 };
 
@@ -128,70 +128,70 @@ struct CSRGEMM<double> {
 template <typename DType>
 struct CSRGEAM {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     BUG_IF_FAIL(false) << "This piece of code should not be reached.";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 
 template <>
 struct CSRGEAM<__half> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
     // different implementation would be required.
     LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
     // implementation would be required.
     LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 
 #if BF16_ENABLED
 template <>
-struct CSRGEAM<__nv_bfloat16> {
+struct CSRGEAM<__hip_bfloat16> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
     // different implementation would be required.
     LOG(FATAL)
         << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
+  static inline hipsparseStatus_t compute(Args... args) {
     // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
     // implementation would be required.
     LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16).";
-    return static_cast<cusparseStatus_t>(0);
+    return static_cast<hipsparseStatus_t>(0);
   }
 };
 #endif  // BF16_ENABLED
@@ -199,36 +199,36 @@ struct CSRGEAM<__nv_bfloat16> {
 template <>
 struct CSRGEAM<float> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseScsrgeam2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseScsrgeam2_bufferSizeExt(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseScsrgeam2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseScsrgeam2(args...);
   }
 };
 
 template <>
 struct CSRGEAM<double> {
   template <typename... Args>
-  static inline cusparseStatus_t bufferSizeExt(Args... args) {
-    return cusparseDcsrgeam2_bufferSizeExt(args...);
+  static inline hipsparseStatus_t bufferSizeExt(Args... args) {
+    return hipsparseDcsrgeam2_bufferSizeExt(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t nnz(Args... args) {
-    return cusparseXcsrgeam2Nnz(args...);
+  static inline hipsparseStatus_t nnz(Args... args) {
+    return hipsparseXcsrgeam2Nnz(args...);
   }
 
   template <typename... Args>
-  static inline cusparseStatus_t compute(Args... args) {
-    return cusparseDcsrgeam2(args...);
+  static inline hipsparseStatus_t compute(Args... args) {
+    return hipsparseDcsrgeam2(args...);
   }
 };
 
diff --git a/src/array/cuda/cusparse_dispatcher.cuh.prehip b/src/array/cuda/cusparse_dispatcher.cuh.prehip
new file mode 100644
index 000000000000..0d9152b59145
--- /dev/null
+++ b/src/array/cuda/cusparse_dispatcher.cuh.prehip
@@ -0,0 +1,238 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/dispatcher.cuh
+ * @brief Templates to dispatch into different cuSPARSE routines based on the
+ * type argument.
+ */
+#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
+#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
+
+#include <cusparse.h>
+#include <dgl/runtime/c_runtime_api.h>
+
+#include "bf16.cuh"
+#include "fp16.cuh"
+
+namespace dgl {
+namespace aten {
+
+/** @brief cusparseXcsrgemm dispatcher */
+template <typename DType>
+struct CSRGEMM {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+
+template <>
+struct CSRGEMM<__half> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
+    // different implementation would be required.
+    LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+
+#if BF16_ENABLED
+template <>
+struct CSRGEMM<__nv_bfloat16> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a
+    // different implementation would be required.
+    LOG(FATAL)
+        << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgemm2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+#endif  // BF16_ENABLED
+
+template <>
+struct CSRGEMM<float> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    return cusparseScsrgemm2_bufferSizeExt(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    return cusparseScsrgemm2(args...);
+  }
+};
+
+template <>
+struct CSRGEMM<double> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    return cusparseDcsrgemm2_bufferSizeExt(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgemm2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    return cusparseDcsrgemm2(args...);
+  }
+};
+
+/** @brief cusparseXcsrgeam dispatcher */
+template <typename DType>
+struct CSRGEAM {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    BUG_IF_FAIL(false) << "This piece of code should not be reached.";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+
+template <>
+struct CSRGEAM<__half> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
+    // different implementation would be required.
+    LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+
+#if BF16_ENABLED
+template <>
+struct CSRGEAM<__nv_bfloat16> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a
+    // different implementation would be required.
+    LOG(FATAL)
+        << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    // TODO(ndickson): There is no cusparseHcsrgeam2, so a different
+    // implementation would be required.
+    LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16).";
+    return static_cast<cusparseStatus_t>(0);
+  }
+};
+#endif  // BF16_ENABLED
+
+template <>
+struct CSRGEAM<float> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    return cusparseScsrgeam2_bufferSizeExt(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    return cusparseScsrgeam2(args...);
+  }
+};
+
+template <>
+struct CSRGEAM<double> {
+  template <typename... Args>
+  static inline cusparseStatus_t bufferSizeExt(Args... args) {
+    return cusparseDcsrgeam2_bufferSizeExt(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t nnz(Args... args) {
+    return cusparseXcsrgeam2Nnz(args...);
+  }
+
+  template <typename... Args>
+  static inline cusparseStatus_t compute(Args... args) {
+    return cusparseDcsrgeam2(args...);
+  }
+};
+
+};  // namespace aten
+};  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_
diff --git a/src/array/cuda/disjoint_union.cu b/src/array/cuda/disjoint_union.cu
index 90cc07bd3f4c..d5b602900850 100644
--- a/src/array/cuda/disjoint_union.cu
+++ b/src/array/cuda/disjoint_union.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *   Copyright (c) 2022, NVIDIA CORPORATION.
  *
@@ -78,7 +79,7 @@ std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
 template <DGLDeviceType XPU, typename IdType>
 void _Merge(
     IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
-    int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
+    int n_elms, DGLContext ctx, DGLDataType dtype, hipStream_t stream) {
   auto device = runtime::DeviceAPI::Get(ctx);
   int nt = 256;
   int nb = (n_elms + nt - 1) / nt;
@@ -99,7 +100,7 @@ void _Merge(
 
 template <DGLDeviceType XPU, typename IdType>
 COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
   uint64_t src_offset = 0, dst_offset = 0;
   bool has_data = false;
diff --git a/src/array/cuda/disjoint_union.cu.prehip b/src/array/cuda/disjoint_union.cu.prehip
new file mode 100644
index 000000000000..90cc07bd3f4c
--- /dev/null
+++ b/src/array/cuda/disjoint_union.cu.prehip
@@ -0,0 +1,185 @@
+/**
+ *   Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file array/gpu/disjoint_union.cu
+ * @brief Disjoint union GPU implementation.
+ */
+
+#include <dgl/array.h>
+#include <dgl/runtime/parallel_for.h>
+
+#include <tuple>
+#include <vector>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename IdType>
+__global__ void _DisjointUnionKernel(
+    IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
+    int n_elms) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < n_elms) {
+    IdType i = dgl::cuda::_UpperBound(offset, n_arrs, tx) - 1;
+    if (arrs[i] == NULL) {
+      out[tx] = tx;
+    } else {
+      IdType j = tx - offset[i];
+      out[tx] = arrs[i][j] + prefix[i];
+    }
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::tuple<IdArray, IdArray, IdArray> _ComputePrefixSums(
+    const std::vector<COOMatrix>& coos) {
+  IdType n = coos.size(), nbits = coos[0].row->dtype.bits;
+  IdArray n_rows = NewIdArray(n, CPU, nbits);
+  IdArray n_cols = NewIdArray(n, CPU, nbits);
+  IdArray n_elms = NewIdArray(n, CPU, nbits);
+
+  IdType* n_rows_data = n_rows.Ptr<IdType>();
+  IdType* n_cols_data = n_cols.Ptr<IdType>();
+  IdType* n_elms_data = n_elms.Ptr<IdType>();
+
+  dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e) {
+    for (IdType i = b; i < e; ++i) {
+      n_rows_data[i] = coos[i].num_rows;
+      n_cols_data[i] = coos[i].num_cols;
+      n_elms_data[i] = coos[i].row->shape[0];
+    }
+  });
+
+  return std::make_tuple(
+      CumSum(n_rows.CopyTo(coos[0].row->ctx), true),
+      CumSum(n_cols.CopyTo(coos[0].row->ctx), true),
+      CumSum(n_elms.CopyTo(coos[0].row->ctx), true));
+}
+
+template <DGLDeviceType XPU, typename IdType>
+void _Merge(
+    IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs,
+    int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) {
+  auto device = runtime::DeviceAPI::Get(ctx);
+  int nt = 256;
+  int nb = (n_elms + nt - 1) / nt;
+
+  IdType** arrs_dev = static_cast<IdType**>(
+      device->AllocWorkspace(ctx, n_arrs * sizeof(IdType*)));
+
+  device->CopyDataFromTo(
+      arrs, 0, arrs_dev, 0, sizeof(IdType*) * n_arrs, DGLContext{kDGLCPU, 0},
+      ctx, dtype);
+
+  CUDA_KERNEL_CALL(
+      _DisjointUnionKernel, nb, nt, 0, stream, arrs_dev, prefix, offset, out,
+      n_arrs, n_elms);
+
+  device->FreeWorkspace(ctx, arrs_dev);
+}
+
+template <DGLDeviceType XPU, typename IdType>
+COOMatrix DisjointUnionCoo(const std::vector<COOMatrix>& coos) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(coos[0].row->ctx);
+  uint64_t src_offset = 0, dst_offset = 0;
+  bool has_data = false;
+  bool row_sorted = true;
+  bool col_sorted = true;
+
+  // check if data index array
+  for (size_t i = 0; i < coos.size(); ++i) {
+    CHECK_SAME_DTYPE(coos[0].row, coos[i].row);
+    CHECK_SAME_CONTEXT(coos[0].row, coos[i].row);
+    has_data |= COOHasData(coos[i]);
+  }
+
+  auto prefixes = _ComputePrefixSums<XPU, IdType>(coos);
+  auto prefix_src = static_cast<IdType*>(std::get<0>(prefixes)->data);
+  auto prefix_dst = static_cast<IdType*>(std::get<1>(prefixes)->data);
+  auto prefix_elm = static_cast<IdType*>(std::get<2>(prefixes)->data);
+
+  std::unique_ptr<IdType*[]> rows(new IdType*[coos.size()]);
+  std::unique_ptr<IdType*[]> cols(new IdType*[coos.size()]);
+  std::unique_ptr<IdType*[]> data(new IdType*[coos.size()]);
+
+  for (size_t i = 0; i < coos.size(); i++) {
+    row_sorted &= coos[i].row_sorted;
+    col_sorted &= coos[i].col_sorted;
+    rows[i] = coos[i].row.Ptr<IdType>();
+    cols[i] = coos[i].col.Ptr<IdType>();
+    data[i] = coos[i].data.Ptr<IdType>();
+  }
+
+  auto ctx = coos[0].row->ctx;
+  auto dtype = coos[0].row->dtype;
+
+  IdType n_elements = 0;
+  device->CopyDataFromTo(
+      &prefix_elm[coos.size()], 0, &n_elements, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
+
+  device->CopyDataFromTo(
+      &prefix_src[coos.size()], 0, &src_offset, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
+
+  device->CopyDataFromTo(
+      &prefix_dst[coos.size()], 0, &dst_offset, 0, sizeof(IdType),
+      coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype);
+
+  // Union src array
+  IdArray result_src =
+      NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+  _Merge<XPU, IdType>(
+      rows.get(), prefix_src, prefix_elm, result_src.Ptr<IdType>(), coos.size(),
+      n_elements, ctx, dtype, stream);
+
+  // Union dst array
+  IdArray result_dst =
+      NewIdArray(n_elements, coos[0].col->ctx, coos[0].col->dtype.bits);
+  _Merge<XPU, IdType>(
+      cols.get(), prefix_dst, prefix_elm, result_dst.Ptr<IdType>(), coos.size(),
+      n_elements, ctx, dtype, stream);
+
+  // Union data array if exists and fetch number of elements
+  IdArray result_dat = NullArray();
+  if (has_data) {
+    result_dat =
+        NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits);
+    _Merge<XPU, IdType>(
+        data.get(), prefix_elm, prefix_elm, result_dat.Ptr<IdType>(),
+        coos.size(), n_elements, ctx, dtype, stream);
+  }
+
+  return COOMatrix(
+      src_offset, dst_offset, result_src, result_dst, result_dat, row_sorted,
+      col_sorted);
+}
+
+template COOMatrix DisjointUnionCoo<kDGLCUDA, int32_t>(
+    const std::vector<COOMatrix>& coos);
+template COOMatrix DisjointUnionCoo<kDGLCUDA, int64_t>(
+    const std::vector<COOMatrix>& coos);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/fp16.cuh b/src/array/cuda/fp16.cuh
index e23837e019a1..8e7ca60b51c3 100644
--- a/src/array/cuda/fp16.cuh
+++ b/src/array/cuda/fp16.cuh
@@ -21,7 +21,7 @@
 #ifndef DGL_ARRAY_CUDA_FP16_CUH_
 #define DGL_ARRAY_CUDA_FP16_CUH_
 
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 
 #include <algorithm>
 
@@ -41,9 +41,9 @@ static __device__ __forceinline__ half min(half a, half b) {
 #endif
 }
 
-#ifdef __CUDACC__
+#ifdef __HIPCC__
 // Arithmetic FP16 operations for architecture >= 5.3 are already defined in
-// cuda_fp16.h
+// hip/hip_fp16.h
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
 // CUDA 12.2 adds "emulated" support for older architectures.
 #if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
@@ -129,6 +129,6 @@ __device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
 }
 #endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
 #endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
-#endif  // __CUDACC__
+#endif  // __HIPCC__
 
 #endif  // DGL_ARRAY_CUDA_FP16_CUH_
diff --git a/src/array/cuda/fp16.cuh.prehip b/src/array/cuda/fp16.cuh.prehip
new file mode 100644
index 000000000000..e23837e019a1
--- /dev/null
+++ b/src/array/cuda/fp16.cuh.prehip
@@ -0,0 +1,134 @@
+/**
+ *  Copyright (c) 2020-2022 by Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file array/cuda/fp16.cuh
+ * @brief float16 related functions.
+ * @note this file is modified from TVM project:
+ *       https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h.
+ */
+#ifndef DGL_ARRAY_CUDA_FP16_CUH_
+#define DGL_ARRAY_CUDA_FP16_CUH_
+
+#include <cuda_fp16.h>
+
+#include <algorithm>
+
+static __device__ __forceinline__ half max(half a, half b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(__half(a), __half(b)) ? a : b;
+#else
+  return __half(max(float(a), float(b)));  // NOLINT
+#endif
+}
+
+static __device__ __forceinline__ half min(half a, half b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(__half(a), __half(b)) ? a : b;
+#else
+  return __half(min(float(a), float(b)));  // NOLINT
+#endif
+}
+
+#ifdef __CUDACC__
+// Arithmetic FP16 operations for architecture >= 5.3 are already defined in
+// cuda_fp16.h
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+// CUDA 12.2 adds "emulated" support for older architectures.
+#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
+__device__ __forceinline__ __half
+operator+(const __half& lh, const __half& rh) {
+  return __half(float(lh) + float(rh));  // NOLINT
+}
+__device__ __forceinline__ __half
+operator-(const __half& lh, const __half& rh) {
+  return __half(float(lh) - float(rh));  // NOLINT
+}
+__device__ __forceinline__ __half
+operator*(const __half& lh, const __half& rh) {
+  return __half(float(lh) * float(rh));  // NOLINT
+}
+__device__ __forceinline__ __half
+operator/(const __half& lh, const __half& rh) {
+  return __half(float(lh) / float(rh));  // NOLINT
+}
+
+__device__ __forceinline__ __half& operator+=(
+    __half& lh, const __half& rh) {    // NOLINT
+  lh = __half(float(lh) + float(rh));  // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __half& operator-=(
+    __half& lh, const __half& rh) {    // NOLINT
+  lh = __half(float(lh) - float(rh));  // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __half& operator*=(
+    __half& lh, const __half& rh) {    // NOLINT
+  lh = __half(float(lh) * float(rh));  // NOLINT
+  return lh;
+}
+__device__ __forceinline__ __half& operator/=(
+    __half& lh, const __half& rh) {    // NOLINT
+  lh = __half(float(lh) / float(rh));  // NOLINT
+  return lh;
+}
+
+__device__ __forceinline__ __half& operator++(__half& h) {  // NOLINT
+  h = __half(float(h) + 1.0f);                              // NOLINT
+  return h;
+}
+__device__ __forceinline__ __half& operator--(__half& h) {  // NOLINT
+  h = __half(float(h) - 1.0f);                              // NOLINT
+  return h;
+}
+__device__ __forceinline__ __half operator++(__half& h, int) {  // NOLINT
+  __half ret = h;
+  h = __half(float(h) + 1.0f);  // NOLINT
+  return ret;
+}
+__device__ __forceinline__ __half operator--(__half& h, int) {  // NOLINT
+  __half ret = h;
+  h = __half(float(h) - 1.0f);  // NOLINT
+  return ret;
+}
+
+__device__ __forceinline__ __half operator+(const __half& h) { return h; }
+__device__ __forceinline__ __half operator-(const __half& h) {
+  return __half(-float(h));  // NOLINT
+}
+
+__device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) {
+  return float(lh) == float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) {
+  return float(lh) != float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator>(const __half& lh, const __half& rh) {
+  return float(lh) > float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator<(const __half& lh, const __half& rh) {
+  return float(lh) < float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) {
+  return float(lh) >= float(rh);  // NOLINT
+}
+__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) {
+  return float(lh) <= float(rh);  // NOLINT
+}
+#endif  // defined(CUDART_VERSION) && (CUDART_VERSION < 12020)
+#endif  // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530)
+#endif  // __CUDACC__
+
+#endif  // DGL_ARRAY_CUDA_FP16_CUH_
diff --git a/src/array/cuda/functor.cuh b/src/array/cuda/functor.cuh
index 226600cb9a84..1dc05e34e008 100644
--- a/src/array/cuda/functor.cuh
+++ b/src/array/cuda/functor.cuh
@@ -208,29 +208,29 @@ struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
 
 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+struct Sum<Idx, __hip_bfloat16, atomic> : _Sum<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
     return __float2bfloat16_rn(0.);
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(
         out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Sum<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
   }
   // sometimes we have to use float in reduction for better precision
   static __device__ __forceinline__ void Call(
       float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
     _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
         static_cast<float>(val), uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
     _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
         static_cast<float>(val), id);
   }
@@ -313,29 +313,29 @@ struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
 
 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+struct Max<Idx, __hip_bfloat16, atomic> : _Max<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
     return __float2bfloat16_rn(-std::numeric_limits<float>::infinity());
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(
         out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Max<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
   }
   // sometimes we have to use float in reduction for better precision
   static __device__ __forceinline__ void Call(
       float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
     _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
         static_cast<float>(val), uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
     _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
         static_cast<float>(val), id);
   }
@@ -418,29 +418,29 @@ struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
 
 #if BF16_ENABLED
 template <typename Idx, bool atomic>
-struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> {
-  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+struct Min<Idx, __hip_bfloat16, atomic> : _Min<Idx, __hip_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() {
     return __float2bfloat16_rn(std::numeric_limits<float>::infinity());
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(
+      __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __hip_bfloat16 val, Idx uid, Idx eid) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(
         out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
-    _Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+      __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
+    _Min<Idx, __hip_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
   }
   // sometimes we have to use float in reduction for better precision
   static __device__ __forceinline__ void Call(
       float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
-      __nv_bfloat16 val, Idx uid, Idx eid) {
+      __hip_bfloat16 val, Idx uid, Idx eid) {
     _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
         static_cast<float>(val), uid, eid);
   }
   static __device__ __forceinline__ void Call(
-      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+      float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) {
     _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
         static_cast<float>(val), id);
   }
diff --git a/src/array/cuda/functor.cuh.prehip b/src/array/cuda/functor.cuh.prehip
new file mode 100644
index 000000000000..226600cb9a84
--- /dev/null
+++ b/src/array/cuda/functor.cuh.prehip
@@ -0,0 +1,456 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/functor.cuh
+ * @brief Functors for template on CUDA
+ */
+#ifndef DGL_ARRAY_CUDA_FUNCTOR_CUH_
+#define DGL_ARRAY_CUDA_FUNCTOR_CUH_
+
+#include <cmath>
+#include <limits>
+
+#include "./atomic.cuh"
+#include "./fp16.cuh"
+#include "bf16.cuh"
+
+namespace dgl {
+namespace aten {
+namespace cuda {
+
+/////////////////////////// CUDA binary operators //////////////////////////////
+namespace binary {
+template <typename DType>
+struct Add {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return lhs[0] + rhs[0];
+  }
+};
+template <typename DType>
+constexpr bool Add<DType>::use_lhs;
+template <typename DType>
+constexpr bool Add<DType>::use_rhs;
+template <typename DType>
+constexpr bool Add<DType>::reduce_last_dim;
+
+template <typename DType>
+struct Sub {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return lhs[0] - rhs[0];
+  }
+};
+template <typename DType>
+constexpr bool Sub<DType>::use_lhs;
+template <typename DType>
+constexpr bool Sub<DType>::use_rhs;
+template <typename DType>
+constexpr bool Sub<DType>::reduce_last_dim;
+
+template <typename DType>
+struct Mul {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return lhs[0] * rhs[0];
+  }
+};
+template <typename DType>
+constexpr bool Mul<DType>::use_lhs;
+template <typename DType>
+constexpr bool Mul<DType>::use_rhs;
+template <typename DType>
+constexpr bool Mul<DType>::reduce_last_dim;
+
+template <typename DType>
+struct Div {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return lhs[0] / rhs[0];
+  }
+};
+template <typename DType>
+constexpr bool Div<DType>::use_lhs;
+template <typename DType>
+constexpr bool Div<DType>::use_rhs;
+template <typename DType>
+constexpr bool Div<DType>::reduce_last_dim;
+
+template <typename DType>
+struct CopyLhs {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = false;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return lhs[0];
+  }
+};
+template <typename DType>
+constexpr bool CopyLhs<DType>::use_lhs;
+template <typename DType>
+constexpr bool CopyLhs<DType>::use_rhs;
+template <typename DType>
+constexpr bool CopyLhs<DType>::reduce_last_dim;
+
+template <typename DType>
+struct CopyRhs {
+  static constexpr bool use_lhs = false;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = false;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    return rhs[0];
+  }
+};
+template <typename DType>
+constexpr bool CopyRhs<DType>::use_lhs;
+template <typename DType>
+constexpr bool CopyRhs<DType>::use_rhs;
+template <typename DType>
+constexpr bool CopyRhs<DType>::reduce_last_dim;
+
+template <typename DType>
+struct Dot {
+  static constexpr bool use_lhs = true;
+  static constexpr bool use_rhs = true;
+  static constexpr bool reduce_last_dim = true;
+  static __device__ __forceinline__ DType
+  Call(const DType *lhs, const DType *rhs, int64_t len = 1) {
+    DType rst = static_cast<DType>(0.0f);
+    for (int64_t i = 0; i < len; ++i) {
+      rst += lhs[i] * rhs[i];
+    }
+    return rst;
+  }
+};
+template <typename DType>
+constexpr bool Dot<DType>::use_lhs;
+template <typename DType>
+constexpr bool Dot<DType>::use_rhs;
+template <typename DType>
+constexpr bool Dot<DType>::reduce_last_dim;
+
+}  // end of namespace binary
+
+/////////////////////////// CUDA reduce operators //////////////////////////////
+namespace reduce {
+template <typename Idx, typename DType, bool atomic>
+struct _Sum {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return 0.;
+  }
+  static constexpr bool require_arg = false;
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid,
+      Idx eid) {
+    if (!atomic) {
+      *out_buf += val;
+    } else {
+      cuda::AtomicAdd(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_buf, DType val, Idx id) {
+    if (!atomic) {
+      *out_buf += val;
+    } else {
+      cuda::AtomicAdd(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void CallArg(
+      Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref,
+      Idx uid, Idx eid) {}
+};
+
+template <typename Idx, typename DType, bool atomic = false>
+struct Sum : _Sum<Idx, DType, atomic> {};
+
+template <typename Idx, bool atomic>
+struct Sum<Idx, __half, atomic> : _Sum<Idx, __half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __half zero() {
+    return __float2half_rn(0.);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Sum<Idx, __half, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Sum<Idx, __half, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+
+#if BF16_ENABLED
+template <typename Idx, bool atomic>
+struct Sum<Idx, __nv_bfloat16, atomic> : _Sum<Idx, __nv_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+    return __float2bfloat16_rn(0.);
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Sum<Idx, __nv_bfloat16, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Sum<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Sum<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Sum<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+#endif  // BF16_ENABLED
+
+template <typename Idx, typename DType, bool atomic>
+struct _Max {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return -std::numeric_limits<DType>::infinity();
+  }
+  static constexpr bool require_arg = true;
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid,
+      Idx eid) {
+    if (!atomic) {
+      if (*out_buf < val) {
+        *out_buf = val;
+        *arg_u_buf = uid;
+        *arg_e_buf = eid;
+      }
+    } else {
+      cuda::AtomicMax(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_buf, DType val, Idx id) {
+    if (!atomic) {
+      if (*out_buf < val) {
+        *out_buf = val;
+        *arg_buf = id;
+      }
+    } else {
+      cuda::AtomicMax(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void CallArg(
+      Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref,
+      Idx uid, Idx eid) {
+    if (atomic) {
+      if (val == val_ref) {
+        if (arg_u_buf) arg_u_buf[fid] = uid;
+        if (arg_e_buf) arg_e_buf[fid] = eid;
+      }
+    }
+  }
+};
+
+template <typename Idx, typename DType, bool atomic = false>
+struct Max : _Max<Idx, DType, atomic> {};
+
+template <typename Idx, bool atomic>
+struct Max<Idx, __half, atomic> : _Max<Idx, __half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __half zero() {
+    return __float2half_rn(-6.550400e+04f);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Max<Idx, __half, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Max<Idx, __half, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+
+#if BF16_ENABLED
+template <typename Idx, bool atomic>
+struct Max<Idx, __nv_bfloat16, atomic> : _Max<Idx, __nv_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+    return __float2bfloat16_rn(-std::numeric_limits<float>::infinity());
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Max<Idx, __nv_bfloat16, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Max<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Max<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Max<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+#endif  // BF16_ENABLED
+
+template <typename Idx, typename DType, bool atomic>
+struct _Min {
+  static constexpr __host__ __device__ __forceinline__ DType zero() {
+    return std::numeric_limits<DType>::infinity();
+  }
+  static constexpr bool require_arg = true;
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid,
+      Idx eid) {
+    if (!atomic) {
+      if (*out_buf > val) {
+        *out_buf = val;
+        *arg_u_buf = uid;
+        *arg_e_buf = eid;
+      }
+    } else {
+      cuda::AtomicMin(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void Call(
+      DType *out_buf, Idx *arg_buf, DType val, Idx id) {
+    if (!atomic) {
+      if (*out_buf > val) {
+        *out_buf = val;
+        *arg_buf = id;
+      }
+    } else {
+      cuda::AtomicMin(out_buf, val);
+    }
+  }
+  static __device__ __forceinline__ void CallArg(
+      Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref,
+      Idx uid, Idx eid) {
+    if (atomic) {
+      if (val == val_ref) {
+        if (arg_u_buf) arg_u_buf[fid] = uid;
+        if (arg_e_buf) arg_e_buf[fid] = eid;
+      }
+    }
+  }
+};
+
+template <typename Idx, typename DType, bool atomic = false>
+struct Min : _Min<Idx, DType, atomic> {};
+
+template <typename Idx, bool atomic>
+struct Min<Idx, __half, atomic> : _Min<Idx, __half, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __half zero() {
+    return __float2half_rn(6.550400e+04f);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Min<Idx, __half, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __half *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Min<Idx, __half, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __half val, Idx uid, Idx eid) {
+    _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __half val, Idx id) {
+    _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+
+#if BF16_ENABLED
+template <typename Idx, bool atomic>
+struct Min<Idx, __nv_bfloat16, atomic> : _Min<Idx, __nv_bfloat16, atomic> {
+  static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() {
+    return __float2bfloat16_rn(std::numeric_limits<float>::infinity());
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Min<Idx, __nv_bfloat16, atomic>::Call(
+        out_buf, arg_u_buf, arg_e_buf, val, uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Min<Idx, __nv_bfloat16, atomic>::Call(out_buf, arg_buf, val, id);
+  }
+  // sometimes we have to use float in reduction for better precision
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf,
+      __nv_bfloat16 val, Idx uid, Idx eid) {
+    _Min<Idx, float, atomic>::Call(out_buf, arg_u_buf, arg_e_buf,
+        static_cast<float>(val), uid, eid);
+  }
+  static __device__ __forceinline__ void Call(
+      float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) {
+    _Min<Idx, float, atomic>::Call(out_buf, arg_buf,
+        static_cast<float>(val), id);
+  }
+};
+#endif  // BF16_ENABLED
+
+}  // namespace reduce
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_FUNCTOR_CUH_
diff --git a/src/array/cuda/gather_mm.cu b/src/array/cuda/gather_mm.cu
index c40d53bb05ec..cf7a87d9d844 100644
--- a/src/array/cuda/gather_mm.cu
+++ b/src/array/cuda/gather_mm.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/gather_mm.cu
@@ -20,54 +21,54 @@ namespace {
 /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
  */
 template <typename DType>
-cublasStatus_t cublasGemm(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, int k, const DType* alpha, const DType* A, int lda,
     const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
   LOG(INFO) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 
 template <>
-cublasStatus_t cublasGemm<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, int k, const __half* alpha, const __half* A, int lda,
     const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
-  return cublasHgemm(
+  return hipblasHgemm(
       handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 #if BF16_ENABLED
 template <>
-cublasStatus_t cublasGemm<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A,
-    int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t cublasGemm<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A,
+    int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta,
+    __hip_bfloat16* C, int ldc) {
   float alpha_float = __bfloat162float(*alpha);
   float beta_float = __bfloat162float(*beta);
-  return cublasGemmEx(
-      handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B,
-      CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F,
+  return hipblasGemmEx_v2(
+      handle, transa, transb, m, n, k, &alpha_float, A, HIP_R_16BF, lda, B,
+      HIP_R_16BF, ldb, &beta_float, C, HIP_R_16BF, ldc, HIPBLAS_COMPUTE_32F,
       CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 }
 #endif  // BF16_ENABLED
 
 template <>
-cublasStatus_t cublasGemm<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, int k, const float* alpha, const float* A, int lda,
     const float* B, int ldb, const float* beta, float* C, int ldc) {
-  return cublasSgemm(
+  return hipblasSgemm(
       handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-cublasStatus_t cublasGemm<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t cublasGemm<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, int k, const double* alpha, const double* A, int lda,
     const double* B, int ldb, const double* beta, double* C, int ldc) {
-  return cublasDgemm(
+  return hipblasDgemm(
       handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
@@ -203,7 +204,7 @@ void SegmentMM(
     const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
     bool a_trans, bool b_trans) {
   auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const DType* A_data = A.Ptr<DType>();
   const DType* B_data = B.Ptr<DType>();
   const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
@@ -215,8 +216,8 @@ void SegmentMM(
 
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
 
   IdType m_offset = 0;
   for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -226,10 +227,10 @@ void SegmentMM(
     n = B->shape[2];  // cols of B
     k = B->shape[1];  // cols of A == rows of B
     int ldb = n, lda = k, ldc = n;
-    cublasOperation_t transB = CUBLAS_OP_N;
-    cublasOperation_t transA = CUBLAS_OP_N;
+    hipblasOperation_t transB = HIPBLAS_OP_N;
+    hipblasOperation_t transA = HIPBLAS_OP_N;
     if (b_trans) {
-      transB = CUBLAS_OP_T;
+      transB = HIPBLAS_OP_T;
       ldb = n, lda = n, ldc = k;
       std::swap(n, k);
     }
@@ -248,7 +249,7 @@ template <int XPU, typename IdType, typename DType>
 void SegmentMMBackwardB(
     const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
   auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const DType* A_data = A.Ptr<DType>();
   const DType* dC_data = dC.Ptr<DType>();
   const IdType* seglen_data = seglen.Ptr<IdType>();
@@ -260,8 +261,8 @@ void SegmentMMBackwardB(
 
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
 
   IdType k_offset = 0;
   for (IdType etype = 0; etype < num_rel; ++etype) {
@@ -271,8 +272,8 @@ void SegmentMMBackwardB(
     CHECK_LE(k_offset + k, A->shape[0])
         << "Segement index out of bound of A->shape[0].";
     int lddC = m, ldA = n, lddB = m;
-    cublasOperation_t trans_dC = CUBLAS_OP_N;
-    cublasOperation_t trans_A = CUBLAS_OP_T;
+    hipblasOperation_t trans_dC = HIPBLAS_OP_N;
+    hipblasOperation_t trans_A = HIPBLAS_OP_T;
     CUBLAS_CALL(cublasGemm<DType>(
         thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
         dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
@@ -299,7 +300,7 @@ void GatherMM(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b) {
   auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int64_t out_len = B->shape[2];  // cols of B
   int64_t in_len = A->shape[1];   // cols of A
   const int64_t tot_num_rows = A->shape[0];
@@ -332,7 +333,7 @@ void GatherMMScatter(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b, const NDArray idx_c) {
   auto device = runtime::DeviceAPI::Get(A->ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const IdType* idx_c_data = idx_c.Ptr<IdType>();
   int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2];  // cols of B
   int64_t in_len = A->shape[1];                                  // cols of A
@@ -367,10 +368,10 @@ template void GatherMM<kDGLCUDA, int64_t, __half>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b);
 #if BF16_ENABLED
-template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int32_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b);
-template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMM<kDGLCUDA, int64_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b);
 #endif  // BF16_ENABLED
@@ -394,10 +395,10 @@ template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b, const NDArray idx_c);
 #if BF16_ENABLED
-template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int32_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b, const NDArray idx_c);
-template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void GatherMMScatter<kDGLCUDA, int64_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
     const NDArray idx_b, const NDArray idx_c);
 #endif  // BF16_ENABLED
@@ -421,10 +422,10 @@ template void SegmentMM<kDGLCUDA, int64_t, __half>(
     const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
     bool a_trans, bool b_trans);
 #if BF16_ENABLED
-template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int32_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
     bool a_trans, bool b_trans);
-template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMM<kDGLCUDA, int64_t, __hip_bfloat16>(
     const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
     bool a_trans, bool b_trans);
 #endif  // BF16_ENABLED
@@ -446,9 +447,9 @@ template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
 template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
     const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #if BF16_ENABLED
-template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, __hip_bfloat16>(
     const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
-template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, __hip_bfloat16>(
     const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
 #endif  // BF16_ENABLED
 template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
diff --git a/src/array/cuda/gather_mm.cu.prehip b/src/array/cuda/gather_mm.cu.prehip
new file mode 100644
index 000000000000..c40d53bb05ec
--- /dev/null
+++ b/src/array/cuda/gather_mm.cu.prehip
@@ -0,0 +1,464 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/gather_mm.cu
+ * @brief GatherMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include <algorithm>  // std::swap
+
+#include "./atomic.cuh"
+#include "./functor.cuh"
+#include "./utils.h"
+
+namespace dgl {
+using namespace cuda;
+namespace aten {
+
+namespace {
+
+/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double.
+ */
+template <typename DType>
+cublasStatus_t cublasGemm(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const DType* alpha, const DType* A, int lda,
+    const DType* B, int ldb, const DType* beta, DType* C, int ldc) {
+  LOG(INFO) << "Not supported dtype";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+
+template <>
+cublasStatus_t cublasGemm<__half>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const __half* alpha, const __half* A, int lda,
+    const __half* B, int ldb, const __half* beta, __half* C, int ldc) {
+  return cublasHgemm(
+      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+#if BF16_ENABLED
+template <>
+cublasStatus_t cublasGemm<__nv_bfloat16>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A,
+    int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta,
+    __nv_bfloat16* C, int ldc) {
+  float alpha_float = __bfloat162float(*alpha);
+  float beta_float = __bfloat162float(*beta);
+  return cublasGemmEx(
+      handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B,
+      CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F,
+      CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+#endif  // BF16_ENABLED
+
+template <>
+cublasStatus_t cublasGemm<float>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float* alpha, const float* A, int lda,
+    const float* B, int ldb, const float* beta, float* C, int ldc) {
+  return cublasSgemm(
+      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemm<double>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double* alpha, const double* A, int lda,
+    const double* B, int ldb, const double* beta, double* C, int ldc) {
+  return cublasDgemm(
+      handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+}  // namespace
+
+namespace cuda {
+
+/**
+ * @note Each row of A multiplies a segment of matrix of B of dimension in_len *
+ * outlen. One warp is assigned to process one row of A. Each WARP sequentially
+ * multiplies one element of A and a row of B to compute partial result of the
+ * output. A is loaded in shared memory in a coalesced way. Output matrix is
+ * loaded in registers. B should get benefit from L2 cache.
+ */
+template <typename Idx, typename DType>
+__global__ void GatherMMScatterKernel(
+    const DType* __restrict__ A, const DType* __restrict__ B,
+    DType* __restrict__ C, const Idx* __restrict__ idx_a,
+    const Idx* __restrict__ idx_b, const Idx* __restrict__ idx_c,
+    const int64_t num_rows, const int64_t in_len, const int64_t out_len) {
+  unsigned int tId = threadIdx.x;
+  unsigned int laneId = tId & 31;
+  unsigned int gId = (blockIdx.x * blockDim.x + threadIdx.x);
+  unsigned int warpId = gId >> 5;
+  unsigned int row = warpId;
+  if (row < num_rows) {
+    const unsigned int local_row =
+        row & 3;  // hardcoded for TB size 128 (4 warps)
+    const Idx cur_rowA = (idx_a) ? idx_a[row] : row;
+    const Idx cur_rowB = (idx_b) ? idx_b[row] : row;
+    const Idx cur_rowC = (idx_c) ? idx_c[row] : row;
+    const Idx B_offset = cur_rowB * in_len * out_len;
+    const int sh_a_tile = 64;
+    __shared__ DType sh_A[4 * sh_a_tile];
+    int a_tile = sh_a_tile;
+    for (unsigned int k_start = 0; k_start < in_len; k_start += 64) {
+      if ((in_len - k_start) < a_tile) a_tile = in_len - k_start;
+      // Load A in shared mem in a coalesced way
+      for (unsigned int l = laneId; l < a_tile; l += 32)
+        sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)];
+      __syncwarp();
+
+      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
+        DType out_reg = static_cast<DType>(0.0f);  // thread private
+        const unsigned int l = laneId;
+        if (l < out_len) {
+          // iterate over elements of a row of A
+          for (unsigned int i = 0; i < a_tile; i++) {
+            const DType a_val = sh_A[local_row * sh_a_tile + i];
+            // iterate over elements of a row of B in parallel
+            out_reg +=
+                a_val * B[B_offset + ((i + k_start) * out_len + (outloop + l))];
+          }
+          if (idx_c) {
+            AtomicAdd(C + cur_rowC * out_len + (outloop + l), out_reg);
+          } else {
+            C[cur_rowC * out_len + (outloop + l)] += out_reg;
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * @note Output matrix is accumulated via atomic operations. Rest of the
+ * strategies are similar to GatherMMKernel. One warp is assigned to process one
+ * row of A. Each WARP sequentially multiplies one element of A and a row of B
+ * to compute partial result of the output. A is loaded in shared memory in a
+ * coalesced way. B should get benefit from L2 cache.
+ */
+template <typename Idx, typename DType>
+__global__ void GatherMMScatterKernel2(
+    const DType* __restrict__ A, const DType* __restrict__ B,
+    DType* __restrict__ C, const Idx* __restrict__ idx_a,
+    const Idx* __restrict__ idx_b, const Idx* __restrict__ idx_c,
+    const int64_t num_rows, const int64_t in_len, const int64_t out_len) {
+  unsigned int tId = threadIdx.x;
+  unsigned int laneId = tId & 31;
+  unsigned int gId = (blockIdx.x * blockDim.x + threadIdx.x);
+  unsigned int warpId = gId >> 5;
+  unsigned int row = warpId;
+  if (row < num_rows) {
+    const unsigned int local_row =
+        row & 3;  // hardcoded for TB size 128 (4 warps)
+    const Idx row_a = (idx_a) ? idx_a[row] : row;
+    const Idx row_b = (idx_b) ? idx_b[row] : row;
+    const Idx row_c = (idx_c) ? idx_c[row] : row;
+    const Idx C_offset = row_c * in_len * out_len;
+    const int sh_a_tile = 64;
+    __shared__ DType sh_A[4 * sh_a_tile];
+    int a_tile = sh_a_tile;
+    for (unsigned int k_start = 0; k_start < in_len; k_start += 64) {
+      if ((in_len - k_start) < a_tile) a_tile = in_len - k_start;
+      /* Load A in shared mem in a coalesced way */
+      for (unsigned int l = laneId; l < a_tile; l += 32)
+        sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)];
+      __syncwarp();
+
+      for (unsigned int outloop = 0; outloop < out_len; outloop += 32) {
+        DType out_reg = static_cast<DType>(0.0f);  // thread private
+        const unsigned int l = laneId;
+        if (l < out_len) {
+          const DType b_val = B[row_b * out_len + (outloop + l)];
+          /* iterate over elements of a row of A */
+          for (unsigned int i = 0; i < a_tile; i++) {
+            const DType a_val = sh_A[local_row * sh_a_tile + i];
+            const Idx C_idx =
+                C_offset + ((i + k_start) * out_len + (outloop + l));
+            AtomicAdd(C + C_idx, a_val * b_val);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+
+/**
+ * @brief Implementation of Gather_mm operator. The input matrix A is
+ *        expected to be sorted according to relation type.
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param seglen_A The input vector of size R. Each element
+ *        is the length of segments of input ``A``
+ * @param a_trans Matrix A to be transposed
+ * @param b_trans Matrix B to be transposed
+ */
+template <int XPU, typename IdType, typename DType>
+void SegmentMM(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans) {
+  auto device = runtime::DeviceAPI::Get(A->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const DType* A_data = A.Ptr<DType>();
+  const DType* B_data = B.Ptr<DType>();
+  const IdType* seglen_A_data = seglen_A.Ptr<IdType>();
+  DType* C_data = C.Ptr<DType>();
+  int64_t A_offset = 0, B_offset = 0, C_offset = 0;
+  int64_t m, n, k;
+  int64_t num_rel = seglen_A.NumElements();
+  DType alpha = 1., beta = 0.;
+
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  if (!thr_entry->cublas_handle)
+    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+
+  IdType m_offset = 0;
+  for (IdType etype = 0; etype < num_rel; ++etype) {
+    m = seglen_A_data[etype];  // rows of A
+    CHECK_LE(m_offset + m, A->shape[0])
+        << "Segment index out of bound of A->shape[0].";
+    n = B->shape[2];  // cols of B
+    k = B->shape[1];  // cols of A == rows of B
+    int ldb = n, lda = k, ldc = n;
+    cublasOperation_t transB = CUBLAS_OP_N;
+    cublasOperation_t transA = CUBLAS_OP_N;
+    if (b_trans) {
+      transB = CUBLAS_OP_T;
+      ldb = n, lda = n, ldc = k;
+      std::swap(n, k);
+    }
+    CUBLAS_CALL(cublasGemm<DType>(
+        thr_entry->cublas_handle, transB, transA, n, m, k, &alpha,
+        B_data + B_offset, ldb, A_data + A_offset, lda, &beta,
+        C_data + C_offset, ldc));
+    A_offset += m * k;
+    B_offset += k * n;
+    C_offset += m * n;
+    m_offset += m;
+  }
+}
+
+template <int XPU, typename IdType, typename DType>
+void SegmentMMBackwardB(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) {
+  auto device = runtime::DeviceAPI::Get(A->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const DType* A_data = A.Ptr<DType>();
+  const DType* dC_data = dC.Ptr<DType>();
+  const IdType* seglen_data = seglen.Ptr<IdType>();
+  DType* dB_data = dB.Ptr<DType>();
+  int64_t A_offset = 0, dC_offset = 0, dB_offset = 0;
+  int64_t m, n, k;
+  int64_t num_rel = seglen.NumElements();
+  DType alpha = 1., beta = 0.;
+
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  if (!thr_entry->cublas_handle)
+    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+
+  IdType k_offset = 0;
+  for (IdType etype = 0; etype < num_rel; ++etype) {
+    m = dC->shape[1];
+    n = A->shape[1];
+    k = seglen_data[etype];
+    CHECK_LE(k_offset + k, A->shape[0])
+        << "Segement index out of bound of A->shape[0].";
+    int lddC = m, ldA = n, lddB = m;
+    cublasOperation_t trans_dC = CUBLAS_OP_N;
+    cublasOperation_t trans_A = CUBLAS_OP_T;
+    CUBLAS_CALL(cublasGemm<DType>(
+        thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha,
+        dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta,
+        dB_data + dB_offset, lddB));
+    dC_offset += m * k;
+    A_offset += n * k;
+    dB_offset += m * n;
+    k_offset += k;
+  }
+}
+
+/**
+ * @brief Implementation of Gather_mm operator. The input matrix A is
+ *        expected to be sorted according to relation type.
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param idx_a The input vector to gather left hand operand on
+ * @param idx_b The input vector to gather right hand operand on
+ */
+
+template <int XPU, typename IdType, typename DType>
+void GatherMM(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b) {
+  auto device = runtime::DeviceAPI::Get(A->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int64_t out_len = B->shape[2];  // cols of B
+  int64_t in_len = A->shape[1];   // cols of A
+  const int64_t tot_num_rows = A->shape[0];
+  const int ntx = 128;
+  const int warp_size = 32;
+  const int nbx = ((tot_num_rows * warp_size + ntx - 1) / ntx);
+  const dim3 nblks(nbx);
+  const dim3 nthrs(ntx);
+  CUDA_KERNEL_CALL(
+      (cuda::GatherMMScatterKernel<IdType, DType>), nblks, nthrs, 0, stream,
+      A.Ptr<DType>(), B.Ptr<DType>(), C.Ptr<DType>(), idx_a.Ptr<IdType>(),
+      idx_b.Ptr<IdType>(), nullptr, tot_num_rows, in_len, out_len);
+}
+
+/**
+ * @brief Implementation of Gather_mm operator. The input matrix A is
+ *        expected to be sorted according to relation type.
+ * @param A The input dense matrix of dimension m x k
+ * @param B The input dense matrix of dimension k x n
+ * @param C The output dense matrix of dimension m x n
+ * @param idx_a The input vector to gather left hand operand on
+ * @param idx_b The input vector to gather right hand operand on
+ * @param idx_c The input vector to gather output operand on
+ * @param num_rel The number of idx types in idx_b
+ * @param a_trans Matrix A to be transposed
+ * @param b_trans Matrix B to be transposed
+ */
+template <int XPU, typename IdType, typename DType>
+void GatherMMScatter(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c) {
+  auto device = runtime::DeviceAPI::Get(A->ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const IdType* idx_c_data = idx_c.Ptr<IdType>();
+  int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2];  // cols of B
+  int64_t in_len = A->shape[1];                                  // cols of A
+  int64_t tot_num_rows = A->shape[0];
+  const int ntx = 128;
+  const int warp_size = 32;
+  const int nbx = ((tot_num_rows * warp_size + ntx - 1) / ntx);
+  const dim3 nblks(nbx);
+  const dim3 nthrs(ntx);
+  if (B->ndim == 3) {
+    CUDA_KERNEL_CALL(
+        (cuda::GatherMMScatterKernel<IdType, DType>), nblks, nthrs, 0, stream,
+        A.Ptr<DType>(), B.Ptr<DType>(), C.Ptr<DType>(), idx_a.Ptr<IdType>(),
+        idx_b.Ptr<IdType>(), idx_c.Ptr<IdType>(), tot_num_rows, in_len,
+        out_len);
+  } else {
+    // Custom kernel for W_grad[idx_c[i]] = H^T[i] * C.grad[i]
+    // This kernel accesses rows of A in a transposed way w/o explicitly
+    // converting A
+    CUDA_KERNEL_CALL(
+        (cuda::GatherMMScatterKernel2<IdType, DType>), nblks, nthrs, 0, stream,
+        A.Ptr<DType>(), B.Ptr<DType>(), C.Ptr<DType>(), idx_a.Ptr<IdType>(),
+        idx_b.Ptr<IdType>(), idx_c.Ptr<IdType>(), tot_num_rows, in_len,
+        out_len);
+  }
+}
+
+template void GatherMM<kDGLCUDA, int32_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+template void GatherMM<kDGLCUDA, int64_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+#if BF16_ENABLED
+template void GatherMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+template void GatherMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+#endif  // BF16_ENABLED
+template void GatherMM<kDGLCUDA, int32_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+template void GatherMM<kDGLCUDA, int64_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+template void GatherMM<kDGLCUDA, int32_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+template void GatherMM<kDGLCUDA, int64_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b);
+
+template void GatherMMScatter<kDGLCUDA, int32_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+template void GatherMMScatter<kDGLCUDA, int64_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+#if BF16_ENABLED
+template void GatherMMScatter<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+template void GatherMMScatter<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+#endif  // BF16_ENABLED
+template void GatherMMScatter<kDGLCUDA, int32_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+template void GatherMMScatter<kDGLCUDA, int64_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+template void GatherMMScatter<kDGLCUDA, int32_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+template void GatherMMScatter<kDGLCUDA, int64_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray idx_a,
+    const NDArray idx_b, const NDArray idx_c);
+
+template void SegmentMM<kDGLCUDA, int32_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+template void SegmentMM<kDGLCUDA, int64_t, __half>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+#if BF16_ENABLED
+template void SegmentMM<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+template void SegmentMM<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+#endif  // BF16_ENABLED
+template void SegmentMM<kDGLCUDA, int32_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+template void SegmentMM<kDGLCUDA, int64_t, float>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+template void SegmentMM<kDGLCUDA, int32_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+template void SegmentMM<kDGLCUDA, int64_t, double>(
+    const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A,
+    bool a_trans, bool b_trans);
+
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, __half>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, __half>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+#if BF16_ENABLED
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+#endif  // BF16_ENABLED
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, float>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, float>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+template void SegmentMMBackwardB<kDGLCUDA, int32_t, double>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+template void SegmentMMBackwardB<kDGLCUDA, int64_t, double>(
+    const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/ge_spmm.cuh b/src/array/cuda/ge_spmm.cuh
index 863f90e97876..2f8939fbb86f 100644
--- a/src/array/cuda/ge_spmm.cuh
+++ b/src/array/cuda/ge_spmm.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  * Copyright (c) 2020 by Contributors
  * @file array/cuda/ge_spmm.cuh
@@ -121,7 +122,7 @@ void GESpMMCsr(
   const DType* efeat_data = efeat.Ptr<DType>();
   DType* out_data = out.Ptr<DType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int ntx = 32;
   const int nty = 32;
diff --git a/src/array/cuda/ge_spmm.cuh.prehip b/src/array/cuda/ge_spmm.cuh.prehip
new file mode 100644
index 000000000000..863f90e97876
--- /dev/null
+++ b/src/array/cuda/ge_spmm.cuh.prehip
@@ -0,0 +1,144 @@
+/**
+ * Copyright (c) 2020 by Contributors
+ * @file array/cuda/ge_spmm.cuh
+ * @brief GE-SpMM CUDA kernel function header.
+ */
+#ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_
+#define DGL_ARRAY_CUDA_GE_SPMM_CUH_
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+#include "atomic.cuh"
+#include "macro.cuh"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+namespace cuda {
+
+/**
+ * @brief CUDA kernel of GE-SpMM on Csr.
+ * @note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf
+ *       The grid dimension x and y are reordered for better performance.
+ */
+template <typename Idx, typename DType, typename BinaryOp>
+__global__ void GESpMMKernel(
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, const Idx* __restrict__ indptr,
+    const Idx* __restrict__ indices, const int64_t num_rows,
+    const int64_t num_cols, const int64_t feat_len) {
+  const Idx rid =
+      blockIdx.x * blockDim.y + threadIdx.y;        // over vertices dimension
+  const Idx fid = (blockIdx.y * 64) + threadIdx.x;  // over feature dimension
+
+  if (rid < num_rows && fid < feat_len) {
+    const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1);
+    DType accum_0 = 0., accum_1 = 0.;
+
+    if (blockIdx.y != gridDim.y - 1) {  // fid + 32 < feat_len
+      for (Idx left = low; left < high; left += 32) {
+        if (left + 32 <= high) {
+#pragma unroll
+          for (Idx i = 0; i < 32; ++i) {
+            const Idx eid = left + i;
+            const Idx cid = __ldg(indices + eid);
+            const Idx offset = feat_len * cid + fid;
+            if (BinaryOp::use_rhs) {
+              accum_0 += BinaryOp::Call(ufeat + offset, efeat + eid);
+              accum_1 += BinaryOp::Call(ufeat + offset + 32, efeat + eid);
+            } else {
+              accum_0 += ufeat[offset];
+              accum_1 += ufeat[offset + 32];
+            }
+          }
+        } else {
+          for (Idx i = 0; left + i < high; ++i) {
+            const Idx eid = left + i;
+            const Idx cid = __ldg(indices + eid);
+            const Idx offset = feat_len * cid + fid;
+            if (BinaryOp::use_rhs) {
+              accum_0 += BinaryOp::Call(ufeat + offset, efeat + eid);
+              accum_1 += BinaryOp::Call(ufeat + offset + 32, efeat + eid);
+            } else {
+              accum_0 += ufeat[offset];
+              accum_1 += ufeat[offset + 32];
+            }
+          }
+        }
+
+        out[feat_len * rid + fid] = accum_0;
+        out[feat_len * rid + fid + 32] = accum_1;
+      }
+    } else {
+      const Idx fid_0 = fid < feat_len ? fid : 0,
+                fid_1 = fid + 32 < feat_len ? fid + 32 : 0;
+      for (int left = low; left < high; left += 32) {
+        if (left + 32 <= high) {
+#pragma unroll
+          for (int i = 0; i < 32; ++i) {
+            const Idx eid = left + i;
+            const Idx cid = __ldg(indices + eid);
+            const Idx offset = feat_len * cid;
+            if (BinaryOp::use_rhs) {
+              accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
+              accum_1 += BinaryOp::Call(ufeat + offset + fid_1, efeat + eid);
+            } else {
+              accum_0 += ufeat[offset + fid_0];
+              accum_1 += ufeat[offset + fid_1];
+            }
+          }
+        } else {
+          for (int i = 0; i + left < high; ++i) {
+            const Idx eid = left + i;
+            const Idx cid = __ldg(indices + eid);
+            const Idx offset = feat_len * cid;
+            if (BinaryOp::use_rhs) {
+              accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid);
+              accum_1 += BinaryOp::Call(ufeat + offset + fid_1, efeat + eid);
+            } else {
+              accum_0 += ufeat[offset + fid_0];
+              accum_1 += ufeat[offset + fid_1];
+            }
+          }
+        }
+
+        out[feat_len * rid + fid] = accum_0;
+        if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1;
+      }
+    }
+  }
+}
+
+template <typename Idx, typename DType, typename BinaryOp>
+void GESpMMCsr(
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    int64_t feat_len) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
+  const DType* ufeat_data = ufeat.Ptr<DType>();
+  const DType* efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int ntx = 32;
+  const int nty = 32;
+  const int nby = (feat_len + (ntx * 2) - 1) / (ntx * 2);
+  const int nbx = (csr.num_rows + nty - 1) / nty;
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  const int sh_mem_size = 0;
+
+  CUDA_KERNEL_CALL(
+      (GESpMMKernel<Idx, DType, BinaryOp>), nblks, nthrs, sh_mem_size, stream,
+      ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows,
+      csr.num_cols, feat_len);
+}
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_GE_SPMM_CUH_
diff --git a/src/array/cuda/labor_sampling.cu b/src/array/cuda/labor_sampling.cu
index c5076e62981d..3125a1bc3390 100644
--- a/src/array/cuda/labor_sampling.cu
+++ b/src/array/cuda/labor_sampling.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*!
  *   Copyright (c) 2022, NVIDIA Corporation
  *   Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
@@ -35,7 +36,7 @@
 #include <thrust/zip_function.h>
 
 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <limits>
 #include <numeric>
 #include <type_traits>
@@ -276,7 +277,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel(
     const FloatType* const ds, const FloatType* const d2s,
     const IdType* const indptr, const FloatType* const probs,
     const FloatType* const A, const IdType* const subindptr) {
-  typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
+  typedef hipcub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   __shared__ FloatType var_1_bcast[BLOCK_CTAS];
 
@@ -350,7 +351,7 @@ int log_size(const IdType size) {
 
 template <typename IdType, typename FloatType, typename exec_policy_t>
 void compute_importance_sampling_probabilities(
-    CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
+    CSRMatrix mat, const IdType hop_size, hipStream_t stream,
     const continuous_seed seed, const IdType num_rows, const IdType* indptr,
     const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
     const IdType* nids,
@@ -397,17 +398,17 @@ void compute_importance_sampling_probabilities(
         hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
         mat.indptr->dtype);
 
-    cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
+    hipcub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
 
     {
       std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
           nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
           stream));
 
       auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
 
-      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+      CUDA_CALL(hipcub::DeviceRadixSort::SortKeys(
           temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
           max_log_num_vertices, stream));
     }
@@ -417,13 +418,13 @@ void compute_importance_sampling_probabilities(
 
     {
       std::size_t temp_storage_bytes = 0;
-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
           nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
           hop_counts.get(), hop_unique_size.get(), hop_size, stream));
 
       auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
 
-      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+      CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode(
           temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
           hop_counts.get(), hop_unique_size.get(), hop_size, stream));
 
@@ -521,7 +522,7 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
   runtime::CUDAWorkspaceAllocator allocator(ctx);
 
   const auto stream = runtime::getCurrentCUDAStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
 
   auto device = runtime::DeviceAPI::Get(ctx);
 
@@ -568,11 +569,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
     auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
 
     size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
         nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
         TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
     auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce(
         temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
         e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
         stream));
@@ -585,11 +586,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
   IdType hop_size;
   {
     size_t prefix_temp_size = 0;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
         nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
         stream));
     auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
         temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
         stream));
 
@@ -618,11 +619,11 @@ std::pair<COOMatrix, FloatArray> CSRLaborSampling(
       auto modified_in_deg = thrust::make_transform_iterator(
           iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
       size_t prefix_temp_size = 0;
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
           nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
           num_rows + 1, stream));
       auto temp = allocator.alloc_unique<char>(prefix_temp_size);
-      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
           temp.get(), prefix_temp_size, modified_in_deg,
           subindptr_aligned.get(), num_rows + 1, stream));
 
diff --git a/src/array/cuda/labor_sampling.cu.prehip b/src/array/cuda/labor_sampling.cu.prehip
new file mode 100644
index 000000000000..c5076e62981d
--- /dev/null
+++ b/src/array/cuda/labor_sampling.cu.prehip
@@ -0,0 +1,833 @@
+/*!
+ *   Copyright (c) 2022, NVIDIA Corporation
+ *   Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file array/cuda/labor_sampling.cu
+ * @brief labor sampling
+ */
+
+#include <dgl/aten/coo.h>
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/shuffle.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
+
+#include <algorithm>
+#include <cub/cub.cuh>  // NOLINT
+#include <limits>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "../../array/cuda/utils.h"
+#include "../../random/continuous_seed.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "./functor.cuh"
+#include "./spmm.cuh"
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+using dgl::random::continuous_seed;
+
+constexpr int BLOCK_SIZE = 128;
+constexpr int CTA_SIZE = 128;
+constexpr double eps = 0.0001;
+
+namespace {
+
+template <typename IdType>
+struct TransformOp {
+  const IdType* idx_coo;
+  const IdType* rows;
+  const IdType* indptr;
+  const IdType* subindptr;
+  const IdType* indices;
+  const IdType* data_arr;
+  bool is_pinned;
+  __host__ __device__ auto operator()(IdType idx) {
+    const auto in_row = idx_coo[idx];
+    const auto row = rows[in_row];
+    const auto in_idx = indptr[in_row] + idx - subindptr[in_row];
+    const auto u = indices[is_pinned ? idx : in_idx];
+    const auto data = data_arr ? data_arr[in_idx] : in_idx;
+    return thrust::make_tuple(row, u, data);
+  }
+};
+
+template <
+    typename IdType, typename FloatType, typename probs_t, typename A_t,
+    typename B_t>
+struct TransformOpImp {
+  probs_t probs;
+  A_t A;
+  B_t B;
+  const IdType* idx_coo;
+  const IdType* rows;
+  const FloatType* cs;
+  const IdType* indptr;
+  const IdType* subindptr;
+  const IdType* indices;
+  const IdType* data_arr;
+  bool is_pinned;
+  __host__ __device__ auto operator()(IdType idx) {
+    const auto ps = probs[idx];
+    const auto in_row = idx_coo[idx];
+    const auto c = cs[in_row];
+    const auto row = rows[in_row];
+    const auto in_idx = indptr[in_row] + idx - subindptr[in_row];
+    const auto u = indices[is_pinned ? idx : in_idx];
+    const auto w = A[in_idx];
+    const auto w2 = B[in_idx];
+    const auto data = data_arr ? data_arr[in_idx] : in_idx;
+    return thrust::make_tuple(
+        in_row, row, u, data, w / min((FloatType)1, c * w2 * ps));
+  }
+};
+
+template <typename FloatType>
+struct StencilOp {
+  const FloatType* cs;
+  template <typename IdType>
+  __host__ __device__ auto operator()(
+      IdType in_row, FloatType ps, FloatType rnd) {
+    return rnd <= cs[in_row] * ps;
+  }
+};
+
+template <typename IdType, typename FloatType, typename ps_t, typename A_t>
+struct StencilOpFused {
+  const continuous_seed seed;
+  const IdType* idx_coo;
+  const FloatType* cs;
+  const ps_t probs;
+  const A_t A;
+  const IdType* subindptr;
+  const IdType* indptr;
+  const IdType* indices;
+  const IdType* nids;
+  bool is_pinned;
+  __device__ auto operator()(IdType idx) {
+    const auto in_row = idx_coo[idx];
+    const auto ps = probs[idx];
+    IdType rofs = idx - subindptr[in_row];
+    const auto in_idx = indptr[in_row] + rofs;
+    const auto u = indices[is_pinned ? idx : in_idx];
+    const auto t = nids ? nids[u] : u;  // t in the paper
+    // rolled random number r_t is a function of the random_seed and t
+    const float rnd = seed.uniform(t);
+    return rnd <= cs[in_row] * A[in_idx] * ps;
+  }
+};
+
+template <typename IdType, typename FloatType>
+struct TransformOpMean {
+  const IdType* ds;
+  const FloatType* ws;
+  __host__ __device__ auto operator()(IdType idx, FloatType ps) {
+    return ps * ds[idx] / ws[idx];
+  }
+};
+
+struct TransformOpMinWith1 {
+  template <typename FloatType>
+  __host__ __device__ auto operator()(FloatType x) {
+    return min((FloatType)1, x);
+  }
+};
+
+template <typename IdType>
+struct IndptrFunc {
+  const IdType* indptr;
+  const IdType* in_deg;
+  __host__ __device__ auto operator()(IdType row) {
+    return indptr[row] + (in_deg ? in_deg[row] : 0);
+  }
+};
+
+template <typename FloatType>
+struct SquareFunc {
+  __host__ __device__ auto operator()(FloatType x) {
+    return thrust::make_tuple(x, x * x);
+  }
+};
+
+struct TupleSum {
+  template <typename T>
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return thrust::make_tuple(
+        thrust::get<0>(a) + thrust::get<0>(b),
+        thrust::get<1>(a) + thrust::get<1>(b));
+  }
+};
+
+template <typename IdType, typename FloatType>
+struct DegreeFunc {
+  const IdType num_picks;
+  const IdType* rows;
+  const IdType* indptr;
+  IdType* in_deg;
+  IdType* inrow_indptr;
+  FloatType* cs;
+  __host__ __device__ auto operator()(IdType tIdx) {
+    const auto out_row = rows[tIdx];
+    const auto indptr_val = indptr[out_row];
+    const auto d = indptr[out_row + 1] - indptr_val;
+    in_deg[tIdx] = d;
+    inrow_indptr[tIdx] = indptr_val;
+    cs[tIdx] = num_picks / (FloatType)d;
+  }
+};
+
+template <typename IdType, typename FloatType>
+__global__ void _CSRRowWiseOneHopExtractorKernel(
+    const continuous_seed seed, const IdType hop_size,
+    const IdType* const indptr, const IdType* const subindptr,
+    const IdType* const indices, const IdType* const idx_coo,
+    const IdType* const nids, const FloatType* const A, FloatType* const rands,
+    IdType* const hop, FloatType* const A_l) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+
+  while (tx < hop_size) {
+    IdType rpos = idx_coo[tx];
+    IdType rofs = tx - subindptr[rpos];
+    const auto in_idx = indptr[rpos] + rofs;
+    const auto not_pinned = indices != hop;
+    const auto u = indices[not_pinned ? in_idx : tx];
+    if (not_pinned) hop[tx] = u;
+    const auto t = nids ? nids[u] : u;
+    if (A) A_l[tx] = A[in_idx];
+    // rolled random number r_t is a function of the random_seed and t
+    rands[tx] = (FloatType)seed.uniform(t);
+    tx += stride_x;
+  }
+}
+
+constexpr int CACHE_LINE_SIZE = 128;
+
+template <typename IdType>
+struct AlignmentFunc {
+  static_assert(CACHE_LINE_SIZE % sizeof(IdType) == 0);
+  const IdType* in_deg;
+  const int64_t* perm;
+  IdType num_rows;
+  __host__ __device__ auto operator()(IdType row) {
+    constexpr int num_elements = CACHE_LINE_SIZE / sizeof(IdType);
+    return in_deg[perm ? perm[row % num_rows] : row] + num_elements - 1;
+  }
+};
+
+template <typename IdType>
+__global__ void _CSRRowWiseOneHopExtractorAlignedKernel(
+    const IdType hop_size, const IdType num_rows, const IdType* const indptr,
+    const IdType* const subindptr, const IdType* const subindptr_aligned,
+    const IdType* const indices, IdType* const hop, const int64_t* const perm) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+
+  while (tx < hop_size) {
+    const IdType rpos_ =
+        dgl::cuda::_UpperBound(subindptr_aligned, num_rows, tx) - 1;
+    const IdType rpos = perm ? perm[rpos_] : rpos_;
+    const auto out_row = subindptr[rpos];
+    const auto d = subindptr[rpos + 1] - out_row;
+    const int offset =
+        ((uint64_t)(indices + indptr[rpos] - subindptr_aligned[rpos_]) %
+         CACHE_LINE_SIZE) /
+        sizeof(IdType);
+    const IdType rofs = tx - subindptr_aligned[rpos_] - offset;
+    if (rofs >= 0 && rofs < d) {
+      const auto in_idx = indptr[rpos] + rofs;
+      assert((uint64_t)(indices + in_idx - tx) % CACHE_LINE_SIZE == 0);
+      const auto u = indices[in_idx];
+      hop[out_row + rofs] = u;
+    }
+    tx += stride_x;
+  }
+}
+
+template <typename IdType, typename FloatType, int BLOCK_CTAS, int TILE_SIZE>
+__global__ void _CSRRowWiseLayerSampleDegreeKernel(
+    const IdType num_picks, const IdType num_rows, FloatType* const cs,
+    const FloatType* const ds, const FloatType* const d2s,
+    const IdType* const indptr, const FloatType* const probs,
+    const FloatType* const A, const IdType* const subindptr) {
+  typedef cub::BlockReduce<FloatType, BLOCK_SIZE> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ FloatType var_1_bcast[BLOCK_CTAS];
+
+  // we assign one warp per row
+  assert(blockDim.x == CTA_SIZE);
+  assert(blockDim.y == BLOCK_CTAS);
+
+  IdType out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
+  const auto last_row =
+      min(static_cast<IdType>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  constexpr FloatType ONE = 1;
+
+  while (out_row < last_row) {
+    const auto in_row_start = indptr[out_row];
+    const auto out_row_start = subindptr[out_row];
+
+    const IdType degree = subindptr[out_row + 1] - out_row_start;
+
+    if (degree > 0) {
+      // stands for k in in arXiv:2210.13339, i.e. fanout
+      const auto k = min(num_picks, degree);
+      // slightly better than NS
+      const FloatType d_ = ds ? ds[out_row] : degree;
+      // stands for right handside of Equation (22) in arXiv:2210.13339
+      FloatType var_target =
+          d_ * d_ / k + (ds ? d2s[out_row] - d_ * d_ / degree : 0);
+
+      auto c = cs[out_row];
+      const int num_valid = min(degree, (IdType)CTA_SIZE);
+      // stands for left handside of Equation (22) in arXiv:2210.13339
+      FloatType var_1;
+      do {
+        var_1 = 0;
+        if (A) {
+          for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) {
+            const auto w = A[in_row_start + idx];
+            const auto ps = probs ? probs[out_row_start + idx] : w;
+            var_1 += w > 0 ? w * w / min(ONE, c * ps) : 0;
+          }
+        } else {
+          for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) {
+            const auto ps = probs[out_row_start + idx];
+            var_1 += 1 / min(ONE, c * ps);
+          }
+        }
+        var_1 = BlockReduce(temp_storage).Sum(var_1, num_valid);
+        if (threadIdx.x == 0) var_1_bcast[threadIdx.y] = var_1;
+        __syncthreads();
+        var_1 = var_1_bcast[threadIdx.y];
+
+        c *= var_1 / var_target;
+      } while (min(var_1, var_target) / max(var_1, var_target) < 1 - eps);
+
+      if (threadIdx.x == 0) cs[out_row] = c;
+    }
+
+    out_row += BLOCK_CTAS;
+  }
+}
+
+}  // namespace
+
+template <typename IdType>
+int log_size(const IdType size) {
+  if (size <= 0) return 0;
+  for (int i = 0; i < static_cast<int>(sizeof(IdType)) * 8; i++)
+    if (((size - 1) >> i) == 0) return i;
+  return sizeof(IdType) * 8;
+}
+
+template <typename IdType, typename FloatType, typename exec_policy_t>
+void compute_importance_sampling_probabilities(
+    CSRMatrix mat, const IdType hop_size, cudaStream_t stream,
+    const continuous_seed seed, const IdType num_rows, const IdType* indptr,
+    const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr,
+    const IdType* nids,
+    FloatArray cs_arr,  // holds the computed cs values, has size num_rows
+    const bool weighted, const FloatType* A, const FloatType* ds,
+    const FloatType* d2s, const IdType num_picks, DGLContext ctx,
+    const runtime::CUDAWorkspaceAllocator& allocator,
+    const exec_policy_t& exec_policy, const int importance_sampling,
+    IdType* hop_1,  // holds the contiguous one-hop neighborhood, has size |E|
+    FloatType* rands,  // holds the rolled random numbers r_t for each edge, has
+                       // size |E|
+    FloatType* probs_found) {  // holds the computed pi_t values for each edge,
+                               // has size |E|
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto idx_coo = idx_coo_arr.Ptr<IdType>();
+  auto cs = cs_arr.Ptr<FloatType>();
+  FloatArray A_l_arr = weighted
+                           ? NewFloatArray(hop_size, ctx, sizeof(FloatType) * 8)
+                           : NullArray();
+  auto A_l = A_l_arr.Ptr<FloatType>();
+
+  const int max_log_num_vertices = log_size(mat.num_cols);
+
+  {  // extracts the onehop neighborhood cols to a contiguous range into hop_1
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseOneHopExtractorKernel<IdType, FloatType>), grid, block, 0,
+        stream, seed, hop_size, indptr, subindptr, indices, idx_coo, nids,
+        weighted ? A : nullptr, rands, hop_1, A_l);
+  }
+  int64_t hop_uniq_size = 0;
+  IdArray hop_new_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+  auto hop_new = hop_new_arr.Ptr<IdType>();
+  auto hop_unique = allocator.alloc_unique<IdType>(hop_size);
+  // After this block, hop_unique holds the unique set of one-hop neighborhood
+  // and hop_new holds the relabeled hop_1, idx_coo already holds relabeled
+  // destination. hop_unique[hop_new] == hop_1 holds
+  {
+    auto hop_2 = allocator.alloc_unique<IdType>(hop_size);
+    auto hop_3 = allocator.alloc_unique<IdType>(hop_size);
+
+    device->CopyDataFromTo(
+        hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx,
+        mat.indptr->dtype);
+
+    cub::DoubleBuffer<IdType> hop_b(hop_2.get(), hop_3.get());
+
+    {
+      std::size_t temp_storage_bytes = 0;
+      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+          nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices,
+          stream));
+
+      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
+
+      CUDA_CALL(cub::DeviceRadixSort::SortKeys(
+          temp.get(), temp_storage_bytes, hop_b, hop_size, 0,
+          max_log_num_vertices, stream));
+    }
+
+    auto hop_counts = allocator.alloc_unique<IdType>(hop_size + 1);
+    auto hop_unique_size = allocator.alloc_unique<int64_t>(1);
+
+    {
+      std::size_t temp_storage_bytes = 0;
+      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+          nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(),
+          hop_counts.get(), hop_unique_size.get(), hop_size, stream));
+
+      auto temp = allocator.alloc_unique<char>(temp_storage_bytes);
+
+      CUDA_CALL(cub::DeviceRunLengthEncode::Encode(
+          temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(),
+          hop_counts.get(), hop_unique_size.get(), hop_size, stream));
+
+      device->CopyDataFromTo(
+          hop_unique_size.get(), 0, &hop_uniq_size, 0, sizeof(hop_uniq_size),
+          ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+    }
+
+    thrust::lower_bound(
+        exec_policy, hop_unique.get(), hop_unique.get() + hop_uniq_size, hop_1,
+        hop_1 + hop_size, hop_new);
+  }
+
+  // @todo Consider creating a CSC because the SpMV will be done multiple times.
+  COOMatrix rmat(
+      num_rows, hop_uniq_size, idx_coo_arr, hop_new_arr, NullArray(), true,
+      mat.sorted);
+
+  BcastOff bcast_off;
+  bcast_off.use_bcast = false;
+  bcast_off.out_len = 1;
+  bcast_off.lhs_len = 1;
+  bcast_off.rhs_len = 1;
+
+  FloatArray probs_arr =
+      NewFloatArray(hop_uniq_size, ctx, sizeof(FloatType) * 8);
+  auto probs_1 = probs_arr.Ptr<FloatType>();
+  FloatArray probs_arr_2 =
+      NewFloatArray(hop_uniq_size, ctx, sizeof(FloatType) * 8);
+  auto probs = probs_arr_2.Ptr<FloatType>();
+  auto arg_u = NewIdArray(hop_uniq_size, ctx, sizeof(IdType) * 8);
+  auto arg_e = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+
+  double prev_ex_nodes = hop_uniq_size;
+
+  for (int iters = 0; iters < importance_sampling || importance_sampling < 0;
+       iters++) {
+    if (weighted && iters == 0) {
+      cuda::SpMMCoo<
+          IdType, FloatType, cuda::binary::Mul<FloatType>,
+          cuda::reduce::Max<IdType, FloatType, true>>(
+          bcast_off, rmat, cs_arr, A_l_arr, probs_arr_2, arg_u, arg_e);
+    } else {
+      cuda::SpMMCoo<
+          IdType, FloatType, cuda::binary::CopyLhs<FloatType>,
+          cuda::reduce::Max<IdType, FloatType, true>>(
+          bcast_off, rmat, cs_arr, NullArray(), iters ? probs_arr : probs_arr_2,
+          arg_u, arg_e);
+    }
+
+    if (iters)
+      thrust::transform(
+          exec_policy, probs_1, probs_1 + hop_uniq_size, probs, probs,
+          thrust::multiplies<FloatType>{});
+
+    thrust::gather(
+        exec_policy, hop_new, hop_new + hop_size, probs, probs_found);
+
+    {
+      constexpr int BLOCK_CTAS = BLOCK_SIZE / CTA_SIZE;
+      // the number of rows each thread block will cover
+      constexpr int TILE_SIZE = BLOCK_CTAS;
+      const dim3 block(CTA_SIZE, BLOCK_CTAS);
+      const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+      CUDA_KERNEL_CALL(
+          (_CSRRowWiseLayerSampleDegreeKernel<
+              IdType, FloatType, BLOCK_CTAS, TILE_SIZE>),
+          grid, block, 0, stream, (IdType)num_picks, num_rows, cs,
+          weighted ? ds : nullptr, weighted ? d2s : nullptr, indptr,
+          probs_found, A, subindptr);
+    }
+
+    {
+      auto probs_min_1 =
+          thrust::make_transform_iterator(probs, TransformOpMinWith1{});
+      const double cur_ex_nodes = thrust::reduce(
+          exec_policy, probs_min_1, probs_min_1 + hop_uniq_size, 0.0);
+      if (cur_ex_nodes / prev_ex_nodes >= 1 - eps) break;
+      prev_ex_nodes = cur_ex_nodes;
+    }
+  }
+}
+
+/////////////////////////////// CSR ///////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType, typename FloatType>
+std::pair<COOMatrix, FloatArray> CSRLaborSampling(
+    CSRMatrix mat, IdArray rows_arr, const int64_t num_picks,
+    FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr,
+    float seed2_contribution, IdArray NIDs) {
+  const bool weighted = !IsNullArray(prob_arr);
+
+  const auto& ctx = rows_arr->ctx;
+
+  runtime::CUDAWorkspaceAllocator allocator(ctx);
+
+  const auto stream = runtime::getCurrentCUDAStream();
+  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+
+  auto device = runtime::DeviceAPI::Get(ctx);
+
+  const IdType num_rows = rows_arr->shape[0];
+  IdType* const rows = rows_arr.Ptr<IdType>();
+  IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr<IdType>();
+  FloatType* const A = prob_arr.Ptr<FloatType>();
+
+  IdType* const indptr_ = mat.indptr.Ptr<IdType>();
+  IdType* const indices_ = mat.indices.Ptr<IdType>();
+  IdType* const data = CSRHasData(mat) ? mat.data.Ptr<IdType>() : nullptr;
+
+  // Read indptr only once in case it is pinned and access is slow.
+  auto indptr = allocator.alloc_unique<IdType>(num_rows);
+  // compute in-degrees
+  auto in_deg = allocator.alloc_unique<IdType>(num_rows + 1);
+  // cs stands for c_s in arXiv:2210.13339
+  FloatArray cs_arr = NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8);
+  auto cs = cs_arr.Ptr<FloatType>();
+  // ds stands for A_{*s} in arXiv:2210.13339
+  FloatArray ds_arr = weighted
+                          ? NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8)
+                          : NullArray();
+  auto ds = ds_arr.Ptr<FloatType>();
+  // d2s stands for (A^2)_{*s} in arXiv:2210.13339, ^2 is elementwise.
+  FloatArray d2s_arr = weighted
+                           ? NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8)
+                           : NullArray();
+  auto d2s = d2s_arr.Ptr<FloatType>();
+
+  thrust::counting_iterator<IdType> iota(0);
+  thrust::for_each(
+      exec_policy, iota, iota + num_rows,
+      DegreeFunc<IdType, FloatType>{
+          (IdType)num_picks, rows, indptr_, in_deg.get(), indptr.get(), cs});
+
+  if (weighted) {
+    auto b_offsets = thrust::make_transform_iterator(
+        iota, IndptrFunc<IdType>{indptr.get(), nullptr});
+    auto e_offsets = thrust::make_transform_iterator(
+        iota, IndptrFunc<IdType>{indptr.get(), in_deg.get()});
+
+    auto A_A2 = thrust::make_transform_iterator(A, SquareFunc<FloatType>{});
+    auto ds_d2s = thrust::make_zip_iterator(ds, d2s);
+
+    size_t prefix_temp_size = 0;
+    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+        nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets,
+        TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream));
+    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Reduce(
+        temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets,
+        e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0),
+        stream));
+  }
+
+  // fill subindptr
+  IdArray subindptr_arr = NewIdArray(num_rows + 1, ctx, sizeof(IdType) * 8);
+  auto subindptr = subindptr_arr.Ptr<IdType>();
+
+  IdType hop_size;
+  {
+    size_t prefix_temp_size = 0;
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+        nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
+        stream));
+    auto temp = allocator.alloc_unique<char>(prefix_temp_size);
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+        temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1,
+        stream));
+
+    device->CopyDataFromTo(
+        subindptr, num_rows * sizeof(hop_size), &hop_size, 0, sizeof(hop_size),
+        ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  }
+  IdArray hop_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+  CSRMatrix smat(
+      num_rows, mat.num_cols, subindptr_arr, hop_arr, NullArray(), mat.sorted);
+  // @todo Consider fusing CSRToCOO into StencilOpFused kernel
+  auto smatcoo = CSRToCOO(smat, false);
+
+  auto idx_coo_arr = smatcoo.row;
+  auto idx_coo = idx_coo_arr.Ptr<IdType>();
+
+  auto hop_1 = hop_arr.Ptr<IdType>();
+  const bool is_pinned = mat.indices.IsPinned();
+  if (is_pinned) {
+    const auto res = Sort(rows_arr, log_size(mat.num_rows));
+    const int64_t* perm = static_cast<int64_t*>(res.second->data);
+
+    IdType hop_size;  // Shadows the original one as this is temporary
+    auto subindptr_aligned = allocator.alloc_unique<IdType>(num_rows + 1);
+    {
+      auto modified_in_deg = thrust::make_transform_iterator(
+          iota, AlignmentFunc<IdType>{in_deg.get(), perm, num_rows});
+      size_t prefix_temp_size = 0;
+      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+          nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(),
+          num_rows + 1, stream));
+      auto temp = allocator.alloc_unique<char>(prefix_temp_size);
+      CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+          temp.get(), prefix_temp_size, modified_in_deg,
+          subindptr_aligned.get(), num_rows + 1, stream));
+
+      device->CopyDataFromTo(
+          subindptr_aligned.get(), num_rows * sizeof(hop_size), &hop_size, 0,
+          sizeof(hop_size), ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+    }
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseOneHopExtractorAlignedKernel<IdType>), grid, block, 0,
+        stream, hop_size, num_rows, indptr.get(), subindptr,
+        subindptr_aligned.get(), indices_, hop_1, perm);
+  }
+  const auto indices = is_pinned ? hop_1 : indices_;
+
+  auto rands =
+      allocator.alloc_unique<FloatType>(importance_sampling ? hop_size : 1);
+  auto probs_found =
+      allocator.alloc_unique<FloatType>(importance_sampling ? hop_size : 1);
+
+  if (weighted) {
+    // Recompute c for weighted graphs.
+    constexpr int BLOCK_CTAS = BLOCK_SIZE / CTA_SIZE;
+    // the number of rows each thread block will cover
+    constexpr int TILE_SIZE = BLOCK_CTAS;
+    const dim3 block(CTA_SIZE, BLOCK_CTAS);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseLayerSampleDegreeKernel<
+            IdType, FloatType, BLOCK_CTAS, TILE_SIZE>),
+        grid, block, 0, stream, (IdType)num_picks, num_rows, cs, ds, d2s,
+        indptr.get(), nullptr, A, subindptr);
+  }
+
+  const continuous_seed random_seed =
+      IsNullArray(random_seed_arr)
+          ? continuous_seed(RandomEngine::ThreadLocal()->RandInt(1000000000))
+          : continuous_seed(random_seed_arr, seed2_contribution);
+
+  if (importance_sampling)
+    compute_importance_sampling_probabilities<
+        IdType, FloatType, decltype(exec_policy)>(
+        mat, hop_size, stream, random_seed, num_rows, indptr.get(), subindptr,
+        indices, idx_coo_arr, nids, cs_arr, weighted, A, ds, d2s,
+        (IdType)num_picks, ctx, allocator, exec_policy, importance_sampling,
+        hop_1, rands.get(), probs_found.get());
+
+  IdArray picked_row = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+  IdArray picked_col = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx = NewIdArray(hop_size, ctx, sizeof(IdType) * 8);
+  FloatArray picked_imp =
+      importance_sampling || weighted
+          ? NewFloatArray(hop_size, ctx, sizeof(FloatType) * 8)
+          : NullArray();
+
+  IdType* const picked_row_data = picked_row.Ptr<IdType>();
+  IdType* const picked_col_data = picked_col.Ptr<IdType>();
+  IdType* const picked_idx_data = picked_idx.Ptr<IdType>();
+  FloatType* const picked_imp_data = picked_imp.Ptr<FloatType>();
+
+  auto picked_inrow = allocator.alloc_unique<IdType>(
+      importance_sampling || weighted ? hop_size : 1);
+
+  // Sample edges here
+  IdType num_edges;
+  {
+    thrust::constant_iterator<FloatType> one(1);
+    if (importance_sampling) {
+      auto output = thrust::make_zip_iterator(
+          picked_inrow.get(), picked_row_data, picked_col_data, picked_idx_data,
+          picked_imp_data);
+      if (weighted) {
+        auto transformed_output = thrust::make_transform_output_iterator(
+            output,
+            TransformOpImp<
+                IdType, FloatType, FloatType*, FloatType*, decltype(one)>{
+                probs_found.get(), A, one, idx_coo, rows, cs, indptr.get(),
+                subindptr, indices, data, is_pinned});
+        auto stencil =
+            thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get());
+        num_edges =
+            thrust::copy_if(
+                exec_policy, iota, iota + hop_size, stencil, transformed_output,
+                thrust::make_zip_function(StencilOp<FloatType>{cs})) -
+            transformed_output;
+      } else {
+        auto transformed_output = thrust::make_transform_output_iterator(
+            output,
+            TransformOpImp<
+                IdType, FloatType, FloatType*, decltype(one), decltype(one)>{
+                probs_found.get(), one, one, idx_coo, rows, cs, indptr.get(),
+                subindptr, indices, data, is_pinned});
+        auto stencil =
+            thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get());
+        num_edges =
+            thrust::copy_if(
+                exec_policy, iota, iota + hop_size, stencil, transformed_output,
+                thrust::make_zip_function(StencilOp<FloatType>{cs})) -
+            transformed_output;
+      }
+    } else {
+      if (weighted) {
+        auto output = thrust::make_zip_iterator(
+            picked_inrow.get(), picked_row_data, picked_col_data,
+            picked_idx_data, picked_imp_data);
+        auto transformed_output = thrust::make_transform_output_iterator(
+            output,
+            TransformOpImp<
+                IdType, FloatType, decltype(one), FloatType*, FloatType*>{
+                one, A, A, idx_coo, rows, cs, indptr.get(), subindptr, indices,
+                data, is_pinned});
+        const auto pred =
+            StencilOpFused<IdType, FloatType, decltype(one), FloatType*>{
+                random_seed, idx_coo,      cs,      one,  A,
+                subindptr,   indptr.get(), indices, nids, is_pinned};
+        num_edges = thrust::copy_if(
+                        exec_policy, iota, iota + hop_size, iota,
+                        transformed_output, pred) -
+                    transformed_output;
+      } else {
+        auto output = thrust::make_zip_iterator(
+            picked_row_data, picked_col_data, picked_idx_data);
+        auto transformed_output = thrust::make_transform_output_iterator(
+            output, TransformOp<IdType>{
+                        idx_coo, rows, indptr.get(), subindptr, indices, data,
+                        is_pinned});
+        const auto pred =
+            StencilOpFused<IdType, FloatType, decltype(one), decltype(one)>{
+                random_seed, idx_coo,      cs,      one,  one,
+                subindptr,   indptr.get(), indices, nids, is_pinned};
+        num_edges = thrust::copy_if(
+                        exec_policy, iota, iota + hop_size, iota,
+                        transformed_output, pred) -
+                    transformed_output;
+      }
+    }
+  }
+
+  // Normalize edge weights here
+  if (importance_sampling || weighted) {
+    thrust::constant_iterator<IdType> one(1);
+    // contains degree information
+    auto ds = allocator.alloc_unique<IdType>(num_rows);
+    // contains sum of edge weights
+    auto ws = allocator.alloc_unique<FloatType>(num_rows);
+    // contains degree information only for vertices with nonzero degree
+    auto ds_2 = allocator.alloc_unique<IdType>(num_rows);
+    // contains sum of edge weights only for vertices with nonzero degree
+    auto ws_2 = allocator.alloc_unique<FloatType>(num_rows);
+    auto output_ = thrust::make_zip_iterator(ds.get(), ws.get());
+    // contains row ids only for vertices with nonzero degree
+    auto keys = allocator.alloc_unique<IdType>(num_rows);
+    auto input = thrust::make_zip_iterator(one, picked_imp_data);
+    auto new_end = thrust::reduce_by_key(
+        exec_policy, picked_inrow.get(), picked_inrow.get() + num_edges, input,
+        keys.get(), output_, thrust::equal_to<IdType>{}, TupleSum{});
+    {
+      thrust::constant_iterator<IdType> zero_int(0);
+      thrust::constant_iterator<FloatType> zero_float(0);
+      auto input = thrust::make_zip_iterator(zero_int, zero_float);
+      auto output = thrust::make_zip_iterator(ds_2.get(), ws_2.get());
+      thrust::copy(exec_policy, input, input + num_rows, output);
+      {
+        const auto num_rows_2 = new_end.first - keys.get();
+        thrust::scatter(
+            exec_policy, output_, output_ + num_rows_2, keys.get(), output);
+      }
+    }
+    {
+      auto input =
+          thrust::make_zip_iterator(picked_inrow.get(), picked_imp_data);
+      auto transformed_input = thrust::make_transform_iterator(
+          input, thrust::make_zip_function(TransformOpMean<IdType, FloatType>{
+                     ds_2.get(), ws_2.get()}));
+      thrust::copy(
+          exec_policy, transformed_input, transformed_input + num_edges,
+          picked_imp_data);
+    }
+  }
+
+  picked_row = picked_row.CreateView({num_edges}, picked_row->dtype);
+  picked_col = picked_col.CreateView({num_edges}, picked_col->dtype);
+  picked_idx = picked_idx.CreateView({num_edges}, picked_idx->dtype);
+  if (importance_sampling || weighted)
+    picked_imp = picked_imp.CreateView({num_edges}, picked_imp->dtype);
+
+  return std::make_pair(
+      COOMatrix(mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx),
+      picked_imp);
+}
+
+template std::pair<COOMatrix, FloatArray>
+CSRLaborSampling<kDGLCUDA, int32_t, float>(
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
+template std::pair<COOMatrix, FloatArray>
+CSRLaborSampling<kDGLCUDA, int64_t, float>(
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
+template std::pair<COOMatrix, FloatArray>
+CSRLaborSampling<kDGLCUDA, int32_t, double>(
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
+template std::pair<COOMatrix, FloatArray>
+CSRLaborSampling<kDGLCUDA, int64_t, double>(
+    CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/macro.cuh b/src/array/cuda/macro.cuh
index ad24a9445273..ea7fb90ac1a0 100644
--- a/src/array/cuda/macro.cuh
+++ b/src/array/cuda/macro.cuh
@@ -30,14 +30,14 @@
       const auto device = runtime::DeviceAPI::Get(ctx);                       \
       (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
           ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
           (LHS_OFF), &info.lhs_offset[0],                                     \
-          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \
       (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
           ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
-      CUDA_CALL(cudaMemcpy(                                                   \
+      CUDA_CALL(hipMemcpy(                                                   \
           (RHS_OFF), &info.rhs_offset[0],                                     \
-          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
+          sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \
       if ((EDGE_MAP)) {                                                       \
         constexpr bool UseIdx = true;                                         \
         { __VA_ARGS__ }                                                       \
diff --git a/src/array/cuda/macro.cuh.prehip b/src/array/cuda/macro.cuh.prehip
new file mode 100644
index 000000000000..ad24a9445273
--- /dev/null
+++ b/src/array/cuda/macro.cuh.prehip
@@ -0,0 +1,53 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/macro.cuh
+ * @brief Macro to call SPMM/SDDMM cuda kernels.
+ */
+#ifndef DGL_ARRAY_CUDA_MACRO_CUH_
+#define DGL_ARRAY_CUDA_MACRO_CUH_
+
+///////////////////////// Dispatchers //////////////////////////
+
+/* Macro used for switching between broadcasting and non-broadcasting kernels.
+ * It also copies the auxiliary information for calculating broadcasting offsets
+ * to GPU.
+ */
+#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...)     \
+  do {                                                                        \
+    const BcastOff &info = (BCAST);                                           \
+    if (!info.use_bcast) {                                                    \
+      constexpr bool UseBcast = false;                                        \
+      if ((EDGE_MAP)) {                                                       \
+        constexpr bool UseIdx = true;                                         \
+        { __VA_ARGS__ }                                                       \
+      } else {                                                                \
+        constexpr bool UseIdx = false;                                        \
+        { __VA_ARGS__ }                                                       \
+      }                                                                       \
+    } else {                                                                  \
+      constexpr bool UseBcast = true;                                         \
+      const DGLContext ctx = (CTX);                                           \
+      const auto device = runtime::DeviceAPI::Get(ctx);                       \
+      (LHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
+          ctx, sizeof(int64_t) * info.lhs_offset.size()));                    \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (LHS_OFF), &info.lhs_offset[0],                                     \
+          sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \
+      (RHS_OFF) = static_cast<int64_t *>(device->AllocWorkspace(              \
+          ctx, sizeof(int64_t) * info.rhs_offset.size()));                    \
+      CUDA_CALL(cudaMemcpy(                                                   \
+          (RHS_OFF), &info.rhs_offset[0],                                     \
+          sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \
+      if ((EDGE_MAP)) {                                                       \
+        constexpr bool UseIdx = true;                                         \
+        { __VA_ARGS__ }                                                       \
+      } else {                                                                \
+        constexpr bool UseIdx = false;                                        \
+        { __VA_ARGS__ }                                                       \
+      }                                                                       \
+      device->FreeWorkspace(ctx, (LHS_OFF));                                  \
+      device->FreeWorkspace(ctx, (RHS_OFF));                                  \
+    }                                                                         \
+  } while (0)
+
+#endif  // DGL_ARRAY_CUDA_MACRO_CUH_
diff --git a/src/array/cuda/negative_sampling.cu b/src/array/cuda/negative_sampling.cu
index 2cf2623306b1..041982e040f2 100644
--- a/src/array/cuda/negative_sampling.cu
+++ b/src/array/cuda/negative_sampling.cu
@@ -1,15 +1,16 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file array/cuda/negative_sampling.cu
  * @brief rowwise sampling
  */
 
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/array_iterator.h>
 #include <dgl/random.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -31,9 +32,9 @@ __global__ void _GlobalUniformNegativeSamplingKernel(
   int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
   const int stride_x = gridDim.x * blockDim.x;
 
-  curandStatePhilox4_32_10_t
+  hiprandStatePhilox4_32_10_t
       rng;  // this allows generating 4 32-bit ints at a time
-  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
 
   while (tx < num_samples) {
     for (int i = 0; i < num_trials; ++i) {
@@ -88,7 +89,7 @@ struct IsNotMinusOne {
 template <typename IdType>
 void SortOrderedPairs(
     runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
-    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) {
   // Sort ordered pairs in lexicographical order by two radix sorts since
   // cub's radix sorts are stable.
   // We need a 2*n auxiliary storage to store the results form the first radix
@@ -98,21 +99,21 @@ void SortOrderedPairs(
   void* tmp2 = nullptr;
 
   // Radix sort by minor key first, reorder the major key in the progress.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
       stream));
   tmp1 = device->AllocWorkspace(ctx, s1);
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
       stream));
 
   // Radix sort by major key next.
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
       stream));
   tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
                    : tmp1;  // reuse buffer if s2 <= s1
-  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+  CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
       tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
       stream));
 
@@ -141,7 +142,7 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
   IdType* out_row_data = out_row.Ptr<IdType>();
   IdType* out_col_data = out_col.Ptr<IdType>();
   auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int nt = cuda::FindNumThreads(num_actual_samples);
   const int nb = (num_actual_samples + nt - 1) / nt;
   std::pair<IdArray, IdArray> result;
@@ -159,11 +160,11 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
   IsNotMinusOne<IdType> op;
   PairIterator<IdType> begin(row_data, col_data);
   PairIterator<IdType> out_begin(out_row_data, out_col_data);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
       nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
       stream));
   void* tmp = device->AllocWorkspace(ctx, tmp_size);
-  CUDA_CALL(cub::DeviceSelect::If(
+  CUDA_CALL(hipcub::DeviceSelect::If(
       tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
       stream));
   num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
@@ -181,13 +182,13 @@ std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
 
     size_t tmp_size_unique = 0;
     void* tmp_unique = nullptr;
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
         nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
         num_out, stream));
     tmp_unique = (tmp_size_unique > tmp_size)
                      ? device->AllocWorkspace(ctx, tmp_size_unique)
                      : tmp;  // reuse buffer
-    CUDA_CALL(cub::DeviceSelect::Unique(
+    CUDA_CALL(hipcub::DeviceSelect::Unique(
         tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
         num_out, stream));
     num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
diff --git a/src/array/cuda/negative_sampling.cu.prehip b/src/array/cuda/negative_sampling.cu.prehip
new file mode 100644
index 000000000000..2cf2623306b1
--- /dev/null
+++ b/src/array/cuda/negative_sampling.cu.prehip
@@ -0,0 +1,220 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/cuda/negative_sampling.cu
+ * @brief rowwise sampling
+ */
+
+#include <curand_kernel.h>
+#include <dgl/array.h>
+#include <dgl/array_iterator.h>
+#include <dgl/random.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+using namespace dgl::runtime;
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+namespace {
+
+template <typename IdType>
+__global__ void _GlobalUniformNegativeSamplingKernel(
+    const IdType* __restrict__ indptr, const IdType* __restrict__ indices,
+    IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row,
+    int64_t num_col, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, int32_t random_seed) {
+  int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+
+  curandStatePhilox4_32_10_t
+      rng;  // this allows generating 4 32-bit ints at a time
+  curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+
+  while (tx < num_samples) {
+    for (int i = 0; i < num_trials; ++i) {
+      uint4 result = curand4(&rng);
+      // Turns out that result.x is always 0 with the above RNG.
+      uint64_t y_hi = result.y >> 16;
+      uint64_t y_lo = result.y & 0xFFFF;
+      uint64_t z = static_cast<uint64_t>(result.z);
+      uint64_t w = static_cast<uint64_t>(result.w);
+      int64_t u = static_cast<int64_t>(((y_lo << 32L) | z) % num_row);
+      int64_t v = static_cast<int64_t>(((y_hi << 32L) | w) % num_col);
+
+      if (exclude_self_loops && (u == v)) continue;
+
+      // binary search of v among indptr[u:u+1]
+      int64_t b = indptr[u], e = indptr[u + 1] - 1;
+      bool found = false;
+      while (b <= e) {
+        int64_t m = (b + e) / 2;
+        if (indices[m] == v) {
+          found = true;
+          break;
+        } else if (indices[m] < v) {
+          b = m + 1;
+        } else {
+          e = m - 1;
+        }
+      }
+
+      if (!found) {
+        row[tx] = u;
+        col[tx] = v;
+        break;
+      }
+    }
+
+    tx += stride_x;
+  }
+}
+
+template <typename DType>
+struct IsNotMinusOne {
+  __device__ __forceinline__ bool operator()(const std::pair<DType, DType>& a) {
+    return a.first != -1;
+  }
+};
+
+/**
+ * @brief Sort ordered pairs in ascending order, using \a tmp_major and \a
+ * tmp_minor as temporary buffers, each with \a n elements.
+ */
+template <typename IdType>
+void SortOrderedPairs(
+    runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor,
+    IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) {
+  // Sort ordered pairs in lexicographical order by two radix sorts since
+  // cub's radix sorts are stable.
+  // We need a 2*n auxiliary storage to store the results form the first radix
+  // sort.
+  size_t s1 = 0, s2 = 0;
+  void* tmp1 = nullptr;
+  void* tmp2 = nullptr;
+
+  // Radix sort by minor key first, reorder the major key in the progress.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));
+  tmp1 = device->AllocWorkspace(ctx, s1);
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8,
+      stream));
+
+  // Radix sort by major key next.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
+      stream));
+  tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2)
+                   : tmp1;  // reuse buffer if s2 <= s1
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+      tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8,
+      stream));
+
+  if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2);
+  device->FreeWorkspace(ctx, tmp1);
+}
+
+};  // namespace
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling(
+    const CSRMatrix& csr, int64_t num_samples, int num_trials,
+    bool exclude_self_loops, bool replace, double redundancy) {
+  auto ctx = csr.indptr->ctx;
+  auto dtype = csr.indptr->dtype;
+  const int64_t num_row = csr.num_rows;
+  const int64_t num_col = csr.num_cols;
+  const int64_t num_actual_samples =
+      static_cast<int64_t>(num_samples * (1 + redundancy));
+  IdArray row = Full<IdType>(-1, num_actual_samples, ctx);
+  IdArray col = Full<IdType>(-1, num_actual_samples, ctx);
+  IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx);
+  IdArray out_col = IdArray::Empty({num_actual_samples}, dtype, ctx);
+  IdType* row_data = row.Ptr<IdType>();
+  IdType* col_data = col.Ptr<IdType>();
+  IdType* out_row_data = out_row.Ptr<IdType>();
+  IdType* out_col_data = out_col.Ptr<IdType>();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int nt = cuda::FindNumThreads(num_actual_samples);
+  const int nb = (num_actual_samples + nt - 1) / nt;
+  std::pair<IdArray, IdArray> result;
+  int64_t num_out;
+
+  CUDA_KERNEL_CALL(
+      _GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream,
+      csr.indptr.Ptr<IdType>(), csr.indices.Ptr<IdType>(), row_data, col_data,
+      num_row, num_col, num_actual_samples, num_trials, exclude_self_loops,
+      RandomEngine::ThreadLocal()->RandInt32());
+
+  size_t tmp_size = 0;
+  int64_t* num_out_cuda =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+  IsNotMinusOne<IdType> op;
+  PairIterator<IdType> begin(row_data, col_data);
+  PairIterator<IdType> out_begin(out_row_data, out_col_data);
+  CUDA_CALL(cub::DeviceSelect::If(
+      nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
+  void* tmp = device->AllocWorkspace(ctx, tmp_size);
+  CUDA_CALL(cub::DeviceSelect::If(
+      tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op,
+      stream));
+  num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
+
+  if (!replace) {
+    IdArray unique_row = IdArray::Empty({num_out}, dtype, ctx);
+    IdArray unique_col = IdArray::Empty({num_out}, dtype, ctx);
+    IdType* unique_row_data = unique_row.Ptr<IdType>();
+    IdType* unique_col_data = unique_col.Ptr<IdType>();
+    PairIterator<IdType> unique_begin(unique_row_data, unique_col_data);
+
+    SortOrderedPairs(
+        device, ctx, out_row_data, out_col_data, unique_row_data,
+        unique_col_data, num_out, stream);
+
+    size_t tmp_size_unique = 0;
+    void* tmp_unique = nullptr;
+    CUDA_CALL(cub::DeviceSelect::Unique(
+        nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
+        num_out, stream));
+    tmp_unique = (tmp_size_unique > tmp_size)
+                     ? device->AllocWorkspace(ctx, tmp_size_unique)
+                     : tmp;  // reuse buffer
+    CUDA_CALL(cub::DeviceSelect::Unique(
+        tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda,
+        num_out, stream));
+    num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda);
+
+    num_out = std::min(num_samples, num_out);
+    result = {
+        unique_row.CreateView({num_out}, dtype),
+        unique_col.CreateView({num_out}, dtype)};
+
+    if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique);
+  } else {
+    num_out = std::min(num_samples, num_out);
+    result = {
+        out_row.CreateView({num_out}, dtype),
+        out_col.CreateView({num_out}, dtype)};
+  }
+
+  device->FreeWorkspace(ctx, tmp);
+  device->FreeWorkspace(ctx, num_out_cuda);
+  return result;
+}
+
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
+template std::pair<IdArray, IdArray> CSRGlobalUniformNegativeSampling<
+    kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double);
+
+};  // namespace impl
+};  // namespace aten
+};  // namespace dgl
diff --git a/src/array/cuda/rowwise_sampling.cu b/src/array/cuda/rowwise_sampling.cu
index 38f08e7b277d..aaa777a179cf 100644
--- a/src/array/cuda/rowwise_sampling.cu
+++ b/src/array/cuda/rowwise_sampling.cu
@@ -1,15 +1,16 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file array/cuda/rowwise_sampling.cu
  * @brief uniform rowwise sampling
  */
 
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/tensordispatch.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>
 
 #include "../../array/cuda/atomic.cuh"
@@ -126,8 +127,8 @@ __global__ void _CSRRowWiseSampleUniformKernel(
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
 
-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
 
   while (out_row < last_row) {
     const int64_t row = in_rows[out_row];
@@ -151,7 +152,7 @@ __global__ void _CSRRowWiseSampleUniformKernel(
       __syncthreads();
 
       for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
-        const int num = curand(&rng) % (idx + 1);
+        const int num = hiprand(&rng) % (idx + 1);
         if (num < num_picks) {
           // use max so as to achieve the replacement order the serial
           // algorithm would have
@@ -204,8 +205,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
 
-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
 
   while (out_row < last_row) {
     const int64_t row = in_rows[out_row];
@@ -216,7 +217,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel(
     if (deg > 0) {
       // each thread then blindly copies in rows only if deg > 0.
       for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
-        const int64_t edge = curand(&rng) % deg;
+        const int64_t edge = hiprand(&rng) % deg;
         const int64_t out_idx = out_row_start + idx;
         out_rows[out_idx] = row;
         out_cols[out_idx] = in_index[in_row_start + edge];
@@ -237,7 +238,7 @@ COOMatrix _CSRRowWiseSamplingUniform(
     CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
   const auto& ctx = rows->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int64_t num_rows = rows->shape[0];
   const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -279,16 +280,16 @@ COOMatrix _CSRRowWiseSamplingUniform(
   IdType* out_ptr = static_cast<IdType*>(
       device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
   size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
   void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
   device->FreeWorkspace(ctx, prefix_temp);
   device->FreeWorkspace(ctx, out_deg);
 
-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));
 
   NDArray new_len_tensor;
   if (TensorDispatcher::Global()->IsAvailable()) {
@@ -301,10 +302,10 @@ COOMatrix _CSRRowWiseSamplingUniform(
   }
 
   // copy using the internal current stream
-  CUDA_CALL(cudaMemcpyAsync(
+  CUDA_CALL(hipMemcpyAsync(
       new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
-      cudaMemcpyDeviceToHost, stream));
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+      hipMemcpyDeviceToHost, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));
 
   const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
 
@@ -329,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform(
   device->FreeWorkspace(ctx, out_ptr);
 
   // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));
 
   const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
   picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
diff --git a/src/array/cuda/rowwise_sampling.cu.prehip b/src/array/cuda/rowwise_sampling.cu.prehip
new file mode 100644
index 000000000000..38f08e7b277d
--- /dev/null
+++ b/src/array/cuda/rowwise_sampling.cu.prehip
@@ -0,0 +1,366 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/cuda/rowwise_sampling.cu
+ * @brief uniform rowwise sampling
+ */
+
+#include <curand_kernel.h>
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>
+
+#include <cub/cub.cuh>
+#include <numeric>
+
+#include "../../array/cuda/atomic.cuh"
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+using namespace dgl::cuda;
+using namespace dgl::aten::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+namespace {
+
+constexpr int BLOCK_SIZE = 128;
+
+/**
+ * @brief Compute the size of each row in the sampled CSR, without replacement.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
+__global__ void _CSRRowWiseSampleDegreeKernel(
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg) {
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tIdx < num_rows) {
+    const int in_row = in_rows[tIdx];
+    const int out_row = tIdx;
+    out_deg[out_row] = min(
+        static_cast<IdType>(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]);
+
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+    }
+  }
+}
+
+/**
+ * @brief Compute the size of each row in the sampled CSR, with replacement.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
+__global__ void _CSRRowWiseSampleDegreeReplaceKernel(
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg) {
+  const int tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tIdx < num_rows) {
+    const int64_t in_row = in_rows[tIdx];
+    const int64_t out_row = tIdx;
+
+    if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) {
+      out_deg[out_row] = 0;
+    } else {
+      out_deg[out_row] = static_cast<IdType>(num_picks);
+    }
+
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+    }
+  }
+}
+
+/**
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
+ * and generate a COO matrix, without replacement.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_index The indices array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ */
+template <typename IdType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleUniformKernel(
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_index, const IdType* const data,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
+    IdType* const out_idxs) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    const int64_t out_row_start = out_ptr[out_row];
+
+    if (deg <= num_picks) {
+      // just copy row when there is not enough nodes to sample.
+      for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const IdType in_idx = in_row_start + idx;
+        out_rows[out_row_start + idx] = row;
+        out_cols[out_row_start + idx] = in_index[in_idx];
+        out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx;
+      }
+    } else {
+      // generate permutation list via reservoir algorithm
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        out_idxs[out_row_start + idx] = idx;
+      }
+      __syncthreads();
+
+      for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const int num = curand(&rng) % (idx + 1);
+        if (num < num_picks) {
+          // use max so as to achieve the replacement order the serial
+          // algorithm would have
+          AtomicMax(out_idxs + out_row_start + num, idx);
+        }
+      }
+      __syncthreads();
+
+      // copy permutation over
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start;
+        out_rows[out_row_start + idx] = row;
+        out_cols[out_row_start + idx] = in_index[perm_idx];
+        out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx;
+      }
+    }
+    out_row += 1;
+  }
+}
+
+/**
+ * @brief Perform row-wise uniform sampling on a CSR matrix,
+ * and generate a COO matrix, with replacement.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_index The indices array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ */
+template <typename IdType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleUniformReplaceKernel(
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_index, const IdType* const data,
+    const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols,
+    IdType* const out_idxs) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+
+    if (deg > 0) {
+      // each thread then blindly copies in rows only if deg > 0.
+      for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        const int64_t edge = curand(&rng) % deg;
+        const int64_t out_idx = out_row_start + idx;
+        out_rows[out_idx] = row;
+        out_cols[out_idx] = in_index[in_row_start + edge];
+        out_idxs[out_idx] =
+            data ? data[in_row_start + edge] : in_row_start + edge;
+      }
+    }
+    out_row += 1;
+  }
+}
+
+}  // namespace
+
+///////////////////////////// CSR sampling //////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+COOMatrix _CSRRowWiseSamplingUniform(
+    CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
+  const auto& ctx = rows->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int64_t num_rows = rows->shape[0];
+  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
+
+  IdArray picked_row =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_col =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdType* const out_rows = static_cast<IdType*>(picked_row->data);
+  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
+  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
+
+  const IdType* in_ptr = static_cast<IdType*>(GetDevicePointer(mat.indptr));
+  const IdType* in_cols = static_cast<IdType*>(GetDevicePointer(mat.indices));
+  const IdType* data = CSRHasData(mat)
+                           ? static_cast<IdType*>(GetDevicePointer(mat.data))
+                           : nullptr;
+
+  // compute degree
+  IdType* out_deg = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  if (replace) {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg);
+  } else {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg);
+  }
+
+  // fill out_ptr
+  IdType* out_ptr = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  size_t prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
+  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+  device->FreeWorkspace(ctx, out_deg);
+
+  cudaEvent_t copyEvent;
+  CUDA_CALL(cudaEventCreate(&copyEvent));
+
+  NDArray new_len_tensor;
+  if (TensorDispatcher::Global()->IsAvailable()) {
+    new_len_tensor = NDArray::PinnedEmpty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  } else {
+    // use pageable memory, it will unecessarily block but be functional
+    new_len_tensor = NDArray::Empty(
+        {1}, DGLDataTypeTraits<IdType>::dtype, DGLContext{kDGLCPU, 0});
+  }
+
+  // copy using the internal current stream
+  CUDA_CALL(cudaMemcpyAsync(
+      new_len_tensor->data, out_ptr + num_rows, sizeof(IdType),
+      cudaMemcpyDeviceToHost, stream));
+  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+
+  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+
+  // select edges
+  // the number of rows each thread block will cover
+  constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
+  if (replace) {  // with replacement
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleUniformReplaceKernel<IdType, TILE_SIZE>), grid, block,
+        0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr,
+        in_cols, data, out_ptr, out_rows, out_cols, out_idxs);
+  } else {  // without replacement
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleUniformKernel<IdType, TILE_SIZE>), grid, block, 0,
+        stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols,
+        data, out_ptr, out_rows, out_cols, out_idxs);
+  }
+  device->FreeWorkspace(ctx, out_ptr);
+
+  // wait for copying `new_len` to finish
+  CUDA_CALL(cudaEventSynchronize(copyEvent));
+  CUDA_CALL(cudaEventDestroy(copyEvent));
+
+  const IdType new_len = static_cast<const IdType*>(new_len_tensor->data)[0];
+  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
+  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
+  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
+
+  return COOMatrix(
+      mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
+}
+
+template <DGLDeviceType XPU, typename IdType>
+COOMatrix CSRRowWiseSamplingUniform(
+    CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) {
+  if (num_picks == -1) {
+    // Basically this is UnitGraph::InEdges().
+    COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
+    IdArray sliced_rows = IndexSelect(rows, coo.row);
+    return COOMatrix(
+        mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
+  } else {
+    return _CSRRowWiseSamplingUniform<XPU, IdType>(
+        mat, rows, num_picks, replace);
+  }
+}
+
+template COOMatrix CSRRowWiseSamplingUniform<kDGLCUDA, int32_t>(
+    CSRMatrix, IdArray, int64_t, bool);
+template COOMatrix CSRRowWiseSamplingUniform<kDGLCUDA, int64_t>(
+    CSRMatrix, IdArray, int64_t, bool);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/rowwise_sampling_prob.cu b/src/array/cuda/rowwise_sampling_prob.cu
index bab553ca8b56..229923cb720d 100644
--- a/src/array/cuda/rowwise_sampling_prob.cu
+++ b/src/array/cuda/rowwise_sampling_prob.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2022 by Contributors
  * @file array/cuda/rowwise_sampling_prob.cu
@@ -6,11 +7,11 @@
  * sampling code rowwise_sampling.cu.
  * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
  */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>
 
 #include "../../array/cuda/atomic.cuh"
@@ -19,7 +20,7 @@
 
 // require CUB 1.17 to use DeviceSegmentedSort
 static_assert(
-    CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
+    HIPCUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
 
 namespace dgl {
 using namespace cuda;
@@ -159,8 +160,8 @@ __global__ void _CSRAResValueKernel(
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
 
-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
 
   while (out_row < last_row) {
     const int64_t row = in_rows[out_row];
@@ -179,7 +180,7 @@ __global__ void _CSRAResValueKernel(
             prob, data, idx, in_row_start, &item_prob);
         // compute A-Res value
         ares[ares_idx] = static_cast<FloatType>(
-            __powf(curand_uniform(&rng), 1.0f / item_prob));
+            __powf(hiprand_uniform(&rng), 1.0f / item_prob));
         ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
       }
     }
@@ -317,8 +318,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
   const int64_t last_row =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
 
-  curandStatePhilox4_32_10_t rng;
-  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+  hiprandStatePhilox4_32_10_t rng;
+  hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
 
   while (out_row < last_row) {
     const int64_t row = in_rows[out_row];
@@ -330,7 +331,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
 
     if (deg > 0) {
       // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
-      typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
+      typedef hipcub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
       // Allocate shared memory for BlockScan
       __shared__ typename BlockScan::TempStorage temp_storage;
       // Initialize running total
@@ -362,10 +363,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel(
       for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
         // get random value
         FloatType sum = cdf[cdf_row_start + deg - 1];
-        FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
+        FloatType rand = static_cast<FloatType>(hiprand_uniform(&rng) * sum);
         // get the offset of the first value within cdf array which is greater
         // than random value.
-        int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
+        int64_t item = hipcub::UpperBound<FloatType*, int64_t, FloatType>(
             &cdf[cdf_row_start], deg, rand);
         item = min(item, deg - 1);
         // get in and out index
@@ -441,7 +442,7 @@ COOMatrix _COORemoveIf(
     const COOMatrix& coo, const NDArray& values, DType criteria) {
   const DType* val = values.Ptr<DType>();
   auto maskgen = [val, criteria](
-                     int nb, int nt, cudaStream_t stream, int64_t nnz,
+                     int nb, int nt, hipStream_t stream, int64_t nnz,
                      const IdType* data, int8_t* flags) {
     CUDA_KERNEL_CALL(
         (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
@@ -481,7 +482,7 @@ COOMatrix _CSRRowWiseSampling(
     const FloatArray& prob, bool replace) {
   const auto& ctx = rows->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int64_t num_rows = rows->shape[0];
   const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
@@ -530,10 +531,10 @@ COOMatrix _CSRRowWiseSampling(
   IdType* temp_ptr = static_cast<IdType*>(
       device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
   size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
   void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
   device->FreeWorkspace(ctx, prefix_temp);
   device->FreeWorkspace(ctx, temp_deg);
@@ -551,16 +552,16 @@ COOMatrix _CSRRowWiseSampling(
   IdType* out_ptr = static_cast<IdType*>(
       device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
   prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
   prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
   device->FreeWorkspace(ctx, prefix_temp);
   device->FreeWorkspace(ctx, out_deg);
 
-  cudaEvent_t copyEvent;
-  CUDA_CALL(cudaEventCreate(&copyEvent));
+  hipEvent_t copyEvent;
+  CUDA_CALL(hipEventCreate(&copyEvent));
   // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
   // wait on a cudaevent
   IdType new_len;
@@ -568,7 +569,7 @@ COOMatrix _CSRRowWiseSampling(
   device->CopyDataFromTo(
       out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
       DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
-  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+  CUDA_CALL(hipEventRecord(copyEvent, stream));
 
   // allocate workspace
   // 1) for w/ replacement, it's a global buffer to store cdf segments (one
@@ -612,16 +613,16 @@ COOMatrix _CSRRowWiseSampling(
     IdType* sort_temp_idxs = static_cast<IdType*>(
         device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
 
-    cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
-    cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
+    hipcub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
+    hipcub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
 
     void* d_temp_storage = nullptr;
     size_t temp_storage_bytes = 0;
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
         num_rows, temp_ptr, temp_ptr + 1, stream));
     d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
-    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
         num_rows, temp_ptr, temp_ptr + 1, stream));
     device->FreeWorkspace(ctx, d_temp_storage);
@@ -641,8 +642,8 @@ COOMatrix _CSRRowWiseSampling(
   device->FreeWorkspace(ctx, out_ptr);
 
   // wait for copying `new_len` to finish
-  CUDA_CALL(cudaEventSynchronize(copyEvent));
-  CUDA_CALL(cudaEventDestroy(copyEvent));
+  CUDA_CALL(hipEventSynchronize(copyEvent));
+  CUDA_CALL(hipEventDestroy(copyEvent));
 
   picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
   picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
diff --git a/src/array/cuda/rowwise_sampling_prob.cu.prehip b/src/array/cuda/rowwise_sampling_prob.cu.prehip
new file mode 100644
index 000000000000..bab553ca8b56
--- /dev/null
+++ b/src/array/cuda/rowwise_sampling_prob.cu.prehip
@@ -0,0 +1,696 @@
+/**
+ *  Copyright (c) 2022 by Contributors
+ * @file array/cuda/rowwise_sampling_prob.cu
+ * @brief weighted rowwise sampling. The degree computing kernels and
+ * host-side functions are partially borrowed from the uniform rowwise
+ * sampling code rowwise_sampling.cu.
+ * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
+ */
+#include <curand_kernel.h>
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+
+#include <cub/cub.cuh>
+#include <numeric>
+
+#include "../../array/cuda/atomic.cuh"
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+// require CUB 1.17 to use DeviceSegmentedSort
+static_assert(
+    CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort");
+
+namespace dgl {
+using namespace cuda;
+using namespace aten::cuda;
+namespace aten {
+namespace impl {
+
+namespace {
+
+constexpr int BLOCK_SIZE = 128;
+
+/**
+ * @brief Compute the size of each row in the sampled CSR, without replacement.
+ * temp_deg is calculated for rows with deg > num_picks.
+ * For these rows, we will calculate their A-Res values and sort them to get
+ * top-num_picks.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ * @param temp_deg The size of each row in the input matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
+__global__ void _CSRRowWiseSampleDegreeKernel(
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg, IdType* const temp_deg) {
+  const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tIdx < num_rows) {
+    const int64_t in_row = in_rows[tIdx];
+    const int64_t out_row = tIdx;
+    const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
+    // temp_deg is used to generate ares_ptr
+    temp_deg[out_row] = deg > static_cast<IdType>(num_picks) ? deg : 0;
+    out_deg[out_row] = min(static_cast<IdType>(num_picks), deg);
+
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+      temp_deg[num_rows] = 0;
+    }
+  }
+}
+
+/**
+ * @brief Compute the size of each row in the sampled CSR, with replacement.
+ * We need the actual in degree of each row to store CDF values.
+ *
+ * @tparam IdType The type of node and edge indexes.
+ * @param num_picks The number of non-zero entries to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The index where each row's edges start.
+ * @param out_deg The size of each row in the sampled matrix, as indexed by
+ * `in_rows` (output).
+ * @param temp_deg The size of each row in the input matrix, as indexed by
+ * `in_rows` (output).
+ */
+template <typename IdType>
+__global__ void _CSRRowWiseSampleDegreeReplaceKernel(
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    IdType* const out_deg, IdType* const temp_deg) {
+  const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (tIdx < num_rows) {
+    const int64_t in_row = in_rows[tIdx];
+    const int64_t out_row = tIdx;
+    const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row];
+    temp_deg[out_row] = deg;
+    out_deg[out_row] = deg == 0 ? 0 : static_cast<IdType>(num_picks);
+
+    if (out_row == num_rows - 1) {
+      // make the prefixsum work
+      out_deg[num_rows] = 0;
+      temp_deg[num_rows] = 0;
+    }
+  }
+}
+
+/**
+ * @brief Equivalent to numpy expression: array[idx[off:off + len]]
+ *
+ * @tparam IdType The ID type used for indices.
+ * @tparam FloatType The float type used for array values.
+ * @param array The array to be selected.
+ * @param idx_data The index mapping array.
+ * @param index The index of value to be selected.
+ * @param offset The offset to start.
+ * @param out The selected value (output).
+ */
+template <typename IdType, typename FloatType>
+__device__ void _DoubleSlice(
+    const FloatType* const array, const IdType* const idx_data,
+    const IdType idx, const IdType offset, FloatType* const out) {
+  if (idx_data) {
+    *out = array[idx_data[offset + idx]];
+  } else {
+    *out = array[offset + idx];
+  }
+}
+
+/**
+ * @brief Compute A-Res value. A-Res value needs to be calculated only if deg
+ * is greater than num_picks in weighted rowwise sampling without replacement.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam FloatType The Float type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param prob The probability array of the input CSR.
+ * @param ares_ptr The offset to write each row to in the A-res array.
+ * @param ares_idxs The A-Res value corresponding index array, the index of
+ * input CSR (output).
+ * @param ares The A-Res value array (output).
+ * @author pengqirong (OPPO)
+ */
+template <typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRAResValueKernel(
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const data, const FloatType* const prob,
+    const IdType* const ares_ptr, IdType* const ares_idxs,
+    FloatType* const ares) {
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    // A-Res value needs to be calculated only if deg is greater than num_picks
+    // in weighted rowwise sampling without replacement
+    if (deg > num_picks) {
+      const int64_t ares_row_start = ares_ptr[out_row];
+
+      for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        const int64_t in_idx = in_row_start + idx;
+        const int64_t ares_idx = ares_row_start + idx;
+        FloatType item_prob;
+        _DoubleSlice<IdType, FloatType>(
+            prob, data, idx, in_row_start, &item_prob);
+        // compute A-Res value
+        ares[ares_idx] = static_cast<FloatType>(
+            __powf(curand_uniform(&rng), 1.0f / item_prob));
+        ares_idxs[ares_idx] = static_cast<IdType>(in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+
+/**
+ * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO
+ * matrix, without replacement. After sorting, we select top-num_picks items.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam FloatType The Float type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_cols The columns array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param ares_ptr The offset to write each row to in the ares array.
+ * @param sort_ares_idxs The sorted A-Res value corresponding index array, the
+ * index of input CSR.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ * @author pengqirong (OPPO)
+ */
+template <typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleKernel(
+    const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_cols, const IdType* const data,
+    const IdType* const out_ptr, const IdType* const ares_ptr,
+    const IdType* const sort_ares_idxs, IdType* const out_rows,
+    IdType* const out_cols, IdType* const out_idxs) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+
+    if (deg > num_picks) {
+      const int64_t ares_row_start = ares_ptr[out_row];
+      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        // get in and out index, the in_idx is one of top num_picks A-Res value
+        // corresponding index in input CSR.
+        const int64_t out_idx = out_row_start + idx;
+        const int64_t ares_idx = ares_row_start + idx;
+        const int64_t in_idx = sort_ares_idxs[ares_idx];
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    } else {
+      for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) {
+        // get in and out index
+        const int64_t out_idx = out_row_start + idx;
+        const int64_t in_idx = in_row_start + idx;
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+template <typename FloatType>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  FloatType running_total;
+  // Constructor
+  __device__ BlockPrefixCallbackOp(FloatType running_total)
+      : running_total(running_total) {}
+  // Callback operator to be entered by the first warp of threads in the block.
+  // Thread-0 is responsible for returning a value for seeding the block-wide
+  // scan.
+  __device__ FloatType operator()(FloatType block_aggregate) {
+    FloatType old_prefix = running_total;
+    running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+/**
+ * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO
+ * matrix, with replacement. We store the CDF (unnormalized) of all neighbors of
+ * a row in global memory and use binary search to find inverse indices as
+ * selected items.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam FloatType The Float type used for matrices.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ * @param rand_seed The random seed to use.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param num_rows The number of rows to pick.
+ * @param in_rows The set of rows to pick.
+ * @param in_ptr The indptr array of the input CSR.
+ * @param in_cols The columns array of the input CSR.
+ * @param data The data array of the input CSR.
+ * @param prob The probability array of the input CSR.
+ * @param out_ptr The offset to write each row to in the output COO.
+ * @param cdf_ptr The offset of each cdf segment.
+ * @param cdf The global buffer to store cdf segments.
+ * @param out_rows The rows of the output COO (output).
+ * @param out_cols The columns of the output COO (output).
+ * @param out_idxs The data array of the output COO (output).
+ * @author pengqirong (OPPO)
+ */
+template <typename IdType, typename FloatType, int TILE_SIZE>
+__global__ void _CSRRowWiseSampleReplaceKernel(
+    const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows,
+    const IdType* const in_rows, const IdType* const in_ptr,
+    const IdType* const in_cols, const IdType* const data,
+    const FloatType* const prob, const IdType* const out_ptr,
+    const IdType* const cdf_ptr, FloatType* const cdf, IdType* const out_rows,
+    IdType* const out_cols, IdType* const out_idxs) {
+  // we assign one warp per row
+  assert(blockDim.x == BLOCK_SIZE);
+
+  int64_t out_row = blockIdx.x * TILE_SIZE;
+  const int64_t last_row =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_rows);
+
+  curandStatePhilox4_32_10_t rng;
+  curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng);
+
+  while (out_row < last_row) {
+    const int64_t row = in_rows[out_row];
+    const int64_t in_row_start = in_ptr[row];
+    const int64_t out_row_start = out_ptr[out_row];
+    const int64_t cdf_row_start = cdf_ptr[out_row];
+    const int64_t deg = in_ptr[row + 1] - in_row_start;
+    const FloatType MIN_THREAD_DATA = static_cast<FloatType>(0.0f);
+
+    if (deg > 0) {
+      // Specialize BlockScan for a 1D block of BLOCK_SIZE threads
+      typedef cub::BlockScan<FloatType, BLOCK_SIZE> BlockScan;
+      // Allocate shared memory for BlockScan
+      __shared__ typename BlockScan::TempStorage temp_storage;
+      // Initialize running total
+      BlockPrefixCallbackOp<FloatType> prefix_op(MIN_THREAD_DATA);
+
+      int64_t max_iter = (1 + (deg - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+      // Have the block iterate over segments of items
+      for (int64_t idx = threadIdx.x; idx < max_iter; idx += BLOCK_SIZE) {
+        // Load a segment of consecutive items that are blocked across threads
+        FloatType thread_data;
+        if (idx < deg)
+          _DoubleSlice<IdType, FloatType>(
+              prob, data, idx, in_row_start, &thread_data);
+        else
+          thread_data = MIN_THREAD_DATA;
+        thread_data = max(thread_data, MIN_THREAD_DATA);
+        // Collectively compute the block-wide inclusive prefix sum
+        BlockScan(temp_storage)
+            .InclusiveSum(thread_data, thread_data, prefix_op);
+        __syncthreads();
+
+        // Store scanned items to cdf array
+        if (idx < deg) {
+          cdf[cdf_row_start + idx] = thread_data;
+        }
+      }
+      __syncthreads();
+
+      for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) {
+        // get random value
+        FloatType sum = cdf[cdf_row_start + deg - 1];
+        FloatType rand = static_cast<FloatType>(curand_uniform(&rng) * sum);
+        // get the offset of the first value within cdf array which is greater
+        // than random value.
+        int64_t item = cub::UpperBound<FloatType*, int64_t, FloatType>(
+            &cdf[cdf_row_start], deg, rand);
+        item = min(item, deg - 1);
+        // get in and out index
+        const int64_t in_idx = in_row_start + item;
+        const int64_t out_idx = out_row_start + idx;
+        // copy permutation over
+        out_rows[out_idx] = static_cast<IdType>(row);
+        out_cols[out_idx] = in_cols[in_idx];
+        out_idxs[out_idx] = static_cast<IdType>(data ? data[in_idx] : in_idx);
+      }
+    }
+    out_row += 1;
+  }
+}
+
+template <typename IdType, typename DType, typename BoolType>
+__global__ void _GenerateFlagsKernel(
+    int64_t n, const IdType* idx, const DType* values, DType criteria,
+    BoolType* output) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < n) {
+    output[tx] = (values[idx ? idx[tx] : tx] != criteria);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType, typename DType, typename MaskGen>
+COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) {
+  using namespace dgl::cuda;
+
+  const auto idtype = coo.row->dtype;
+  const auto ctx = coo.row->ctx;
+  const int64_t nnz = coo.row->shape[0];
+  const IdType* row = coo.row.Ptr<IdType>();
+  const IdType* col = coo.col.Ptr<IdType>();
+  const IdArray& eid =
+      COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx);
+  const IdType* data = coo.data.Ptr<IdType>();
+  IdArray new_row = IdArray::Empty({nnz}, idtype, ctx);
+  IdArray new_col = IdArray::Empty({nnz}, idtype, ctx);
+  IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx);
+  IdType* new_row_data = new_row.Ptr<IdType>();
+  IdType* new_col_data = new_col.Ptr<IdType>();
+  IdType* new_eid_data = new_eid.Ptr<IdType>();
+  auto stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(ctx);
+
+  int8_t* flags = static_cast<int8_t*>(device->AllocWorkspace(ctx, nnz));
+  int nt = dgl::cuda::FindNumThreads(nnz);
+  int64_t nb = (nnz + nt - 1) / nt;
+
+  maskgen(nb, nt, stream, nnz, data, flags);
+
+  int64_t* rst =
+      static_cast<int64_t*>(device->AllocWorkspace(ctx, sizeof(int64_t)));
+  MaskSelect(device, ctx, row, flags, new_row_data, nnz, rst, stream);
+  MaskSelect(device, ctx, col, flags, new_col_data, nnz, rst, stream);
+  MaskSelect(device, ctx, data, flags, new_eid_data, nnz, rst, stream);
+
+  int64_t new_len = GetCUDAScalar(device, ctx, rst);
+
+  device->FreeWorkspace(ctx, flags);
+  device->FreeWorkspace(ctx, rst);
+  return COOMatrix(
+      coo.num_rows, coo.num_cols, new_row.CreateView({new_len}, idtype, 0),
+      new_col.CreateView({new_len}, idtype, 0),
+      new_eid.CreateView({new_len}, idtype, 0));
+}
+
+template <DGLDeviceType XPU, typename IdType, typename DType>
+COOMatrix _COORemoveIf(
+    const COOMatrix& coo, const NDArray& values, DType criteria) {
+  const DType* val = values.Ptr<DType>();
+  auto maskgen = [val, criteria](
+                     int nb, int nt, cudaStream_t stream, int64_t nnz,
+                     const IdType* data, int8_t* flags) {
+    CUDA_KERNEL_CALL(
+        (_GenerateFlagsKernel<IdType, DType, int8_t>), nb, nt, 0, stream, nnz,
+        data, val, criteria, flags);
+  };
+  return COOGeneralRemoveIf<XPU, IdType, DType, decltype(maskgen)>(
+      coo, maskgen);
+}
+
+}  // namespace
+
+/////////////////////////////// CSR ///////////////////////////////
+
+/**
+ * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO
+ * matrix. Use CDF sampling algorithm for with replacement:
+ *   1) Calculate the CDF of all neighbor's prob.
+ *   2) For each [0, num_picks), generate a rand ~ U(0, 1). Use binary search to
+ *      find its index in the CDF array as a chosen item.
+ * Use A-Res sampling algorithm for without replacement:
+ *   1) For rows with deg > num_picks, calculate A-Res values for all neighbors.
+ *   2) Sort the A-Res array and select top-num_picks as chosen items.
+ *
+ * @tparam XPU The device type used for matrices.
+ * @tparam IdType The ID type used for matrices.
+ * @tparam FloatType The Float type used for matrices.
+ * @param mat The CSR matrix.
+ * @param rows The set of rows to pick.
+ * @param num_picks The number of non-zeros to pick per row.
+ * @param prob The probability array of the input CSR.
+ * @param replace Is replacement sampling?
+ * @author pengqirong (OPPO), dlasalle and Xin from Nvidia.
+ */
+template <DGLDeviceType XPU, typename IdType, typename FloatType>
+COOMatrix _CSRRowWiseSampling(
+    const CSRMatrix& mat, const IdArray& rows, int64_t num_picks,
+    const FloatArray& prob, bool replace) {
+  const auto& ctx = rows->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int64_t num_rows = rows->shape[0];
+  const IdType* const slice_rows = static_cast<const IdType*>(rows->data);
+
+  IdArray picked_row =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_col =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdArray picked_idx =
+      NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8);
+  IdType* const out_rows = static_cast<IdType*>(picked_row->data);
+  IdType* const out_cols = static_cast<IdType*>(picked_col->data);
+  IdType* const out_idxs = static_cast<IdType*>(picked_idx->data);
+
+  const IdType* in_ptr = static_cast<IdType*>(GetDevicePointer(mat.indptr));
+  const IdType* in_cols = static_cast<IdType*>(GetDevicePointer(mat.indices));
+  const IdType* data = CSRHasData(mat)
+                           ? static_cast<IdType*>(GetDevicePointer(mat.data))
+                           : nullptr;
+  const FloatType* prob_data = static_cast<FloatType*>(GetDevicePointer(prob));
+
+  // compute degree
+  // out_deg: the size of each row in the sampled matrix
+  // temp_deg: the size of each row we will manipulate in sampling
+  //    1) for w/o replacement: in degree if it's greater than num_picks else 0
+  //    2) for w/ replacement: in degree
+  IdType* out_deg = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  IdType* temp_deg = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  if (replace) {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg, temp_deg);
+  } else {
+    const dim3 block(512);
+    const dim3 grid((num_rows + block.x - 1) / block.x);
+    CUDA_KERNEL_CALL(
+        _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks,
+        num_rows, slice_rows, in_ptr, out_deg, temp_deg);
+  }
+
+  // fill temp_ptr
+  IdType* temp_ptr = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  size_t prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
+  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+  device->FreeWorkspace(ctx, temp_deg);
+
+  // TODO(Xin): The copy here is too small, and the overhead of creating
+  // cuda events cannot be ignored. Just use synchronized copy.
+  IdType temp_len;
+  // copy using the internal current stream.
+  device->CopyDataFromTo(
+      temp_ptr, num_rows * sizeof(temp_len), &temp_len, 0, sizeof(temp_len),
+      ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  device->StreamSync(ctx, stream);
+
+  // fill out_ptr
+  IdType* out_ptr = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType)));
+  prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
+  prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+  device->FreeWorkspace(ctx, out_deg);
+
+  cudaEvent_t copyEvent;
+  CUDA_CALL(cudaEventCreate(&copyEvent));
+  // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and
+  // wait on a cudaevent
+  IdType new_len;
+  // copy using the internal current stream.
+  device->CopyDataFromTo(
+      out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx,
+      DGLContext{kDGLCPU, 0}, mat.indptr->dtype);
+  CUDA_CALL(cudaEventRecord(copyEvent, stream));
+
+  // allocate workspace
+  // 1) for w/ replacement, it's a global buffer to store cdf segments (one
+  // segment for each row).
+  // 2) for w/o replacement, it's used to store a-res segments (one segment for
+  // each row with degree > num_picks)
+  FloatType* temp = static_cast<FloatType*>(
+      device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
+
+  const uint64_t rand_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+
+  // select edges
+  // the number of rows each thread block will cover
+  constexpr int TILE_SIZE = 128 / BLOCK_SIZE;
+  if (replace) {  // with replacement.
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleReplaceKernel<IdType, FloatType, TILE_SIZE>), grid,
+        block, 0, stream, rand_seed, num_picks, num_rows, slice_rows, in_ptr,
+        in_cols, data, prob_data, out_ptr, temp_ptr, temp, out_rows, out_cols,
+        out_idxs);
+    device->FreeWorkspace(ctx, temp);
+  } else {  // without replacement
+    IdType* temp_idxs = static_cast<IdType*>(
+        device->AllocWorkspace(ctx, (temp_len) * sizeof(IdType)));
+
+    // Compute A-Res value. A-Res value needs to be calculated only if deg
+    // is greater than num_picks in weighted rowwise sampling without
+    // replacement.
+    const dim3 block(BLOCK_SIZE);
+    const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+    CUDA_KERNEL_CALL(
+        (_CSRAResValueKernel<IdType, FloatType, TILE_SIZE>), grid, block, 0,
+        stream, rand_seed, num_picks, num_rows, slice_rows, in_ptr, data,
+        prob_data, temp_ptr, temp_idxs, temp);
+
+    // sort A-Res value array.
+    FloatType* sort_temp = static_cast<FloatType*>(
+        device->AllocWorkspace(ctx, temp_len * sizeof(FloatType)));
+    IdType* sort_temp_idxs = static_cast<IdType*>(
+        device->AllocWorkspace(ctx, temp_len * sizeof(IdType)));
+
+    cub::DoubleBuffer<FloatType> sort_keys(temp, sort_temp);
+    cub::DoubleBuffer<IdType> sort_values(temp_idxs, sort_temp_idxs);
+
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
+        num_rows, temp_ptr, temp_ptr + 1, stream));
+    d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes);
+    CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len,
+        num_rows, temp_ptr, temp_ptr + 1, stream));
+    device->FreeWorkspace(ctx, d_temp_storage);
+    device->FreeWorkspace(ctx, temp);
+    device->FreeWorkspace(ctx, temp_idxs);
+    device->FreeWorkspace(ctx, sort_temp);
+    device->FreeWorkspace(ctx, sort_temp_idxs);
+
+    // select tok-num_picks as results
+    CUDA_KERNEL_CALL(
+        (_CSRRowWiseSampleKernel<IdType, FloatType, TILE_SIZE>), grid, block, 0,
+        stream, num_picks, num_rows, slice_rows, in_ptr, in_cols, data, out_ptr,
+        temp_ptr, sort_values.Current(), out_rows, out_cols, out_idxs);
+  }
+
+  device->FreeWorkspace(ctx, temp_ptr);
+  device->FreeWorkspace(ctx, out_ptr);
+
+  // wait for copying `new_len` to finish
+  CUDA_CALL(cudaEventSynchronize(copyEvent));
+  CUDA_CALL(cudaEventDestroy(copyEvent));
+
+  picked_row = picked_row.CreateView({new_len}, picked_row->dtype);
+  picked_col = picked_col.CreateView({new_len}, picked_col->dtype);
+  picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype);
+
+  return COOMatrix(
+      mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx);
+}
+
+template <DGLDeviceType XPU, typename IdType, typename DType>
+COOMatrix CSRRowWiseSampling(
+    CSRMatrix mat, IdArray rows, int64_t num_picks, FloatArray prob,
+    bool replace) {
+  COOMatrix result;
+  if (num_picks == -1) {
+    // Basically this is UnitGraph::InEdges().
+    COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false);
+    IdArray sliced_rows = IndexSelect(rows, coo.row);
+    result =
+        COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data);
+  } else {
+    result = _CSRRowWiseSampling<XPU, IdType, DType>(
+        mat, rows, num_picks, prob, replace);
+  }
+  // NOTE(BarclayII): I'm removing the entries with zero probability after
+  // sampling. Is there a better way?
+  return _COORemoveIf<XPU, IdType, DType>(result, prob, static_cast<DType>(0));
+}
+
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, float>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, float>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, double>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, double>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+// These are not being called, but we instantiate them anyway to prevent missing
+// symbols in Debug build
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, int8_t>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, int8_t>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int32_t, uint8_t>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+template COOMatrix CSRRowWiseSampling<kDGLCUDA, int64_t, uint8_t>(
+    CSRMatrix, IdArray, int64_t, FloatArray, bool);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/sddmm.cu b/src/array/cuda/sddmm.cu
index 957d1a02a377..3f00596f6af2 100644
--- a/src/array/cuda/sddmm.cu
+++ b/src/array/cuda/sddmm.cu
@@ -48,10 +48,10 @@ template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
     const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED
@@ -75,10 +75,10 @@ template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
     const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #if BF16_ENABLED
-template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
-template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
     NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
 #endif  // BF16_ENABLED
diff --git a/src/array/cuda/sddmm.cu.prehip b/src/array/cuda/sddmm.cu.prehip
new file mode 100644
index 000000000000..957d1a02a377
--- /dev/null
+++ b/src/array/cuda/sddmm.cu.prehip
@@ -0,0 +1,99 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include "./functor.cuh"
+#include "./sddmm.cuh"
+
+namespace dgl {
+namespace aten {
+
+/**
+ * @brief CUDA implementation of g-SDDMM on Csr format.
+ */
+template <int XPU, typename IdType, typename DType>
+void SDDMMCsr(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target) {
+  SWITCH_OP(op, Op, {
+    SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
+      cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
+          bcast, csr, lhs, rhs, out);
+    });
+  });
+}
+
+/**
+ * @brief CUDA implementation of g-SDDMM on Coo format.
+ */
+template <int XPU, typename IdType, typename DType>
+void SDDMMCoo(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target) {
+  SWITCH_OP(op, Op, {
+    SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
+      cuda::SDDMMCoo<IdType, DType, Op, LhsTarget, RhsTarget>(
+          bcast, coo, lhs, rhs, out);
+    });
+  });
+}
+
+template void SDDMMCsr<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCsr<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+#if BF16_ENABLED
+template void SDDMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+#endif  // BF16_ENABLED
+template void SDDMMCsr<kDGLCUDA, int32_t, float>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCsr<kDGLCUDA, int64_t, float>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCsr<kDGLCUDA, int32_t, double>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCsr<kDGLCUDA, int64_t, double>(
+    const std::string& op, const BcastOff& bcast, const CSRMatrix& csr,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+
+template void SDDMMCoo<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCoo<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+#if BF16_ENABLED
+template void SDDMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+#endif  // BF16_ENABLED
+template void SDDMMCoo<kDGLCUDA, int32_t, float>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCoo<kDGLCUDA, int64_t, float>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCoo<kDGLCUDA, int32_t, double>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+template void SDDMMCoo<kDGLCUDA, int64_t, double>(
+    const std::string& op, const BcastOff& bcast, const COOMatrix& coo,
+    NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/sddmm.cuh b/src/array/cuda/sddmm.cuh
index bc1cadfa01c3..33d4b5999892 100644
--- a/src/array/cuda/sddmm.cuh
+++ b/src/array/cuda/sddmm.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/sddmm.cuh
@@ -275,7 +276,7 @@ void SDDMMCoo(
   const DType* lhs_data = lhs.Ptr<DType>();
   const DType* rhs_data = rhs.Ptr<DType>();
   DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   int64_t *lhs_off = nullptr, *rhs_off = nullptr;
   int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -337,7 +338,7 @@ void SDDMMCsr(
   const DType* lhs_data = lhs.Ptr<DType>();
   const DType* rhs_data = rhs.Ptr<DType>();
   DType* out_data = out.Ptr<DType>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];
 
   int64_t *lhs_off = nullptr, *rhs_off = nullptr;
diff --git a/src/array/cuda/sddmm.cuh.prehip b/src/array/cuda/sddmm.cuh.prehip
new file mode 100644
index 000000000000..bc1cadfa01c3
--- /dev/null
+++ b/src/array/cuda/sddmm.cuh.prehip
@@ -0,0 +1,368 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/sddmm.cuh
+ * @brief SDDMM CUDA kernel function header.
+ */
+#ifndef DGL_ARRAY_CUDA_SDDMM_CUH_
+#define DGL_ARRAY_CUDA_SDDMM_CUH_
+
+#include <dgl/bcast.h>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "../selector.h"
+#include "./functor.cuh"
+#include "./utils.h"
+#include "atomic.cuh"
+#include "bf16.cuh"
+#include "fp16.cuh"
+#include "functor.cuh"
+#include "macro.cuh"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+namespace cuda {
+
+#define SWITCH_OP(op, Op, ...)                                        \
+  do {                                                                \
+    if ((op) == "add") {                                              \
+      typedef cuda::binary::Add<DType> Op;                            \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "sub") {                                       \
+      typedef cuda::binary::Sub<DType> Op;                            \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "mul") {                                       \
+      typedef cuda::binary::Mul<DType> Op;                            \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "div") {                                       \
+      typedef cuda::binary::Div<DType> Op;                            \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "copy_lhs") {                                  \
+      typedef cuda::binary::CopyLhs<DType> Op;                        \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "copy_rhs") {                                  \
+      typedef cuda::binary::CopyRhs<DType> Op;                        \
+      { __VA_ARGS__ }                                                 \
+    } else if ((op) == "dot") {                                       \
+      typedef cuda::binary::Dot<DType> Op;                            \
+      { __VA_ARGS__ }                                                 \
+    } else {                                                          \
+      LOG(FATAL) << "Unsupported SpMM/SDDMM binary operator: " << op; \
+    }                                                                 \
+  } while (0)
+
+#define SWITCH_RHS(rhs_target, RhsTarget, ...)             \
+  do {                                                     \
+    if ((rhs_target) == 0) {                               \
+      constexpr int RhsTarget = 0;                         \
+      { __VA_ARGS__ }                                      \
+    } else if ((rhs_target) == 1) {                        \
+      constexpr int RhsTarget = 1;                         \
+      { __VA_ARGS__ }                                      \
+    } else if ((rhs_target) == 2) {                        \
+      constexpr int RhsTarget = 2;                         \
+      { __VA_ARGS__ }                                      \
+    } else {                                               \
+      LOG(INFO) << "Invalid rhs target: " << (rhs_target); \
+    }                                                      \
+  } while (0)
+
+#define SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, ...) \
+  do {                                                                   \
+    if ((lhs_target) == 0) {                                             \
+      constexpr int LhsTarget = 0;                                       \
+      SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__);                    \
+    } else if ((lhs_target) == 1) {                                      \
+      constexpr int LhsTarget = 1;                                       \
+      SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__);                    \
+    } else if ((lhs_target) == 2) {                                      \
+      constexpr int LhsTarget = 2;                                       \
+      SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__);                    \
+    } else {                                                             \
+      LOG(INFO) << "Invalid lhs target: " << (lhs_target);               \
+    }                                                                    \
+  } while (0)
+
+constexpr unsigned int full_mask = 0xffffffff;
+
+/**
+ * @brief CUDA kernel of g-SDDMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different edges. Threadblocks
+ *       on the x-axis are responsible for the computation on different
+ * positions in feature dimension.
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, bool UseBcast = false,
+    bool UseIdx = false, int LhsTarget = 0, int RhsTarget = 2>
+__global__ void SDDMMCooKernel(
+    const DType* __restrict__ lhs, const DType* __restrict__ rhs,
+    DType* __restrict__ out, const Idx* __restrict__ row,
+    const Idx* __restrict__ col, const Idx* __restrict__ edge_map, int64_t N,
+    int64_t M, int64_t E, int64_t reduce_size,
+    const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off,
+    int64_t lhs_len, int64_t rhs_len, int64_t out_len) {
+  // SDDMM with COO.
+  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  while (ty < E) {
+    const Idx src = _ldg(row + ty);
+    const Idx dst = _ldg(col + ty);
+    const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty;
+    const DType* lhsoff =
+        BinaryOp::use_lhs
+            ? (lhs + Selector<LhsTarget>::Call(src, eid, dst) * lhs_len)
+            : nullptr;
+    const DType* rhsoff =
+        BinaryOp::use_rhs
+            ? (rhs + Selector<RhsTarget>::Call(src, eid, dst) * rhs_len)
+            : nullptr;
+    DType* outoff = out + eid * out_len;
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int stride_x = blockDim.x * gridDim.x;
+    while (tx < out_len) {
+      const Idx lhs_add = UseBcast ? lhs_off[tx] : tx;
+      const Idx rhs_add = UseBcast ? rhs_off[tx] : tx;
+      DType val = BinaryOp::Call(
+          lhsoff + lhs_add * reduce_size, rhsoff + rhs_add * reduce_size,
+          reduce_size);
+      outoff[tx] = val;
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree
+ * reduction.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different edges. Threadblocks
+ *       on the x-axis are responsible for the computation on different
+ * positions in feature dimension.
+ */
+template <
+    typename Idx, typename DType, bool UseBcast = false, bool UseIdx = false,
+    int LhsTarget = 0, int RhsTarget = 2>
+__global__ void SDDMMCooTreeReduceKernel(
+    const DType* __restrict__ lhs, const DType* __restrict__ rhs,
+    DType* __restrict__ out, const Idx* __restrict__ row,
+    const Idx* __restrict__ col, const Idx* __restrict__ edge_map, int64_t N,
+    int64_t M, int64_t E, int64_t reduce_size,
+    const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off,
+    int64_t lhs_len, int64_t rhs_len, int64_t out_len) {
+  Idx ty = blockIdx.x * blockDim.y + threadIdx.y;
+  if (ty < E) {
+    const Idx src = _ldg(row + ty);
+    const Idx dst = _ldg(col + ty);
+    const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty;
+    const DType* lhsoff =
+        lhs + Selector<LhsTarget>::Call(src, eid, dst) * lhs_len;
+    const DType* rhsoff =
+        rhs + Selector<RhsTarget>::Call(src, eid, dst) * rhs_len;
+    DType* outoff = out + eid * out_len;
+    int tx = threadIdx.x;  // tx < 32
+    for (int i = blockIdx.y; i < out_len;
+         i += gridDim.y) {  // over output feature dimension
+      const Idx lhs_add = UseBcast ? __ldg(lhs_off + i) : i;
+      const Idx rhs_add = UseBcast ? __ldg(rhs_off + i) : i;
+      DType val = reduce::Sum<Idx, DType>::zero();
+      for (int j = tx; j < reduce_size; j += 64) {
+        val += lhsoff[lhs_add * reduce_size + j] *
+               rhsoff[rhs_add * reduce_size + j];
+        if (j + 32 < reduce_size)
+          val += lhsoff[lhs_add * reduce_size + j + 32] *
+                 rhsoff[rhs_add * reduce_size + j + 32];
+      }
+#pragma unroll
+      for (int offset = 16; offset > 0; offset /= 2)
+        val += __shfl_down_sync(full_mask, val, offset);
+      if (tx == 0) outoff[i] = val;
+    }
+  }
+}
+
+// Binary search the row_offsets to find the source node of the edge id.
+template <typename Idx>
+__device__ __forceinline__ Idx
+BinarySearchSrc(const Idx* array, Idx length, Idx eid) {
+  Idx lo = 0, hi = length - 1;
+  while (lo < hi) {
+    Idx mid = (lo + hi) >> 1;
+    if (_ldg(array + mid) <= eid) {
+      lo = mid + 1;
+    } else {
+      hi = mid;
+    }
+  }
+  // INVARIANT: lo == hi
+  if (_ldg(array + hi) == eid) {
+    return hi;
+  } else {
+    return hi - 1;
+  }
+}
+
+/**
+ * @brief CUDA kernel of g-SDDMM on Csr format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different edges. Threadblocks
+ *       on the x-axis are responsible for the computation on different
+ * positions in feature dimension. To efficiently find the source node idx and
+ * destination node index of an given edge on Csr format, it uses binary search
+ * (time complexity O(log N)).
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, bool UseBcast = false,
+    bool UseIdx = false, int LhsTarget = 0, int RhsTarget = 2>
+__global__ void SDDMMCsrKernel(
+    const DType* __restrict__ lhs, const DType* __restrict__ rhs,
+    DType* __restrict__ out, const Idx* __restrict__ indptr,
+    const Idx* __restrict__ indices, const Idx* __restrict__ edge_map,
+    int64_t N, int64_t M, int64_t E, int64_t reduce_size,
+    const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off,
+    int64_t lhs_len, int64_t rhs_len, int64_t out_len) {
+  // SDDMM with Csr.
+  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  while (ty < E) {
+    const Idx src = BinarySearchSrc<Idx>(indptr, N + 1, ty);
+    const Idx dst = _ldg(indices + ty);
+    const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty;
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t stride_x = blockDim.x * gridDim.x;
+    const DType* lhsoff =
+        BinaryOp::use_lhs
+            ? (lhs + Selector<LhsTarget>::Call(src, eid, dst) * lhs_len)
+            : nullptr;
+    const DType* rhsoff =
+        BinaryOp::use_rhs
+            ? (rhs + Selector<RhsTarget>::Call(src, eid, dst) * rhs_len)
+            : nullptr;
+    DType* outoff = out + eid * out_len;
+    while (tx < out_len) {
+      const Idx lhs_add = UseBcast ? lhs_off[tx] : tx;
+      const Idx rhs_add = UseBcast ? rhs_off[tx] : tx;
+      DType val = BinaryOp::Call(
+          lhsoff + lhs_add * reduce_size, rhsoff + rhs_add * reduce_size,
+          reduce_size);
+      outoff[tx] = val;
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA implementation of g-SDDMM on Coo format.
+ * @param bcast Broadcast information.
+ * @param coo The Coo matrix.
+ * @param lhs The left hand side operand feature.
+ * @param rhs The right hand size operand feature.
+ * @param out The result feature on edges.
+ */
+template <
+    typename Idx, typename DType, typename Op, int LhsTarget = 0,
+    int RhsTarget = 2>
+void SDDMMCoo(
+    const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs,
+    NDArray out) {
+  const Idx* row = coo.row.Ptr<Idx>();
+  const Idx* col = coo.col.Ptr<Idx>();
+  const Idx* edge_map = coo.data.Ptr<Idx>();
+  const DType* lhs_data = lhs.Ptr<DType>();
+  const DType* rhs_data = rhs.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  int64_t *lhs_off = nullptr, *rhs_off = nullptr;
+  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
+  int64_t reduce_dim = bcast.reduce_size;
+
+  const int64_t nnz = coo.row->shape[0];
+  const bool use_idx = !IsNullArray(coo.data);
+
+  if (std::is_same<Op, binary::Dot<DType> >::value && reduce_dim >= 32) {
+    const int ntx = 32;  // on feature dimension
+    const int nty = 8;   // on out dimension
+    const int nbx = (nnz + nty - 1) / nty;
+    const int nby = FindNumBlocks<'y'>(len);
+    const dim3 nblks(nbx, nby);
+    const dim3 nthrs(ntx, nty);
+    BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
+      CUDA_KERNEL_CALL(
+          (SDDMMCooTreeReduceKernel<
+              Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+          nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, row, col,
+          edge_map, coo.num_rows, coo.num_cols, nnz, reduce_dim, lhs_off,
+          rhs_off, lhs_len, rhs_len, len);
+    });
+  } else {
+    const int ntx = FindNumThreads(len);
+    const int nty = CUDA_MAX_NUM_THREADS / ntx;
+    const int nbx = (len + ntx - 1) / ntx;
+    const int nby = FindNumBlocks<'y'>((nnz + nty - 1) / nty);
+    const dim3 nblks(nbx, nby);
+    const dim3 nthrs(ntx, nty);
+    BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
+      CUDA_KERNEL_CALL(
+          (SDDMMCooKernel<
+              Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+          nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, row, col,
+          edge_map, coo.num_rows, coo.num_cols, nnz, reduce_dim, lhs_off,
+          rhs_off, lhs_len, rhs_len, len);
+    });
+  }
+}
+
+/**
+ * @brief CUDA implementation of g-SDDMM on Csr format.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param lhs The left hand side operand feature.
+ * @param rhs The right hand size operand feature.
+ * @param out The result feature on edges.
+ */
+template <
+    typename Idx, typename DType, typename Op, int LhsTarget = 0,
+    int RhsTarget = 2>
+void SDDMMCsr(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs,
+    NDArray out) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
+  const Idx* edge_map = csr.data.Ptr<Idx>();
+  const DType* lhs_data = lhs.Ptr<DType>();
+  const DType* rhs_data = rhs.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0];
+
+  int64_t *lhs_off = nullptr, *rhs_off = nullptr;
+  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
+  int64_t reduce_dim = bcast.reduce_size;
+
+  const int ntx = FindNumThreads(len);
+  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int nbx = (len + ntx - 1) / ntx;
+  const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty);
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  const bool use_idx = !IsNullArray(csr.data);
+
+  BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, {
+    CUDA_KERNEL_CALL(
+        (SDDMMCsrKernel<
+            Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>),
+        nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, indptr, indices,
+        edge_map, N, M, E, reduce_dim, lhs_off, rhs_off, lhs_len, rhs_len, len);
+  });
+}
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_SDDMM_CUH_
diff --git a/src/array/cuda/sddmm_hetero_coo.cu b/src/array/cuda/sddmm_hetero_coo.cu
index 180e189e7152..278c9d60cafe 100644
--- a/src/array/cuda/sddmm_hetero_coo.cu
+++ b/src/array/cuda/sddmm_hetero_coo.cu
@@ -49,13 +49,13 @@ template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
     int rhs_target, const std::vector<dgl_type_t>& in_eid,
     const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast,
     const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
     const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
     int rhs_target, const std::vector<dgl_type_t>& in_eid,
     const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCooHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast,
     const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
     const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
diff --git a/src/array/cuda/sddmm_hetero_coo.cu.prehip b/src/array/cuda/sddmm_hetero_coo.cu.prehip
new file mode 100644
index 000000000000..180e189e7152
--- /dev/null
+++ b/src/array/cuda/sddmm_hetero_coo.cu.prehip
@@ -0,0 +1,91 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include "./sddmm.cuh"
+
+namespace dgl {
+namespace aten {
+
+/**
+ * @brief CUDA implementation of g-SDDMM on heterograph using
+    Csr format.
+ */
+template <int XPU, typename IdType, typename DType>
+void SDDMMCooHetero(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& vec_lhs,
+    const std::vector<NDArray>& vec_rhs, std::vector<NDArray> vec_out,
+    int lhs_target, int rhs_target, const std::vector<dgl_type_t>& lhs_eid,
+    const std::vector<dgl_type_t>& rhs_eid) {
+  SWITCH_OP(op, Op, {
+    SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
+      /* Call SDDMM CUDA kernel for each relation type sequentially */
+      for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
+        COOMatrix coo = vec_coo[etype];
+        NDArray lhs = vec_lhs[lhs_eid[etype]];
+        NDArray rhs = vec_rhs[rhs_eid[etype]];
+        NDArray out = vec_out[etype];
+        cuda::SDDMMCoo<IdType, DType, Op, LhsTarget, RhsTarget>(
+            bcast, coo, lhs, rhs, out);
+      }
+    });
+  });
+}
+
+template void SDDMMCooHetero<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCooHetero<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+#if BF16_ENABLED
+template void SDDMMCooHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCooHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+#endif  // BF16_ENABLED
+template void SDDMMCooHetero<kDGLCUDA, int32_t, float>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCooHetero<kDGLCUDA, int64_t, float>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCooHetero<kDGLCUDA, int32_t, double>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCooHetero<kDGLCUDA, int64_t, double>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<COOMatrix>& vec_coo, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/sddmm_hetero_csr.cu b/src/array/cuda/sddmm_hetero_csr.cu
index 7a0331e700b9..cfeaf3cfc348 100644
--- a/src/array/cuda/sddmm_hetero_csr.cu
+++ b/src/array/cuda/sddmm_hetero_csr.cu
@@ -48,13 +48,13 @@ template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
     int rhs_target, const std::vector<dgl_type_t>& in_eid,
     const std::vector<dgl_type_t>& out_eid);
 #if BF16_ENABLED
-template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast,
     const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
     const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
     int rhs_target, const std::vector<dgl_type_t>& in_eid,
     const std::vector<dgl_type_t>& out_eid);
-template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const BcastOff& bcast,
     const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
     const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
diff --git a/src/array/cuda/sddmm_hetero_csr.cu.prehip b/src/array/cuda/sddmm_hetero_csr.cu.prehip
new file mode 100644
index 000000000000..7a0331e700b9
--- /dev/null
+++ b/src/array/cuda/sddmm_hetero_csr.cu.prehip
@@ -0,0 +1,90 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/sddmm.cu
+ * @brief SDDMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include "./sddmm.cuh"
+
+namespace dgl {
+namespace aten {
+
+/**
+ * @brief CUDA implementation of g-SDDMM on heterograph using Csr format.
+ */
+template <int XPU, typename IdType, typename DType>
+void SDDMMCsrHetero(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& vec_lhs,
+    const std::vector<NDArray>& vec_rhs, std::vector<NDArray> vec_out,
+    int lhs_target, int rhs_target, const std::vector<dgl_type_t>& lhs_eid,
+    const std::vector<dgl_type_t>& rhs_eid) {
+  SWITCH_OP(op, Op, {
+    SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, {
+      /* Call SDDMM CUDA kernel for each relation type sequentially */
+      for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) {
+        CSRMatrix csr = vec_csr[etype];
+        NDArray lhs = vec_lhs[lhs_eid[etype]];
+        NDArray rhs = vec_rhs[rhs_eid[etype]];
+        NDArray out = vec_out[etype];
+        cuda::SDDMMCsr<IdType, DType, Op, LhsTarget, RhsTarget>(
+            bcast, csr, lhs, rhs, out);
+      }
+    });
+  });
+}
+
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+#if BF16_ENABLED
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+#endif  // BF16_ENABLED
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, float>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, float>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDGLCUDA, int32_t, double>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+template void SDDMMCsrHetero<kDGLCUDA, int64_t, double>(
+    const std::string& op, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr, const std::vector<NDArray>& lhs,
+    const std::vector<NDArray>& rhs, std::vector<NDArray> out, int lhs_target,
+    int rhs_target, const std::vector<dgl_type_t>& in_eid,
+    const std::vector<dgl_type_t>& out_eid);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/segment_reduce.cu b/src/array/cuda/segment_reduce.cu
index d83c1e68eeb1..7417ca178e24 100644
--- a/src/array/cuda/segment_reduce.cu
+++ b/src/array/cuda/segment_reduce.cu
@@ -60,10 +60,10 @@ template void SegmentReduce<kDGLCUDA, int64_t, __half>(
     const std::string& op, NDArray feat, NDArray offsets, NDArray out,
     NDArray arg);
 #if BF16_ENABLED
-template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, NDArray feat, NDArray offsets, NDArray out,
     NDArray arg);
-template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SegmentReduce<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, NDArray feat, NDArray offsets, NDArray out,
     NDArray arg);
 #endif  // BF16_ENABLED
@@ -85,9 +85,9 @@ template void ScatterAdd<kDGLCUDA, int32_t, __half>(
 template void ScatterAdd<kDGLCUDA, int64_t, __half>(
     NDArray feat, NDArray idx, NDArray out);
 #if BF16_ENABLED
-template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int32_t, __hip_bfloat16>(
     NDArray feat, NDArray idx, NDArray out);
-template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void ScatterAdd<kDGLCUDA, int64_t, __hip_bfloat16>(
     NDArray feat, NDArray idx, NDArray out);
 #endif  // BF16_ENABLED
 template void ScatterAdd<kDGLCUDA, int32_t, float>(
@@ -108,11 +108,11 @@ template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
     const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
     const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
 #if BF16_ENABLED
-template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __hip_bfloat16>(
     const HeteroGraphPtr& g, const std::string& op,
     const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
     const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
-template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __hip_bfloat16>(
     const HeteroGraphPtr& g, const std::string& op,
     const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
     const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
@@ -139,9 +139,9 @@ template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
 template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
     NDArray feat, NDArray arg, NDArray out);
 #if BF16_ENABLED
-template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, __hip_bfloat16>(
     NDArray feat, NDArray arg, NDArray out);
-template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, __hip_bfloat16>(
     NDArray feat, NDArray arg, NDArray out);
 #endif  // BF16_ENABLED
 template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
diff --git a/src/array/cuda/segment_reduce.cu.prehip b/src/array/cuda/segment_reduce.cu.prehip
new file mode 100644
index 000000000000..d83c1e68eeb1
--- /dev/null
+++ b/src/array/cuda/segment_reduce.cu.prehip
@@ -0,0 +1,157 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/segment_reduce.cu
+ * @brief Segment reduce C APIs and definitions.
+ */
+#include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+
+#include "./functor.cuh"
+#include "./segment_reduce.cuh"
+#include "./utils.h"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+
+template <int XPU, typename IdType, typename DType>
+void SegmentReduce(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg) {
+  if (op == "sum") {
+    cuda::SegmentReduce<IdType, DType, cuda::reduce::Sum<IdType, DType>>(
+        feat, offsets, out, arg);
+  } else if (op == "max") {
+    cuda::SegmentReduce<IdType, DType, cuda::reduce::Max<IdType, DType>>(
+        feat, offsets, out, arg);
+  } else if (op == "min") {
+    cuda::SegmentReduce<IdType, DType, cuda::reduce::Min<IdType, DType>>(
+        feat, offsets, out, arg);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+template <int XPU, typename IdType, typename DType>
+void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
+  cuda::ScatterAdd<IdType, DType>(feat, idx, out);
+}
+
+template <int XPU, typename IdType, typename DType>
+void UpdateGradMinMax_hetero(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out) {
+  cuda::UpdateGradMinMax_hetero<IdType, DType>(
+      g, op, feat, idx, idx_etype, out);
+}
+
+template <int XPU, typename IdType, typename DType>
+void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
+  cuda::BackwardSegmentCmp<IdType, DType>(feat, arg, out);
+}
+
+template void SegmentReduce<kDGLCUDA, int32_t, __half>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDGLCUDA, int64_t, __half>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+#if BF16_ENABLED
+template void SegmentReduce<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+#endif  // BF16_ENABLED
+template void SegmentReduce<kDGLCUDA, int32_t, float>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDGLCUDA, int64_t, float>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDGLCUDA, int32_t, double>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+template void SegmentReduce<kDGLCUDA, int64_t, double>(
+    const std::string& op, NDArray feat, NDArray offsets, NDArray out,
+    NDArray arg);
+
+template void ScatterAdd<kDGLCUDA, int32_t, __half>(
+    NDArray feat, NDArray idx, NDArray out);
+template void ScatterAdd<kDGLCUDA, int64_t, __half>(
+    NDArray feat, NDArray idx, NDArray out);
+#if BF16_ENABLED
+template void ScatterAdd<kDGLCUDA, int32_t, __nv_bfloat16>(
+    NDArray feat, NDArray idx, NDArray out);
+template void ScatterAdd<kDGLCUDA, int64_t, __nv_bfloat16>(
+    NDArray feat, NDArray idx, NDArray out);
+#endif  // BF16_ENABLED
+template void ScatterAdd<kDGLCUDA, int32_t, float>(
+    NDArray feat, NDArray idx, NDArray out);
+template void ScatterAdd<kDGLCUDA, int64_t, float>(
+    NDArray feat, NDArray idx, NDArray out);
+template void ScatterAdd<kDGLCUDA, int32_t, double>(
+    NDArray feat, NDArray idx, NDArray out);
+template void ScatterAdd<kDGLCUDA, int64_t, double>(
+    NDArray feat, NDArray idx, NDArray out);
+
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __half>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __half>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+#if BF16_ENABLED
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+#endif  // BF16_ENABLED
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, float>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, float>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+template void UpdateGradMinMax_hetero<kDGLCUDA, int32_t, double>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+template void UpdateGradMinMax_hetero<kDGLCUDA, int64_t, double>(
+    const HeteroGraphPtr& g, const std::string& op,
+    const std::vector<NDArray>& feat, const std::vector<NDArray>& idx,
+    const std::vector<NDArray>& idx_etype, std::vector<NDArray>* out);
+
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, __half>(
+    NDArray feat, NDArray arg, NDArray out);
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, __half>(
+    NDArray feat, NDArray arg, NDArray out);
+#if BF16_ENABLED
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, __nv_bfloat16>(
+    NDArray feat, NDArray arg, NDArray out);
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, __nv_bfloat16>(
+    NDArray feat, NDArray arg, NDArray out);
+#endif  // BF16_ENABLED
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, float>(
+    NDArray feat, NDArray arg, NDArray out);
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, float>(
+    NDArray feat, NDArray arg, NDArray out);
+template void BackwardSegmentCmp<kDGLCUDA, int32_t, double>(
+    NDArray feat, NDArray arg, NDArray out);
+template void BackwardSegmentCmp<kDGLCUDA, int64_t, double>(
+    NDArray feat, NDArray arg, NDArray out);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/segment_reduce.cuh b/src/array/cuda/segment_reduce.cuh
index b1be03fadd71..90f0616993cc 100644
--- a/src/array/cuda/segment_reduce.cuh
+++ b/src/array/cuda/segment_reduce.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/segment_reduce.cuh
@@ -125,7 +126,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
   DType* out_data = out.Ptr<DType>();
   IdType* arg_data = arg.Ptr<IdType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int64_t n = out->shape[0];
   int64_t dim = 1;
   for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -155,7 +156,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
   const IdType* idx_data = idx.Ptr<IdType>();
   DType* out_data = out.Ptr<DType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int64_t n = feat->shape[0];
   int64_t dim = 1;
   for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
@@ -186,7 +187,7 @@ void UpdateGradMinMax_hetero(
     const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
     const std::vector<NDArray>& list_idx_types,
     std::vector<NDArray>* list_out) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   if (op == "copy_lhs" || op == "copy_rhs") {
     std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
         graph->NumVertexTypes(), std::vector<dgl_id_t>());
@@ -239,7 +240,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
   const IdType* arg_data = arg.Ptr<IdType>();
   DType* out_data = out.Ptr<DType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int64_t n = feat->shape[0];
   int64_t dim = 1;
   for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
diff --git a/src/array/cuda/segment_reduce.cuh.prehip b/src/array/cuda/segment_reduce.cuh.prehip
new file mode 100644
index 000000000000..b1be03fadd71
--- /dev/null
+++ b/src/array/cuda/segment_reduce.cuh.prehip
@@ -0,0 +1,262 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/segment_reduce.cuh
+ * @brief Segment reduce kernel function header.
+ */
+#ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
+#define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
+
+#include <string>
+#include <vector>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./atomic.cuh"
+#include "./utils.h"
+
+namespace dgl {
+
+using namespace cuda;
+using namespace runtime;
+
+namespace aten {
+namespace cuda {
+
+/**
+ * @brief CUDA kernel of segment reduce.
+ * @note each blockthread is responsible for aggregation on a row
+ *       in the result tensor.
+ */
+template <typename IdType, typename DType, typename ReduceOp>
+__global__ void SegmentReduceKernel(
+    const DType* feat, const IdType* offsets, DType* out, IdType* arg,
+    int64_t n, int64_t dim) {
+  for (int row = blockIdx.x; row < n; row += gridDim.x) {
+    int col = blockIdx.y * blockDim.x + threadIdx.x;
+    while (col < dim) {
+      typename accum_dtype<DType>::type local_accum = ReduceOp::zero();
+      IdType local_arg = -1;
+      for (IdType i = offsets[row]; i < offsets[row + 1]; ++i) {
+        ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i);
+      }
+      out[row * dim + col] = static_cast<DType>(local_accum);
+      if (ReduceOp::require_arg) arg[row * dim + col] = local_arg;
+      col += gridDim.y * blockDim.x;
+    }
+  }
+}
+
+/**
+ * @brief CUDA kernel of scatter add.
+ * @note each blockthread is responsible for adding a row in feature tensor
+ *       to a target row in output tensor.
+ */
+template <typename IdType, typename DType>
+__global__ void ScatterAddKernel(
+    const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) {
+  for (int row = blockIdx.x; row < n; row += gridDim.x) {
+    const int write_row = idx[row];
+    int col = blockIdx.y * blockDim.x + threadIdx.x;
+    while (col < dim) {
+      cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]);
+      col += gridDim.y * blockDim.x;
+    }
+  }
+}
+
+/**
+ * @brief CUDA kernel to update gradients for reduce op max/min
+ * @note each WARP (group of 32 threads) is responsible for adding a row in
+ * feature tensor to a target row in output tensor.
+ */
+
+template <typename IdType, typename DType>
+__global__ void UpdateGradMinMaxHeteroKernel(
+    const DType* feat, const IdType* idx, const IdType* idx_type, DType* out,
+    int64_t n, int64_t dim, int type) {
+  unsigned int tId = threadIdx.x;
+  unsigned int laneId = tId & 31;
+  unsigned int gId = blockIdx.x * blockDim.x + threadIdx.x;
+  unsigned int warpId = gId >> 5;
+  unsigned int warp_size = 32;
+  unsigned int row = warpId;
+
+  while (row < n) {
+    for (unsigned int col = laneId; col < dim; col += warp_size) {
+      if (type == idx_type[row * dim + col]) {
+        const int write_row = idx[row * dim + col];
+        cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]);
+      }
+    }
+    row += blockDim.x * gridDim.x;
+  }
+}
+
+/**
+ * @brief CUDA kernel of backward phase in segment min/max.
+ * @note each blockthread is responsible for writing a row in the
+ *       result gradient tensor by lookup the ArgMin/Max for index information.
+ */
+template <typename IdType, typename DType>
+__global__ void BackwardSegmentCmpKernel(
+    const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) {
+  for (int row = blockIdx.x; row < n; row += gridDim.x) {
+    int col = blockIdx.y * blockDim.x + threadIdx.x;
+    while (col < dim) {
+      int write_row = arg[row * dim + col];
+      if (write_row >= 0) {
+        out[write_row * dim + col] = feat[row * dim + col];
+      }
+      col += gridDim.y * blockDim.x;
+    }
+  }
+}
+
+/**
+ * @brief CUDA implementation of forward phase of Segment Reduce.
+ * @param feat The input tensor.
+ * @param offsets The offsets tensor.
+ * @param out The output tensor.
+ * @param arg An auxiliary tensor storing ArgMax/Min information,
+ */
+template <typename IdType, typename DType, typename ReduceOp>
+void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) {
+  const DType* feat_data = feat.Ptr<DType>();
+  const IdType* offsets_data = offsets.Ptr<IdType>();
+  DType* out_data = out.Ptr<DType>();
+  IdType* arg_data = arg.Ptr<IdType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int64_t n = out->shape[0];
+  int64_t dim = 1;
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
+
+  const int nbx = FindNumBlocks<'x'>(n);
+  const int ntx = FindNumThreads(dim);
+  const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx);
+  const int nty = 1;
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance.
+  CUDA_KERNEL_CALL(
+      (SegmentReduceKernel<IdType, DType, ReduceOp>), nblks, nthrs, 0, stream,
+      feat_data, offsets_data, out_data, arg_data, n, dim);
+}
+
+/**
+ * @brief CUDA implementation of Scatter Add (on first dimension).
+ * @note math equation: out[idx[i], *] += feat[i, *]
+ * @param feat The input tensor.
+ * @param idx The indices tensor.
+ * @param out The output tensor.
+ */
+template <typename IdType, typename DType>
+void ScatterAdd(NDArray feat, NDArray idx, NDArray out) {
+  const DType* feat_data = feat.Ptr<DType>();
+  const IdType* idx_data = idx.Ptr<IdType>();
+  DType* out_data = out.Ptr<DType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int64_t n = feat->shape[0];
+  int64_t dim = 1;
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
+
+  const int nbx = FindNumBlocks<'x'>(n);
+  const int ntx = FindNumThreads(dim);
+  const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx);
+  const int nty = 1;
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  CUDA_KERNEL_CALL(
+      (ScatterAddKernel<IdType, DType>), nblks, nthrs, 0, stream, feat_data,
+      idx_data, out_data, n, dim);
+}
+
+/**
+ * @brief CUDA implementation to update gradients for reduce op max/min
+ * @param graph The input heterogeneous graph.
+ * @param op The binary operator, could be `copy_u`, `copy_e'.
+ * @param list_feat List of the input tensors.
+ * @param list_idx  List of the indices tensors.
+ * @param list_idx_etype List of the node- or edge-type tensors.
+ * @param list_out List of the output tensors.
+ */
+template <typename IdType, typename DType>
+void UpdateGradMinMax_hetero(
+    const HeteroGraphPtr& graph, const std::string& op,
+    const std::vector<NDArray>& list_feat, const std::vector<NDArray>& list_idx,
+    const std::vector<NDArray>& list_idx_types,
+    std::vector<NDArray>* list_out) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  if (op == "copy_lhs" || op == "copy_rhs") {
+    std::vector<std::vector<dgl_id_t>> src_dst_ntypes(
+        graph->NumVertexTypes(), std::vector<dgl_id_t>());
+    for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) {
+      auto pair = graph->meta_graph()->FindEdge(etype);
+      const dgl_id_t dst_ntype = pair.first;  // graph is reversed
+      const dgl_id_t src_ntype = pair.second;
+      auto same_src_dst_ntype = std::find(
+          std::begin(src_dst_ntypes[dst_ntype]),
+          std::end(src_dst_ntypes[dst_ntype]), src_ntype);
+      // if op is "copy_lhs", relation type with same src and dst node type will
+      // be updated once
+      if (op == "copy_lhs" &&
+          same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype]))
+        continue;
+      src_dst_ntypes[dst_ntype].push_back(src_ntype);
+      const DType* feat_data = list_feat[dst_ntype].Ptr<DType>();
+      const IdType* idx_data = list_idx[dst_ntype].Ptr<IdType>();
+      const IdType* idx_type_data = list_idx_types[dst_ntype].Ptr<IdType>();
+      int type = (op == "copy_lhs") ? src_ntype : etype;
+      DType* out_data = (*list_out)[type].Ptr<DType>();
+      int dim = 1;
+      for (int i = 1; i < (*list_out)[type]->ndim; ++i)
+        dim *= (*list_out)[type]->shape[i];
+      int n = list_feat[dst_ntype]->shape[0];
+      const int th_per_row = 32;
+      const int ntx = 128;
+      const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx);
+      const dim3 nblks(nbx);
+      const dim3 nthrs(ntx);
+      CUDA_KERNEL_CALL(
+          (UpdateGradMinMaxHeteroKernel<IdType, DType>), nblks, nthrs, 0,
+          stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type);
+    }
+  }
+}
+
+/**
+ * @brief CUDA implementation of backward phase of Segment Reduce with Min/Max
+ *        reducer.
+ * @note math equation: out[arg[i, k], k] = feat[i, k]
+ * @param feat The input
+ *       tensor.
+ * @param arg The ArgMin/Max information, used for indexing.
+ * @param out The output tensor.
+ */
+template <typename IdType, typename DType>
+void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) {
+  const DType* feat_data = feat.Ptr<DType>();
+  const IdType* arg_data = arg.Ptr<IdType>();
+  DType* out_data = out.Ptr<DType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int64_t n = feat->shape[0];
+  int64_t dim = 1;
+  for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i];
+
+  const int nbx = FindNumBlocks<'x'>(n);
+  const int ntx = FindNumThreads(dim);
+  const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx);
+  const int nty = 1;
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  CUDA_KERNEL_CALL(
+      (BackwardSegmentCmpKernel<IdType, DType>), nblks, nthrs, 0, stream,
+      feat_data, arg_data, out_data, n, dim);
+}
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_
diff --git a/src/array/cuda/spmat_op_impl_coo.cu b/src/array/cuda/spmat_op_impl_coo.cu
index dddcb6c01413..24c53d4ea9ce 100644
--- a/src/array/cuda/spmat_op_impl_coo.cu
+++ b/src/array/cuda/spmat_op_impl_coo.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by contributors.
  * @file array/cuda/spmat_op_impl_coo.cu
@@ -72,7 +73,7 @@ __global__ void _COOGetRowNNZKernel(
 
 template <DGLDeviceType XPU, typename IdType>
 int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = coo.row->ctx;
   IdType nnz = coo.row->shape[0];
   IdType nt = 1024;
@@ -103,7 +104,7 @@ __global__ void _COOGetAllRowNNZKernel(
 
 template <DGLDeviceType XPU, typename IdType>
 NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = coo.row->ctx;
   IdType nnz = coo.row->shape[0];
   IdType num_rows = coo.num_rows;
diff --git a/src/array/cuda/spmat_op_impl_coo.cu.prehip b/src/array/cuda/spmat_op_impl_coo.cu.prehip
new file mode 100644
index 000000000000..dddcb6c01413
--- /dev/null
+++ b/src/array/cuda/spmat_op_impl_coo.cu.prehip
@@ -0,0 +1,139 @@
+/**
+ *  Copyright (c) 2021 by contributors.
+ * @file array/cuda/spmat_op_impl_coo.cu
+ * @brief COO operator GPU implementation
+ */
+#include <dgl/array.h>
+
+#include <numeric>
+#include <unordered_set>
+#include <vector>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./atomic.cuh"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+using namespace cuda;
+
+namespace aten {
+namespace impl {
+
+template <typename IdType>
+__device__ void _warpReduce(volatile IdType* sdata, IdType tid) {
+  sdata[tid] += sdata[tid + 32];
+  sdata[tid] += sdata[tid + 16];
+  sdata[tid] += sdata[tid + 8];
+  sdata[tid] += sdata[tid + 4];
+  sdata[tid] += sdata[tid + 2];
+  sdata[tid] += sdata[tid + 1];
+}
+
+template <typename IdType>
+__global__ void _COOGetRowNNZKernel(
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt,
+    const int64_t row_query, IdType nnz) {
+  __shared__ IdType local_cnt[1024];
+  IdType tx = threadIdx.x;
+  IdType bx = blockIdx.x;
+  local_cnt[tx] = 0;
+  IdType start = bx * blockDim.x;
+  while (start < nnz) {
+    if (start + tx < nnz)
+      local_cnt[tx] = (row_indices[start + tx] == row_query);
+    __syncthreads();
+    if (tx < 512) {
+      local_cnt[tx] += local_cnt[tx + 512];
+      __syncthreads();
+    }
+    if (tx < 256) {
+      local_cnt[tx] += local_cnt[tx + 256];
+      __syncthreads();
+    }
+    if (tx < 128) {
+      local_cnt[tx] += local_cnt[tx + 128];
+      __syncthreads();
+    }
+    if (tx < 64) {
+      local_cnt[tx] += local_cnt[tx + 64];
+      __syncthreads();
+    }
+    if (tx < 32) {
+      _warpReduce(local_cnt, tx);
+    }
+    if (tx == 0) {
+      cuda::AtomicAdd(glb_cnt, local_cnt[tx]);
+    }
+    start += blockDim.x * gridDim.x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = coo.row->ctx;
+  IdType nnz = coo.row->shape[0];
+  IdType nt = 1024;
+  IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
+  NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
+  _Fill(rst.Ptr<IdType>(), 1, IdType(0));
+  CUDA_KERNEL_CALL(
+      _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+      rst.Ptr<IdType>(), row, nnz);
+  rst = rst.CopyTo(DGLContext{kDGLCPU, 0});
+  return *rst.Ptr<IdType>();
+}
+
+template int64_t COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, int64_t);
+template int64_t COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, int64_t);
+
+template <typename IdType>
+__global__ void _COOGetAllRowNNZKernel(
+    const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts,
+    IdType nnz) {
+  IdType eid = blockIdx.x * blockDim.x + threadIdx.x;
+  while (eid < nnz) {
+    IdType row = row_indices[eid];
+    cuda::AtomicAdd(glb_cnts + row, IdType(1));
+    eid += blockDim.x * gridDim.x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = coo.row->ctx;
+  IdType nnz = coo.row->shape[0];
+  IdType num_rows = coo.num_rows;
+  IdType num_queries = rows->shape[0];
+  if (num_queries == 1) {
+    auto rows_cpu = rows.CopyTo(DGLContext{kDGLCPU, 0});
+    int64_t row = *rows_cpu.Ptr<IdType>();
+    IdType nt = 1024;
+    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
+    NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx);
+    _Fill(rst.Ptr<IdType>(), 1, IdType(0));
+    CUDA_KERNEL_CALL(
+        _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+        rst.Ptr<IdType>(), row, nnz);
+    return rst;
+  } else {
+    IdType nt = 1024;
+    IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt);
+    NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx);
+    _Fill(in_degrees.Ptr<IdType>(), num_rows, IdType(0));
+    CUDA_KERNEL_CALL(
+        _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr<IdType>(),
+        in_degrees.Ptr<IdType>(), nnz);
+    return IndexSelect(in_degrees, rows);
+  }
+}
+
+template NDArray COOGetRowNNZ<kDGLCUDA, int32_t>(COOMatrix, NDArray);
+template NDArray COOGetRowNNZ<kDGLCUDA, int64_t>(COOMatrix, NDArray);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/spmat_op_impl_csr.cu b/src/array/cuda/spmat_op_impl_csr.cu
index 7ee1f2ddcecc..4e07fcb0416c 100644
--- a/src/array/cuda/spmat_op_impl_csr.cu
+++ b/src/array/cuda/spmat_op_impl_csr.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/spmat_op_impl_csr.cu
@@ -7,7 +8,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <numeric>
 #include <unordered_set>
 #include <vector>
@@ -28,7 +29,7 @@ namespace impl {
 
 template <DGLDeviceType XPU, typename IdType>
 bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = csr.indptr->ctx;
   IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
   IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
@@ -58,7 +59,7 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
   if (rstlen == 0) return rst;
   const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
   const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int nt = dgl::cuda::FindNumThreads(rstlen);
   const int nb = (rstlen + nt - 1) / nt;
   const IdType* data = nullptr;
@@ -104,7 +105,7 @@ template <DGLDeviceType XPU, typename IdType>
 bool CSRHasDuplicate(CSRMatrix csr) {
   if (!csr.sorted) csr = CSRSort(csr);
   const auto& ctx = csr.indptr->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = runtime::DeviceAPI::Get(ctx);
   // We allocate a workspace of num_rows bytes. It wastes a little bit memory
   // but should be fine.
@@ -149,7 +150,7 @@ __global__ void _CSRGetRowNNZKernel(
 
 template <DGLDeviceType XPU, typename IdType>
 NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto len = rows->shape[0];
   const IdType* vid_data = rows.Ptr<IdType>();
   const IdType* indptr_data =
@@ -250,7 +251,7 @@ __global__ void _SegmentCopyKernel(
 
 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int64_t len = rows->shape[0];
   IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
   const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
@@ -367,7 +368,7 @@ std::vector<NDArray> CSRGetDataAndIndices(
   const int64_t nnz = csr.indices->shape[0];
   const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
   const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const IdType* indptr_data =
       static_cast<IdType*>(GetDevicePointer(csr.indptr));
@@ -532,7 +533,7 @@ __global__ void _SegmentMaskColKernel(
           static_cast<IdType>(num_rows));
 
   NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
-  typedef cub::WarpReduce<IdType> WarpReduce;
+  typedef hipcub::WarpReduce<IdType> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
 
   while (out_row < last_row) {
@@ -557,7 +558,7 @@ __global__ void _SegmentMaskColKernel(
 template <DGLDeviceType XPU, typename IdType>
 CSRMatrix CSRSliceMatrix(
     CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = rows->ctx;
   const auto& dtype = rows->dtype;
   const auto nbits = dtype.bits;
@@ -582,7 +583,7 @@ CSRMatrix CSRSliceMatrix(
   // A count for how many masked values per row.
   IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
   CUDA_CALL(
-      cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
+      hipMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
 
   // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
   // For performance, the load factor of the hashmap is in (0.25, 0.5);
@@ -593,7 +594,7 @@ CSRMatrix CSRSliceMatrix(
 
   using it = thrust::counting_iterator<int64_t>;
   runtime::CUDAWorkspaceAllocator allocator(ctx);
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
   thrust::for_each(
       exec_policy, it(0), it(new_ncols),
       [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
diff --git a/src/array/cuda/spmat_op_impl_csr.cu.prehip b/src/array/cuda/spmat_op_impl_csr.cu.prehip
new file mode 100644
index 000000000000..7ee1f2ddcecc
--- /dev/null
+++ b/src/array/cuda/spmat_op_impl_csr.cu.prehip
@@ -0,0 +1,654 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/spmat_op_impl_csr.cu
+ * @brief CSR operator CPU implementation
+ */
+#include <dgl/array.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+
+#include <cub/cub.cuh>
+#include <numeric>
+#include <unordered_set>
+#include <vector>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./atomic.cuh"
+#include "./utils.h"
+
+namespace dgl {
+
+using runtime::NDArray;
+using namespace cuda;
+
+namespace aten {
+namespace impl {
+
+///////////////////////////// CSRIsNonZero /////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = csr.indptr->ctx;
+  IdArray rows = aten::VecToIdArray<int64_t>({row}, sizeof(IdType) * 8, ctx);
+  IdArray cols = aten::VecToIdArray<int64_t>({col}, sizeof(IdType) * 8, ctx);
+  rows = rows.CopyTo(ctx);
+  cols = cols.CopyTo(ctx);
+  IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8);
+  const IdType* data = nullptr;
+  // TODO(minjie): use binary search for sorted csr
+  CUDA_KERNEL_CALL(
+      dgl::cuda::_LinearSearchKernel, 1, 1, 0, stream, csr.indptr.Ptr<IdType>(),
+      csr.indices.Ptr<IdType>(), data, rows.Ptr<IdType>(), cols.Ptr<IdType>(),
+      1, 1, 1, static_cast<IdType*>(nullptr), static_cast<IdType>(-1),
+      out.Ptr<IdType>());
+  out = out.CopyTo(DGLContext{kDGLCPU, 0});
+  return *out.Ptr<IdType>() != -1;
+}
+
+template bool CSRIsNonZero<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
+template bool CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);
+
+template <DGLDeviceType XPU, typename IdType>
+NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) {
+  const auto rowlen = row->shape[0];
+  const auto collen = col->shape[0];
+  const auto rstlen = std::max(rowlen, collen);
+  NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx);
+  if (rstlen == 0) return rst;
+  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
+  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int nt = dgl::cuda::FindNumThreads(rstlen);
+  const int nb = (rstlen + nt - 1) / nt;
+  const IdType* data = nullptr;
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
+  // TODO(minjie): use binary search for sorted csr
+  CUDA_KERNEL_CALL(
+      dgl::cuda::_LinearSearchKernel, nb, nt, 0, stream, indptr_data,
+      indices_data, data, row.Ptr<IdType>(), col.Ptr<IdType>(), row_stride,
+      col_stride, rstlen, static_cast<IdType*>(nullptr),
+      static_cast<IdType>(-1), rst.Ptr<IdType>());
+  return rst != -1;
+}
+
+template NDArray CSRIsNonZero<kDGLCUDA, int32_t>(CSRMatrix, NDArray, NDArray);
+template NDArray CSRIsNonZero<kDGLCUDA, int64_t>(CSRMatrix, NDArray, NDArray);
+
+///////////////////////////// CSRHasDuplicate /////////////////////////////
+
+/**
+ * @brief Check whether each row does not have any duplicate entries.
+ * Assume the CSR is sorted.
+ */
+template <typename IdType>
+__global__ void _SegmentHasNoDuplicate(
+    const IdType* indptr, const IdType* indices, int64_t num_rows,
+    int8_t* flags) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_rows) {
+    bool f = true;
+    for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) {
+      f = (indices[i - 1] != indices[i]);
+    }
+    flags[tx] = static_cast<int8_t>(f);
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+bool CSRHasDuplicate(CSRMatrix csr) {
+  if (!csr.sorted) csr = CSRSort(csr);
+  const auto& ctx = csr.indptr->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  // We allocate a workspace of num_rows bytes. It wastes a little bit memory
+  // but should be fine.
+  int8_t* flags =
+      static_cast<int8_t*>(device->AllocWorkspace(ctx, csr.num_rows));
+  const int nt = dgl::cuda::FindNumThreads(csr.num_rows);
+  const int nb = (csr.num_rows + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _SegmentHasNoDuplicate, nb, nt, 0, stream, csr.indptr.Ptr<IdType>(),
+      csr.indices.Ptr<IdType>(), csr.num_rows, flags);
+  bool ret = dgl::cuda::AllTrue(flags, csr.num_rows, ctx);
+  device->FreeWorkspace(ctx, flags);
+  return !ret;
+}
+
+template bool CSRHasDuplicate<kDGLCUDA, int32_t>(CSRMatrix csr);
+template bool CSRHasDuplicate<kDGLCUDA, int64_t>(CSRMatrix csr);
+
+///////////////////////////// CSRGetRowNNZ /////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) {
+  const IdType cur = aten::IndexSelect<IdType>(csr.indptr, row);
+  const IdType next = aten::IndexSelect<IdType>(csr.indptr, row + 1);
+  return next - cur;
+}
+
+template int64_t CSRGetRowNNZ<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template int64_t CSRGetRowNNZ<kDGLCUDA, int64_t>(CSRMatrix, int64_t);
+
+template <typename IdType>
+__global__ void _CSRGetRowNNZKernel(
+    const IdType* vid, const IdType* indptr, IdType* out, int64_t length) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    const IdType vv = vid[tx];
+    out[tx] = indptr[vv + 1] - indptr[vv];
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto len = rows->shape[0];
+  const IdType* vid_data = rows.Ptr<IdType>();
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx);
+  IdType* rst_data = static_cast<IdType*>(rst->data);
+  const int nt = dgl::cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _CSRGetRowNNZKernel, nb, nt, 0, stream, vid_data, indptr_data, rst_data,
+      len);
+  return rst;
+}
+
+template NDArray CSRGetRowNNZ<kDGLCUDA, int32_t>(CSRMatrix, NDArray);
+template NDArray CSRGetRowNNZ<kDGLCUDA, int64_t>(CSRMatrix, NDArray);
+
+////////////////////////// CSRGetRowColumnIndices //////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) {
+  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
+  const int64_t offset =
+      aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
+  return csr.indices.CreateView({len}, csr.indices->dtype, offset);
+}
+
+template NDArray CSRGetRowColumnIndices<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowColumnIndices<kDGLCUDA, int64_t>(CSRMatrix, int64_t);
+
+///////////////////////////// CSRGetRowData /////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+NDArray CSRGetRowData(CSRMatrix csr, int64_t row) {
+  const int64_t len = impl::CSRGetRowNNZ<XPU, IdType>(csr, row);
+  const int64_t offset =
+      aten::IndexSelect<IdType>(csr.indptr, row) * sizeof(IdType);
+  if (aten::CSRHasData(csr))
+    return csr.data.CreateView({len}, csr.data->dtype, offset);
+  else
+    return aten::Range(
+        offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx);
+}
+
+template NDArray CSRGetRowData<kDGLCUDA, int32_t>(CSRMatrix, int64_t);
+template NDArray CSRGetRowData<kDGLCUDA, int64_t>(CSRMatrix, int64_t);
+
+///////////////////////////// CSRSliceRows /////////////////////////////
+
+template <DGLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) {
+  const int64_t num_rows = end - start;
+  const IdType st_pos = aten::IndexSelect<IdType>(csr.indptr, start);
+  const IdType ed_pos = aten::IndexSelect<IdType>(csr.indptr, end);
+  const IdType nnz = ed_pos - st_pos;
+  IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos;
+  // indices and data can be view arrays
+  IdArray ret_indices = csr.indices.CreateView(
+      {nnz}, csr.indices->dtype, st_pos * sizeof(IdType));
+  IdArray ret_data;
+  if (CSRHasData(csr))
+    ret_data =
+        csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType));
+  else
+    ret_data =
+        aten::Range(st_pos, ed_pos, csr.indptr->dtype.bits, csr.indptr->ctx);
+  return CSRMatrix(
+      num_rows, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted);
+}
+
+template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, int64_t, int64_t);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, int64_t, int64_t);
+
+/**
+ * @brief Copy data segment to output buffers
+ *
+ * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1]
+ * to the out_data from out_indptr[i] ~ out_indptr[i+1]
+ *
+ * If the provided `data` array is nullptr, write the read index to the
+ * out_data.
+ *
+ */
+template <typename IdType, typename DType>
+__global__ void _SegmentCopyKernel(
+    const IdType* indptr, const DType* data, const IdType* row, int64_t length,
+    int64_t n_row, const IdType* out_indptr, DType* out_data) {
+  IdType tx = static_cast<IdType>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    IdType rpos = dgl::cuda::_UpperBound(out_indptr, n_row, tx) - 1;
+    IdType rofs = tx - out_indptr[rpos];
+    const IdType u = row[rpos];
+    out_data[tx] = data ? data[indptr[u] + rofs] : indptr[u] + rofs;
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int64_t len = rows->shape[0];
+  IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true);
+  const int64_t nnz = aten::IndexSelect<IdType>(ret_indptr, len);
+
+  const int nt = 256;  // for better GPU usage of small invocations
+  const int nb = (nnz + nt - 1) / nt;
+
+  // Copy indices.
+  IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, rows->ctx);
+
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
+  const IdType* data_data =
+      CSRHasData(csr) ? static_cast<IdType*>(GetDevicePointer(csr.data))
+                      : nullptr;
+
+  CUDA_KERNEL_CALL(
+      _SegmentCopyKernel, nb, nt, 0, stream, indptr_data, indices_data,
+      rows.Ptr<IdType>(), nnz, len, ret_indptr.Ptr<IdType>(),
+      ret_indices.Ptr<IdType>());
+  // Copy data.
+  IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, rows->ctx);
+  CUDA_KERNEL_CALL(
+      _SegmentCopyKernel, nb, nt, 0, stream, indptr_data, data_data,
+      rows.Ptr<IdType>(), nnz, len, ret_indptr.Ptr<IdType>(),
+      ret_data.Ptr<IdType>());
+  return CSRMatrix(
+      len, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted);
+}
+
+template CSRMatrix CSRSliceRows<kDGLCUDA, int32_t>(CSRMatrix, NDArray);
+template CSRMatrix CSRSliceRows<kDGLCUDA, int64_t>(CSRMatrix, NDArray);
+
+///////////////////////////// CSRGetDataAndIndices /////////////////////////////
+
+/**
+ * @brief Generate a 0-1 mask for each index that hits the provided (row, col)
+ *        index.
+ *
+ * Examples:
+ * Given a CSR matrix (with duplicate entries) as follows:
+ * [[0, 1, 2, 0, 0],
+ *  [1, 0, 0, 0, 0],
+ *  [0, 0, 1, 1, 0],
+ *  [0, 0, 0, 0, 0]]
+ * Given rows: [0, 1], cols: [0, 2, 3]
+ * The result mask is: [0, 1, 1, 1, 0, 0]
+ */
+template <typename IdType>
+__global__ void _SegmentMaskKernel(
+    const IdType* indptr, const IdType* indices, const IdType* row,
+    const IdType* col, int64_t row_stride, int64_t col_stride, int64_t length,
+    IdType* mask) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    int rpos = tx * row_stride, cpos = tx * col_stride;
+    const IdType r = row[rpos], c = col[cpos];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      if (indices[i] == c) {
+        mask[i] = 1;
+      }
+    }
+    tx += stride_x;
+  }
+}
+
+/**
+ * @brief Search for the insertion positions for needle in the hay.
+ *
+ * The hay is a list of sorted elements and the result is the insertion position
+ * of each needle so that the insertion still gives sorted order.
+ *
+ * It essentially perform binary search to find lower bound for each needle
+ * elements. Require the largest elements in the hay is larger than the given
+ * needle elements. Commonly used in searching for row IDs of a given set of
+ * coordinates.
+ */
+template <typename IdType>
+__global__ void _SortedSearchKernel(
+    const IdType* hay, int64_t hay_size, const IdType* needles,
+    int64_t num_needles, IdType* pos) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < num_needles) {
+    const IdType ele = needles[tx];
+    // binary search
+    IdType lo = 0, hi = hay_size - 1;
+    while (lo < hi) {
+      IdType mid = (lo + hi) >> 1;
+      if (hay[mid] <= ele) {
+        lo = mid + 1;
+      } else {
+        hi = mid;
+      }
+    }
+    pos[tx] = (hay[hi] == ele) ? hi : hi - 1;
+    tx += stride_x;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::vector<NDArray> CSRGetDataAndIndices(
+    CSRMatrix csr, NDArray row, NDArray col) {
+  const auto rowlen = row->shape[0];
+  const auto collen = col->shape[0];
+  const auto len = std::max(rowlen, collen);
+  if (len == 0) return {NullArray(), NullArray(), NullArray()};
+
+  const auto& ctx = row->ctx;
+  const auto nbits = row->dtype.bits;
+  const int64_t nnz = csr.indices->shape[0];
+  const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1;
+  const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
+
+  // Generate a 0-1 mask for matched (row, col) positions.
+  IdArray mask = Full(0, nnz, nbits, ctx);
+  const int nt = dgl::cuda::FindNumThreads(len);
+  const int nb = (len + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _SegmentMaskKernel, nb, nt, 0, stream, indptr_data, indices_data,
+      row.Ptr<IdType>(), col.Ptr<IdType>(), row_stride, col_stride, len,
+      mask.Ptr<IdType>());
+
+  IdArray idx = AsNumBits(NonZero(mask), nbits);
+  if (idx->shape[0] == 0)
+    // No data. Return three empty arrays.
+    return {idx, idx, idx};
+
+  // Search for row index
+  IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits);
+  const int nt2 = dgl::cuda::FindNumThreads(idx->shape[0]);
+  const int nb2 = (idx->shape[0] + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _SortedSearchKernel, nb2, nt2, 0, stream, indptr_data, csr.num_rows,
+      idx.Ptr<IdType>(), idx->shape[0], ret_row.Ptr<IdType>());
+
+  // Column & data can be obtained by index select.
+  IdArray ret_col = IndexSelect(csr.indices, idx);
+  IdArray ret_data = CSRHasData(csr) ? IndexSelect(csr.data, idx) : idx;
+  return {ret_row, ret_col, ret_data};
+}
+
+template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int32_t>(
+    CSRMatrix csr, NDArray rows, NDArray cols);
+template std::vector<NDArray> CSRGetDataAndIndices<kDGLCUDA, int64_t>(
+    CSRMatrix csr, NDArray rows, NDArray cols);
+
+///////////////////////////// CSRSliceMatrix /////////////////////////////
+
+int64_t _UpPower(int64_t numel) {
+  uint64_t ret = 1 << static_cast<uint64_t>(std::log2(numel) + 1);
+  return ret;
+}
+
+/**
+ * @brief Thomas Wang's 32 bit Mix Function.
+ * Source link: https://gist.github.com/badboy/6267743
+ */
+__device__ inline uint32_t _Hash32Shift(uint32_t key) {
+  key = ~key + (key << 15);
+  key = key ^ (key >> 12);
+  key = key + (key << 2);
+  key = key ^ (key >> 4);
+  key = key * 2057;
+  key = key ^ (key >> 16);
+  return key;
+}
+
+/**
+ * @brief Thomas Wang's 64 bit Mix Function.
+ * Source link: https://gist.github.com/badboy/6267743
+ */
+__device__ inline uint64_t _Hash64Shift(uint64_t key) {
+  key = (~key) + (key << 21);
+  key = key ^ (key >> 24);
+  key = (key + (key << 3)) + (key << 8);
+  key = key ^ (key >> 14);
+  key = (key + (key << 2)) + (key << 4);
+  key = key ^ (key >> 28);
+  key = key + (key << 31);
+  return key;
+}
+
+/**
+ * @brief A hashmap designed for CSRSliceMatrix, similar in function to set. For
+ * performance, it can only be created and called in the cuda kernel.
+ */
+template <typename IdType>
+struct NodeQueryHashmap {
+  __device__ inline NodeQueryHashmap(IdType* Kptr, size_t numel)
+      : kptr_(Kptr), capacity_(numel) {}
+
+  /**
+   * @brief Insert a key. It must be called by cuda threads.
+   *
+   * @param key The key to be inserted.
+   */
+  __device__ inline void Insert(IdType key) {
+    uint32_t delta = 1;
+    uint32_t pos = Hash(key);
+    IdType prev = dgl::aten::cuda::AtomicCAS(&kptr_[pos], kEmptyKey_, key);
+    while (prev != key && prev != kEmptyKey_) {
+      pos = Hash(pos + delta);
+      delta += 1;
+      prev = dgl::aten::cuda::AtomicCAS(&kptr_[pos], kEmptyKey_, key);
+    }
+  }
+
+  /**
+   * @brief Check whether a key exists within the hashtable. It must be called
+   * by cuda threads.
+   *
+   * @param key The key to check for.
+   * @return True if the key exists in the hashtable.
+   */
+  __device__ inline bool Query(IdType key) {
+    uint32_t delta = 1;
+    uint32_t pos = Hash(key);
+    while (true) {
+      if (kptr_[pos] == key) return true;
+      if (kptr_[pos] == kEmptyKey_) return false;
+      pos = Hash(pos + delta);
+      delta += 1;
+    }
+    return false;
+  }
+
+  __device__ inline uint32_t Hash(int32_t key) {
+    return _Hash32Shift(key) & (capacity_ - 1);
+  }
+
+  __device__ inline uint32_t Hash(uint32_t key) {
+    return _Hash32Shift(key) & (capacity_ - 1);
+  }
+
+  __device__ inline uint32_t Hash(int64_t key) {
+    return static_cast<uint32_t>(_Hash64Shift(key)) & (capacity_ - 1);
+  }
+
+  __device__ inline uint32_t Hash(uint64_t key) {
+    return static_cast<uint32_t>(_Hash64Shift(key)) & (capacity_ - 1);
+  }
+
+  IdType kEmptyKey_{-1};
+  IdType* kptr_;
+  uint32_t capacity_{0};
+};
+
+/**
+ * @brief Generate a 0-1 mask for each index whose column is in the provided
+ * hashmap. It also counts the number of masked values per row.
+ *
+ * @tparam IdType The ID type used for matrices.
+ * @tparam WARP_SIZE The number of cuda threads in a cuda warp.
+ * @tparam BLOCK_WARPS The number of warps in a cuda block.
+ * @tparam TILE_SIZE The number of rows covered by each threadblock.
+ */
+template <typename IdType, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+__global__ void _SegmentMaskColKernel(
+    const IdType* indptr, const IdType* indices, int64_t num_rows,
+    IdType* hashmap_buffer, int64_t buffer_size, IdType* mask, IdType* count) {
+  assert(blockDim.x == WARP_SIZE);
+  assert(blockDim.y == BLOCK_WARPS);
+
+  int warp_id = threadIdx.y;
+  int laneid = threadIdx.x;
+  IdType out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
+  IdType last_row =
+      min(static_cast<IdType>((blockIdx.x + 1) * TILE_SIZE),
+          static_cast<IdType>(num_rows));
+
+  NodeQueryHashmap<IdType> hashmap(hashmap_buffer, buffer_size);
+  typedef cub::WarpReduce<IdType> WarpReduce;
+  __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS];
+
+  while (out_row < last_row) {
+    IdType local_count = 0;
+    IdType in_row_start = indptr[out_row];
+    IdType in_row_end = indptr[out_row + 1];
+    for (int idx = in_row_start + laneid; idx < in_row_end; idx += WARP_SIZE) {
+      bool is_in = hashmap.Query(indices[idx]);
+      if (is_in) {
+        local_count += 1;
+        mask[idx] = 1;
+      }
+    }
+    IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count);
+    if (laneid == 0) {
+      count[out_row] = reduce_count;
+    }
+    out_row += BLOCK_WARPS;
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+CSRMatrix CSRSliceMatrix(
+    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = rows->ctx;
+  const auto& dtype = rows->dtype;
+  const auto nbits = dtype.bits;
+  const int64_t new_nrows = rows->shape[0];
+  const int64_t new_ncols = cols->shape[0];
+
+  if (new_nrows == 0 || new_ncols == 0)
+    return CSRMatrix(
+        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
+        NullArray(dtype, ctx), NullArray(dtype, ctx));
+
+  // First slice rows
+  csr = CSRSliceRows(csr, rows);
+
+  if (csr.indices->shape[0] == 0)
+    return CSRMatrix(
+        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
+        NullArray(dtype, ctx), NullArray(dtype, ctx));
+
+  // Generate a 0-1 mask for matched (row, col) positions.
+  IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx);
+  // A count for how many masked values per row.
+  IdArray count = NewIdArray(csr.num_rows, ctx, nbits);
+  CUDA_CALL(
+      cudaMemset(count.Ptr<IdType>(), 0, sizeof(IdType) * (csr.num_rows)));
+
+  // Generate a NodeQueryHashmap buffer. The key of the hashmap is col.
+  // For performance, the load factor of the hashmap is in (0.25, 0.5);
+  // Because num_cols is usually less than 1 Million (on GPU), the
+  // memory overhead is not significant (less than 31MB) at a low load factor.
+  int64_t buffer_size = _UpPower(new_ncols) * 2;
+  IdArray hashmap_buffer = Full(-1, buffer_size, nbits, ctx);
+
+  using it = thrust::counting_iterator<int64_t>;
+  runtime::CUDAWorkspaceAllocator allocator(ctx);
+  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  thrust::for_each(
+      exec_policy, it(0), it(new_ncols),
+      [key = cols.Ptr<IdType>(), buffer = hashmap_buffer.Ptr<IdType>(),
+       buffer_size] __device__(int64_t i) {
+        NodeQueryHashmap<IdType> hashmap(buffer, buffer_size);
+        hashmap.Insert(key[i]);
+      });
+
+  const IdType* indptr_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indptr));
+  const IdType* indices_data =
+      static_cast<IdType*>(GetDevicePointer(csr.indices));
+
+  // Execute SegmentMaskColKernel
+  const int64_t num_rows = csr.num_rows;
+  constexpr int WARP_SIZE = 32;
+  // With a simple fine-tuning, TILE_SIZE=16 gives a good performance.
+  constexpr int TILE_SIZE = 16;
+  constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE;
+  IdType nb =
+      dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE);
+  const dim3 nthrs(WARP_SIZE, BLOCK_WARPS);
+  const dim3 nblks(nb);
+  CUDA_KERNEL_CALL(
+      (_SegmentMaskColKernel<IdType, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>), nblks,
+      nthrs, 0, stream, indptr_data, indices_data, num_rows,
+      hashmap_buffer.Ptr<IdType>(), buffer_size, mask.Ptr<IdType>(),
+      count.Ptr<IdType>());
+
+  IdArray idx = AsNumBits(NonZero(mask), nbits);
+  if (idx->shape[0] == 0)
+    return CSRMatrix(
+        new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx),
+        NullArray(dtype, ctx), NullArray(dtype, ctx));
+
+  // Indptr needs to be adjusted according to the new nnz per row.
+  IdArray ret_indptr = CumSum(count, true);
+
+  // Column & data can be obtained by index select.
+  IdArray ret_col = IndexSelect(csr.indices, idx);
+  IdArray ret_data = CSRHasData(csr) ? IndexSelect(csr.data, idx) : idx;
+
+  // Relabel column
+  IdArray col_hash = NewIdArray(csr.num_cols, ctx, nbits);
+  Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash);
+  ret_col = IndexSelect(col_hash, ret_col);
+
+  return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data);
+}
+
+template CSRMatrix CSRSliceMatrix<kDGLCUDA, int32_t>(
+    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
+template CSRMatrix CSRSliceMatrix<kDGLCUDA, int64_t>(
+    CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/spmm.cu b/src/array/cuda/spmm.cu
index a91927896277..e7d4ca952f89 100644
--- a/src/array/cuda/spmm.cu
+++ b/src/array/cuda/spmm.cu
@@ -114,11 +114,11 @@ template void SpMMCsr<kDGLCUDA, int64_t, __half>(
     const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
-template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsr<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
@@ -149,11 +149,11 @@ template void SpMMCoo<kDGLCUDA, int64_t, __half>(
     const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
 #if BF16_ENABLED
-template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
-template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCoo<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
     std::vector<NDArray> out_aux);
diff --git a/src/array/cuda/spmm.cu.prehip b/src/array/cuda/spmm.cu.prehip
new file mode 100644
index 000000000000..a91927896277
--- /dev/null
+++ b/src/array/cuda/spmm.cu.prehip
@@ -0,0 +1,179 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/spmm.cu
+ * @brief SPMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include <cstdlib>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./functor.cuh"
+#include "./ge_spmm.cuh"
+#include "./spmm.cuh"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+
+/**
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @note use cusparse if the reduce operator is `sum` and there is
+ *       no broadcast, use dgl's kernel in other cases.
+ */
+template <int XPU, typename IdType, typename DType>
+void SpMMCsr(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux) {
+  bool is_scalar_efeat = efeat.NumElements() == csr.indices->shape[0];
+  bool use_efeat = op != "copy_lhs";
+  bool use_deterministic_alg_only = false;
+  if (NULL != std::getenv("USE_DETERMINISTIC_ALG"))
+    use_deterministic_alg_only = true;
+
+  if (reduce == "sum") {
+    bool more_nnz = (csr.indices->shape[0] > csr.num_rows * csr.num_cols);
+    if (op == "copy_lhs" && cusparse_available<DType, IdType>(more_nnz)) {
+      // cusparse
+      int64_t x_length = 1;
+      for (int i = 1; i < ufeat->ndim; ++i) x_length *= ufeat->shape[i];
+      CusparseCsrmm2<DType, IdType>(
+          ufeat->ctx, csr, static_cast<DType*>(ufeat->data), nullptr,
+          static_cast<DType*>(out->data), x_length, use_deterministic_alg_only);
+    } else if (
+        op == "mul" && is_scalar_efeat &&
+        cusparse_available<DType, IdType>(more_nnz)) {
+      // cusparse
+      int64_t x_length = 1;
+      for (int i = 1; i < ufeat->ndim; ++i) x_length *= ufeat->shape[i];
+      if (!IsNullArray(csr.data)) {
+        efeat = IndexSelect(efeat, csr.data);
+      }
+      CusparseCsrmm2<DType, IdType>(
+          ufeat->ctx, csr, static_cast<DType*>(ufeat->data),
+          static_cast<DType*>(efeat->data), static_cast<DType*>(out->data),
+          x_length, use_deterministic_alg_only);
+    } else {  // general kernel
+      SWITCH_OP(op, Op, {
+        cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType> >(
+            bcast, csr, ufeat, efeat, out, NullArray(), NullArray());
+      });
+    }
+  } else if (reduce == "max") {
+    SWITCH_OP(op, Op, {
+      cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Max<IdType, DType> >(
+          bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+    });
+  } else if (reduce == "min") {
+    SWITCH_OP(op, Op, {
+      cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Min<IdType, DType> >(
+          bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]);
+    });
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+/**
+ * @brief CUDA implementation of g-SpMM on Coo format.
+ */
+template <int XPU, typename IdType, typename DType>
+void SpMMCoo(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux) {
+  if (reduce == "sum") {
+    SWITCH_OP(op, Op, {
+      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Sum<IdType, DType, true> >(
+          bcast, coo, ufeat, efeat, out, NullArray(), NullArray());
+    });
+  } else if (reduce == "max") {
+    SWITCH_OP(op, Op, {
+      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Max<IdType, DType, true> >(
+          bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+    });
+  } else if (reduce == "min") {
+    SWITCH_OP(op, Op, {
+      cuda::SpMMCoo<IdType, DType, Op, cuda::reduce::Min<IdType, DType, true> >(
+          bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]);
+    });
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+template void SpMMCsr<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCsr<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+#if BF16_ENABLED
+template void SpMMCsr<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCsr<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+#endif  // BF16_ENABLED
+template void SpMMCsr<kDGLCUDA, int32_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCsr<kDGLCUDA, int64_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCsr<kDGLCUDA, int32_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCsr<kDGLCUDA, int64_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+
+template void SpMMCoo<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCoo<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+#if BF16_ENABLED
+template void SpMMCoo<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCoo<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+#endif  // BF16_ENABLED
+template void SpMMCoo<kDGLCUDA, int32_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCoo<kDGLCUDA, int64_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCoo<kDGLCUDA, int32_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+template void SpMMCoo<kDGLCUDA, int64_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out,
+    std::vector<NDArray> out_aux);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/spmm.cuh b/src/array/cuda/spmm.cuh
index 9ebed71cc134..3677648682cf 100644
--- a/src/array/cuda/spmm.cuh
+++ b/src/array/cuda/spmm.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/spmm.cuh
@@ -35,7 +36,7 @@ inline bool cusparse_available(bool more_nnz_than_matrix_size) {
   return false;
 #else
   if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
     return false;  // cusparse's SpMM on fp16 is slow, temporally disabled.
   // If the CSR matrix has more NNZ than matrix size, we should not use
   // cuSPARSE 11.1.
@@ -47,54 +48,54 @@ namespace {
 
 /** @brief Call cuBLAS geam API for transpose operation for float and double. */
 template <typename DType>
-cublasStatus_t Xgeam(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, const DType* alpha, const DType* A, int lda,
     const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
   LOG(FATAL) << "Not supported dtype";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 
 template <>
-cublasStatus_t Xgeam<__half>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<__half>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, const __half* alpha, const __half* A, int lda,
     const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
   // TODO(ndickson): There is no cublasHgeam, so a different
   // implementation would be required.
   LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 
 #if BF16_ENABLED
 template <>
-cublasStatus_t Xgeam<__nv_bfloat16>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda,
-    const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb,
-    __nv_bfloat16* C, int ldc) {
+hipblasStatus_t Xgeam<__hip_bfloat16>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
+    int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda,
+    const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb,
+    __hip_bfloat16* C, int ldc) {
   // TODO(ndickson): There is no cublasHgeam, so a different
   // implementation would be required.
   LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
-  return CUBLAS_STATUS_EXECUTION_FAILED;
+  return HIPBLAS_STATUS_EXECUTION_FAILED;
 }
 #endif  // BF16_ENABLED
 
 template <>
-cublasStatus_t Xgeam<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<float>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, const float* alpha, const float* A, int lda,
     const float* beta, const float* B, int ldb, float* C, int ldc) {
-  return cublasSgeam(
+  return hipblasSgeam(
       handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }
 
 template <>
-cublasStatus_t Xgeam<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+hipblasStatus_t Xgeam<double>(
+    hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb,
     int m, int n, const double* alpha, const double* A, int lda,
     const double* beta, const double* B, int ldb, double* C, int ldc) {
-  return cublasDgeam(
+  return hipblasDgeam(
       handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }
 
@@ -119,12 +120,12 @@ template <typename DType>
 void _Transpose(const DType* in, DType* out, int row, int col) {
   DType alpha = 1., beta = 0.;
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   if (!thr_entry->cublas_handle)
-    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
-  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+    CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream));
   CUBLAS_CALL(Xgeam<DType>(
-      thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in,
+      thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in,
       col, &beta, nullptr, row, out, row));
 }
 
@@ -134,7 +135,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) {
  */
 template <>
 void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = FindNumThreads(row);
   int nb = col;
   CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
@@ -146,9 +147,9 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
  * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
  */
 template <>
-void _Transpose<__nv_bfloat16>(
-    const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+void _Transpose<__hip_bfloat16>(
+    const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) {
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = FindNumThreads(row);
   int nb = col;
   CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
@@ -157,36 +158,36 @@ void _Transpose<__nv_bfloat16>(
 
 #if CUDART_VERSION < 11000
 template <typename DType>
-cusparseStatus_t Xcsrmm2(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA,
+hipsparseStatus_t Xcsrmm2(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA,
     const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
     const DType* beta, DType* C, int ldc) {
   LOG(INFO) << "Not supported dtype";
-  return CUSPARSE_STATUS_EXECUTION_FAILED;
+  return HIPSPARSE_STATUS_EXECUTION_FAILED;
 }
 
 template <>
-cusparseStatus_t Xcsrmm2<float>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA,
+hipsparseStatus_t Xcsrmm2<float>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA,
     const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
     const float* beta, float* C, int ldc) {
-  return cusparseScsrmm2(
+  return hipsparseScsrmm2(
       handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
       csrColIndA, B, ldb, beta, C, ldc);
 }
 
 template <>
-cusparseStatus_t Xcsrmm2<double>(
-    cusparseHandle_t handle, cusparseOperation_t transA,
-    cusparseOperation_t transB, int m, int n, int k, int nnz,
-    const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA,
+hipsparseStatus_t Xcsrmm2<double>(
+    hipsparseHandle_t handle, hipsparseOperation_t transA,
+    hipsparseOperation_t transB, int m, int n, int k, int nnz,
+    const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA,
     const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
     const double* beta, double* C, int ldc) {
-  return cusparseDcsrmm2(
+  return hipsparseDcsrmm2(
       handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
       csrColIndA, B, ldb, beta, C, ldc);
 }
@@ -214,12 +215,12 @@ void CusparseCsrmm2(
   // device
   auto device = runtime::DeviceAPI::Get(ctx);
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream));
   // all one data array
   DType* valptr = nullptr;
   if (!A_data) {
@@ -228,54 +229,54 @@ void CusparseCsrmm2(
     _Fill(valptr, nnz, static_cast<DType>(1.));
   }
 #if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
   constexpr auto dtype = cuda_dtype<DType>::value;
   constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
       &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
       static_cast<IdType*>(csr.indices->data),
       const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
   CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
 
-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
   size_t workspace_size;
-  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
-                                   ? CUSPARSE_SPMM_CSR_ALG3
-                                   : CUSPARSE_SPMM_CSR_ALG2;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  hipsparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
+                                   ? HIPSPARSE_SPMM_CSR_ALG3
+                                   : HIPSPARSE_SPMM_CSR_ALG2;
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, spmm_alg, &workspace_size));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, spmm_alg, workspace));
   device->FreeWorkspace(ctx, workspace);
 
-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
   // allocate matrix for temporary transposed output
   DType* trans_out =
       static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));
 
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
   CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
       (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
       static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
       m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
   // transpose the output matrix
   _Transpose(trans_out, C_data, n, m);
   device->FreeWorkspace(ctx, trans_out);
@@ -287,7 +288,7 @@ void CusparseCsrmm2(
 template <typename DType, typename IdType>
 void CusparseCsrmm2Hetero(
     const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
-    const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id,
+    const DType* A_data, DType* C_data, int64_t x_length, hipStream_t strm_id,
     bool use_deterministic_alg_only = false) {
   // We use csrmm2 to perform following operation:
   // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
@@ -311,9 +312,9 @@ void CusparseCsrmm2Hetero(
   auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
   // allocate cusparse handle if needed
   if (!thr_entry->cusparse_handle) {
-    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+    CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle)));
   }
-  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id));
+  CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id));
   // all one data array
   DType* valptr = nullptr;
   if (!A_data) {
@@ -322,50 +323,50 @@ void CusparseCsrmm2Hetero(
     _Fill(valptr, nnz, static_cast<DType>(1.));
   }
 #if CUDART_VERSION >= 11000
-  cusparseSpMatDescr_t matA;
-  cusparseDnMatDescr_t matB, matC;
+  hipsparseSpMatDescr_t matA;
+  hipsparseDnMatDescr_t matB, matC;
   constexpr auto dtype = cuda_dtype<DType>::value;
   constexpr auto idtype = cusparse_idtype<IdType>::value;
-  CUSPARSE_CALL(cusparseCreateCsr(
+  CUSPARSE_CALL(hipsparseCreateCsr(
       &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
       static_cast<IdType*>(csr.indices->data),
       const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
-      CUSPARSE_INDEX_BASE_ZERO, dtype));
-  CUSPARSE_CALL(cusparseCreateDnMat(
-      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+      HIPSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(hipsparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, HIPSPARSE_ORDER_ROW));
   CUSPARSE_CALL(
-      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+      hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW));
 
-  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
-  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE;
   size_t workspace_size;
-  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
-                                   ? CUSPARSE_SPMM_CSR_ALG3
-                                   : CUSPARSE_SPMM_CSR_ALG2;
-  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+  hipsparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
+                                   ? HIPSPARSE_SPMM_CSR_ALG3
+                                   : HIPSPARSE_SPMM_CSR_ALG2;
+  CUSPARSE_CALL(hipsparseSpMM_bufferSize(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, spmm_alg, &workspace_size));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUSPARSE_CALL(cusparseSpMM(
+  CUSPARSE_CALL(hipsparseSpMM(
       thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
       matC, dtype, spmm_alg, workspace));
   device->FreeWorkspace(ctx, workspace);
 
-  CUSPARSE_CALL(cusparseDestroySpMat(matA));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
-  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+  CUSPARSE_CALL(hipsparseDestroySpMat(matA));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matB));
+  CUSPARSE_CALL(hipsparseDestroyDnMat(matC));
 #else
-  cusparseMatDescr_t descr;
-  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
-  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  hipsparseMatDescr_t descr;
+  CUSPARSE_CALL(hipsparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO));
   CHECK_EQ(sizeof(IdType), sizeof(int32_t));
   CUSPARSE_CALL(Xcsrmm2<DType>(
-      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
-      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE,
+      HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
       (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
       static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
-  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  CUSPARSE_CALL(hipsparseDestroyMatDescr(descr));
 #endif
   if (valptr) device->FreeWorkspace(ctx, valptr);
 }
@@ -632,7 +633,7 @@ void SpMMCoo(
    */
 #if BF16_ENABLED
   if (std::is_same<DType, __half>::value ||
-      std::is_same<DType, __nv_bfloat16>::value)
+      std::is_same<DType, __hip_bfloat16>::value)
 #else
   if (std::is_same<DType, __half>::value)
 #endif  // BF16_ENABLED
@@ -645,7 +646,7 @@ void SpMMCoo(
               *efeat_data = efeat.Ptr<DType>();
   DType* out_data = out.Ptr<DType>();
   Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];
 
   int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
@@ -710,7 +711,7 @@ void SpMMCsr(
   Idx* argu_data = argu.Ptr<Idx>();
   Idx* arge_data = arge.Ptr<Idx>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
   int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
@@ -771,7 +772,7 @@ void SpMMCmpCsrHetero(
   Idx* argu_data = argu.Ptr<Idx>();
   Idx* arge_data = arge.Ptr<Idx>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
   int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
diff --git a/src/array/cuda/spmm.cuh.prehip b/src/array/cuda/spmm.cuh.prehip
new file mode 100644
index 000000000000..9ebed71cc134
--- /dev/null
+++ b/src/array/cuda/spmm.cuh.prehip
@@ -0,0 +1,802 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/spmm.cuh
+ * @brief SPMM CUDA kernel function header.
+ */
+#ifndef DGL_ARRAY_CUDA_SPMM_CUH_
+#define DGL_ARRAY_CUDA_SPMM_CUH_
+
+#include <dgl/bcast.h>
+
+#include <limits>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+#include "atomic.cuh"
+#include "bf16.cuh"
+#include "fp16.cuh"
+#include "macro.cuh"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+
+/**
+ * @brief Determine whether cusparse SpMM function is applicable.
+ */
+template <typename DType, typename IdType>
+inline bool cusparse_available(bool more_nnz_than_matrix_size) {
+#if CUDART_VERSION < 11000
+  if (std::is_same<IdType, int>::value &&
+      (std::is_same<DType, float>::value || std::is_same<DType, double>::value))
+    return true;
+  return false;
+#else
+  if (std::is_same<DType, __half>::value ||
+      std::is_same<DType, __nv_bfloat16>::value)
+    return false;  // cusparse's SpMM on fp16 is slow, temporally disabled.
+  // If the CSR matrix has more NNZ than matrix size, we should not use
+  // cuSPARSE 11.1.
+  return !more_nnz_than_matrix_size;
+#endif
+}
+
+namespace {
+
+/** @brief Call cuBLAS geam API for transpose operation for float and double. */
+template <typename DType>
+cublasStatus_t Xgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const DType* alpha, const DType* A, int lda,
+    const DType* beta, const DType* B, int ldb, DType* C, int ldc) {
+  LOG(FATAL) << "Not supported dtype";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+
+template <>
+cublasStatus_t Xgeam<__half>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const __half* alpha, const __half* A, int lda,
+    const __half* beta, const __half* B, int ldb, __half* C, int ldc) {
+  // TODO(ndickson): There is no cublasHgeam, so a different
+  // implementation would be required.
+  LOG(FATAL) << "Xgeam does not support dtype half (FP16)";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+
+#if BF16_ENABLED
+template <>
+cublasStatus_t Xgeam<__nv_bfloat16>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda,
+    const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb,
+    __nv_bfloat16* C, int ldc) {
+  // TODO(ndickson): There is no cublasHgeam, so a different
+  // implementation would be required.
+  LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)";
+  return CUBLAS_STATUS_EXECUTION_FAILED;
+}
+#endif  // BF16_ENABLED
+
+template <>
+cublasStatus_t Xgeam<float>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float* alpha, const float* A, int lda,
+    const float* beta, const float* B, int ldb, float* C, int ldc) {
+  return cublasSgeam(
+      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+template <>
+cublasStatus_t Xgeam<double>(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double* alpha, const double* A, int lda,
+    const double* beta, const double* B, int ldb, double* C, int ldc) {
+  return cublasDgeam(
+      handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+/**
+ * @brief Transpose operator kernel implementation.
+ * @note not efficient but it's not a bottleneck, used for float16 dtype.
+ */
+template <typename DType>
+__global__ void _TransposeKernel(
+    const DType* __restrict__ in, DType* __restrict__ out, int n, int m) {
+  int i = blockIdx.x;
+  for (int j = threadIdx.x; j < m; j += blockDim.x)
+    out[i * m + j] = in[j * n + i];
+}
+
+/**
+ * @brief Tranpose the input matrix.
+ * @param row number of rows of input matrix.
+ * @param col number of columns of input matrix.
+ */
+template <typename DType>
+void _Transpose(const DType* in, DType* out, int row, int col) {
+  DType alpha = 1., beta = 0.;
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  if (!thr_entry->cublas_handle)
+    CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle)));
+  CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream));
+  CUBLAS_CALL(Xgeam<DType>(
+      thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in,
+      col, &beta, nullptr, row, out, row));
+}
+
+/**
+ * @brief Tranpose the input matrix for data type half.
+ * @note cuBLAS has no geam API for half data type, fallback to our kernel.
+ */
+template <>
+void _Transpose<__half>(const __half* in, __half* out, int row, int col) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = FindNumThreads(row);
+  int nb = col;
+  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
+}
+
+#if BF16_ENABLED
+/**
+ * @brief Tranpose the input matrix for data type half.
+ * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel.
+ */
+template <>
+void _Transpose<__nv_bfloat16>(
+    const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = FindNumThreads(row);
+  int nb = col;
+  CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row);
+}
+#endif  // BF16_ENABLED
+
+#if CUDART_VERSION < 11000
+template <typename DType>
+cusparseStatus_t Xcsrmm2(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k, int nnz,
+    const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA,
+    const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb,
+    const DType* beta, DType* C, int ldc) {
+  LOG(INFO) << "Not supported dtype";
+  return CUSPARSE_STATUS_EXECUTION_FAILED;
+}
+
+template <>
+cusparseStatus_t Xcsrmm2<float>(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k, int nnz,
+    const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA,
+    const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb,
+    const float* beta, float* C, int ldc) {
+  return cusparseScsrmm2(
+      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
+      csrColIndA, B, ldb, beta, C, ldc);
+}
+
+template <>
+cusparseStatus_t Xcsrmm2<double>(
+    cusparseHandle_t handle, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int n, int k, int nnz,
+    const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA,
+    const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb,
+    const double* beta, double* C, int ldc) {
+  return cusparseDcsrmm2(
+      handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA,
+      csrColIndA, B, ldb, beta, C, ldc);
+}
+#endif
+
+/** Cusparse implementation of SpMM on Csr format. */
+template <typename DType, typename IdType>
+void CusparseCsrmm2(
+    const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
+    const DType* A_data, DType* C_data, int x_length,
+    bool use_deterministic_alg_only = false) {
+  // We use csrmm2 to perform following operation:
+  // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
+  // for node feature tensor. However, since cusparse only supports
+  // column-major, while our tensor is stored in row-major, the actual
+  // computation is: C = trans(A x trans(B)). Currently, we use cublasXgeam to
+  // implement transposition and allocate intermediate workspace memory for
+  // this.
+  const int m = csr.num_rows;
+  const int n = x_length;
+  const int k = csr.num_cols;
+  const int nnz = csr.indices->shape[0];
+  const DType alpha = 1.0;
+  const DType beta = 0.0;
+  // device
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream));
+  // all one data array
+  DType* valptr = nullptr;
+  if (!A_data) {
+    valptr =
+        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
+    _Fill(valptr, nnz, static_cast<DType>(1.));
+  }
+#if CUDART_VERSION >= 11000
+  cusparseSpMatDescr_t matA;
+  cusparseDnMatDescr_t matB, matC;
+  constexpr auto dtype = cuda_dtype<DType>::value;
+  constexpr auto idtype = cusparse_idtype<IdType>::value;
+  CUSPARSE_CALL(cusparseCreateCsr(
+      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
+      static_cast<IdType*>(csr.indices->data),
+      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
+      CUSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(cusparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+  CUSPARSE_CALL(
+      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+
+  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  size_t workspace_size;
+  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
+                                   ? CUSPARSE_SPMM_CSR_ALG3
+                                   : CUSPARSE_SPMM_CSR_ALG2;
+  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, spmm_alg, &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(cusparseSpMM(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, spmm_alg, workspace));
+  device->FreeWorkspace(ctx, workspace);
+
+  CUSPARSE_CALL(cusparseDestroySpMat(matA));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+#else
+  // allocate matrix for temporary transposed output
+  DType* trans_out =
+      static_cast<DType*>(device->AllocWorkspace(ctx, m * n * sizeof(DType)));
+
+  cusparseMatDescr_t descr;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CALL(Xcsrmm2<DType>(
+      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
+      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, trans_out,
+      m));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+  // transpose the output matrix
+  _Transpose(trans_out, C_data, n, m);
+  device->FreeWorkspace(ctx, trans_out);
+#endif
+  if (valptr) device->FreeWorkspace(ctx, valptr);
+}
+
+/** Cusparse implementation of SpMM on Csr format. */
+template <typename DType, typename IdType>
+void CusparseCsrmm2Hetero(
+    const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data,
+    const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id,
+    bool use_deterministic_alg_only = false) {
+  // We use csrmm2 to perform following operation:
+  // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix
+  // for node feature tensor. However, since cusparse only supports
+  // column-major, while our tensor is stored in row-major, the actual
+  // computation is: C = trans(A x trans(B)). Currently, we use cublasXgeam to
+  // implement transposition and allocate intermediate workspace memory for
+  // this.
+  int int_maxlimit = std::numeric_limits<int>::max();
+  CHECK_GE(int_maxlimit, (csr.num_rows));
+  CHECK_GE(int_maxlimit, csr.num_cols);
+  CHECK_GE(int_maxlimit, csr.indices->shape[0]);
+  const int m = csr.num_rows;
+  const int n = x_length;
+  const int k = csr.num_cols;
+  const int nnz = csr.indices->shape[0];
+  const DType alpha = 1.0;
+  const DType beta = 1.0;
+  // device
+  auto device = runtime::DeviceAPI::Get(ctx);
+  auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal();
+  // allocate cusparse handle if needed
+  if (!thr_entry->cusparse_handle) {
+    CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle)));
+  }
+  CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id));
+  // all one data array
+  DType* valptr = nullptr;
+  if (!A_data) {
+    valptr =
+        static_cast<DType*>(device->AllocWorkspace(ctx, nnz * sizeof(DType)));
+    _Fill(valptr, nnz, static_cast<DType>(1.));
+  }
+#if CUDART_VERSION >= 11000
+  cusparseSpMatDescr_t matA;
+  cusparseDnMatDescr_t matB, matC;
+  constexpr auto dtype = cuda_dtype<DType>::value;
+  constexpr auto idtype = cusparse_idtype<IdType>::value;
+  CUSPARSE_CALL(cusparseCreateCsr(
+      &matA, m, k, nnz, static_cast<IdType*>(csr.indptr->data),
+      static_cast<IdType*>(csr.indices->data),
+      const_cast<DType*>(valptr ? valptr : A_data), idtype, idtype,
+      CUSPARSE_INDEX_BASE_ZERO, dtype));
+  CUSPARSE_CALL(cusparseCreateDnMat(
+      &matB, k, n, n, const_cast<DType*>(B_data), dtype, CUSPARSE_ORDER_ROW));
+  CUSPARSE_CALL(
+      cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW));
+
+  auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE;
+  size_t workspace_size;
+  cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only
+                                   ? CUSPARSE_SPMM_CSR_ALG3
+                                   : CUSPARSE_SPMM_CSR_ALG2;
+  CUSPARSE_CALL(cusparseSpMM_bufferSize(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, spmm_alg, &workspace_size));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUSPARSE_CALL(cusparseSpMM(
+      thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta,
+      matC, dtype, spmm_alg, workspace));
+  device->FreeWorkspace(ctx, workspace);
+
+  CUSPARSE_CALL(cusparseDestroySpMat(matA));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matB));
+  CUSPARSE_CALL(cusparseDestroyDnMat(matC));
+#else
+  cusparseMatDescr_t descr;
+  CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
+  CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+  CHECK_EQ(sizeof(IdType), sizeof(int32_t));
+  CUSPARSE_CALL(Xcsrmm2<DType>(
+      thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr,
+      (valptr) ? valptr : A_data, static_cast<int32_t*>(csr.indptr->data),
+      static_cast<int32_t*>(csr.indices->data), B_data, n, &beta, C_data, m));
+  CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
+#endif
+  if (valptr) device->FreeWorkspace(ctx, valptr);
+}
+
+}  // namespace
+
+#define SWITCH_OP(op, Op, ...)                                  \
+  do {                                                          \
+    if ((op) == "add") {                                        \
+      typedef cuda::binary::Add<DType> Op;                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "sub") {                                 \
+      typedef cuda::binary::Sub<DType> Op;                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "mul") {                                 \
+      typedef cuda::binary::Mul<DType> Op;                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "div") {                                 \
+      typedef cuda::binary::Div<DType> Op;                      \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "copy_lhs") {                            \
+      typedef cuda::binary::CopyLhs<DType> Op;                  \
+      { __VA_ARGS__ }                                           \
+    } else if ((op) == "copy_rhs") {                            \
+      typedef cuda::binary::CopyRhs<DType> Op;                  \
+      { __VA_ARGS__ }                                           \
+    } else {                                                    \
+      LOG(FATAL) << "Unsupported SpMM binary operator: " << op; \
+    }                                                           \
+  } while (0)
+
+namespace cuda {
+
+/**
+ * @brief CUDA kernel of g-SpMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different edges. Threadblocks
+ *       on the x-axis are responsible for the computation on different
+ * positions in feature dimension. To avoid possible data hazards, it uses
+ * atomic operators for reduction.
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, typename ReduceOp,
+    bool UseBcast = false, bool UseIdx = false>
+__global__ void SpMMCooKernel(
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e,
+    const Idx* __restrict__ row, const Idx* __restrict__ col,
+    const Idx* __restrict__ edge_map, int64_t N, int64_t M, int64_t E,
+    const int64_t* __restrict__ ubcast_off,
+    const int64_t* __restrict__ ebcast_off, int64_t ufeat_len,
+    int64_t efeat_len, int64_t out_len) {
+  // SPMM with COO.
+  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  while (ty < E) {
+    const Idx src = _ldg(row + ty);
+    const Idx dst = _ldg(col + ty);
+    const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty;
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t stride_x = blockDim.x * gridDim.x;
+    const DType* uoff = BinaryOp::use_lhs ? (ufeat + src * ufeat_len) : nullptr;
+    const DType* eoff = BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr;
+    DType* outoff = out + dst * out_len;
+    while (tx < out_len) {
+      const int64_t lhs_add = UseBcast ? ubcast_off[tx] : tx;
+      const int64_t rhs_add = UseBcast ? ebcast_off[tx] : tx;
+      DType val = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add);
+      Idx* arguoff = nullptr;  // arguoff is not used in SpMMCoo.
+      Idx* argeoff = nullptr;  // argeoff is not used in SpMMCoo.
+      ReduceOp::Call(outoff + tx, arguoff, argeoff, val, src, eid);
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA kernel to compute argu and arge in g-SpMM on Coo format.
+ * @note it uses edge parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different edges. Threadblocks
+ *       on the x-axis are responsible for the computation on different
+ * positions in feature dimension.
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, typename ReduceOp,
+    bool UseBcast = false, bool UseIdx = false>
+__global__ void ArgSpMMCooKernel(
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e,
+    const Idx* __restrict__ row, const Idx* __restrict__ col,
+    const Idx* __restrict__ edge_map, int64_t N, int64_t M, int64_t E,
+    const int64_t* __restrict__ ubcast_off,
+    const int64_t* __restrict__ ebcast_off, int64_t ufeat_len,
+    int64_t efeat_len, int64_t out_len) {
+  // SPMM with COO arg max/min.
+  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  while (ty < E) {
+    const Idx src = _ldg(row + ty);
+    const Idx dst = _ldg(col + ty);
+    const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty;
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t stride_x = blockDim.x * gridDim.x;
+    const DType* uoff = BinaryOp::use_lhs ? (ufeat + src * ufeat_len) : nullptr;
+    const DType* eoff = BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr;
+    const DType* outoff = out + dst * out_len;
+    Idx* arguoff = BinaryOp::use_lhs ? (arg_u + dst * out_len) : nullptr;
+    Idx* argeoff = BinaryOp::use_rhs ? (arg_e + dst * out_len) : nullptr;
+    while (tx < out_len) {
+      int64_t lhs_add = UseBcast ? ubcast_off[tx] : tx;
+      int64_t rhs_add = UseBcast ? ebcast_off[tx] : tx;
+      DType val = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add);
+      ReduceOp::CallArg(tx, arguoff, argeoff, val, outoff[tx], src, eid);
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA kernel of g-SpMM on Csr format.
+ * @note it uses node parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different destination nodes.
+ *       Threadblocks on the x-axis are responsible for the computation on
+ *       different positions in feature dimension.
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, typename ReduceOp,
+    bool UseBcast = false, bool UseIdx = false>
+__global__ void SpMMCsrKernel(
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e,
+    const Idx* __restrict__ indptr, const Idx* __restrict__ indices,
+    const Idx* __restrict__ edge_map, int64_t num_rows, int64_t num_cols,
+    const int64_t* __restrict__ ubcast_off,
+    const int64_t* __restrict__ ebcast_off, int64_t ufeat_len,
+    int64_t efeat_len, int64_t out_len) {
+  // SPMM with CSR.
+  int ty = blockIdx.x * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.x;
+  const int stride_x = blockDim.x * gridDim.y;
+  while (ty < num_rows) {
+    int tx = blockIdx.y * blockDim.x + threadIdx.x;
+    while (tx < out_len) {
+      typename accum_dtype<DType>::type local_accum = ReduceOp::zero();
+      Idx local_argu = 0, local_arge = 0;
+      const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
+      const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
+      for (Idx i = indptr[ty]; i < indptr[ty + 1]; ++i) {
+        const Idx eid = UseIdx ? _ldg(edge_map + i) : i;
+        const Idx cid = _ldg(indices + i);
+        const DType* uoff =
+            BinaryOp::use_lhs ? (ufeat + cid * ufeat_len) : nullptr;
+        const DType* eoff =
+            BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr;
+        DType out = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add);
+        ReduceOp::Call(&local_accum, &local_argu, &local_arge, out, cid, eid);
+      }
+      // The use of += is to compute cross-type reducing on heterogeneous graph
+      // when reduce op is `sum`.
+      //     C = SpMM(SpA, B) + C
+      // Separate kernel `SpMMCmpCsrHeteroKernel` is used for max- and
+      // min-reducer. It does not affect the output on homogeneous graph as
+      // `out` is initialized to zero.
+      out[ty * out_len + tx] += static_cast<DType>(local_accum);
+      if (ReduceOp::require_arg && BinaryOp::use_lhs)
+        arg_u[ty * out_len + tx] = local_argu;
+      if (ReduceOp::require_arg && BinaryOp::use_rhs)
+        arg_e[ty * out_len + tx] = local_arge;
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA kernel of SpMM-Min/Max on Csr format.
+ * @note it uses node parallel strategy, different threadblocks (on y-axis)
+ *       is responsible for the computation on different destination nodes.
+ *       Threadblocks on the x-axis are responsible for the computation on
+ *       different positions in feature dimension.
+ */
+template <
+    typename Idx, typename DType, typename BinaryOp, typename ReduceOp,
+    bool UseBcast = false, bool UseIdx = false>
+__global__ void SpMMCmpCsrHeteroKernel(
+    const DType* __restrict__ ufeat, const DType* __restrict__ efeat,
+    DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e,
+    Idx* __restrict__ arg_u_ntype, Idx* __restrict__ arg_e_etype,
+    const Idx* __restrict__ indptr, const Idx* __restrict__ indices,
+    const Idx* __restrict__ edge_map, int64_t num_rows, int64_t num_cols,
+    const int64_t* __restrict__ ubcast_off,
+    const int64_t* __restrict__ ebcast_off, int64_t ufeat_len,
+    int64_t efeat_len, int64_t out_len, const int src_type, const int etype) {
+  // SPMM with CSR.
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  const int stride_x = blockDim.x * gridDim.x;
+  while (ty < num_rows) {
+    int tx = blockIdx.x * blockDim.x + threadIdx.x;
+    while (tx < out_len) {
+      using accum_type = typename accum_dtype<DType>::type;
+      accum_type local_accum =
+          static_cast<accum_type>(out[ty * out_len + tx]);  // ReduceOp::zero();
+      Idx local_argu = 0, local_arge = 0;
+      const int lhs_add = UseBcast ? ubcast_off[tx] : tx;
+      const int rhs_add = UseBcast ? ebcast_off[tx] : tx;
+      for (Idx i = indptr[ty]; i < indptr[ty + 1]; ++i) {
+        const Idx eid = UseIdx ? _ldg(edge_map + i) : i;
+        const Idx cid = _ldg(indices + i);
+        const DType* uoff =
+            BinaryOp::use_lhs ? (ufeat + cid * ufeat_len) : nullptr;
+        const DType* eoff =
+            BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr;
+        DType tmp_out = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add);
+        ReduceOp::Call(
+            &local_accum, &local_argu, &local_arge, tmp_out, cid, eid);
+      }
+      // Update output only when max/min values are different that original
+      // output
+      DType new_out = static_cast<DType>(local_accum);
+      if (out[ty * out_len + tx] != new_out) {
+        out[ty * out_len + tx] = new_out;
+        if (ReduceOp::require_arg && BinaryOp::use_lhs) {
+          arg_u[ty * out_len + tx] = local_argu;
+          arg_u_ntype[ty * out_len + tx] = src_type;
+        }
+        if (ReduceOp::require_arg && BinaryOp::use_rhs) {
+          arg_e[ty * out_len + tx] = local_arge;
+          arg_e_etype[ty * out_len + tx] = etype;
+        }
+      }
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+/**
+ * @brief CUDA implementation of g-SpMM on Coo format.
+ * @param bcast Broadcast information.
+ * @param coo The Coo matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ */
+template <typename Idx, typename DType, typename BinaryOp, typename ReduceOp>
+void SpMMCoo(
+    const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
+  /**
+   * TODO(Xin): Disable half precision for SpMMCoo due to the round-off error.
+   * We should use fp32 for the accumulation but it's hard to modify the
+   * current implementation.
+   */
+#if BF16_ENABLED
+  if (std::is_same<DType, __half>::value ||
+      std::is_same<DType, __nv_bfloat16>::value)
+#else
+  if (std::is_same<DType, __half>::value)
+#endif  // BF16_ENABLED
+    LOG(FATAL) << "SpMMCoo doesn't support half precision fow now. "
+               << "Please use SpMMCsr instead by allowing the graph "
+               << "materialize CSR/CSC formats.";
+  const Idx *row = coo.row.Ptr<Idx>(), *col = coo.col.Ptr<Idx>(),
+            *edge_map = coo.data.Ptr<Idx>();
+  const DType *ufeat_data = ufeat.Ptr<DType>(),
+              *efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+  Idx *argu_data = argu.Ptr<Idx>(), *arge_data = arge.Ptr<Idx>();
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0];
+
+  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
+  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
+
+  int64_t out_size = out.NumElements();
+  const int nt = FindNumThreads(out_size);
+  const int nb = (out_size + nt - 1) / nt;
+  CUDA_KERNEL_CALL(
+      _FillKernel, nb, nt, 0, stream, out_data, out_size, ReduceOp::zero());
+
+  const int ntx = FindNumThreads(len);
+  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int nbx = (len + ntx - 1) / ntx;
+  const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty);
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  const bool use_idx = !IsNullArray(coo.data);
+
+  BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, {
+    CUDA_KERNEL_CALL(
+        (SpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+        nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data,
+        arge_data, row, col, edge_map, N, M, E, ubcast_off, ebcast_off, lhs_len,
+        rhs_len, len);
+    if (ReduceOp::require_arg) {
+      CUDA_KERNEL_CALL(
+          (ArgSpMMCooKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+          nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data,
+          arge_data, row, col, edge_map, N, M, E, ubcast_off, ebcast_off,
+          lhs_len, rhs_len, len);
+    }
+  });
+}
+
+/**
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ */
+template <typename Idx, typename DType, typename BinaryOp, typename ReduceOp>
+void SpMMCsr(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
+  const Idx* edge_map = csr.data.Ptr<Idx>();
+  const DType* ufeat_data = ufeat.Ptr<DType>();
+  const DType* efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+  Idx* argu_data = argu.Ptr<Idx>();
+  Idx* arge_data = arge.Ptr<Idx>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
+  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
+  const int ntx = FindNumThreads(len);
+  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int nby = (len + ntx - 1) / ntx;
+  const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty);
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  const bool use_idx = !IsNullArray(csr.data);
+
+  BCAST_IDX_CTX_SWITCH(
+      bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off,
+      {CUDA_KERNEL_CALL(
+          (SpMMCsrKernel<Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+          nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data,
+          arge_data, indptr, indices, edge_map, csr.num_rows, csr.num_cols,
+          ubcast_off, ebcast_off, lhs_len, rhs_len, len)});
+}
+
+/**
+ * @brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph.
+ * @param bcast Broadcast information.
+ * @param csr The Csr matrix.
+ * @param ufeat The feature on source nodes.
+ * @param efeat The feature on edges.
+ * @param out The result feature on destination nodes.
+ * @param argu Arg-Min/Max on source nodes, which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param arge Arg-Min/Max on edges. which refers the source node indices
+ *        correspond to the minimum/maximum values of reduction result on
+ *        destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param argu_ntype Node type of the arg-Min/Max on source nodes, which refers
+ * the source node types correspond to the minimum/maximum values of reduction
+ * result on destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param arge_etype Edge-type of the arg-Min/Max on edges. which refers the
+ * source node indices correspond to the minimum/maximum values of reduction
+ * result on destination nodes. It's useful in computing gradients of Min/Max
+ * reducer.
+ * @param src_type Node type of the source nodes of an etype
+ * @param etype Edge type
+ */
+template <typename Idx, typename DType, typename BinaryOp, typename ReduceOp>
+void SpMMCmpCsrHetero(
+    const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat,
+    NDArray out, NDArray argu, NDArray arge, NDArray argu_ntype,
+    NDArray arge_etype, const int src_type, const int etype) {
+  const Idx* indptr = csr.indptr.Ptr<Idx>();
+  const Idx* indices = csr.indices.Ptr<Idx>();
+  const Idx* edge_map = csr.data.Ptr<Idx>();
+  const DType* ufeat_data = ufeat.Ptr<DType>();
+  const DType* efeat_data = efeat.Ptr<DType>();
+  DType* out_data = out.Ptr<DType>();
+  Idx* argu_data = argu.Ptr<Idx>();
+  Idx* arge_data = arge.Ptr<Idx>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  int64_t *ubcast_off = nullptr, *ebcast_off = nullptr;
+  int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len;
+  const int ntx = FindNumThreads(len);
+  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int nbx = (len + ntx - 1) / ntx;
+  const int nby = FindNumBlocks<'y'>((csr.num_rows + nty - 1) / nty);
+  const dim3 nblks(nbx, nby);
+  const dim3 nthrs(ntx, nty);
+  const bool use_idx = !IsNullArray(csr.data);
+
+  BCAST_IDX_CTX_SWITCH(
+      bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off,
+      {CUDA_KERNEL_CALL(
+          (SpMMCmpCsrHeteroKernel<
+              Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>),
+          nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data,
+          arge_data, static_cast<Idx*>(argu_ntype->data),
+          static_cast<Idx*>(arge_etype->data), indptr, indices, edge_map,
+          csr.num_rows, csr.num_cols, ubcast_off, ebcast_off, lhs_len, rhs_len,
+          len, src_type, etype)});
+}
+
+}  // namespace cuda
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_SPMM_CUH_
diff --git a/src/array/cuda/spmm_hetero.cu b/src/array/cuda/spmm_hetero.cu
index b5b258df2ef6..6bcd98a898be 100644
--- a/src/array/cuda/spmm_hetero.cu
+++ b/src/array/cuda/spmm_hetero.cu
@@ -55,7 +55,7 @@ void SpMMCsrHetero(
       if (m == 0) continue;
       DType* out = static_cast<DType*>(device->AllocWorkspace(
           vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
-      CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType)));
+      CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType)));
       trans_out[ntype] = out;
     }
   }
@@ -116,7 +116,7 @@ void SpMMCsrHetero(
     }
   }
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
     const dgl_type_t src_id = ufeat_ntids[etype];
     const dgl_type_t dst_id = out_ntids[etype];
@@ -214,14 +214,14 @@ template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
     const std::vector<dgl_type_t>& ufeat_ntids,
     const std::vector<dgl_type_t>& out_ntids);
 #if BF16_ENABLED
-template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int32_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
     const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
     std::vector<std::vector<NDArray>>* out_aux,
     const std::vector<dgl_type_t>& ufeat_ntids,
     const std::vector<dgl_type_t>& out_ntids);
-template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+template void SpMMCsrHetero<kDGLCUDA, int64_t, __hip_bfloat16>(
     const std::string& op, const std::string& reduce, const BcastOff& bcast,
     const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
     const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
diff --git a/src/array/cuda/spmm_hetero.cu.prehip b/src/array/cuda/spmm_hetero.cu.prehip
new file mode 100644
index 000000000000..b5b258df2ef6
--- /dev/null
+++ b/src/array/cuda/spmm_hetero.cu.prehip
@@ -0,0 +1,262 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/spmm.cu
+ * @brief SPMM C APIs and definitions.
+ */
+#include <dgl/array.h>
+
+#include <cstdlib>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./functor.cuh"
+#include "./ge_spmm.cuh"
+#include "./spmm.cuh"
+
+namespace dgl {
+
+using namespace cuda;
+
+namespace aten {
+
+/**
+ * @brief CUDA implementation of g-SpMM on Csr format.
+ * @note use cusparse if the reduce operator is `sum` and there is
+ *       no broadcast, use dgl's kernel in other cases.
+ */
+template <int XPU, typename IdType, typename DType>
+void SpMMCsrHetero(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& vec_csr,
+    const std::vector<NDArray>& vec_ufeat,
+    const std::vector<NDArray>& vec_efeat, std::vector<NDArray>* vec_out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,  // ufeat node type id
+    const std::vector<dgl_type_t>& out_ntids) {  // output node type id
+  bool is_scalar_efeat =
+      vec_efeat[0].NumElements() == vec_csr[0].indices->shape[0];
+  bool use_efeat = op != "copy_lhs";
+  auto device = runtime::DeviceAPI::Get(vec_csr[0].indptr->ctx);
+  std::vector<DType*> trans_out((*vec_out).size(), NULL);
+  bool use_deterministic_alg_only = false;
+  if (NULL != std::getenv("USE_DETERMINISTIC_ALG"))
+    use_deterministic_alg_only = true;
+
+  bool use_legacy_cusparsemm =
+      (CUDART_VERSION < 11000) && (reduce == "sum") &&
+      // legacy cuSPARSE does not care about NNZ, hence the argument "false".
+      ((op == "copy_lhs" && cusparse_available<DType, IdType>(false)) ||
+       (op == "mul" && is_scalar_efeat &&
+        cusparse_available<DType, IdType>(false)));
+  // Create temporary output buffer to store non-transposed output
+  if (use_legacy_cusparsemm) {
+    for (dgl_type_t ntype = 0; ntype < (*vec_out).size(); ++ntype) {
+      const int m = (*vec_out)[ntype]->shape[0];
+      const int n = (*vec_out)[ntype]->shape[1];
+      if (m == 0) continue;
+      DType* out = static_cast<DType*>(device->AllocWorkspace(
+          vec_csr[0].indptr->ctx, m * n * sizeof(DType)));
+      CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType)));
+      trans_out[ntype] = out;
+    }
+  }
+  // Check shape of ufeat for all relation type and compute feature size
+  int64_t x_length = 1;
+  for (dgl_type_t etype = 0; etype < (ufeat_ntids.size() - 1); ++etype) {
+    NDArray ufeat = vec_ufeat[ufeat_ntids[etype]];
+    NDArray next_ufeat = vec_ufeat[ufeat_ntids[etype + 1]];
+    CHECK_EQ(ufeat->ndim, next_ufeat->ndim)
+        << "Input features have different shapes";
+    for (int i = 1; i < ufeat->ndim; ++i) {
+      if (ufeat->shape[i] != next_ufeat->shape[i]) {
+        if (ufeat->shape[i] == 1 || next_ufeat->shape[i] == 1)
+          LOG(FATAL) << "Homogenized message passing on heterogeneous graphs "
+                        "does not support "
+                     << "automatic broadcasting.  Please manually broadcast it "
+                        "before calling "
+                     << "message passing functions.";
+        else
+          LOG(FATAL) << "Input features have different shapes.";
+        return;
+      }
+
+      if (etype == 0) x_length *= ufeat->shape[i];
+    }
+  }
+  // TODO(Israt): Can python do the following initializations while creating the
+  // tensors?
+  if (reduce == "max" || reduce == "min") {
+    const int64_t dim = bcast.out_len;
+    std::vector<bool> updated((*vec_out).size(), false);
+    for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
+      DType* out_off = (*vec_out)[out_ntids[etype]].Ptr<DType>();
+      if (reduce == "max")
+        _Fill(
+            out_off, vec_csr[etype].num_rows * dim,
+            cuda::reduce::Max<IdType, DType>::zero());
+      else  // min
+        _Fill(
+            out_off, vec_csr[etype].num_rows * dim,
+            cuda::reduce::Min<IdType, DType>::zero());
+      const dgl_type_t dst_id = out_ntids[etype];
+      if (!updated[dst_id]) {
+        updated[dst_id] = true;
+        if (op == "copy_lhs") {
+          IdType* argu_ntype = (*out_aux)[2][dst_id].Ptr<IdType>();
+          _Fill(
+              argu_ntype, vec_csr[etype].num_rows * dim,
+              static_cast<IdType>(-1));
+        }
+        if (op == "copy_rhs") {
+          IdType* arge_etype = (*out_aux)[3][dst_id].Ptr<IdType>();
+          _Fill(
+              arge_etype, vec_csr[etype].num_rows * dim,
+              static_cast<IdType>(-1));
+        }
+      }
+    }
+  }
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) {
+    const dgl_type_t src_id = ufeat_ntids[etype];
+    const dgl_type_t dst_id = out_ntids[etype];
+    CSRMatrix csr = vec_csr[etype];
+    if (reduce == "sum") {
+      bool more_nnz = (csr.indices->shape[0] > csr.num_rows * csr.num_cols);
+      /* Call  SpMM for each relation type */
+      if (op == "copy_lhs" &&
+          cusparse_available<DType, IdType>(more_nnz)) {  // cusparse
+        /* If CUDA is less than 11.0, put the output in trans_out for later
+         * transposition */
+        DType* out = (CUDART_VERSION < 11000)
+                         ? trans_out[dst_id]
+                         : static_cast<DType*>((*vec_out)[dst_id]->data);
+        CusparseCsrmm2Hetero<DType, IdType>(
+            csr.indptr->ctx, csr, static_cast<DType*>(vec_ufeat[src_id]->data),
+            nullptr, out, x_length, stream, use_deterministic_alg_only);
+      } else if (
+          op == "mul" && is_scalar_efeat &&
+          cusparse_available<DType, IdType>(more_nnz)) {  // cusparse
+        NDArray efeat = vec_efeat[etype];
+        if (!IsNullArray(csr.data)) efeat = IndexSelect(efeat, csr.data);
+        CusparseCsrmm2Hetero<DType, IdType>(
+            csr.indptr->ctx, csr, static_cast<DType*>(vec_ufeat[src_id]->data),
+            static_cast<DType*>(efeat->data),
+            // TODO(Israt): Change (*vec_out) to trans_out to support CUDA
+            // version < 11
+            static_cast<DType*>((*vec_out)[dst_id]->data), x_length, stream,
+            use_deterministic_alg_only);
+      } else {  // general kernel
+        NDArray ufeat =
+            (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id];
+        NDArray efeat =
+            (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype];
+        SWITCH_OP(op, Op, {
+          cuda::SpMMCsr<IdType, DType, Op, cuda::reduce::Sum<IdType, DType>>(
+              bcast, csr, ufeat, efeat, (*vec_out)[dst_id], NullArray(),
+              NullArray());
+        });
+      }
+    } else if (reduce == "max") {
+      SWITCH_OP(op, Op, {
+        NDArray ufeat =
+            (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id];
+        NDArray efeat =
+            (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype];
+        cuda::SpMMCmpCsrHetero<
+            IdType, DType, Op, cuda::reduce::Max<IdType, DType>>(
+            bcast, csr, ufeat, efeat, (*vec_out)[dst_id], (*out_aux)[0][dst_id],
+            (*out_aux)[1][dst_id], (*out_aux)[2][dst_id], (*out_aux)[3][dst_id],
+            src_id, etype);
+      });
+    } else if (reduce == "min") {
+      SWITCH_OP(op, Op, {
+        NDArray ufeat =
+            (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id];
+        NDArray efeat =
+            (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype];
+        cuda::SpMMCmpCsrHetero<
+            IdType, DType, Op, cuda::reduce::Min<IdType, DType>>(
+            bcast, csr, ufeat, efeat, (*vec_out)[dst_id], (*out_aux)[0][dst_id],
+            (*out_aux)[1][dst_id], (*out_aux)[2][dst_id], (*out_aux)[3][dst_id],
+            src_id, etype);
+      });
+    } else {
+      LOG(FATAL) << "Not implemented";
+    }
+  }
+
+  if (use_legacy_cusparsemm) {
+    // transpose output
+    for (dgl_type_t ntype = 0; ntype < (*vec_out).size(); ++ntype) {
+      const int m = (*vec_out)[ntype]->shape[0];
+      const int n = (*vec_out)[ntype]->shape[1];
+      if (m == 0) continue;
+      DType* C_data = static_cast<DType*>((*vec_out)[ntype]->data);
+      _Transpose(trans_out[ntype], C_data, n, m);
+      device->FreeWorkspace(vec_csr[0].indptr->ctx, trans_out[ntype]);
+    }
+  }
+}
+
+template void SpMMCsrHetero<kDGLCUDA, int32_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+template void SpMMCsrHetero<kDGLCUDA, int64_t, __half>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+#if BF16_ENABLED
+template void SpMMCsrHetero<kDGLCUDA, int32_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+template void SpMMCsrHetero<kDGLCUDA, int64_t, __nv_bfloat16>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+#endif  // BF16_ENABLED
+template void SpMMCsrHetero<kDGLCUDA, int32_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+template void SpMMCsrHetero<kDGLCUDA, int64_t, float>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+template void SpMMCsrHetero<kDGLCUDA, int32_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+template void SpMMCsrHetero<kDGLCUDA, int64_t, double>(
+    const std::string& op, const std::string& reduce, const BcastOff& bcast,
+    const std::vector<CSRMatrix>& csr, const std::vector<NDArray>& ufeat,
+    const std::vector<NDArray>& efeat, std::vector<NDArray>* out,
+    std::vector<std::vector<NDArray>>* out_aux,
+    const std::vector<dgl_type_t>& ufeat_ntids,
+    const std::vector<dgl_type_t>& out_ntids);
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/utils.cu b/src/array/cuda/utils.cu
index 1006e7a70732..48f42a9e081f 100644
--- a/src/array/cuda/utils.cu
+++ b/src/array/cuda/utils.cu
@@ -4,7 +4,7 @@
  * @brief Utilities for CUDA kernels.
  */
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "./utils.h"
@@ -17,11 +17,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
   int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
   // Call CUB's reduction
   size_t workspace_size = 0;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-  CUDA_CALL(cub::DeviceReduce::Min(
+  hipStream_t stream = runtime::getCurrentCUDAStream();
+  CUDA_CALL(hipcub::DeviceReduce::Min(
       nullptr, workspace_size, flags, rst, length, stream));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceReduce::Min(
+  CUDA_CALL(hipcub::DeviceReduce::Min(
       workspace, workspace_size, flags, rst, length, stream));
   int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
   device->FreeWorkspace(ctx, workspace);
diff --git a/src/array/cuda/utils.cu.prehip b/src/array/cuda/utils.cu.prehip
new file mode 100644
index 000000000000..1006e7a70732
--- /dev/null
+++ b/src/array/cuda/utils.cu.prehip
@@ -0,0 +1,33 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/utils.cu
+ * @brief Utilities for CUDA kernels.
+ */
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "./utils.h"
+
+namespace dgl {
+namespace cuda {
+
+bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) {
+  auto device = runtime::DeviceAPI::Get(ctx);
+  int8_t* rst = static_cast<int8_t*>(device->AllocWorkspace(ctx, 1));
+  // Call CUB's reduction
+  size_t workspace_size = 0;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  CUDA_CALL(cub::DeviceReduce::Min(
+      nullptr, workspace_size, flags, rst, length, stream));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUDA_CALL(cub::DeviceReduce::Min(
+      workspace, workspace_size, flags, rst, length, stream));
+  int8_t cpu_rst = GetCUDAScalar(device, ctx, rst);
+  device->FreeWorkspace(ctx, workspace);
+  device->FreeWorkspace(ctx, rst);
+  return cpu_rst == 1;
+}
+
+}  // namespace cuda
+}  // namespace dgl
diff --git a/src/array/cuda/utils.h b/src/array/cuda/utils.h
index 157bdb295150..502ccaddefc3 100644
--- a/src/array/cuda/utils.h
+++ b/src/array/cuda/utils.h
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file array/cuda/utils.h
@@ -11,7 +12,7 @@
 #include <dgl/runtime/ndarray.h>
 #include <dmlc/logging.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <type_traits>
 
 #include "../../runtime/cuda/cuda_common.h"
@@ -126,7 +127,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) {
 /** @brief Fill the vector started from ptr of size length with val */
 template <typename DType>
 void _Fill(DType* ptr, size_t length, DType val) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   int nt = FindNumThreads(length);
   int nb =
       (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
@@ -185,8 +186,8 @@ template <typename IdType>
 __global__ void _LinearSearchKernel(
     const IdType* indptr, const IdType* indices, const IdType* data,
     const IdType* row, const IdType* col, int64_t row_stride,
-    int64_t col_stride, int64_t length, const __nv_bfloat16* weights,
-    __nv_bfloat16 filler, __nv_bfloat16* out) {
+    int64_t col_stride, int64_t length, const __hip_bfloat16* weights,
+    __hip_bfloat16 filler, __hip_bfloat16* out) {
   int tx = blockIdx.x * blockDim.x + threadIdx.x;
   const int stride_x = gridDim.x * blockDim.x;
   while (tx < length) {
@@ -204,7 +205,7 @@ __global__ void _LinearSearchKernel(
     } else {
       // If the result is saved in bf16, it should be fine to convert it to
       // float first
-      out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v));
+      out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast<float>(v));
     }
     tx += stride_x;
   }
@@ -277,12 +278,12 @@ template <typename DType, typename BoolType>
 void MaskSelect(
     runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
     const BoolType* mask, DType* output, int64_t n, int64_t* rst,
-    cudaStream_t stream) {
+    hipStream_t stream) {
   size_t workspace_size = 0;
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
       nullptr, workspace_size, input, mask, output, rst, n, stream));
   void* workspace = device->AllocWorkspace(ctx, workspace_size);
-  CUDA_CALL(cub::DeviceSelect::Flagged(
+  CUDA_CALL(hipcub::DeviceSelect::Flagged(
       workspace, workspace_size, input, mask, output, rst, n, stream));
   device->FreeWorkspace(ctx, workspace);
 }
@@ -290,7 +291,7 @@ void MaskSelect(
 inline void* GetDevicePointer(runtime::NDArray array) {
   void* ptr = array->data;
   if (array.IsPinned()) {
-    CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
+    CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0));
   }
   return ptr;
 }
diff --git a/src/array/cuda/utils.h.prehip b/src/array/cuda/utils.h.prehip
new file mode 100644
index 000000000000..157bdb295150
--- /dev/null
+++ b/src/array/cuda/utils.h.prehip
@@ -0,0 +1,301 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/cuda/utils.h
+ * @brief Utilities for CUDA kernels.
+ */
+#ifndef DGL_ARRAY_CUDA_UTILS_H_
+#define DGL_ARRAY_CUDA_UTILS_H_
+
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/ndarray.h>
+#include <dmlc/logging.h>
+
+#include <cub/cub.cuh>
+#include <type_traits>
+
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace dgl {
+namespace cuda {
+
+#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF
+#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF
+#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF
+// The max number of threads per block
+#define CUDA_MAX_NUM_THREADS 256
+
+/** @brief Calculate the number of threads needed given the dimension length.
+ *
+ * It finds the biggest number that is smaller than min(dim, max_nthrs)
+ * and is also power of two.
+ */
+inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) {
+  CHECK_GE(dim, 0);
+  if (dim == 0) return 1;
+  int ret = max_nthrs;
+  while (ret > dim) {
+    ret = ret >> 1;
+  }
+  return ret;
+}
+
+template <typename T>
+int _NumberOfBits(const T& range) {
+  if (range <= 1) {
+    // ranges of 0 or 1 require no bits to store
+    return 0;
+  }
+
+  int bits = 1;
+  const auto urange = static_cast<std::make_unsigned_t<T>>(range);
+  while (bits < static_cast<int>(sizeof(T) * 8) && (1ull << bits) < urange) {
+    ++bits;
+  }
+
+  if (bits < static_cast<int>(sizeof(T) * 8)) {
+    CHECK_EQ((range - 1) >> bits, 0);
+  }
+  CHECK_NE((range - 1) >> (bits - 1), 0);
+
+  return bits;
+}
+
+/**
+ * @brief Find number of blocks is smaller than nblks and max_nblks
+ * on the given axis ('x', 'y' or 'z').
+ */
+template <char axis>
+inline int FindNumBlocks(int nblks, int max_nblks = -1) {
+  int default_max_nblks = -1;
+  switch (axis) {
+    case 'x':
+      default_max_nblks = CUDA_MAX_NUM_BLOCKS_X;
+      break;
+    case 'y':
+      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Y;
+      break;
+    case 'z':
+      default_max_nblks = CUDA_MAX_NUM_BLOCKS_Z;
+      break;
+    default:
+      LOG(FATAL) << "Axis " << axis << " not recognized";
+      break;
+  }
+  if (max_nblks == -1) max_nblks = default_max_nblks;
+  CHECK_NE(nblks, 0);
+  if (nblks < max_nblks) return nblks;
+  return max_nblks;
+}
+
+template <typename T>
+__device__ __forceinline__ T _ldg(T* addr) {
+#if __CUDA_ARCH__ >= 350
+  return __ldg(addr);
+#else
+  return *addr;
+#endif
+}
+
+/**
+ * @brief Return true if the given bool flag array is all true.
+ * The input bool array is in int8_t type so it is aligned with byte address.
+ *
+ * @param flags The bool array.
+ * @param length The length.
+ * @param ctx Device context.
+ * @return True if all the flags are true.
+ */
+bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx);
+
+/**
+ * @brief CUDA Kernel of filling the vector started from ptr of size length
+ *        with val.
+ * @note internal use only.
+ */
+template <typename DType>
+__global__ void _FillKernel(DType* ptr, size_t length, DType val) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    ptr[tx] = val;
+    tx += stride_x;
+  }
+}
+
+/** @brief Fill the vector started from ptr of size length with val */
+template <typename DType>
+void _Fill(DType* ptr, size_t length, DType val) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  int nt = FindNumThreads(length);
+  int nb =
+      (length + nt - 1) / nt;  // on x-axis, no need to worry about upperbound.
+  CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, stream, ptr, length, val);
+}
+
+/**
+ * @brief Search adjacency list linearly for each (row, col) pair and
+ * write the data under the matched position in the indices array to the output.
+ *
+ * If there is no match, the value in \c filler is written.
+ * If there are multiple matches, only the first match is written.
+ * If the given data array is null, write the matched position to the output.
+ */
+template <typename IdType, typename DType>
+__global__ void _LinearSearchKernel(
+    const IdType* indptr, const IdType* indices, const IdType* data,
+    const IdType* row, const IdType* col, int64_t row_stride,
+    int64_t col_stride, int64_t length, const DType* weights, DType filler,
+    DType* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    int rpos = tx * row_stride, cpos = tx * col_stride;
+    IdType v = -1;
+    const IdType r = row[rpos], c = col[cpos];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      if (indices[i] == c) {
+        v = data ? data[i] : i;
+        break;
+      }
+    }
+    if (v == -1) {
+      out[tx] = filler;
+    } else {
+      // The casts here are to be able to handle DType being __half.
+      // GCC treats int64_t as a distinct type from long long, so
+      // without the explcit cast to long long, it errors out saying
+      // that the implicit cast results in an ambiguous choice of
+      // constructor for __half.
+      // The using statement is to avoid a linter error about using
+      // long or long long.
+      using LongLong = long long;  // NOLINT
+      out[tx] = weights ? weights[v] : DType(LongLong(v));
+    }
+    tx += stride_x;
+  }
+}
+
+#if BF16_ENABLED
+/**
+ * @brief Specialization for bf16 because conversion from long long to bfloat16
+ * doesn't exist before SM80.
+ */
+template <typename IdType>
+__global__ void _LinearSearchKernel(
+    const IdType* indptr, const IdType* indices, const IdType* data,
+    const IdType* row, const IdType* col, int64_t row_stride,
+    int64_t col_stride, int64_t length, const __nv_bfloat16* weights,
+    __nv_bfloat16 filler, __nv_bfloat16* out) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int stride_x = gridDim.x * blockDim.x;
+  while (tx < length) {
+    int rpos = tx * row_stride, cpos = tx * col_stride;
+    IdType v = -1;
+    const IdType r = row[rpos], c = col[cpos];
+    for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) {
+      if (indices[i] == c) {
+        v = data ? data[i] : i;
+        break;
+      }
+    }
+    if (v == -1) {
+      out[tx] = filler;
+    } else {
+      // If the result is saved in bf16, it should be fine to convert it to
+      // float first
+      out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast<float>(v));
+    }
+    tx += stride_x;
+  }
+}
+#endif  // BF16_ENABLED
+
+template <typename DType>
+inline DType GetCUDAScalar(
+    runtime::DeviceAPI* device_api, DGLContext ctx, const DType* cuda_ptr) {
+  DType result;
+  device_api->CopyDataFromTo(
+      cuda_ptr, 0, &result, 0, sizeof(result), ctx, DGLContext{kDGLCPU, 0},
+      DGLDataTypeTraits<DType>::dtype);
+  return result;
+}
+
+/**
+ * @brief Given a sorted array and a value this function returns the index
+ * of the first element which compares greater than value.
+ *
+ * This function assumes 0-based index
+ * @param A: ascending sorted array
+ * @param n: size of the A
+ * @param x: value to search in A
+ * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n.
+ * if x<A[0] then it returns 0.
+ */
+template <typename IdType>
+__device__ IdType _UpperBound(const IdType* A, int64_t n, IdType x) {
+  IdType l = 0, r = n, m = 0;
+  while (l < r) {
+    m = l + (r - l) / 2;
+    if (x >= A[m]) {
+      l = m + 1;
+    } else {
+      r = m;
+    }
+  }
+  return l;
+}
+
+/**
+ * @brief Given a sorted array and a value this function returns the index
+ * of the element who is equal to val. If not exist returns n+1
+ *
+ * This function assumes 0-based index
+ * @param A: ascending sorted array
+ * @param n: size of the A
+ * @param x: value to search in A
+ * @return index, i, st. A[i]==x. If such an index not exists returns 'n'.
+ */
+template <typename IdType>
+__device__ IdType _BinarySearch(const IdType* A, int64_t n, IdType x) {
+  IdType l = 0, r = n - 1, m = 0;
+  while (l <= r) {
+    m = l + (r - l) / 2;
+    if (A[m] == x) {
+      return m;
+    }
+    if (A[m] < x) {
+      l = m + 1;
+    } else {
+      r = m - 1;
+    }
+  }
+  return n;  // not found
+}
+
+template <typename DType, typename BoolType>
+void MaskSelect(
+    runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input,
+    const BoolType* mask, DType* output, int64_t n, int64_t* rst,
+    cudaStream_t stream) {
+  size_t workspace_size = 0;
+  CUDA_CALL(cub::DeviceSelect::Flagged(
+      nullptr, workspace_size, input, mask, output, rst, n, stream));
+  void* workspace = device->AllocWorkspace(ctx, workspace_size);
+  CUDA_CALL(cub::DeviceSelect::Flagged(
+      workspace, workspace_size, input, mask, output, rst, n, stream));
+  device->FreeWorkspace(ctx, workspace);
+}
+
+inline void* GetDevicePointer(runtime::NDArray array) {
+  void* ptr = array->data;
+  if (array.IsPinned()) {
+    CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0));
+  }
+  return ptr;
+}
+
+}  // namespace cuda
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_UTILS_H_
diff --git a/src/array/cuda/uvm/array_index_select_uvm.cu b/src/array/cuda/uvm/array_index_select_uvm.cu
index df067f259f24..7e8bb13fb370 100644
--- a/src/array/cuda/uvm/array_index_select_uvm.cu
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu
@@ -17,7 +17,7 @@ namespace impl {
 
 template <typename DType, typename IdType>
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const int64_t arr_len = array->shape[0];
   const int64_t len = index->shape[0];
   int64_t num_feat = 1;
@@ -78,7 +78,7 @@ template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
 
 template <typename DType, typename IdType>
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const DType* source_data = static_cast<DType*>(source->data);
   const IdType* idx_data = static_cast<IdType*>(index->data);
   const int64_t arr_len = dest->shape[0];
diff --git a/src/array/cuda/uvm/array_index_select_uvm.cu.prehip b/src/array/cuda/uvm/array_index_select_uvm.cu.prehip
new file mode 100644
index 000000000000..df067f259f24
--- /dev/null
+++ b/src/array/cuda/uvm/array_index_select_uvm.cu.prehip
@@ -0,0 +1,131 @@
+/**
+ *  Copyright (c) 2019-2022 by Contributors
+ * @file array/cuda/uvm/array_index_select_uvm.cu
+ * @brief Array index select GPU implementation
+ */
+#include <dgl/array.h>
+
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../array_index_select.cuh"
+#include "../utils.h"
+#include "./array_index_select_uvm.cuh"
+
+namespace dgl {
+using runtime::NDArray;
+namespace aten {
+namespace impl {
+
+template <typename DType, typename IdType>
+NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const int64_t arr_len = array->shape[0];
+  const int64_t len = index->shape[0];
+  int64_t num_feat = 1;
+  std::vector<int64_t> shape{len};
+
+  CHECK(array.IsPinned());
+  const DType* array_data = static_cast<DType*>(cuda::GetDevicePointer(array));
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA);
+
+  for (int d = 1; d < array->ndim; ++d) {
+    num_feat *= array->shape[d];
+    shape.emplace_back(array->shape[d]);
+  }
+
+  NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx);
+  if (len == 0 || arr_len * num_feat == 0) return ret;
+  DType* ret_data = static_cast<DType*>(ret->data);
+
+  auto res = Sort(index, cuda::_NumberOfBits(arr_len));
+  const IdType* idx_data = static_cast<IdType*>(res.first->data);
+  const int64_t* perm_data = static_cast<int64_t*>(res.second->data);
+
+  if (num_feat == 1) {
+    const int nt = cuda::FindNumThreads(len);
+    const int nb = (len + nt - 1) / nt;
+    CUDA_KERNEL_CALL(
+        IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len,
+        arr_len, ret_data, perm_data);
+  } else {
+    dim3 block(256, 1);
+    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
+      block.x /= 2;
+      block.y *= 2;
+    }
+    const dim3 grid((len + block.y - 1) / block.y);
+    if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) {
+      CUDA_KERNEL_CALL(
+          IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat,
+          idx_data, len, arr_len, ret_data, perm_data);
+    } else {
+      CUDA_KERNEL_CALL(
+          IndexSelectMultiKernelAligned, grid, block, 0, stream, array_data,
+          num_feat, idx_data, len, arr_len, ret_data, perm_data);
+    }
+  }
+  return ret;
+}
+
+// floating point types are treated as their equal width integer types
+template NDArray IndexSelectCPUFromGPU<int8_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int8_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int16_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int16_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int32_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int32_t, int64_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int64_t, int32_t>(NDArray, IdArray);
+template NDArray IndexSelectCPUFromGPU<int64_t, int64_t>(NDArray, IdArray);
+
+template <typename DType, typename IdType>
+void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const DType* source_data = static_cast<DType*>(source->data);
+  const IdType* idx_data = static_cast<IdType*>(index->data);
+  const int64_t arr_len = dest->shape[0];
+  const int64_t len = index->shape[0];
+  int64_t num_feat = 1;
+  std::vector<int64_t> shape{len};
+
+  CHECK(dest.IsPinned());
+  DType* dest_data = static_cast<DType*>(cuda::GetDevicePointer(dest));
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA);
+  CHECK_EQ(source->ctx.device_type, kDGLCUDA);
+
+  for (int d = 1; d < source->ndim; ++d) {
+    num_feat *= source->shape[d];
+  }
+
+  if (len == 0) return;
+
+  if (num_feat == 1) {
+    const int nt = cuda::FindNumThreads(len);
+    const int nb = (len + nt - 1) / nt;
+    CUDA_KERNEL_CALL(
+        IndexScatterSingleKernel, nb, nt, 0, stream, source_data, idx_data, len,
+        arr_len, dest_data);
+  } else {
+    dim3 block(256, 1);
+    while (static_cast<int64_t>(block.x) >= 2 * num_feat) {
+      block.x /= 2;
+      block.y *= 2;
+    }
+    const dim3 grid((len + block.y - 1) / block.y);
+    CUDA_KERNEL_CALL(
+        IndexScatterMultiKernel, grid, block, 0, stream, source_data, num_feat,
+        idx_data, len, arr_len, dest_data);
+  }
+}
+
+// floating point types are treated as their equal width integer types
+template void IndexScatterGPUToCPU<int8_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int8_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int16_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int16_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int32_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int32_t, int64_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int64_t, int32_t>(NDArray, IdArray, NDArray);
+template void IndexScatterGPUToCPU<int64_t, int64_t>(NDArray, IdArray, NDArray);
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/array/cuda/uvm/array_index_select_uvm.cuh b/src/array/cuda/uvm/array_index_select_uvm.cuh
index 5a7d222e55ce..1ab7f05ba1db 100644
--- a/src/array/cuda/uvm/array_index_select_uvm.cuh
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file array/cpu/array_index_select_uvm.cuh
diff --git a/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip b/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip
new file mode 100644
index 000000000000..5a7d222e55ce
--- /dev/null
+++ b/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip
@@ -0,0 +1,52 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/cpu/array_index_select_uvm.cuh
+ * @brief Array index select GPU kernel implementation
+ */
+
+#ifndef DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
+#define DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
+
+#define CACHE_LINE_SIZE 128
+
+namespace dgl {
+namespace aten {
+namespace impl {
+
+/**
+ *  This is a cross-device access version of IndexSelectMultiKernel.
+ *  Since the memory access over PCIe is more sensitive to the
+ *  data access aligment (cacheline), we need a separate version here.
+ */
+template <typename DType, typename IdType>
+__global__ void IndexSelectMultiKernelAligned(
+    const DType* const array, const int64_t num_feat, const IdType* const index,
+    const int64_t length, const int64_t arr_len, DType* const out,
+    const int64_t* perm = nullptr) {
+  int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y;
+
+  const int64_t stride = blockDim.y * gridDim.x;
+
+  while (out_row_index < length) {
+    int64_t col = threadIdx.x;
+    const int64_t in_row = index[out_row_index];
+    assert(in_row >= 0 && in_row < arr_len);
+    const int64_t idx_offset =
+        ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) /
+        sizeof(DType);
+    col = col - idx_offset;
+    const auto out_row = perm ? perm[out_row_index] : out_row_index;
+    while (col < num_feat) {
+      if (col >= 0)
+        out[out_row * num_feat + col] = array[in_row * num_feat + col];
+      col += blockDim.x;
+    }
+    out_row_index += stride;
+  }
+}
+
+}  // namespace impl
+}  // namespace aten
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_
diff --git a/src/array/filter.cc b/src/array/filter.cc
index 658816d60199..01306e0d2458 100644
--- a/src/array/filter.cc
+++ b/src/array/filter.cc
@@ -24,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
       auto ctx = array->ctx;
       // TODO(nv-dlasalle): Implement CPU version.
       if (ctx.device_type == kDGLCUDA) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
         ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
           *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
         });
diff --git a/src/array/filter.cc.prehip b/src/array/filter.cc.prehip
new file mode 100644
index 000000000000..658816d60199
--- /dev/null
+++ b/src/array/filter.cc.prehip
@@ -0,0 +1,54 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file array/filter.cc
+ * @brief Object for selecting items in a set, or selecting items not in a set.
+ */
+
+#include "./filter.h"
+
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
+
+namespace dgl {
+namespace array {
+
+using namespace dgl::runtime;
+
+template <DGLDeviceType XPU, typename IdType>
+FilterRef CreateSetFilter(IdArray set);
+
+DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      IdArray array = args[0];
+      auto ctx = array->ctx;
+      // TODO(nv-dlasalle): Implement CPU version.
+      if (ctx.device_type == kDGLCUDA) {
+#ifdef DGL_USE_CUDA
+        ATEN_ID_TYPE_SWITCH(array->dtype, IdType, {
+          *rv = CreateSetFilter<kDGLCUDA, IdType>(array);
+        });
+#else
+        LOG(FATAL) << "GPU support not compiled.";
+#endif
+      } else {
+        LOG(FATAL) << "CPU support not yet implemented.";
+      }
+    });
+
+DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindIncludedIndices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      FilterRef filter = args[0];
+      IdArray array = args[1];
+      *rv = filter->find_included_indices(array);
+    });
+
+DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindExcludedIndices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      FilterRef filter = args[0];
+      IdArray array = args[1];
+      *rv = filter->find_excluded_indices(array);
+    });
+
+}  // namespace array
+}  // namespace dgl
diff --git a/src/array/selector.h b/src/array/selector.h
index 1257cf18be31..b72c66a0f697 100644
--- a/src/array/selector.h
+++ b/src/array/selector.h
@@ -12,13 +12,13 @@ namespace dgl {
 
 namespace {
 
-#ifdef __CUDACC__
+#ifdef __HIPCC__
 #define DGLDEVICE __device__
 #define DGLINLINE __forceinline__
 #else
 #define DGLDEVICE
 #define DGLINLINE inline
-#endif  // __CUDACC__
+#endif  // __HIPCC__
 
 }  // namespace
 
diff --git a/src/array/selector.h.prehip b/src/array/selector.h.prehip
new file mode 100644
index 000000000000..1257cf18be31
--- /dev/null
+++ b/src/array/selector.h.prehip
@@ -0,0 +1,59 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file array/selector.h
+ * @brief Selector functions to select among src/edge/dst attributes.
+ */
+#ifndef DGL_ARRAY_SELECTOR_H_
+#define DGL_ARRAY_SELECTOR_H_
+
+#include <dmlc/logging.h>
+
+namespace dgl {
+
+namespace {
+
+#ifdef __CUDACC__
+#define DGLDEVICE __device__
+#define DGLINLINE __forceinline__
+#else
+#define DGLDEVICE
+#define DGLINLINE inline
+#endif  // __CUDACC__
+
+}  // namespace
+
+/**
+ * @brief Select among src/edge/dst feature/idx.
+ * @note the integer argument target specifies which target
+ *       to choose, 0: src, 1: edge, 2: dst.
+ */
+template <int target>
+struct Selector {
+  template <typename T>
+  static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) {
+    LOG(INFO) << "Target " << target << " not recognized.";
+    return src;
+  }
+};
+
+template <>
+template <typename T>
+DGLDEVICE DGLINLINE T Selector<0>::Call(T src, T edge, T dst) {
+  return src;
+}
+
+template <>
+template <typename T>
+DGLDEVICE DGLINLINE T Selector<1>::Call(T src, T edge, T dst) {
+  return edge;
+}
+
+template <>
+template <typename T>
+DGLDEVICE DGLINLINE T Selector<2>::Call(T src, T edge, T dst) {
+  return dst;
+}
+
+}  // namespace dgl
+
+#endif  // DGL_ARRAY_SELECTOR_H_
diff --git a/src/array/uvm_array.cc b/src/array/uvm_array.cc
index 671c4f262520..e29d9f192b56 100644
--- a/src/array/uvm_array.cc
+++ b/src/array/uvm_array.cc
@@ -16,7 +16,7 @@ namespace dgl {
 namespace aten {
 
 NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   CHECK(array.IsPinned()) << "Input array must be in pinned memory.";
   CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
   CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension.";
@@ -34,7 +34,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
 }
 
 void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   CHECK(dest.IsPinned()) << "Destination array must be in pinned memory.";
   CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
   CHECK_EQ(source->ctx.device_type, kDGLCUDA)
diff --git a/src/array/uvm_array.cc.prehip b/src/array/uvm_array.cc.prehip
new file mode 100644
index 000000000000..671c4f262520
--- /dev/null
+++ b/src/array/uvm_array.cc.prehip
@@ -0,0 +1,74 @@
+/**
+ *  Copyright (c) 2019-2022 by Contributors
+ * @file array/uvm_array.cc
+ * @brief DGL array utilities implementation
+ */
+#include <dgl/array.h>
+
+#include <sstream>
+
+#include "../c_api_common.h"
+#include "./uvm_array_op.h"
+
+using namespace dgl::runtime;
+
+namespace dgl {
+namespace aten {
+
+NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) {
+#ifdef DGL_USE_CUDA
+  CHECK(array.IsPinned()) << "Input array must be in pinned memory.";
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
+  CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension.";
+  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
+
+  ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, "values", {
+    ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
+      return impl::IndexSelectCPUFromGPU<DType, IdType>(array, index);
+    });
+  });
+#endif
+  LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA.";
+  // Should be unreachable
+  return NDArray{};
+}
+
+void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) {
+#ifdef DGL_USE_CUDA
+  CHECK(dest.IsPinned()) << "Destination array must be in pinned memory.";
+  CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU.";
+  CHECK_EQ(source->ctx.device_type, kDGLCUDA)
+      << "Source array must be on the GPU.";
+  CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source "
+                                          "array must have the same dtype.";
+  CHECK_GE(dest->ndim, 1)
+      << "Destination array must have at least 1 dimension.";
+  CHECK_EQ(index->ndim, 1) << "Index must be a 1D array.";
+
+  ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", {
+    ATEN_ID_TYPE_SWITCH(index->dtype, IdType, {
+      impl::IndexScatterGPUToCPU<DType, IdType>(dest, index, source);
+    });
+  });
+#else
+  LOG(FATAL) << "IndexScatterGPUToCPU requires CUDA.";
+#endif
+}
+
+DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArray array = args[0];
+      IdArray index = args[1];
+      *rv = IndexSelectCPUFromGPU(array, index);
+    });
+
+DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArray dest = args[0];
+      IdArray index = args[1];
+      NDArray source = args[2];
+      IndexScatterGPUToCPU(dest, index, source);
+    });
+
+}  // namespace aten
+}  // namespace dgl
diff --git a/src/geometry/cuda/edge_coarsening_impl.cu b/src/geometry/cuda/edge_coarsening_impl.cu
index 6907410c6477..1d1630444d30 100644
--- a/src/geometry/cuda/edge_coarsening_impl.cu
+++ b/src/geometry/cuda/edge_coarsening_impl.cu
@@ -1,9 +1,10 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2019 by Contributors
  * @file geometry/cuda/edge_coarsening_impl.cu
  * @brief Edge coarsening CUDA implementation
  */
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dmlc/thread_local.h>
@@ -32,9 +33,9 @@ __global__ void generate_uniform_kernel(
     float *ret_values, size_t num, uint64_t seed) {
   size_t id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id < num) {
-    curandState state;
-    curand_init(seed, id, 0, &state);
-    ret_values[id] = curand_uniform(&state);
+    hiprandState state;
+    hiprand_init(seed, id, 0, &state);
+    ret_values[id] = hiprand_uniform(&state);
   }
 }
 
@@ -116,7 +117,7 @@ __global__ void weighted_respond_kernel(
 template <typename IdType>
 bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
   // initial done signal
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
 
   // generate color prop for each node
@@ -132,8 +133,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
       colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
       result_data);
   bool done_h = false;
-  CUDA_CALL(cudaMemcpyFromSymbol(
-      &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
+  CUDA_CALL(hipMemcpyFromSymbol(
+      &done_h, HIP_SYMBOL(done_d), sizeof(done_h), 0, hipMemcpyDeviceToHost));
   return done_h;
 }
 
@@ -155,7 +156,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
 template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void WeightedNeighborMatching(
     const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto &ctx = result->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   device->SetDevice(ctx);
@@ -216,7 +217,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
   device->SetDevice(ctx);
 
   // generate random weights
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   NDArray weight = NDArray::Empty(
       {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
   float *weight_data = static_cast<float *>(weight->data);
diff --git a/src/geometry/cuda/edge_coarsening_impl.cu.prehip b/src/geometry/cuda/edge_coarsening_impl.cu.prehip
new file mode 100644
index 000000000000..6907410c6477
--- /dev/null
+++ b/src/geometry/cuda/edge_coarsening_impl.cu.prehip
@@ -0,0 +1,239 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file geometry/cuda/edge_coarsening_impl.cu
+ * @brief Edge coarsening CUDA implementation
+ */
+#include <curand_kernel.h>
+#include <dgl/array.h>
+#include <dgl/random.h>
+#include <dmlc/thread_local.h>
+
+#include <cstdint>
+
+#include "../../array/cuda/utils.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "../geometry_op.h"
+
+#define BLOCKS(N, T) (N + T - 1) / T
+
+namespace dgl {
+namespace geometry {
+namespace impl {
+
+constexpr float BLUE_P = 0.53406;
+constexpr int BLUE = -1;
+constexpr int RED = -2;
+constexpr int EMPTY_IDX = -1;
+
+__device__ bool done_d;
+__global__ void init_done_kernel() { done_d = true; }
+
+__global__ void generate_uniform_kernel(
+    float *ret_values, size_t num, uint64_t seed) {
+  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < num) {
+    curandState state;
+    curand_init(seed, id, 0, &state);
+    ret_values[id] = curand_uniform(&state);
+  }
+}
+
+template <typename IdType>
+__global__ void colorize_kernel(
+    const float *prop, int64_t num_elem, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] < 0) {  // if unmatched
+      result[idx] = (prop[idx] > BLUE_P) ? RED : BLUE;
+      done_d = false;
+    }
+  }
+}
+
+template <typename FloatType, typename IdType>
+__global__ void weighted_propose_kernel(
+    const IdType *indptr, const IdType *indices, const FloatType *weights,
+    int64_t num_elem, IdType *proposal, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] != BLUE) return;
+
+    bool has_unmatched_neighbor = false;
+    FloatType weight_max = 0.;
+    IdType v_max = EMPTY_IDX;
+
+    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
+      auto v = indices[i];
+
+      if (result[v] < 0) has_unmatched_neighbor = true;
+      if (result[v] == RED && weights[i] >= weight_max) {
+        v_max = v;
+        weight_max = weights[i];
+      }
+    }
+
+    proposal[idx] = v_max;
+    if (!has_unmatched_neighbor) result[idx] = idx;
+  }
+}
+
+template <typename FloatType, typename IdType>
+__global__ void weighted_respond_kernel(
+    const IdType *indptr, const IdType *indices, const FloatType *weights,
+    int64_t num_elem, IdType *proposal, IdType *result) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elem) {
+    if (result[idx] != RED) return;
+
+    bool has_unmatched_neighbors = false;
+    IdType v_max = -1;
+    FloatType weight_max = 0.;
+
+    for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) {
+      auto v = indices[i];
+
+      if (result[v] < 0) {
+        has_unmatched_neighbors = true;
+      }
+      if (result[v] == BLUE && proposal[v] == idx && weights[i] >= weight_max) {
+        v_max = v;
+        weight_max = weights[i];
+      }
+    }
+    if (v_max >= 0) {
+      result[v_max] = min(idx, v_max);
+      result[idx] = min(idx, v_max);
+    }
+
+    if (!has_unmatched_neighbors) result[idx] = idx;
+  }
+}
+
+/** @brief The colorize procedure. This procedure randomly marks unmarked
+ * nodes with BLUE(-1) and RED(-2) and checks whether the node matching
+ * process has finished.
+ */
+template <typename IdType>
+bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) {
+  // initial done signal
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream);
+
+  // generate color prop for each node
+  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
+  auto num_threads = cuda::FindNumThreads(num_nodes);
+  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
+  CUDA_KERNEL_CALL(
+      generate_uniform_kernel, num_blocks, num_threads, 0, stream, prop,
+      num_nodes, seed);
+
+  // call kernel
+  CUDA_KERNEL_CALL(
+      colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes,
+      result_data);
+  bool done_h = false;
+  CUDA_CALL(cudaMemcpyFromSymbol(
+      &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost));
+  return done_h;
+}
+
+/** @brief Weighted neighbor matching procedure (GPU version).
+ * This implementation is from `A GPU Algorithm for Greedy Graph Matching
+ * <http://www.staff.science.uu.nl/~bisse101/Articles/match12.pdf>`__
+ *
+ * This algorithm has three parts: colorize, propose and respond.
+ * In colorize procedure, each unmarked node will be marked as BLUE or
+ * RED randomly. If all nodes are marked, finish and return.
+ * In propose procedure, each BLUE node will propose to the RED
+ * neighbor with the largest weight (or randomly choose one if without weight).
+ * If all its neighbors are marked, mark this node with its id.
+ * In respond procedure, each RED node will respond to the BLUE neighbor
+ * that has proposed to it and has the largest weight. If all neighbors
+ * are marked, mark this node with its id. Else match this (BLUE, RED) node
+ * pair and mark them with the smaller id between them.
+ */
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
+void WeightedNeighborMatching(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto &ctx = result->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  device->SetDevice(ctx);
+
+  // create proposal tensor
+  const int64_t num_nodes = result->shape[0];
+  IdArray proposal = aten::Full(-1, num_nodes, sizeof(IdType) * 8, ctx);
+
+  // get data ptrs
+  IdType *indptr_data = static_cast<IdType *>(csr.indptr->data);
+  IdType *indices_data = static_cast<IdType *>(csr.indices->data);
+  IdType *result_data = static_cast<IdType *>(result->data);
+  IdType *proposal_data = static_cast<IdType *>(proposal->data);
+  FloatType *weight_data = static_cast<FloatType *>(weight->data);
+
+  // allocate workspace for prop used in Colorize()
+  float *prop = static_cast<float *>(
+      device->AllocWorkspace(ctx, num_nodes * sizeof(float)));
+
+  auto num_threads = cuda::FindNumThreads(num_nodes);
+  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads));
+  while (!Colorize<IdType>(result_data, num_nodes, prop)) {
+    CUDA_KERNEL_CALL(
+        weighted_propose_kernel, num_blocks, num_threads, 0, stream,
+        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
+        result_data);
+    CUDA_KERNEL_CALL(
+        weighted_respond_kernel, num_blocks, num_threads, 0, stream,
+        indptr_data, indices_data, weight_data, num_nodes, proposal_data,
+        result_data);
+  }
+  device->FreeWorkspace(ctx, prop);
+}
+template void WeightedNeighborMatching<kDGLCUDA, float, int32_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDGLCUDA, float, int64_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDGLCUDA, double, int32_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+template void WeightedNeighborMatching<kDGLCUDA, double, int64_t>(
+    const aten::CSRMatrix &csr, const NDArray weight, IdArray result);
+
+/** @brief Unweighted neighbor matching procedure (GPU version).
+ * Instead of directly sample neighbors, we assign each neighbor
+ * with a random weight. We use random weight for 2 reasons:
+ *  1. Random sample for each node in GPU is expensive. Although
+ *     we can perform a global group-wise (neighborhood of each
+ *     node as a group) random permutation as in CPU version,
+ *     it still cost too much compared to directly using random weights.
+ *  2. Graph is sparse, thus neighborhood of each node is small,
+ *     which is suitable for GPU implementation.
+ */
+template <DGLDeviceType XPU, typename IdType>
+void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) {
+  const int64_t num_edges = csr.indices->shape[0];
+  const auto &ctx = result->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  device->SetDevice(ctx);
+
+  // generate random weights
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  NDArray weight = NDArray::Empty(
+      {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx);
+  float *weight_data = static_cast<float *>(weight->data);
+  uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX);
+  auto num_threads = cuda::FindNumThreads(num_edges);
+  auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_edges, num_threads));
+  CUDA_KERNEL_CALL(
+      generate_uniform_kernel, num_blocks, num_threads, 0, stream, weight_data,
+      num_edges, seed);
+
+  WeightedNeighborMatching<XPU, float, IdType>(csr, weight, result);
+}
+template void NeighborMatching<kDGLCUDA, int32_t>(
+    const aten::CSRMatrix &csr, IdArray result);
+template void NeighborMatching<kDGLCUDA, int64_t>(
+    const aten::CSRMatrix &csr, IdArray result);
+
+}  // namespace impl
+}  // namespace geometry
+}  // namespace dgl
diff --git a/src/geometry/cuda/geometry_op_impl.cu b/src/geometry/cuda/geometry_op_impl.cu
index ac3b05966322..eb8a38de3070 100644
--- a/src/geometry/cuda/geometry_op_impl.cu
+++ b/src/geometry/cuda/geometry_op_impl.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2019 by Contributors
  * @file geometry/cuda/geometry_op_impl.cc
@@ -95,7 +96,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void FarthestPointSampler(
     NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
     IdArray start_idx, IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const FloatType* array_data = static_cast<FloatType*>(array->data);
 
@@ -110,7 +111,7 @@ void FarthestPointSampler(
 
   // sample for each cloud in the batch
   IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
-  CUDA_CALL(cudaSetDevice(array->ctx.device_id));
+  CUDA_CALL(hipSetDevice(array->ctx.device_id));
 
   CUDA_KERNEL_CALL(
       fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
diff --git a/src/geometry/cuda/geometry_op_impl.cu.prehip b/src/geometry/cuda/geometry_op_impl.cu.prehip
new file mode 100644
index 000000000000..ac3b05966322
--- /dev/null
+++ b/src/geometry/cuda/geometry_op_impl.cu.prehip
@@ -0,0 +1,135 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file geometry/cuda/geometry_op_impl.cc
+ * @brief Geometry operator CUDA implementation
+ */
+#include <dgl/array.h>
+
+#include "../../c_api_common.h"
+#include "../../runtime/cuda/cuda_common.h"
+#include "../geometry_op.h"
+
+#define THREADS 1024
+
+namespace dgl {
+namespace geometry {
+namespace impl {
+
+/**
+ * @brief Farthest Point Sampler without the need to compute all pairs of
+ * distance.
+ *
+ * The input array has shape (N, d), where N is the number of points, and d is
+ * the dimension. It consists of a (flatten) batch of point clouds.
+ *
+ * In each batch, the algorithm starts with the sample index specified by
+ * ``start_idx``. Then for each point, we maintain the minimum to-sample
+ * distance. Finally, we pick the point with the maximum such distance. This
+ * process will be repeated for ``sample_points`` - 1 times.
+ */
+template <typename FloatType, typename IdType>
+__global__ void fps_kernel(
+    const FloatType* array_data, const int64_t batch_size,
+    const int64_t sample_points, const int64_t point_in_batch,
+    const int64_t dim, const IdType* start_idx, FloatType* dist_data,
+    IdType* ret_data) {
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t batch_idx = blockIdx.x;
+
+  const int64_t array_start = point_in_batch * batch_idx;
+  const int64_t ret_start = sample_points * batch_idx;
+
+  __shared__ FloatType dist_max_ht[THREADS];
+  __shared__ int64_t dist_argmax_ht[THREADS];
+
+  // start with random initialization
+  if (thread_idx == 0) {
+    ret_data[ret_start] = (IdType)(start_idx[batch_idx]);
+  }
+
+  // sample the rest `sample_points - 1` points
+  for (auto i = 0; i < sample_points - 1; i++) {
+    __syncthreads();
+
+    // the last sampled point
+    int64_t sample_idx = (int64_t)(ret_data[ret_start + i]);
+    dist_argmax_ht[thread_idx] = 0;
+    dist_max_ht[thread_idx] = (FloatType)(-1.);
+
+    // multi-thread distance calculation
+    for (auto j = thread_idx; j < point_in_batch; j += THREADS) {
+      FloatType one_dist = (FloatType)(0.);
+      for (auto d = 0; d < dim; d++) {
+        FloatType tmp = array_data[(array_start + j) * dim + d] -
+                        array_data[(array_start + sample_idx) * dim + d];
+        one_dist += tmp * tmp;
+      }
+
+      if (i == 0 || dist_data[array_start + j] > one_dist) {
+        dist_data[array_start + j] = one_dist;
+      }
+
+      if (dist_data[array_start + j] > dist_max_ht[thread_idx]) {
+        dist_argmax_ht[thread_idx] = j;
+        dist_max_ht[thread_idx] = dist_data[array_start + j];
+      }
+    }
+
+    __syncthreads();
+
+    if (thread_idx == 0) {
+      FloatType best = dist_max_ht[0];
+      int64_t best_idx = dist_argmax_ht[0];
+      for (auto j = 1; j < THREADS; j++) {
+        if (dist_max_ht[j] > best) {
+          best = dist_max_ht[j];
+          best_idx = dist_argmax_ht[j];
+        }
+      }
+      ret_data[ret_start + i + 1] = (IdType)(best_idx);
+    }
+  }
+}
+
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
+void FarthestPointSampler(
+    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
+    IdArray start_idx, IdArray result) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const FloatType* array_data = static_cast<FloatType*>(array->data);
+
+  const int64_t point_in_batch = array->shape[0] / batch_size;
+  const int64_t dim = array->shape[1];
+
+  // return value
+  IdType* ret_data = static_cast<IdType*>(result->data);
+
+  // distance
+  FloatType* dist_data = static_cast<FloatType*>(dist->data);
+
+  // sample for each cloud in the batch
+  IdType* start_idx_data = static_cast<IdType*>(start_idx->data);
+  CUDA_CALL(cudaSetDevice(array->ctx.device_id));
+
+  CUDA_KERNEL_CALL(
+      fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size,
+      sample_points, point_in_batch, dim, start_idx_data, dist_data, ret_data);
+}
+
+template void FarthestPointSampler<kDGLCUDA, float, int32_t>(
+    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
+    IdArray start_idx, IdArray result);
+template void FarthestPointSampler<kDGLCUDA, float, int64_t>(
+    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
+    IdArray start_idx, IdArray result);
+template void FarthestPointSampler<kDGLCUDA, double, int32_t>(
+    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
+    IdArray start_idx, IdArray result);
+template void FarthestPointSampler<kDGLCUDA, double, int64_t>(
+    NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist,
+    IdArray start_idx, IdArray result);
+
+}  // namespace impl
+}  // namespace geometry
+}  // namespace dgl
diff --git a/src/graph/heterograph_capi.cc b/src/graph/heterograph_capi.cc
index 9e88d01c1ba5..f8dcd95162ac 100644
--- a/src/graph/heterograph_capi.cc
+++ b/src/graph/heterograph_capi.cc
@@ -687,7 +687,7 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
         }
       };
 
-#if !(defined(DGL_USE_CUDA))
+#if !(defined(DGL_USE_ROCM))
       runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
 #else
       get_format_f(0, hg->NumEdgeTypes());
diff --git a/src/graph/heterograph_capi.cc.prehip b/src/graph/heterograph_capi.cc.prehip
new file mode 100644
index 000000000000..9e88d01c1ba5
--- /dev/null
+++ b/src/graph/heterograph_capi.cc.prehip
@@ -0,0 +1,841 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file graph/heterograph_capi.cc
+ * @brief Heterograph CAPI bindings.
+ */
+#include <dgl/array.h>
+#include <dgl/aten/coo.h>
+#include <dgl/immutable_graph.h>
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/container.h>
+#include <dgl/runtime/parallel_for.h>
+
+#include <set>
+
+#include "../c_api_common.h"
+#include "./heterograph.h"
+#include "unit_graph.h"
+
+using namespace dgl::runtime;
+
+namespace dgl {
+
+///////////////////////// Unitgraph functions /////////////////////////
+
+// XXX(minjie): Ideally, Unitgraph should be invisible to python side
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      int64_t nvtypes = args[0];
+      int64_t num_src = args[1];
+      int64_t num_dst = args[2];
+      IdArray row = args[3];
+      IdArray col = args[4];
+      List<Value> formats = args[5];
+      bool row_sorted = args[6];
+      bool col_sorted = args[7];
+      std::vector<SparseFormat> formats_vec;
+      for (Value val : formats) {
+        std::string fmt = val->data;
+        formats_vec.push_back(ParseSparseFormat(fmt));
+      }
+      const auto code = SparseFormatsToCode(formats_vec);
+      auto hgptr = CreateFromCOO(
+          nvtypes, num_src, num_dst, row, col, row_sorted, col_sorted, code);
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCSR")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      int64_t nvtypes = args[0];
+      int64_t num_src = args[1];
+      int64_t num_dst = args[2];
+      IdArray indptr = args[3];
+      IdArray indices = args[4];
+      IdArray edge_ids = args[5];
+      List<Value> formats = args[6];
+      bool transpose = args[7];
+      std::vector<SparseFormat> formats_vec;
+      for (Value val : formats) {
+        std::string fmt = val->data;
+        formats_vec.push_back(ParseSparseFormat(fmt));
+      }
+      const auto code = SparseFormatsToCode(formats_vec);
+      if (!transpose) {
+        auto hgptr = CreateFromCSR(
+            nvtypes, num_src, num_dst, indptr, indices, edge_ids, code);
+        *rv = HeteroGraphRef(hgptr);
+      } else {
+        auto hgptr = CreateFromCSC(
+            nvtypes, num_src, num_dst, indptr, indices, edge_ids, code);
+        *rv = HeteroGraphRef(hgptr);
+      }
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateHeteroGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      GraphRef meta_graph = args[0];
+      List<HeteroGraphRef> rel_graphs = args[1];
+      std::vector<HeteroGraphPtr> rel_ptrs;
+      rel_ptrs.reserve(rel_graphs.size());
+      for (const auto& ref : rel_graphs) {
+        rel_ptrs.push_back(ref.sptr());
+      }
+      auto hgptr = CreateHeteroGraph(meta_graph.sptr(), rel_ptrs);
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL(
+    "heterograph_index._CAPI_DGLHeteroCreateHeteroGraphWithNumNodes")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      GraphRef meta_graph = args[0];
+      List<HeteroGraphRef> rel_graphs = args[1];
+      IdArray num_nodes_per_type = args[2];
+      std::vector<HeteroGraphPtr> rel_ptrs;
+      rel_ptrs.reserve(rel_graphs.size());
+      for (const auto& ref : rel_graphs) {
+        rel_ptrs.push_back(ref.sptr());
+      }
+      auto hgptr = CreateHeteroGraph(
+          meta_graph.sptr(), rel_ptrs, num_nodes_per_type.ToVector<int64_t>());
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+///////////////////////// HeteroGraph member functions /////////////////////////
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetMetaGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->meta_graph();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsMetaGraphUniBipartite")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      GraphPtr mg = hg->meta_graph();
+      *rv = mg->IsUniBipartite();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetRelationGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      CHECK_LE(etype, hg->NumEdgeTypes()) << "invalid edge type " << etype;
+      auto unit_graph = hg->GetRelationGraph(etype);
+      auto meta_graph = unit_graph->meta_graph();
+      auto hgptr = CreateHeteroGraph(
+          meta_graph, {unit_graph}, unit_graph->NumVerticesPerType());
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFlattenedGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> etypes = args[1];
+      std::vector<dgl_id_t> etypes_vec;
+      for (Value val : etypes) {
+        // (gq) have to decompose it into two statements because of a weird MSVC
+        // internal error
+        dgl_id_t id = val->data;
+        etypes_vec.push_back(id);
+      }
+
+      *rv = FlattenedHeteroGraphRef(hg->Flatten(etypes_vec));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddVertices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t vtype = args[1];
+      int64_t num = args[2];
+      hg->AddVertices(vtype, num);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddEdge")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t src = args[2];
+      dgl_id_t dst = args[3];
+      hg->AddEdge(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray src = args[2];
+      IdArray dst = args[3];
+      hg->AddEdges(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroClear")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      hg->Clear();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDataType")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->DataType();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroContext")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->Context();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsPinned")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->IsPinned();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumBits")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->NumBits();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsMultigraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->IsMultigraph();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsReadonly")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = hg->IsReadonly();
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumVertices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t vtype = args[1];
+      *rv = static_cast<int64_t>(hg->NumVertices(vtype));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      *rv = static_cast<int64_t>(hg->NumEdges(etype));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasVertex")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t vtype = args[1];
+      dgl_id_t vid = args[2];
+      *rv = hg->HasVertex(vtype, vid);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasVertices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t vtype = args[1];
+      IdArray vids = args[2];
+      *rv = hg->HasVertices(vtype, vids);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasEdgeBetween")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t src = args[2];
+      dgl_id_t dst = args[3];
+      *rv = hg->HasEdgeBetween(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasEdgesBetween")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray src = args[2];
+      IdArray dst = args[3];
+      *rv = hg->HasEdgesBetween(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPredecessors")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t dst = args[2];
+      *rv = hg->Predecessors(etype, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSuccessors")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t src = args[2];
+      *rv = hg->Successors(etype, src);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeId")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t src = args[2];
+      dgl_id_t dst = args[3];
+      *rv = hg->EdgeId(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeIdsAll")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray src = args[2];
+      IdArray dst = args[3];
+      const auto& ret = hg->EdgeIdsAll(etype, src, dst);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeIdsOne")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray src = args[2];
+      IdArray dst = args[3];
+      *rv = hg->EdgeIdsOne(etype, src, dst);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroFindEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray eids = args[2];
+      const auto& ret = hg->FindEdges(etype, eids);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInEdges_1")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t vid = args[2];
+      const auto& ret = hg->InEdges(etype, vid);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInEdges_2")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray vids = args[2];
+      const auto& ret = hg->InEdges(etype, vids);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutEdges_1")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t vid = args[2];
+      const auto& ret = hg->OutEdges(etype, vid);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutEdges_2")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray vids = args[2];
+      const auto& ret = hg->OutEdges(etype, vids);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      std::string order = args[2];
+      const auto& ret = hg->Edges(etype, order);
+      *rv = ConvertEdgeArrayToPackedFunc(ret);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInDegree")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t vid = args[2];
+      *rv = static_cast<int64_t>(hg->InDegree(etype, vid));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInDegrees")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray vids = args[2];
+      *rv = hg->InDegrees(etype, vids);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutDegree")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      dgl_id_t vid = args[2];
+      *rv = static_cast<int64_t>(hg->OutDegree(etype, vid));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutDegrees")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      IdArray vids = args[2];
+      *rv = hg->OutDegrees(etype, vids);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetAdj")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_type_t etype = args[1];
+      bool transpose = args[2];
+      std::string fmt = args[3];
+      *rv = ConvertNDArrayVectorToPackedFunc(hg->GetAdj(etype, transpose, fmt));
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroVertexSubgraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> vids = args[1];
+      std::vector<IdArray> vid_vec;
+      vid_vec.reserve(vids.size());
+      for (Value val : vids) {
+        vid_vec.push_back(val->data);
+      }
+      std::shared_ptr<HeteroSubgraph> subg(
+          new HeteroSubgraph(hg->VertexSubgraph(vid_vec)));
+      *rv = HeteroSubgraphRef(subg);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> eids = args[1];
+      bool preserve_nodes = args[2];
+      std::vector<IdArray> eid_vec;
+      eid_vec.reserve(eids.size());
+      for (Value val : eids) {
+        eid_vec.push_back(val->data);
+      }
+      std::shared_ptr<HeteroSubgraph> subg(
+          new HeteroSubgraph(hg->EdgeSubgraph(eid_vec, preserve_nodes)));
+      *rv = HeteroSubgraphRef(subg);
+    });
+
+///////////////////////// HeteroSubgraph members /////////////////////////
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroSubgraphRef subg = args[0];
+      *rv = HeteroGraphRef(subg->graph);
+    });
+
+DGL_REGISTER_GLOBAL(
+    "heterograph_index._CAPI_DGLHeteroSubgraphGetInducedVertices")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroSubgraphRef subg = args[0];
+      List<Value> induced_verts;
+      for (IdArray arr : subg->induced_vertices) {
+        induced_verts.push_back(Value(MakeValue(arr)));
+      }
+      *rv = induced_verts;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetInducedEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroSubgraphRef subg = args[0];
+      List<Value> induced_edges;
+      for (IdArray arr : subg->induced_edges) {
+        induced_edges.push_back(Value(MakeValue(arr)));
+      }
+      *rv = induced_edges;
+    });
+
+///////////////////////// Global functions and algorithms
+////////////////////////////
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAsNumBits")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      int bits = args[1];
+      HeteroGraphPtr bhg_ptr = hg.sptr();
+      auto hg_ptr = std::dynamic_pointer_cast<HeteroGraph>(bhg_ptr);
+      HeteroGraphPtr hg_new;
+      if (hg_ptr) {
+        hg_new = HeteroGraph::AsNumBits(hg_ptr, bits);
+      } else {
+        hg_new = UnitGraph::AsNumBits(bhg_ptr, bits);
+      }
+      *rv = HeteroGraphRef(hg_new);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyTo")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      int device_type = args[1];
+      int device_id = args[2];
+      DGLContext ctx;
+      ctx.device_type = static_cast<DGLDeviceType>(device_type);
+      ctx.device_id = device_id;
+      HeteroGraphPtr hg_new = HeteroGraph::CopyTo(hg.sptr(), ctx);
+      *rv = HeteroGraphRef(hg_new);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPinMemory")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      HeteroGraphPtr hg_new = HeteroGraph::PinMemory(hg.sptr());
+      *rv = HeteroGraphRef(hg_new);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPinMemory_")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      auto hgindex = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+      hgindex->PinMemory_();
+      *rv = hg;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroUnpinMemory_")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      auto hgindex = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+      hgindex->UnpinMemory_();
+      *rv = hg;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroRecordStream")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      DGLStreamHandle stream = args[1];
+      auto hgindex = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+      hgindex->RecordStream(stream);
+      *rv = hg;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyToSharedMem")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      std::string name = args[1];
+      List<Value> ntypes = args[2];
+      List<Value> etypes = args[3];
+      List<Value> fmts = args[4];
+      auto ntypes_vec = ListValueToVector<std::string>(ntypes);
+      auto etypes_vec = ListValueToVector<std::string>(etypes);
+      std::set<std::string> fmts_set;
+      for (const auto& fmt : fmts) {
+        std::string fmt_data = fmt->data;
+        fmts_set.insert(fmt_data);
+      }
+      auto hg_share = HeteroGraph::CopyToSharedMem(
+          hg.sptr(), name, ntypes_vec, etypes_vec, fmts_set);
+      *rv = HeteroGraphRef(hg_share);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFromSharedMem")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      std::string name = args[0];
+      HeteroGraphPtr hg;
+      std::vector<std::string> ntypes;
+      std::vector<std::string> etypes;
+      std::tie(hg, ntypes, etypes) = HeteroGraph::CreateFromSharedMem(name);
+      List<Value> ntypes_list;
+      List<Value> etypes_list;
+      for (const auto& ntype : ntypes)
+        ntypes_list.push_back(Value(MakeValue(ntype)));
+      for (const auto& etype : etypes)
+        etypes_list.push_back(Value(MakeValue(etype)));
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(hg));
+      ret.push_back(ntypes_list);
+      ret.push_back(etypes_list);
+      *rv = ret;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroJointUnion")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      GraphRef meta_graph = args[0];
+      List<HeteroGraphRef> component_graphs = args[1];
+      CHECK(component_graphs.size() > 1)
+          << "Expect graph list to have at least two graphs";
+      std::vector<HeteroGraphPtr> component_ptrs;
+      component_ptrs.reserve(component_graphs.size());
+      const int64_t bits = component_graphs[0]->NumBits();
+      const DGLContext ctx = component_graphs[0]->Context();
+      for (const auto& component : component_graphs) {
+        component_ptrs.push_back(component.sptr());
+        CHECK_EQ(component->NumBits(), bits)
+            << "Expect graphs to joint union have the same index dtype(int"
+            << bits << "), but got int" << component->NumBits();
+        CHECK_EQ(component->Context(), ctx)
+            << "Expect graphs to joint union have the same context" << ctx
+            << "), but got " << component->Context();
+      }
+
+      auto hgptr = JointUnionHeteroGraph(meta_graph.sptr(), component_ptrs);
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDisjointUnion_v2")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      GraphRef meta_graph = args[0];
+      List<HeteroGraphRef> component_graphs = args[1];
+      CHECK(component_graphs.size() > 0)
+          << "Expect graph list has at least one graph";
+      std::vector<HeteroGraphPtr> component_ptrs;
+      component_ptrs.reserve(component_graphs.size());
+      const int64_t bits = component_graphs[0]->NumBits();
+      const DGLContext ctx = component_graphs[0]->Context();
+      for (const auto& component : component_graphs) {
+        component_ptrs.push_back(component.sptr());
+        CHECK_EQ(component->NumBits(), bits)
+            << "Expect graphs to batch have the same index dtype(int" << bits
+            << "), but got int" << component->NumBits();
+        CHECK_EQ(component->Context(), ctx)
+            << "Expect graphs to batch have the same context" << ctx
+            << "), but got " << component->Context();
+      }
+
+      auto hgptr = DisjointUnionHeteroGraph2(meta_graph.sptr(), component_ptrs);
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL(
+    "heterograph_index._CAPI_DGLHeteroDisjointPartitionBySizes_v2")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      const IdArray vertex_sizes = args[1];
+      const IdArray edge_sizes = args[2];
+      std::vector<HeteroGraphPtr> ret;
+      ret = DisjointPartitionHeteroBySizes2(
+          hg->meta_graph(), hg.sptr(), vertex_sizes, edge_sizes);
+      List<HeteroGraphRef> ret_list;
+      for (HeteroGraphPtr hgptr : ret) {
+        ret_list.push_back(HeteroGraphRef(hgptr));
+      }
+      *rv = ret_list;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDisjointPartitionBySizes")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      const IdArray vertex_sizes = args[1];
+      const IdArray edge_sizes = args[2];
+      const int64_t bits = hg->NumBits();
+      std::vector<HeteroGraphPtr> ret;
+      ATEN_ID_BITS_SWITCH(bits, IdType, {
+        ret = DisjointPartitionHeteroBySizes<IdType>(
+            hg->meta_graph(), hg.sptr(), vertex_sizes, edge_sizes);
+      });
+      List<HeteroGraphRef> ret_list;
+      for (HeteroGraphPtr hgptr : ret) {
+        ret_list.push_back(HeteroGraphRef(hgptr));
+      }
+      *rv = ret_list;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSlice")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      const IdArray num_nodes_per_type = args[1];
+      const IdArray start_nid_per_type = args[2];
+      const IdArray num_edges_per_type = args[3];
+      const IdArray start_eid_per_type = args[4];
+      auto hgptr = SliceHeteroGraph(
+          hg->meta_graph(), hg.sptr(), num_nodes_per_type, start_nid_per_type,
+          num_edges_per_type, start_eid_per_type);
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetCreatedFormats")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> format_list;
+      dgl_format_code_t code = hg->GetRelationGraph(0)->GetCreatedFormats();
+      for (auto format : CodeToSparseFormats(code)) {
+        format_list.push_back(Value(MakeValue(ToStringSparseFormat(format))));
+      }
+      *rv = format_list;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetAllowedFormats")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> format_list;
+      dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
+      for (auto format : CodeToSparseFormats(code)) {
+        format_list.push_back(Value(MakeValue(ToStringSparseFormat(format))));
+      }
+      *rv = format_list;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats();
+      auto get_format_f = [&](size_t etype_b, size_t etype_e) {
+        for (auto etype = etype_b; etype < etype_e; ++etype) {
+          auto bg =
+              std::dynamic_pointer_cast<UnitGraph>(hg->GetRelationGraph(etype));
+          for (auto format : CodeToSparseFormats(code)) bg->GetFormat(format);
+        }
+      };
+
+#if !(defined(DGL_USE_CUDA))
+      runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f);
+#else
+      get_format_f(0, hg->NumEdgeTypes());
+#endif
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      List<Value> formats = args[1];
+      std::vector<SparseFormat> formats_vec;
+      for (Value val : formats) {
+        std::string fmt = val->data;
+        formats_vec.push_back(ParseSparseFormat(fmt));
+      }
+      auto hgptr = hg->GetGraphInFormat(SparseFormatsToCode(formats_vec));
+      *rv = HeteroGraphRef(hgptr);
+    });
+
+DGL_REGISTER_GLOBAL("subgraph._CAPI_DGLInSubgraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      const auto& nodes = ListValueToVector<IdArray>(args[1]);
+      bool relabel_nodes = args[2];
+      std::shared_ptr<HeteroSubgraph> ret(new HeteroSubgraph);
+      *ret = InEdgeGraph(hg.sptr(), nodes, relabel_nodes);
+      *rv = HeteroGraphRef(ret);
+    });
+
+DGL_REGISTER_GLOBAL("subgraph._CAPI_DGLOutSubgraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      const auto& nodes = ListValueToVector<IdArray>(args[1]);
+      bool relabel_nodes = args[2];
+      std::shared_ptr<HeteroSubgraph> ret(new HeteroSubgraph);
+      *ret = OutEdgeGraph(hg.sptr(), nodes, relabel_nodes);
+      *rv = HeteroGraphRef(ret);
+    });
+
+DGL_REGISTER_GLOBAL("transform._CAPI_DGLAsImmutableGraph")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      *rv = GraphRef(hg->AsImmutableGraph());
+    });
+
+DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortOutEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      NDArray tag = args[1];
+      int64_t num_tag = args[2];
+
+      CHECK_EQ(hg->Context().device_type, kDGLCPU)
+          << "Only support sorting by tag on cpu";
+      CHECK(aten::IsValidIdArray(tag));
+      CHECK_EQ(tag->ctx.device_type, kDGLCPU)
+          << "Only support sorting by tag on cpu";
+
+      const auto csr = hg->GetCSRMatrix(0);
+
+      NDArray tag_pos = aten::NullArray();
+      aten::CSRMatrix output;
+      std::tie(output, tag_pos) = aten::CSRSortByTag(csr, tag, num_tag);
+      HeteroGraphPtr output_hg =
+          CreateFromCSR(hg->NumVertexTypes(), output, ALL_CODE);
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(output_hg));
+      ret.push_back(Value(MakeValue(tag_pos)));
+      *rv = ret;
+    });
+
+DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortInEdges")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      NDArray tag = args[1];
+      int64_t num_tag = args[2];
+
+      CHECK_EQ(hg->Context().device_type, kDGLCPU)
+          << "Only support sorting by tag on cpu";
+      CHECK(aten::IsValidIdArray(tag));
+      CHECK_EQ(tag->ctx.device_type, kDGLCPU)
+          << "Only support sorting by tag on cpu";
+
+      const auto csc = hg->GetCSCMatrix(0);
+
+      NDArray tag_pos = aten::NullArray();
+      aten::CSRMatrix output;
+      std::tie(output, tag_pos) = aten::CSRSortByTag(csc, tag, num_tag);
+
+      HeteroGraphPtr output_hg =
+          CreateFromCSC(hg->NumVertexTypes(), output, ALL_CODE);
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(output_hg));
+      ret.push_back(Value(MakeValue(tag_pos)));
+      *rv = ret;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      GraphRef metagraph = args[0];
+      std::unordered_set<uint64_t> dst_set;
+      std::unordered_set<uint64_t> src_set;
+
+      for (uint64_t eid = 0; eid < metagraph->NumEdges(); ++eid) {
+        auto edge = metagraph->FindEdge(eid);
+        auto src = edge.first;
+        auto dst = edge.second;
+        dst_set.insert(dst);
+        src_set.insert(src);
+      }
+
+      List<Value> srclist, dstlist;
+      List<List<Value>> ret_list;
+      for (uint64_t nid = 0; nid < metagraph->NumVertices(); ++nid) {
+        auto is_dst = dst_set.count(nid);
+        auto is_src = src_set.count(nid);
+        if (is_dst && is_src)
+          return;
+        else if (is_dst)
+          dstlist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
+        else
+          // If a node type is isolated, put it in srctype as defined in the
+          // Python docstring.
+          srclist.push_back(Value(MakeValue(static_cast<int64_t>(nid))));
+      }
+      ret_list.push_back(srclist);
+      ret_list.push_back(dstlist);
+      *rv = ret_list;
+    });
+
+DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroReverse")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      HeteroGraphRef hg = args[0];
+      CHECK_GT(hg->NumEdgeTypes(), 0);
+      auto g = std::dynamic_pointer_cast<HeteroGraph>(hg.sptr());
+      std::vector<HeteroGraphPtr> rev_ugs;
+      const auto& ugs = g->relation_graphs();
+      rev_ugs.resize(ugs.size());
+
+      for (size_t i = 0; i < ugs.size(); ++i) {
+        const auto& rev_ug = ugs[i]->Reverse();
+        rev_ugs[i] = rev_ug;
+      }
+      // node types are not changed
+      const auto& num_nodes = g->NumVerticesPerType();
+      const auto& meta_edges = hg->meta_graph()->Edges("eid");
+      // reverse the metagraph
+      const auto& rev_meta = ImmutableGraph::CreateFromCOO(
+          hg->meta_graph()->NumVertices(), meta_edges.dst, meta_edges.src);
+      *rv = CreateHeteroGraph(rev_meta, rev_ugs, num_nodes);
+    });
+}  // namespace dgl
diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cu b/src/graph/sampling/randomwalks/frequency_hashmap.cu
index feb88b4c86e8..d805be0d8f97 100644
--- a/src/graph/sampling/randomwalks/frequency_hashmap.cu
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file graph/sampling/frequency_hashmap.cu
@@ -5,7 +6,7 @@
  */
 
 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <tuple>
 #include <utility>
 
@@ -71,7 +72,7 @@ __global__ void _count_frequency(
     }
   }
 
-  using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
+  using BlockReduce = typename hipcub::BlockReduce<IdxType, BLOCK_SIZE>;
   __shared__ typename BlockReduce::TempStorage temp_space;
 
   count = BlockReduce(temp_space).Sum(count);
@@ -112,7 +113,7 @@ __global__ void _compact_frequency(
   int64_t last_idx = start_idx + TILE_SIZE;
   const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
 
-  using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
+  using BlockScan = typename hipcub::BlockScan<IdxType, BLOCK_SIZE>;
   __shared__ typename BlockScan::TempStorage temp_space;
   BlockPrefixCallbackOp<IdxType> prefix_op(0);
 
@@ -246,7 +247,7 @@ inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
 template <typename IdxType>
 FrequencyHashmap<IdxType>::FrequencyHashmap(
     int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-    cudaStream_t stream, int64_t edge_table_scale) {
+    hipStream_t stream, int64_t edge_table_scale) {
   _ctx = ctx;
   _stream = stream;
   num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
@@ -259,7 +260,7 @@ FrequencyHashmap<IdxType>::FrequencyHashmap(
   constexpr int TILE_SIZE = BLOCK_SIZE * 8;
   dim3 block(BLOCK_SIZE);
   dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
-  CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
+  CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
   CUDA_KERNEL_CALL(
       (_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
       _stream, edge_hashmap, (num_dst * num_items_each_dst));
@@ -300,7 +301,7 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   // _edge_hashmap
   bool *is_first_position = static_cast<bool *>(
       device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
-  CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
+  CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
   // double space to use ExclusiveSum
   auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
       _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
@@ -327,11 +328,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   // 2.1 ExclusiveSum the edge_blocks_prefix
   void *d_temp_storage = nullptr;
   size_t temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
       edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
   d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
       edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
   device->FreeWorkspace(_ctx, d_temp_storage);
@@ -365,19 +366,19 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   // 3.1 ExclusiveSum the num_unique_each_node
   d_temp_storage = nullptr;
   temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, num_unique_each_node,
       num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
   d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, num_unique_each_node,
       num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
   device->FreeWorkspace(_ctx, d_temp_storage);
   // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
   // Create a set of DoubleBuffers to wrap pairs of device pointers
-  cub::DoubleBuffer<Idx64Type> d_unique_frequency(
+  hipcub::DoubleBuffer<Idx64Type> d_unique_frequency(
       unique_frequency, unique_frequency_alternate);
-  cub::DoubleBuffer<IdxType> d_unique_src_edges(
+  hipcub::DoubleBuffer<IdxType> d_unique_src_edges(
       unique_src_edges, unique_src_edges_alternate);
   // Determine temporary device storage requirements
   d_temp_storage = nullptr;
@@ -385,12 +386,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   // the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
   // especially when num_dst_nodes is large (about ~10000)
   if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, d_unique_frequency,
         d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
         _stream));
   } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, d_unique_frequency,
         d_unique_src_edges, num_unique_edges, num_dst_nodes,
         num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -398,12 +399,12 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   }
   d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
   if (dtype.bits == 32) {
-    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, d_unique_frequency,
         d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
         _stream));
   } else {
-    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+    CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending(
         d_temp_storage, temp_storage_bytes, d_unique_frequency,
         d_unique_src_edges, num_unique_edges, num_dst_nodes,
         num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
@@ -422,11 +423,11 @@ std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
   // use unique_output_offsets;
   d_temp_storage = nullptr;
   temp_storage_bytes = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, num_unique_each_node,
       unique_output_offsets, num_dst_nodes + 1, _stream));
   d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       d_temp_storage, temp_storage_bytes, num_unique_each_node,
       unique_output_offsets, num_dst_nodes + 1, _stream));
   device->FreeWorkspace(_ctx, d_temp_storage);
diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip b/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip
new file mode 100644
index 000000000000..feb88b4c86e8
--- /dev/null
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip
@@ -0,0 +1,471 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file graph/sampling/frequency_hashmap.cu
+ * @brief frequency hashmap - used to select top-k frequency edges of each node
+ */
+
+#include <algorithm>
+#include <cub/cub.cuh>  // NOLINT
+#include <tuple>
+#include <utility>
+
+#include "../../../array/cuda/atomic.cuh"
+#include "../../../runtime/cuda/cuda_common.h"
+#include "frequency_hashmap.cuh"
+
+namespace dgl {
+
+namespace sampling {
+
+namespace impl {
+
+namespace {
+
+int64_t _table_size(const int64_t num, const int64_t scale) {
+  /**
+   * Calculate the number of buckets in the hashtable. To guarantee we can
+   * fill the hashtable in the worst case, we must use a number of buckets which
+   * is a power of two.
+   * https://en.wikipedia.org/wiki/Quadratic_probing#Limitations
+   */
+  const int64_t next_pow2 = 1 << static_cast<int64_t>(1 + std::log2(num >> 1));
+  return next_pow2 << scale;
+}
+
+template <typename IdxType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _init_edge_table(void *edge_hashmap, int64_t edges_len) {
+  using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
+  auto edge_hashmap_t = static_cast<EdgeItem *>(edge_hashmap);
+  int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x;
+  int64_t last_idx = start_idx + TILE_SIZE;
+#pragma unroll(4)
+  for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) {
+    if (idx < edges_len) {
+      EdgeItem *edge = (edge_hashmap_t + idx);
+      edge->src = static_cast<IdxType>(-1);
+      edge->cnt = static_cast<IdxType>(0);
+    }
+  }
+}
+
+template <typename IdxType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _count_frequency(
+    const IdxType *src_data, const int64_t num_edges,
+    const int64_t num_edges_per_node, IdxType *edge_blocks_prefix,
+    bool *is_first_position, DeviceEdgeHashmap<IdxType> device_edge_hashmap) {
+  int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x;
+  int64_t last_idx = start_idx + TILE_SIZE;
+
+  IdxType count = 0;
+  for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) {
+    if (idx < num_edges) {
+      IdxType src = src_data[idx];
+      if (src == static_cast<IdxType>(-1)) {
+        continue;
+      }
+      IdxType dst_idx = (idx / num_edges_per_node);
+      if (device_edge_hashmap.InsertEdge(src, dst_idx) == 0) {
+        is_first_position[idx] = true;
+        ++count;
+      }
+    }
+  }
+
+  using BlockReduce = typename cub::BlockReduce<IdxType, BLOCK_SIZE>;
+  __shared__ typename BlockReduce::TempStorage temp_space;
+
+  count = BlockReduce(temp_space).Sum(count);
+  if (threadIdx.x == 0) {
+    edge_blocks_prefix[blockIdx.x] = count;
+    if (blockIdx.x == 0) {
+      edge_blocks_prefix[gridDim.x] = 0;
+    }
+  }
+}
+
+/**
+ * This structure is used with cub's block-level prefixscan in order to
+ * keep a running sum as items are iteratively processed.
+ */
+template <typename T>
+struct BlockPrefixCallbackOp {
+  T _running_total;
+
+  __device__ BlockPrefixCallbackOp(const T running_total)
+      : _running_total(running_total) {}
+
+  __device__ T operator()(const T block_aggregate) {
+    const T old_prefix = _running_total;
+    _running_total += block_aggregate;
+    return old_prefix;
+  }
+};
+
+template <typename IdxType, typename Idx64Type, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _compact_frequency(
+    const IdxType *src_data, const IdxType *dst_data, const int64_t num_edges,
+    const int64_t num_edges_per_node, const IdxType *edge_blocks_prefix,
+    const bool *is_first_position, IdxType *num_unique_each_node,
+    IdxType *unique_src_edges, Idx64Type *unique_frequency,
+    DeviceEdgeHashmap<IdxType> device_edge_hashmap) {
+  int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x;
+  int64_t last_idx = start_idx + TILE_SIZE;
+  const IdxType block_offset = edge_blocks_prefix[blockIdx.x];
+
+  using BlockScan = typename cub::BlockScan<IdxType, BLOCK_SIZE>;
+  __shared__ typename BlockScan::TempStorage temp_space;
+  BlockPrefixCallbackOp<IdxType> prefix_op(0);
+
+  for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) {
+    IdxType flag = 0;
+    if (idx < num_edges) {
+      IdxType src = src_data[idx];
+      IdxType dst_idx = (idx / num_edges_per_node);
+      if (idx % num_edges_per_node == 0) {
+        num_unique_each_node[dst_idx] =
+            device_edge_hashmap.GetDstCount(dst_idx);
+      }
+      if (is_first_position[idx] == true) {
+        flag = 1;
+      }
+      BlockScan(temp_space).ExclusiveSum(flag, flag, prefix_op);
+      __syncthreads();
+      if (is_first_position[idx] == true) {
+        const IdxType pos = (block_offset + flag);
+        unique_src_edges[pos] = src;
+        if (sizeof(IdxType) != sizeof(Idx64Type) &&
+            sizeof(IdxType) == 4) {  // if IdxType is a 32-bit data
+          unique_frequency[pos] =
+              ((static_cast<Idx64Type>(num_edges / num_edges_per_node - dst_idx)
+                << 32) |
+               device_edge_hashmap.GetEdgeCount(src, dst_idx));
+        } else {
+          unique_frequency[pos] =
+              device_edge_hashmap.GetEdgeCount(src, dst_idx);
+        }
+      }
+    }
+  }
+}
+
+template <typename IdxType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _get_pick_num(
+    IdxType *num_unique_each_node, const int64_t num_pick,
+    const int64_t num_dst_nodes) {
+  int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x;
+  int64_t last_idx = start_idx + TILE_SIZE;
+#pragma unroll(4)
+  for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) {
+    if (idx < num_dst_nodes) {
+      IdxType &num_unique = num_unique_each_node[idx];
+      num_unique = min(num_unique, static_cast<IdxType>(num_pick));
+    }
+  }
+}
+
+template <typename IdxType, typename Idx64Type, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _pick_data(
+    const Idx64Type *unique_frequency, const IdxType *unique_src_edges,
+    const IdxType *unique_input_offsets, const IdxType *dst_data,
+    const int64_t num_edges_per_node, const int64_t num_dst_nodes,
+    const int64_t num_edges, const IdxType *unique_output_offsets,
+    IdxType *output_src, IdxType *output_dst, IdxType *output_frequency) {
+  int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x;
+  int64_t last_idx = start_idx + TILE_SIZE;
+
+  for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) {
+    if (idx < num_dst_nodes) {
+      const int64_t dst_pos = (idx * num_edges_per_node);
+      assert(dst_pos < num_edges);
+      const IdxType dst = dst_data[dst_pos];
+      const IdxType last_output_offset = unique_output_offsets[idx + 1];
+      assert(
+          (last_output_offset - unique_output_offsets[idx]) <=
+          (unique_input_offsets[idx + 1] - unique_input_offsets[idx]));
+      for (IdxType output_idx = unique_output_offsets[idx],
+                   input_idx = unique_input_offsets[idx];
+           output_idx < last_output_offset; ++output_idx, ++input_idx) {
+        output_src[output_idx] = unique_src_edges[input_idx];
+        output_dst[output_idx] = dst;
+        output_frequency[output_idx] =
+            static_cast<IdxType>(unique_frequency[input_idx]);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+// return the old cnt of this edge
+template <typename IdxType>
+inline __device__ IdxType DeviceEdgeHashmap<IdxType>::InsertEdge(
+    const IdxType &src, const IdxType &dst_idx) {
+  IdxType start_off = dst_idx * _num_items_each_dst;
+  IdxType pos = EdgeHash(src);
+  IdxType delta = 1;
+  IdxType old_cnt = static_cast<IdxType>(-1);
+  while (true) {
+    IdxType old_src = dgl::aten::cuda::AtomicCAS(
+        &_edge_hashmap[start_off + pos].src, static_cast<IdxType>(-1), src);
+    if (old_src == static_cast<IdxType>(-1) || old_src == src) {
+      // first insert
+      old_cnt = dgl::aten::cuda::AtomicAdd(
+          &_edge_hashmap[start_off + pos].cnt, static_cast<IdxType>(1));
+      if (old_src == static_cast<IdxType>(-1)) {
+        assert(dst_idx < _num_dst);
+        dgl::aten::cuda::AtomicAdd(
+            &_dst_unique_edges[dst_idx], static_cast<IdxType>(1));
+      }
+      break;
+    }
+    pos = EdgeHash(pos + delta);
+    delta += 1;
+  }
+  return old_cnt;
+}
+
+template <typename IdxType>
+inline __device__ IdxType
+DeviceEdgeHashmap<IdxType>::GetDstCount(const IdxType &dst_idx) {
+  return _dst_unique_edges[dst_idx];
+}
+
+template <typename IdxType>
+inline __device__ IdxType DeviceEdgeHashmap<IdxType>::GetEdgeCount(
+    const IdxType &src, const IdxType &dst_idx) {
+  IdxType start_off = dst_idx * _num_items_each_dst;
+  IdxType pos = EdgeHash(src);
+  IdxType delta = 1;
+  while (_edge_hashmap[start_off + pos].src != src) {
+    pos = EdgeHash(pos + delta);
+    delta += 1;
+  }
+  return _edge_hashmap[start_off + pos].cnt;
+}
+
+template <typename IdxType>
+FrequencyHashmap<IdxType>::FrequencyHashmap(
+    int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
+    cudaStream_t stream, int64_t edge_table_scale) {
+  _ctx = ctx;
+  _stream = stream;
+  num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale);
+  auto device = dgl::runtime::DeviceAPI::Get(_ctx);
+  auto dst_unique_edges = static_cast<IdxType *>(
+      device->AllocWorkspace(_ctx, (num_dst) * sizeof(IdxType)));
+  auto edge_hashmap = static_cast<EdgeItem *>(device->AllocWorkspace(
+      _ctx, (num_dst * num_items_each_dst) * sizeof(EdgeItem)));
+  constexpr int BLOCK_SIZE = 256;
+  constexpr int TILE_SIZE = BLOCK_SIZE * 8;
+  dim3 block(BLOCK_SIZE);
+  dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE);
+  CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType)));
+  CUDA_KERNEL_CALL(
+      (_init_edge_table<IdxType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
+      _stream, edge_hashmap, (num_dst * num_items_each_dst));
+  _device_edge_hashmap = new DeviceEdgeHashmap<IdxType>(
+      num_dst, num_items_each_dst, dst_unique_edges, edge_hashmap);
+  _dst_unique_edges = dst_unique_edges;
+  _edge_hashmap = edge_hashmap;
+}
+
+template <typename IdxType>
+FrequencyHashmap<IdxType>::~FrequencyHashmap() {
+  auto device = dgl::runtime::DeviceAPI::Get(_ctx);
+  delete _device_edge_hashmap;
+  _device_edge_hashmap = nullptr;
+  device->FreeWorkspace(_ctx, _dst_unique_edges);
+  _dst_unique_edges = nullptr;
+  device->FreeWorkspace(_ctx, _edge_hashmap);
+  _edge_hashmap = nullptr;
+}
+
+template <typename IdxType>
+std::tuple<IdArray, IdArray, IdArray> FrequencyHashmap<IdxType>::Topk(
+    const IdxType *src_data, const IdxType *dst_data, DGLDataType dtype,
+    const int64_t num_edges, const int64_t num_edges_per_node,
+    const int64_t num_pick) {
+  using Idx64Type = int64_t;
+  const int64_t num_dst_nodes = (num_edges / num_edges_per_node);
+  constexpr int BLOCK_SIZE = 256;
+  // XXX: a experienced value, best performance in GV100
+  constexpr int TILE_SIZE = BLOCK_SIZE * 32;
+  const dim3 block(BLOCK_SIZE);
+  const dim3 edges_grid((num_edges + TILE_SIZE - 1) / TILE_SIZE);
+  auto device = dgl::runtime::DeviceAPI::Get(_ctx);
+  const IdxType num_edge_blocks = static_cast<IdxType>(edges_grid.x);
+  IdxType num_unique_edges = 0;
+
+  // to mark if this position of edges is the first inserting position for
+  // _edge_hashmap
+  bool *is_first_position = static_cast<bool *>(
+      device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges)));
+  CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges)));
+  // double space to use ExclusiveSum
+  auto edge_blocks_prefix_data = static_cast<IdxType *>(device->AllocWorkspace(
+      _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1)));
+  IdxType *edge_blocks_prefix = edge_blocks_prefix_data;
+  IdxType *edge_blocks_prefix_alternate =
+      (edge_blocks_prefix_data + (num_edge_blocks + 1));
+  // triple space to use ExclusiveSum and unique_output_offsets
+  auto num_unique_each_node_data = static_cast<IdxType *>(
+      device->AllocWorkspace(_ctx, 3 * sizeof(IdxType) * (num_dst_nodes + 1)));
+  IdxType *num_unique_each_node = num_unique_each_node_data;
+  IdxType *num_unique_each_node_alternate =
+      (num_unique_each_node_data + (num_dst_nodes + 1));
+  IdxType *unique_output_offsets =
+      (num_unique_each_node_data + 2 * (num_dst_nodes + 1));
+
+  // 1. Scan the all edges and count the unique edges and unique edges for each
+  // dst node
+  CUDA_KERNEL_CALL(
+      (_count_frequency<IdxType, BLOCK_SIZE, TILE_SIZE>), edges_grid, block, 0,
+      _stream, src_data, num_edges, num_edges_per_node, edge_blocks_prefix,
+      is_first_position, *_device_edge_hashmap);
+
+  // 2. Compact the unique edges frequency
+  // 2.1 ExclusiveSum the edge_blocks_prefix
+  void *d_temp_storage = nullptr;
+  size_t temp_storage_bytes = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
+      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
+  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, edge_blocks_prefix,
+      edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream));
+  device->FreeWorkspace(_ctx, d_temp_storage);
+  std::swap(edge_blocks_prefix, edge_blocks_prefix_alternate);
+  device->CopyDataFromTo(
+      &edge_blocks_prefix[num_edge_blocks], 0, &num_unique_edges, 0,
+      sizeof(num_unique_edges), _ctx, DGLContext{kDGLCPU, 0}, dtype);
+  device->StreamSync(_ctx, _stream);
+  // 2.2 Allocate the data of unique edges and frequency
+  // double space to use SegmentedRadixSort
+  auto unique_src_edges_data = static_cast<IdxType *>(
+      device->AllocWorkspace(_ctx, 2 * sizeof(IdxType) * (num_unique_edges)));
+  IdxType *unique_src_edges = unique_src_edges_data;
+  IdxType *unique_src_edges_alternate =
+      unique_src_edges_data + num_unique_edges;
+  // double space to use SegmentedRadixSort
+  auto unique_frequency_data = static_cast<Idx64Type *>(
+      device->AllocWorkspace(_ctx, 2 * sizeof(Idx64Type) * (num_unique_edges)));
+  Idx64Type *unique_frequency = unique_frequency_data;
+  Idx64Type *unique_frequency_alternate =
+      unique_frequency_data + num_unique_edges;
+  // 2.3 Compact the unique edges and their frequency
+  CUDA_KERNEL_CALL(
+      (_compact_frequency<IdxType, Idx64Type, BLOCK_SIZE, TILE_SIZE>),
+      edges_grid, block, 0, _stream, src_data, dst_data, num_edges,
+      num_edges_per_node, edge_blocks_prefix, is_first_position,
+      num_unique_each_node, unique_src_edges, unique_frequency,
+      *_device_edge_hashmap);
+
+  // 3. SegmentedRadixSort the unique edges and unique_frequency
+  // 3.1 ExclusiveSum the num_unique_each_node
+  d_temp_storage = nullptr;
+  temp_storage_bytes = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, num_unique_each_node,
+      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
+  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, num_unique_each_node,
+      num_unique_each_node_alternate, num_dst_nodes + 1, _stream));
+  device->FreeWorkspace(_ctx, d_temp_storage);
+  // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency
+  // Create a set of DoubleBuffers to wrap pairs of device pointers
+  cub::DoubleBuffer<Idx64Type> d_unique_frequency(
+      unique_frequency, unique_frequency_alternate);
+  cub::DoubleBuffer<IdxType> d_unique_src_edges(
+      unique_src_edges, unique_src_edges_alternate);
+  // Determine temporary device storage requirements
+  d_temp_storage = nullptr;
+  temp_storage_bytes = 0;
+  // the DeviceRadixSort is faster than DeviceSegmentedRadixSort,
+  // especially when num_dst_nodes is large (about ~10000)
+  if (dtype.bits == 32) {
+    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, d_unique_frequency,
+        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
+        _stream));
+  } else {
+    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, d_unique_frequency,
+        d_unique_src_edges, num_unique_edges, num_dst_nodes,
+        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
+        sizeof(Idx64Type) * 8, _stream));
+  }
+  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
+  if (dtype.bits == 32) {
+    CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, d_unique_frequency,
+        d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8,
+        _stream));
+  } else {
+    CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        d_temp_storage, temp_storage_bytes, d_unique_frequency,
+        d_unique_src_edges, num_unique_edges, num_dst_nodes,
+        num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0,
+        sizeof(Idx64Type) * 8, _stream));
+  }
+  device->FreeWorkspace(_ctx, d_temp_storage);
+
+  // 4. Get the final pick number for each dst node
+  // 4.1 Reset the min(num_pick, num_unique_each_node) to num_unique_each_node
+  constexpr int NODE_TILE_SIZE = BLOCK_SIZE * 2;
+  const dim3 nodes_grid((num_dst_nodes + NODE_TILE_SIZE - 1) / NODE_TILE_SIZE);
+  CUDA_KERNEL_CALL(
+      (_get_pick_num<IdxType, BLOCK_SIZE, NODE_TILE_SIZE>), nodes_grid, block,
+      0, _stream, num_unique_each_node, num_pick, num_dst_nodes);
+  // 4.2 ExclusiveSum the new num_unique_each_node as unique_output_offsets
+  // use unique_output_offsets;
+  d_temp_storage = nullptr;
+  temp_storage_bytes = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, num_unique_each_node,
+      unique_output_offsets, num_dst_nodes + 1, _stream));
+  d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      d_temp_storage, temp_storage_bytes, num_unique_each_node,
+      unique_output_offsets, num_dst_nodes + 1, _stream));
+  device->FreeWorkspace(_ctx, d_temp_storage);
+
+  // 5. Pick the data to result
+  IdxType num_output = 0;
+  device->CopyDataFromTo(
+      &unique_output_offsets[num_dst_nodes], 0, &num_output, 0,
+      sizeof(num_output), _ctx, DGLContext{kDGLCPU, 0}, dtype);
+  device->StreamSync(_ctx, _stream);
+
+  IdArray res_src =
+      IdArray::Empty({static_cast<int64_t>(num_output)}, dtype, _ctx);
+  IdArray res_dst =
+      IdArray::Empty({static_cast<int64_t>(num_output)}, dtype, _ctx);
+  IdArray res_cnt =
+      IdArray::Empty({static_cast<int64_t>(num_output)}, dtype, _ctx);
+  CUDA_KERNEL_CALL(
+      (_pick_data<IdxType, Idx64Type, BLOCK_SIZE, NODE_TILE_SIZE>), nodes_grid,
+      block, 0, _stream, d_unique_frequency.Current(),
+      d_unique_src_edges.Current(), num_unique_each_node_alternate, dst_data,
+      num_edges_per_node, num_dst_nodes, num_edges, unique_output_offsets,
+      res_src.Ptr<IdxType>(), res_dst.Ptr<IdxType>(), res_cnt.Ptr<IdxType>());
+
+  device->FreeWorkspace(_ctx, is_first_position);
+  device->FreeWorkspace(_ctx, edge_blocks_prefix_data);
+  device->FreeWorkspace(_ctx, num_unique_each_node_data);
+  device->FreeWorkspace(_ctx, unique_src_edges_data);
+  device->FreeWorkspace(_ctx, unique_frequency_data);
+
+  return std::make_tuple(res_src, res_dst, res_cnt);
+}
+
+template class FrequencyHashmap<int64_t>;
+
+template class FrequencyHashmap<int32_t>;
+
+};  // namespace impl
+
+};  // namespace sampling
+
+};  // namespace dgl
diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cuh b/src/graph/sampling/randomwalks/frequency_hashmap.cuh
index 3c1a1c0649b6..622bcc59671e 100644
--- a/src/graph/sampling/randomwalks/frequency_hashmap.cuh
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh
@@ -56,7 +56,7 @@ class FrequencyHashmap {
   FrequencyHashmap() = delete;
   FrequencyHashmap(
       int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
-      cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
+      hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
   ~FrequencyHashmap();
   using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
   std::tuple<IdArray, IdArray, IdArray> Topk(
@@ -66,7 +66,7 @@ class FrequencyHashmap {
 
  private:
   DGLContext _ctx;
-  cudaStream_t _stream;
+  hipStream_t _stream;
   DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
   IdxType *_dst_unique_edges;
   EdgeItem *_edge_hashmap;
diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip b/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip
new file mode 100644
index 000000000000..3c1a1c0649b6
--- /dev/null
+++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip
@@ -0,0 +1,79 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file graph/sampling/frequency_hashmap.cuh
+ * @brief frequency hashmap - used to select top-k frequency edges of each node
+ */
+
+#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_
+#define DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_
+
+#include <dgl/array.h>
+#include <dgl/runtime/device_api.h>
+
+#include <tuple>
+
+namespace dgl {
+namespace sampling {
+namespace impl {
+
+template <typename IdxType>
+class DeviceEdgeHashmap {
+ public:
+  struct EdgeItem {
+    IdxType src;
+    IdxType cnt;
+  };
+  DeviceEdgeHashmap() = delete;
+  DeviceEdgeHashmap(
+      int64_t num_dst, int64_t num_items_each_dst, IdxType *dst_unique_edges,
+      EdgeItem *edge_hashmap)
+      : _num_dst(num_dst),
+        _num_items_each_dst(num_items_each_dst),
+        _dst_unique_edges(dst_unique_edges),
+        _edge_hashmap(edge_hashmap) {}
+  // return the old cnt of this edge
+  inline __device__ IdxType
+  InsertEdge(const IdxType &src, const IdxType &dst_idx);
+  inline __device__ IdxType GetDstCount(const IdxType &dst_idx);
+  inline __device__ IdxType
+  GetEdgeCount(const IdxType &src, const IdxType &dst_idx);
+
+ private:
+  int64_t _num_dst;
+  int64_t _num_items_each_dst;
+  IdxType *_dst_unique_edges;
+  EdgeItem *_edge_hashmap;
+
+  inline __device__ IdxType EdgeHash(const IdxType &id) const {
+    return id % _num_items_each_dst;
+  }
+};
+
+template <typename IdxType>
+class FrequencyHashmap {
+ public:
+  static constexpr int64_t kDefaultEdgeTableScale = 3;
+  FrequencyHashmap() = delete;
+  FrequencyHashmap(
+      int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx,
+      cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale);
+  ~FrequencyHashmap();
+  using EdgeItem = typename DeviceEdgeHashmap<IdxType>::EdgeItem;
+  std::tuple<IdArray, IdArray, IdArray> Topk(
+      const IdxType *src_data, const IdxType *dst_data, DGLDataType dtype,
+      const int64_t num_edges, const int64_t num_edges_per_node,
+      const int64_t num_pick);
+
+ private:
+  DGLContext _ctx;
+  cudaStream_t _stream;
+  DeviceEdgeHashmap<IdxType> *_device_edge_hashmap;
+  IdxType *_dst_unique_edges;
+  EdgeItem *_edge_hashmap;
+};
+
+};  // namespace impl
+};  // namespace sampling
+};  // namespace dgl
+
+#endif  // DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_
diff --git a/src/graph/sampling/randomwalks/get_node_types_gpu.cu b/src/graph/sampling/randomwalks/get_node_types_gpu.cu
index 79e8d2596e9a..72d2addb6e26 100644
--- a/src/graph/sampling/randomwalks/get_node_types_gpu.cu
+++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu
@@ -4,7 +4,7 @@
  * @brief DGL sampler
  */
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/runtime/device_api.h>
diff --git a/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip b/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip
new file mode 100644
index 000000000000..79e8d2596e9a
--- /dev/null
+++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip
@@ -0,0 +1,72 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file graph/sampling/get_node_types_gpu.cu
+ * @brief DGL sampler
+ */
+
+#include <cuda_runtime.h>
+#include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+#include <dgl/runtime/device_api.h>
+
+#include <utility>
+
+#include "randomwalks_impl.h"
+
+namespace dgl {
+
+using namespace dgl::runtime;
+using namespace dgl::aten;
+
+namespace sampling {
+
+namespace impl {
+
+template <DGLDeviceType XPU, typename IdxType>
+TypeArray GetNodeTypesFromMetapath(
+    const HeteroGraphPtr hg, const TypeArray metapath) {
+  uint64_t num_etypes = metapath->shape[0];
+
+  auto cpu_ctx = DGLContext{kDGLCPU, 0};
+  auto metapath_ctx = metapath->ctx;
+  auto stream = DeviceAPI::Get(metapath_ctx)->GetStream();
+
+  TypeArray h_result =
+      TypeArray::Empty({metapath->shape[0] + 1}, metapath->dtype, cpu_ctx);
+  auto h_result_data = h_result.Ptr<IdxType>();
+
+  auto h_metapath = metapath.CopyTo(cpu_ctx);
+  DeviceAPI::Get(metapath_ctx)->StreamSync(metapath_ctx, stream);
+  const IdxType *h_metapath_data = h_metapath.Ptr<IdxType>();
+
+  dgl_type_t curr_type = hg->GetEndpointTypes(h_metapath_data[0]).first;
+  h_result_data[0] = curr_type;
+
+  for (uint64_t i = 0; i < num_etypes; ++i) {
+    auto src_dst_type = hg->GetEndpointTypes(h_metapath_data[i]);
+    dgl_type_t srctype = src_dst_type.first;
+    dgl_type_t dsttype = src_dst_type.second;
+
+    if (srctype != curr_type) {
+      LOG(FATAL) << "source of edge type #" << i
+                 << " does not match destination of edge type #" << i - 1;
+    }
+    curr_type = dsttype;
+    h_result_data[i + 1] = dsttype;
+  }
+
+  auto result = h_result.CopyTo(metapath->ctx);
+  DeviceAPI::Get(metapath_ctx)->StreamSync(metapath_ctx, stream);
+  return result;
+}
+
+template TypeArray GetNodeTypesFromMetapath<kDGLCUDA, int32_t>(
+    const HeteroGraphPtr hg, const TypeArray metapath);
+template TypeArray GetNodeTypesFromMetapath<kDGLCUDA, int64_t>(
+    const HeteroGraphPtr hg, const TypeArray metapath);
+
+};  // namespace impl
+
+};  // namespace sampling
+
+};  // namespace dgl
diff --git a/src/graph/sampling/randomwalks/randomwalk_gpu.cu b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
index bb7a48906acf..7e822b961476 100644
--- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu
@@ -1,16 +1,17 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021-2022 by Contributors
  * @file graph/sampling/randomwalk_gpu.cu
  * @brief CUDA random walk sampleing
  */
 
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/base_heterograph.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -48,10 +49,10 @@ __global__ void _RandomWalkKernel(
   int64_t last_idx =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
   int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState rng;
   // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);
 
   while (idx < last_idx) {
     IdType curr = seed_data[idx];
@@ -68,18 +69,18 @@ __global__ void _RandomWalkKernel(
       if (deg == 0) {  // the degree is zero
         break;
       }
-      const int64_t num = curand(&rng) % deg;
+      const int64_t num = hiprand(&rng) % deg;
       IdType pick = graph.in_cols[in_row_start + num];
       IdType eid =
           (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
       *traces_data_ptr = pick;
       *eids_data_ptr = eid;
       if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
         break;
       } else if (
           (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
         break;
       }
       ++traces_data_ptr;
@@ -107,10 +108,10 @@ __global__ void _RandomWalkBiasedKernel(
   int64_t last_idx =
       min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
   int64_t trace_length = (max_num_steps + 1);
-  curandState rng;
+  hiprandState rng;
   // reference:
-  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
-  curand_init(rand_seed + idx, 0, 0, &rng);
+  //     https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes
+  hiprand_init(rand_seed + idx, 0, 0, &rng);
 
   while (idx < last_idx) {
     IdType curr = seed_data[idx];
@@ -133,9 +134,9 @@ __global__ void _RandomWalkBiasedKernel(
       const FloatType *prob = probs[metapath_id];
       int64_t num;
       if (prob == nullptr) {
-        num = curand(&rng) % deg;
+        num = hiprand(&rng) % deg;
       } else {
-        auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
+        auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng);
         FloatType sum_w{0.};
         for (num = 0; num < deg; ++num) {
           sum_w += prob[in_row_start + num];
@@ -149,11 +150,11 @@ __global__ void _RandomWalkBiasedKernel(
       *traces_data_ptr = pick;
       *eids_data_ptr = eid;
       if ((restart_prob_size > 1) &&
-          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+          (hiprand_uniform(&rng) < restart_prob_data[step_idx])) {
         break;
       } else if (
           (restart_prob_size == 1) &&
-          (curand_uniform(&rng) < restart_prob_data[0])) {
+          (hiprand_uniform(&rng) < restart_prob_data[0])) {
         break;
       }
       ++traces_data_ptr;
@@ -202,7 +203,7 @@ std::pair<IdArray, IdArray> RandomWalkUniform(
                          : nullptr);
   }
   // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = DeviceAPI::Get(ctx);
   auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
       ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
@@ -263,7 +264,7 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
   IdType *traces_data = traces.Ptr<IdType>();
   IdType *eids_data = eids.Ptr<IdType>();
 
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto device = DeviceAPI::Get(ctx);
   // new probs and prob sums pointers
   assert(num_etypes == static_cast<int64_t>(prob.size()));
@@ -297,11 +298,11 @@ std::pair<IdArray, IdArray> RandomWalkBiased(
     // calculate the sum of the neighbor weights
     const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
     size_t temp_storage_size = 0;
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
         nullptr, temp_storage_size, probs[etype], prob_sums[etype],
         num_segments, d_offsets, d_offsets + 1, stream));
     void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
-    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+    CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum(
         temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
         num_segments, d_offsets, d_offsets + 1, stream));
     device->FreeWorkspace(ctx, temp_storage);
@@ -396,7 +397,7 @@ std::pair<IdArray, IdArray> RandomWalkWithRestart(
   auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
 
   // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   device->CopyDataFromTo(
       &restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
       DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
@@ -449,7 +450,7 @@ std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
   const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
   auto ctx = src->ctx;
   // use cuda stream from local thread
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   auto frequency_hashmap = FrequencyHashmap<IdxType>(
       num_dst_nodes, num_samples_per_node, ctx, stream);
   auto ret = frequency_hashmap.Topk(
diff --git a/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip b/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip
new file mode 100644
index 000000000000..bb7a48906acf
--- /dev/null
+++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip
@@ -0,0 +1,496 @@
+/**
+ *  Copyright (c) 2021-2022 by Contributors
+ * @file graph/sampling/randomwalk_gpu.cu
+ * @brief CUDA random walk sampleing
+ */
+
+#include <curand_kernel.h>
+#include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+
+#include <cub/cub.cuh>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "../../../runtime/cuda/cuda_common.h"
+#include "frequency_hashmap.cuh"
+
+namespace dgl {
+
+using namespace dgl::runtime;
+using namespace dgl::aten;
+
+namespace sampling {
+
+namespace impl {
+
+namespace {
+
+template <typename IdType>
+struct GraphKernelData {
+  const IdType *in_ptr;
+  const IdType *in_cols;
+  const IdType *data;
+};
+
+template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _RandomWalkKernel(
+    const uint64_t rand_seed, const IdType *seed_data, const int64_t num_seeds,
+    const IdType *metapath_data, const uint64_t max_num_steps,
+    const GraphKernelData<IdType> *graphs, const FloatType *restart_prob_data,
+    const int64_t restart_prob_size, const int64_t max_nodes,
+    IdType *out_traces_data, IdType *out_eids_data) {
+  assert(BLOCK_SIZE == blockDim.x);
+  int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x;
+  int64_t last_idx =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
+  int64_t trace_length = (max_num_steps + 1);
+  curandState rng;
+  // reference:
+  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
+  curand_init(rand_seed + idx, 0, 0, &rng);
+
+  while (idx < last_idx) {
+    IdType curr = seed_data[idx];
+    assert(curr < max_nodes);
+    IdType *traces_data_ptr = &out_traces_data[idx * trace_length];
+    IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps];
+    *(traces_data_ptr++) = curr;
+    int64_t step_idx;
+    for (step_idx = 0; step_idx < max_num_steps; ++step_idx) {
+      IdType metapath_id = metapath_data[step_idx];
+      const GraphKernelData<IdType> &graph = graphs[metapath_id];
+      const int64_t in_row_start = graph.in_ptr[curr];
+      const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr];
+      if (deg == 0) {  // the degree is zero
+        break;
+      }
+      const int64_t num = curand(&rng) % deg;
+      IdType pick = graph.in_cols[in_row_start + num];
+      IdType eid =
+          (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
+      *traces_data_ptr = pick;
+      *eids_data_ptr = eid;
+      if ((restart_prob_size > 1) &&
+          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+        break;
+      } else if (
+          (restart_prob_size == 1) &&
+          (curand_uniform(&rng) < restart_prob_data[0])) {
+        break;
+      }
+      ++traces_data_ptr;
+      ++eids_data_ptr;
+      curr = pick;
+    }
+    for (; step_idx < max_num_steps; ++step_idx) {
+      *(traces_data_ptr++) = -1;
+      *(eids_data_ptr++) = -1;
+    }
+    idx += BLOCK_SIZE;
+  }
+}
+
+template <typename IdType, typename FloatType, int BLOCK_SIZE, int TILE_SIZE>
+__global__ void _RandomWalkBiasedKernel(
+    const uint64_t rand_seed, const IdType *seed_data, const int64_t num_seeds,
+    const IdType *metapath_data, const uint64_t max_num_steps,
+    const GraphKernelData<IdType> *graphs, const FloatType **probs,
+    const FloatType **prob_sums, const FloatType *restart_prob_data,
+    const int64_t restart_prob_size, const int64_t max_nodes,
+    IdType *out_traces_data, IdType *out_eids_data) {
+  assert(BLOCK_SIZE == blockDim.x);
+  int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x;
+  int64_t last_idx =
+      min(static_cast<int64_t>(blockIdx.x + 1) * TILE_SIZE, num_seeds);
+  int64_t trace_length = (max_num_steps + 1);
+  curandState rng;
+  // reference:
+  //     https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes
+  curand_init(rand_seed + idx, 0, 0, &rng);
+
+  while (idx < last_idx) {
+    IdType curr = seed_data[idx];
+    assert(curr < max_nodes);
+    IdType *traces_data_ptr = &out_traces_data[idx * trace_length];
+    IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps];
+    *(traces_data_ptr++) = curr;
+    int64_t step_idx;
+    for (step_idx = 0; step_idx < max_num_steps; ++step_idx) {
+      IdType metapath_id = metapath_data[step_idx];
+      const GraphKernelData<IdType> &graph = graphs[metapath_id];
+      const int64_t in_row_start = graph.in_ptr[curr];
+      const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr];
+      if (deg == 0) {  // the degree is zero
+        break;
+      }
+
+      // randomly select by weight
+      const FloatType *prob_sum = prob_sums[metapath_id];
+      const FloatType *prob = probs[metapath_id];
+      int64_t num;
+      if (prob == nullptr) {
+        num = curand(&rng) % deg;
+      } else {
+        auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng);
+        FloatType sum_w{0.};
+        for (num = 0; num < deg; ++num) {
+          sum_w += prob[in_row_start + num];
+          if (sum_w >= rnd_sum_w) break;
+        }
+      }
+
+      IdType pick = graph.in_cols[in_row_start + num];
+      IdType eid =
+          (graph.data ? graph.data[in_row_start + num] : in_row_start + num);
+      *traces_data_ptr = pick;
+      *eids_data_ptr = eid;
+      if ((restart_prob_size > 1) &&
+          (curand_uniform(&rng) < restart_prob_data[step_idx])) {
+        break;
+      } else if (
+          (restart_prob_size == 1) &&
+          (curand_uniform(&rng) < restart_prob_data[0])) {
+        break;
+      }
+      ++traces_data_ptr;
+      ++eids_data_ptr;
+      curr = pick;
+    }
+    for (; step_idx < max_num_steps; ++step_idx) {
+      *(traces_data_ptr++) = -1;
+      *(eids_data_ptr++) = -1;
+    }
+    idx += BLOCK_SIZE;
+  }
+}
+
+}  // namespace
+
+// random walk for uniform choice
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> RandomWalkUniform(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    FloatArray restart_prob) {
+  const int64_t max_num_steps = metapath->shape[0];
+  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const int64_t begin_ntype =
+      hg->meta_graph()->FindEdge(metapath_data[0]).first;
+  const int64_t max_nodes = hg->NumVertices(begin_ntype);
+  int64_t num_etypes = hg->NumEdgeTypes();
+  auto ctx = seeds->ctx;
+
+  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
+  const int64_t num_seeds = seeds->shape[0];
+  int64_t trace_length = max_num_steps + 1;
+  IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
+  IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
+  IdType *traces_data = traces.Ptr<IdType>();
+  IdType *eids_data = eids.Ptr<IdType>();
+
+  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    h_graphs[etype].data =
+        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+                         : nullptr);
+  }
+  // use cuda stream from local thread
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = DeviceAPI::Get(ctx);
+  auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
+      ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
+  // copy graph metadata pointers to GPU
+  device->CopyDataFromTo(
+      h_graphs.data(), 0, d_graphs, 0,
+      (num_etypes) * sizeof(GraphKernelData<IdType>), DGLContext{kDGLCPU, 0},
+      ctx, hg->GetCSRMatrix(0).indptr->dtype);
+  // copy metapath to GPU
+  auto d_metapath = metapath.CopyTo(ctx);
+  const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+
+  constexpr int BLOCK_SIZE = 256;
+  constexpr int TILE_SIZE = BLOCK_SIZE * 4;
+  dim3 block(256);
+  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
+  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+  ATEN_FLOAT_TYPE_SWITCH(
+      restart_prob->dtype, FloatType, "random walk GPU kernel", {
+        CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+            << "restart prob should be in GPU.";
+        CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
+        const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+        const int64_t restart_prob_size = restart_prob->shape[0];
+        CUDA_KERNEL_CALL(
+            (_RandomWalkKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
+            block, 0, stream, random_seed, seed_data, num_seeds,
+            d_metapath_data, max_num_steps, d_graphs, restart_prob_data,
+            restart_prob_size, max_nodes, traces_data, eids_data);
+      });
+
+  device->FreeWorkspace(ctx, d_graphs);
+  return std::make_pair(traces, eids);
+}
+
+/**
+ * @brief Random walk for biased choice. We use inverse transform sampling to
+ * choose the next step.
+ */
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
+std::pair<IdArray, IdArray> RandomWalkBiased(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, FloatArray restart_prob) {
+  const int64_t max_num_steps = metapath->shape[0];
+  const IdType *metapath_data = static_cast<IdType *>(metapath->data);
+  const int64_t begin_ntype =
+      hg->meta_graph()->FindEdge(metapath_data[0]).first;
+  const int64_t max_nodes = hg->NumVertices(begin_ntype);
+  int64_t num_etypes = hg->NumEdgeTypes();
+  auto ctx = seeds->ctx;
+
+  const IdType *seed_data = static_cast<const IdType *>(seeds->data);
+  CHECK(seeds->ndim == 1) << "seeds shape is not one dimension.";
+  const int64_t num_seeds = seeds->shape[0];
+  int64_t trace_length = max_num_steps + 1;
+  IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx);
+  IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx);
+  IdType *traces_data = traces.Ptr<IdType>();
+  IdType *eids_data = eids.Ptr<IdType>();
+
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto device = DeviceAPI::Get(ctx);
+  // new probs and prob sums pointers
+  assert(num_etypes == static_cast<int64_t>(prob.size()));
+  std::unique_ptr<FloatType *[]> probs(new FloatType *[prob.size()]);
+  std::unique_ptr<FloatType *[]> prob_sums(new FloatType *[prob.size()]);
+  std::vector<FloatArray> prob_sums_arr;
+  prob_sums_arr.reserve(prob.size());
+
+  // graphs
+  std::vector<GraphKernelData<IdType>> h_graphs(num_etypes);
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const CSRMatrix &csr = hg->GetCSRMatrix(etype);
+    h_graphs[etype].in_ptr = static_cast<const IdType *>(csr.indptr->data);
+    h_graphs[etype].in_cols = static_cast<const IdType *>(csr.indices->data);
+    h_graphs[etype].data =
+        (CSRHasData(csr) ? static_cast<const IdType *>(csr.data->data)
+                         : nullptr);
+
+    int64_t num_segments = csr.indptr->shape[0] - 1;
+    // will handle empty probs in the kernel
+    if (IsNullArray(prob[etype])) {
+      probs[etype] = nullptr;
+      prob_sums[etype] = nullptr;
+      continue;
+    }
+    probs[etype] = prob[etype].Ptr<FloatType>();
+    prob_sums_arr.push_back(
+        FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx));
+    prob_sums[etype] = prob_sums_arr[etype].Ptr<FloatType>();
+
+    // calculate the sum of the neighbor weights
+    const IdType *d_offsets = static_cast<const IdType *>(csr.indptr->data);
+    size_t temp_storage_size = 0;
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+        nullptr, temp_storage_size, probs[etype], prob_sums[etype],
+        num_segments, d_offsets, d_offsets + 1, stream));
+    void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(
+        temp_storage, temp_storage_size, probs[etype], prob_sums[etype],
+        num_segments, d_offsets, d_offsets + 1, stream));
+    device->FreeWorkspace(ctx, temp_storage);
+  }
+
+  // copy graph metadata pointers to GPU
+  auto d_graphs = static_cast<GraphKernelData<IdType> *>(device->AllocWorkspace(
+      ctx, (num_etypes) * sizeof(GraphKernelData<IdType>)));
+  device->CopyDataFromTo(
+      h_graphs.data(), 0, d_graphs, 0,
+      (num_etypes) * sizeof(GraphKernelData<IdType>), DGLContext{kDGLCPU, 0},
+      ctx, hg->GetCSRMatrix(0).indptr->dtype);
+  // copy probs pointers to GPU
+  const FloatType **probs_dev = static_cast<const FloatType **>(
+      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
+  device->CopyDataFromTo(
+      probs.get(), 0, probs_dev, 0, (num_etypes) * sizeof(FloatType *),
+      DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
+  // copy probs_sum pointers to GPU
+  const FloatType **prob_sums_dev = static_cast<const FloatType **>(
+      device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *)));
+  device->CopyDataFromTo(
+      prob_sums.get(), 0, prob_sums_dev, 0, (num_etypes) * sizeof(FloatType *),
+      DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype);
+  // copy metapath to GPU
+  auto d_metapath = metapath.CopyTo(ctx);
+  const IdType *d_metapath_data = static_cast<IdType *>(d_metapath->data);
+
+  constexpr int BLOCK_SIZE = 256;
+  constexpr int TILE_SIZE = BLOCK_SIZE * 4;
+  dim3 block(256);
+  dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE);
+  const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000);
+  CHECK(restart_prob->ctx.device_type == kDGLCUDA)
+      << "restart prob should be in GPU.";
+  CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1.";
+  const FloatType *restart_prob_data = restart_prob.Ptr<FloatType>();
+  const int64_t restart_prob_size = restart_prob->shape[0];
+  CUDA_KERNEL_CALL(
+      (_RandomWalkBiasedKernel<IdType, FloatType, BLOCK_SIZE, TILE_SIZE>), grid,
+      block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data,
+      max_num_steps, d_graphs, probs_dev, prob_sums_dev, restart_prob_data,
+      restart_prob_size, max_nodes, traces_data, eids_data);
+
+  device->FreeWorkspace(ctx, d_graphs);
+  device->FreeWorkspace(ctx, probs_dev);
+  device->FreeWorkspace(ctx, prob_sums_dev);
+  return std::make_pair(traces, eids);
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> RandomWalk(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob) {
+  bool isUniform = true;
+  for (const auto &etype_prob : prob) {
+    if (!IsNullArray(etype_prob)) {
+      isUniform = false;
+      break;
+    }
+  }
+
+  auto restart_prob =
+      NDArray::Empty({0}, DGLDataType{kDGLFloat, 32, 1}, DGLContext{XPU, 0});
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(
+          hg, seeds, metapath, prob, restart_prob);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> RandomWalkWithRestart(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, double restart_prob) {
+  bool isUniform = true;
+  for (const auto &etype_prob : prob) {
+    if (!IsNullArray(etype_prob)) {
+      isUniform = false;
+      break;
+    }
+  }
+
+  auto device_ctx = seeds->ctx;
+  auto restart_prob_array =
+      NDArray::Empty({1}, DGLDataType{kDGLFloat, 64, 1}, device_ctx);
+  auto device = dgl::runtime::DeviceAPI::Get(device_ctx);
+
+  // use cuda stream from local thread
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  device->CopyDataFromTo(
+      &restart_prob, 0, restart_prob_array.Ptr<double>(), 0, sizeof(double),
+      DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype);
+  device->StreamSync(device_ctx, stream);
+
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(
+          hg, seeds, metapath, prob, restart_prob_array);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(
+        hg, seeds, metapath, restart_prob_array);
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, IdArray> RandomWalkWithStepwiseRestart(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, FloatArray restart_prob) {
+  bool isUniform = true;
+  for (const auto &etype_prob : prob) {
+    if (!IsNullArray(etype_prob)) {
+      isUniform = false;
+      break;
+    }
+  }
+
+  if (!isUniform) {
+    std::pair<IdArray, IdArray> ret;
+    ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", {
+      ret = RandomWalkBiased<XPU, FloatType, IdType>(
+          hg, seeds, metapath, prob, restart_prob);
+    });
+    return ret;
+  } else {
+    return RandomWalkUniform<XPU, IdType>(hg, seeds, metapath, restart_prob);
+  }
+}
+
+template <DGLDeviceType XPU, typename IdxType>
+std::tuple<IdArray, IdArray, IdArray> SelectPinSageNeighbors(
+    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
+    const int64_t k) {
+  CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!";
+  const IdxType *src_data = src.Ptr<IdxType>();
+  const IdxType *dst_data = dst.Ptr<IdxType>();
+  const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node);
+  auto ctx = src->ctx;
+  // use cuda stream from local thread
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  auto frequency_hashmap = FrequencyHashmap<IdxType>(
+      num_dst_nodes, num_samples_per_node, ctx, stream);
+  auto ret = frequency_hashmap.Topk(
+      src_data, dst_data, src->dtype, src->shape[0], num_samples_per_node, k);
+  return ret;
+}
+
+template std::pair<IdArray, IdArray> RandomWalk<kDGLCUDA, int32_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob);
+template std::pair<IdArray, IdArray> RandomWalk<kDGLCUDA, int64_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob);
+
+template std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCUDA, int32_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, double restart_prob);
+template std::pair<IdArray, IdArray> RandomWalkWithRestart<kDGLCUDA, int64_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, double restart_prob);
+
+template std::pair<IdArray, IdArray>
+RandomWalkWithStepwiseRestart<kDGLCUDA, int32_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, FloatArray restart_prob);
+template std::pair<IdArray, IdArray>
+RandomWalkWithStepwiseRestart<kDGLCUDA, int64_t>(
+    const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath,
+    const std::vector<FloatArray> &prob, FloatArray restart_prob);
+
+template std::tuple<IdArray, IdArray, IdArray>
+SelectPinSageNeighbors<kDGLCUDA, int32_t>(
+    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
+    const int64_t k);
+template std::tuple<IdArray, IdArray, IdArray>
+SelectPinSageNeighbors<kDGLCUDA, int64_t>(
+    const IdArray src, const IdArray dst, const int64_t num_samples_per_node,
+    const int64_t k);
+
+};  // namespace impl
+
+};  // namespace sampling
+
+};  // namespace dgl
diff --git a/src/graph/transform/cuda/cuda_compact_graph.cu b/src/graph/transform/cuda/cuda_compact_graph.cu
index 359da3f0d41e..576fecba8e01 100644
--- a/src/graph/transform/cuda/cuda_compact_graph.cu
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu
@@ -18,7 +18,7 @@
  * all given graphs with the same set of nodes.
  */
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>
 
@@ -55,10 +55,10 @@ template <typename IdType>
 void BuildNodeMaps(
     const std::vector<IdArray> &input_nodes,
     DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
-    std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) {
+    std::vector<IdArray> *const unique_nodes_device, hipStream_t stream) {
   const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
 
-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
       count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
       stream));
 
@@ -81,7 +81,7 @@ std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
     const std::vector<IdArray> &always_preserve) {
   const auto &ctx = graphs[0]->Context();
   auto device = runtime::DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   CHECK_EQ(ctx.device_type, kDGLCUDA);
 
diff --git a/src/graph/transform/cuda/cuda_compact_graph.cu.prehip b/src/graph/transform/cuda/cuda_compact_graph.cu.prehip
new file mode 100644
index 000000000000..359da3f0d41e
--- /dev/null
+++ b/src/graph/transform/cuda/cuda_compact_graph.cu.prehip
@@ -0,0 +1,247 @@
+/**
+ *  Copyright 2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file graph/transform/cuda/cuda_compact_graph.cu
+ * @brief Functions to find and eliminate the common isolated nodes across
+ * all given graphs with the same set of nodes.
+ */
+
+#include <cuda_runtime.h>
+#include <dgl/immutable_graph.h>
+#include <dgl/runtime/device_api.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../../heterograph.h"
+#include "../compact.h"
+#include "cuda_map_edges.cuh"
+
+using namespace dgl::aten;
+using namespace dgl::runtime::cuda;
+using namespace dgl::transform::cuda;
+
+namespace dgl {
+namespace transform {
+
+namespace {
+
+/**
+ * @brief This function builds node maps for each node type, preserving the
+ * order of the input nodes. Here it is assumed the nodes are not unique,
+ * and thus a unique list is generated.
+ *
+ * @param input_nodes The set of input nodes.
+ * @param node_maps The node maps to be constructed.
+ * @param count_unique_device The number of unique nodes (on the GPU).
+ * @param unique_nodes_device The unique nodes (on the GPU).
+ * @param stream The stream to operate on.
+ */
+template <typename IdType>
+void BuildNodeMaps(
+    const std::vector<IdArray> &input_nodes,
+    DeviceNodeMap<IdType> *const node_maps, int64_t *const count_unique_device,
+    std::vector<IdArray> *const unique_nodes_device, cudaStream_t stream) {
+  const int64_t num_ntypes = static_cast<int64_t>(input_nodes.size());
+
+  CUDA_CALL(cudaMemsetAsync(
+      count_unique_device, 0, num_ntypes * sizeof(*count_unique_device),
+      stream));
+
+  // possibly duplicated nodes
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    const IdArray &nodes = input_nodes[ntype];
+    if (nodes->shape[0] > 0) {
+      CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
+      node_maps->LhsHashTable(ntype).FillWithDuplicates(
+          nodes.Ptr<IdType>(), nodes->shape[0],
+          (*unique_nodes_device)[ntype].Ptr<IdType>(),
+          count_unique_device + ntype, stream);
+    }
+  }
+}
+
+template <typename IdType>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>> CompactGraphsGPU(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  const auto &ctx = graphs[0]->Context();
+  auto device = runtime::DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  CHECK_EQ(ctx.device_type, kDGLCUDA);
+
+  // Step 1: Collect the nodes that has connections for each type.
+  const uint64_t num_ntypes = graphs[0]->NumVertexTypes();
+  std::vector<std::vector<EdgeArray>> all_edges(
+      graphs.size());  // all_edges[i][etype]
+
+  // count the number of nodes per type
+  std::vector<int64_t> max_vertex_cnt(num_ntypes, 0);
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+
+    for (IdType etype = 0; etype < num_etypes; ++etype) {
+      IdType srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+
+      const int64_t n_edges = curr_graph->NumEdges(etype);
+      max_vertex_cnt[srctype] += n_edges;
+      max_vertex_cnt[dsttype] += n_edges;
+    }
+  }
+
+  for (size_t i = 0; i < always_preserve.size(); ++i) {
+    max_vertex_cnt[i] += always_preserve[i]->shape[0];
+  }
+
+  // gather all nodes
+  std::vector<IdArray> all_nodes(num_ntypes);
+  std::vector<int64_t> node_offsets(num_ntypes, 0);
+
+  for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    all_nodes[ntype] =
+        NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8);
+    // copy the nodes in always_preserve
+    if (ntype < always_preserve.size() &&
+        always_preserve[ntype]->shape[0] > 0) {
+      device->CopyDataFromTo(
+          always_preserve[ntype].Ptr<IdType>(), 0,
+          all_nodes[ntype].Ptr<IdType>(), node_offsets[ntype],
+          sizeof(IdType) * always_preserve[ntype]->shape[0],
+          always_preserve[ntype]->ctx, all_nodes[ntype]->ctx,
+          always_preserve[ntype]->dtype);
+      node_offsets[ntype] += sizeof(IdType) * always_preserve[ntype]->shape[0];
+    }
+  }
+
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+
+    all_edges[i].reserve(num_etypes);
+    for (int64_t etype = 0; etype < num_etypes; ++etype) {
+      dgl_type_t srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+
+      const EdgeArray edges = curr_graph->Edges(etype, "eid");
+
+      if (edges.src.defined()) {
+        device->CopyDataFromTo(
+            edges.src.Ptr<IdType>(), 0, all_nodes[srctype].Ptr<IdType>(),
+            node_offsets[srctype], sizeof(IdType) * edges.src->shape[0],
+            edges.src->ctx, all_nodes[srctype]->ctx, edges.src->dtype);
+        node_offsets[srctype] += sizeof(IdType) * edges.src->shape[0];
+      }
+      if (edges.dst.defined()) {
+        device->CopyDataFromTo(
+            edges.dst.Ptr<IdType>(), 0, all_nodes[dsttype].Ptr<IdType>(),
+            node_offsets[dsttype], sizeof(IdType) * edges.dst->shape[0],
+            edges.dst->ctx, all_nodes[dsttype]->ctx, edges.dst->dtype);
+        node_offsets[dsttype] += sizeof(IdType) * edges.dst->shape[0];
+      }
+      all_edges[i].push_back(edges);
+    }
+  }
+
+  // Step 2: Relabel the nodes for each type to a smaller ID space
+  //         using BuildNodeMaps
+
+  // allocate space for map creation
+  // the hashmap on GPU
+  DeviceNodeMap<IdType> node_maps(max_vertex_cnt, 0, ctx, stream);
+  // number of unique nodes per type on CPU
+  std::vector<int64_t> num_induced_nodes(num_ntypes);
+  // number of unique nodes per type on GPU
+  int64_t *count_unique_device = static_cast<int64_t *>(
+      device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes));
+  // the set of unique nodes per type
+  std::vector<IdArray> induced_nodes(num_ntypes);
+  for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    induced_nodes[ntype] =
+        NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8);
+  }
+
+  BuildNodeMaps(
+      all_nodes, &node_maps, count_unique_device, &induced_nodes, stream);
+
+  device->CopyDataFromTo(
+      count_unique_device, 0, num_induced_nodes.data(), 0,
+      sizeof(*num_induced_nodes.data()) * num_ntypes, ctx,
+      DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1});
+  device->StreamSync(ctx, stream);
+
+  // wait for the node counts to finish transferring
+  device->FreeWorkspace(ctx, count_unique_device);
+
+  // resize induced nodes
+  for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    induced_nodes[ntype]->shape[0] = num_induced_nodes[ntype];
+  }
+
+  // Step 3: Remap the edges of each graph using MapEdges
+  std::vector<HeteroGraphPtr> new_graphs;
+  for (size_t i = 0; i < graphs.size(); ++i) {
+    const HeteroGraphPtr curr_graph = graphs[i];
+    const auto meta_graph = curr_graph->meta_graph();
+    const int64_t num_etypes = curr_graph->NumEdgeTypes();
+
+    std::vector<HeteroGraphPtr> rel_graphs;
+    rel_graphs.reserve(num_etypes);
+
+    std::vector<IdArray> new_src;
+    std::vector<IdArray> new_dst;
+    std::tie(new_src, new_dst) =
+        MapEdges(curr_graph, all_edges[i], node_maps, stream);
+
+    for (IdType etype = 0; etype < num_etypes; ++etype) {
+      IdType srctype, dsttype;
+      std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype);
+
+      rel_graphs.push_back(UnitGraph::CreateFromCOO(
+          srctype == dsttype ? 1 : 2, induced_nodes[srctype]->shape[0],
+          induced_nodes[dsttype]->shape[0], new_src[etype], new_dst[etype]));
+    }
+
+    new_graphs.push_back(
+        CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes));
+  }
+
+  return std::make_pair(new_graphs, induced_nodes);
+}
+
+}  // namespace
+
+template <>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphs<kDGLCUDA, int32_t>(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  return CompactGraphsGPU<int32_t>(graphs, always_preserve);
+}
+
+template <>
+std::pair<std::vector<HeteroGraphPtr>, std::vector<IdArray>>
+CompactGraphs<kDGLCUDA, int64_t>(
+    const std::vector<HeteroGraphPtr> &graphs,
+    const std::vector<IdArray> &always_preserve) {
+  return CompactGraphsGPU<int64_t>(graphs, always_preserve);
+}
+
+}  // namespace transform
+}  // namespace dgl
diff --git a/src/graph/transform/cuda/cuda_map_edges.cuh b/src/graph/transform/cuda/cuda_map_edges.cuh
index 93f1f3e2d927..9ae512f62926 100644
--- a/src/graph/transform/cuda/cuda_map_edges.cuh
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright 2020-2022 Contributors
  *
@@ -22,7 +23,7 @@
 
 #include <dgl/runtime/c_runtime_api.h>
 #include <dgl/base_heterograph.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/runtime/c_runtime_api.h>
 
 #include <algorithm>
@@ -113,7 +114,7 @@ class DeviceNodeMap {
 
   DeviceNodeMap(
       const std::vector<int64_t>& num_nodes, const int64_t offset,
-      DGLContext ctx, cudaStream_t stream)
+      DGLContext ctx, hipStream_t stream)
       : num_types_(num_nodes.size()),
         rhs_offset_(offset),
         hash_tables_(),
@@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) {
 template <typename IdType>
 std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
     HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
-    const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) {
+    const DeviceNodeMap<IdType>& node_map, hipStream_t stream) {
   constexpr const int BLOCK_SIZE = 128;
   constexpr const size_t TILE_SIZE = 1024;
 
diff --git a/src/graph/transform/cuda/cuda_map_edges.cuh.prehip b/src/graph/transform/cuda/cuda_map_edges.cuh.prehip
new file mode 100644
index 000000000000..93f1f3e2d927
--- /dev/null
+++ b/src/graph/transform/cuda/cuda_map_edges.cuh.prehip
@@ -0,0 +1,240 @@
+/**
+ *  Copyright 2020-2022 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file graph/transform/cuda/cuda_map_edges.cuh
+ * @brief Device level functions for mapping edges.
+ */
+
+#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
+#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
+
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/base_heterograph.h>
+#include <cuda_runtime.h>
+#include <dgl/runtime/c_runtime_api.h>
+
+#include <algorithm>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../../../runtime/cuda/cuda_hashtable.cuh"
+
+using namespace dgl::aten;
+using namespace dgl::runtime::cuda;
+
+namespace dgl {
+namespace transform {
+
+namespace cuda {
+
+template <typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+__device__ void map_vertex_ids(
+    const IdType* const global, IdType* const new_global,
+    const IdType num_vertices, const DeviceOrderedHashTable<IdType>& table) {
+  assert(BLOCK_SIZE == blockDim.x);
+
+  using Mapping = typename OrderedHashTable<IdType>::Mapping;
+
+  const IdType tile_start = TILE_SIZE * blockIdx.x;
+  const IdType tile_end = min(TILE_SIZE * (blockIdx.x + 1), num_vertices);
+
+  for (IdType idx = threadIdx.x + tile_start; idx < tile_end;
+       idx += BLOCK_SIZE) {
+    const Mapping& mapping = *table.Search(global[idx]);
+    new_global[idx] = mapping.local;
+  }
+}
+
+/**
+ * @brief Generate mapped edge endpoint ids.
+ *
+ * @tparam IdType The type of id.
+ * @tparam BLOCK_SIZE The size of each thread block.
+ * @tparam TILE_SIZE The number of edges to process per thread block.
+ * @param global_srcs_device The source ids to map.
+ * @param new_global_srcs_device The mapped source ids (output).
+ * @param global_dsts_device The destination ids to map.
+ * @param new_global_dsts_device The mapped destination ids (output).
+ * @param num_edges The number of edges to map.
+ * @param src_mapping The mapping of sources ids.
+ * @param src_hash_size The the size of source id hash table/mapping.
+ * @param dst_mapping The mapping of destination ids.
+ * @param dst_hash_size The the size of destination id hash table/mapping.
+ */
+template <typename IdType, int BLOCK_SIZE, IdType TILE_SIZE>
+__global__ void map_edge_ids(
+    const IdType* const global_srcs_device,
+    IdType* const new_global_srcs_device,
+    const IdType* const global_dsts_device,
+    IdType* const new_global_dsts_device, const IdType num_edges,
+    DeviceOrderedHashTable<IdType> src_mapping,
+    DeviceOrderedHashTable<IdType> dst_mapping) {
+  assert(BLOCK_SIZE == blockDim.x);
+  assert(2 == gridDim.y);
+
+  if (blockIdx.y == 0) {
+    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
+        global_srcs_device, new_global_srcs_device, num_edges, src_mapping);
+  } else {
+    map_vertex_ids<IdType, BLOCK_SIZE, TILE_SIZE>(
+        global_dsts_device, new_global_dsts_device, num_edges, dst_mapping);
+  }
+}
+
+/**
+ * @brief Device level node maps for each node type.
+ *
+ * @param num_nodes Number of nodes per type.
+ * @param offset When offset is set to 0, LhsHashTable is identical to
+ *        RhsHashTable. Or set to num_nodes.size()/2 to use seperated
+ *        LhsHashTable and RhsHashTable.
+ * @param ctx The DGL context.
+ * @param stream The stream to operate on.
+ */
+template <typename IdType>
+class DeviceNodeMap {
+ public:
+  using Mapping = typename OrderedHashTable<IdType>::Mapping;
+
+  DeviceNodeMap(
+      const std::vector<int64_t>& num_nodes, const int64_t offset,
+      DGLContext ctx, cudaStream_t stream)
+      : num_types_(num_nodes.size()),
+        rhs_offset_(offset),
+        hash_tables_(),
+        ctx_(ctx) {
+    auto device = runtime::DeviceAPI::Get(ctx);
+
+    hash_tables_.reserve(num_types_);
+    for (int64_t i = 0; i < num_types_; ++i) {
+      hash_tables_.emplace_back(
+          new OrderedHashTable<IdType>(num_nodes[i], ctx_, stream));
+    }
+  }
+
+  OrderedHashTable<IdType>& LhsHashTable(const size_t index) {
+    return HashData(index);
+  }
+
+  OrderedHashTable<IdType>& RhsHashTable(const size_t index) {
+    return HashData(index + rhs_offset_);
+  }
+
+  const OrderedHashTable<IdType>& LhsHashTable(const size_t index) const {
+    return HashData(index);
+  }
+
+  const OrderedHashTable<IdType>& RhsHashTable(const size_t index) const {
+    return HashData(index + rhs_offset_);
+  }
+
+  IdType LhsHashSize(const size_t index) const { return HashSize(index); }
+
+  IdType RhsHashSize(const size_t index) const {
+    return HashSize(rhs_offset_ + index);
+  }
+
+  size_t Size() const { return hash_tables_.size(); }
+
+ private:
+  int64_t num_types_;
+  size_t rhs_offset_;
+  std::vector<std::unique_ptr<OrderedHashTable<IdType>>> hash_tables_;
+  DGLContext ctx_;
+
+  inline OrderedHashTable<IdType>& HashData(const size_t index) {
+    CHECK_LT(index, hash_tables_.size());
+    return *hash_tables_[index];
+  }
+
+  inline const OrderedHashTable<IdType>& HashData(const size_t index) const {
+    CHECK_LT(index, hash_tables_.size());
+    return *hash_tables_[index];
+  }
+
+  inline IdType HashSize(const size_t index) const {
+    return HashData(index).size();
+  }
+};
+
+template <typename IdType>
+inline size_t RoundUpDiv(const IdType num, const size_t divisor) {
+  return static_cast<IdType>(num / divisor) + (num % divisor == 0 ? 0 : 1);
+}
+
+template <typename IdType>
+inline IdType RoundUp(const IdType num, const size_t unit) {
+  return RoundUpDiv(num, unit) * unit;
+}
+
+template <typename IdType>
+std::tuple<std::vector<IdArray>, std::vector<IdArray>> MapEdges(
+    HeteroGraphPtr graph, const std::vector<EdgeArray>& edge_sets,
+    const DeviceNodeMap<IdType>& node_map, cudaStream_t stream) {
+  constexpr const int BLOCK_SIZE = 128;
+  constexpr const size_t TILE_SIZE = 1024;
+
+  const auto& ctx = graph->Context();
+
+  std::vector<IdArray> new_lhs;
+  new_lhs.reserve(edge_sets.size());
+  std::vector<IdArray> new_rhs;
+  new_rhs.reserve(edge_sets.size());
+
+  // The next peformance optimization here, is to perform mapping of all edge
+  // types in a single kernel launch.
+  const int64_t num_edge_sets = static_cast<int64_t>(edge_sets.size());
+  for (int64_t etype = 0; etype < num_edge_sets; ++etype) {
+    const EdgeArray& edges = edge_sets[etype];
+    if (edges.id.defined() && edges.src->shape[0] > 0) {
+      const int64_t num_edges = edges.src->shape[0];
+
+      new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8));
+      new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8));
+
+      const auto src_dst_types = graph->GetEndpointTypes(etype);
+      const int src_type = src_dst_types.first;
+      const int dst_type = src_dst_types.second;
+
+      const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2);
+      const dim3 block(BLOCK_SIZE);
+
+      // map the srcs
+      CUDA_KERNEL_CALL(
+          (map_edge_ids<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0, stream,
+          edges.src.Ptr<IdType>(), new_lhs.back().Ptr<IdType>(),
+          edges.dst.Ptr<IdType>(), new_rhs.back().Ptr<IdType>(), num_edges,
+          node_map.LhsHashTable(src_type).DeviceHandle(),
+          node_map.RhsHashTable(dst_type).DeviceHandle());
+    } else {
+      new_lhs.emplace_back(
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
+      new_rhs.emplace_back(
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
+    }
+  }
+
+  return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
+      std::move(new_lhs), std::move(new_rhs));
+}
+
+}  // namespace cuda
+}  // namespace transform
+}  // namespace dgl
+
+#endif  // DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_
diff --git a/src/graph/transform/cuda/cuda_to_block.cu b/src/graph/transform/cuda/cuda_to_block.cu
index a8bffc8cc6e8..f09bd694aaf4 100644
--- a/src/graph/transform/cuda/cuda_to_block.cu
+++ b/src/graph/transform/cuda/cuda_to_block.cu
@@ -20,7 +20,7 @@
  * Tested via python wrapper: python/dgl/path/to/to_block.py
  */
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/immutable_graph.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/tensordispatch.h>
@@ -69,10 +69,10 @@ class DeviceNodeMapMaker {
       const std::vector<IdArray>& lhs_nodes,
       const std::vector<IdArray>& rhs_nodes,
       DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
-      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
+      std::vector<IdArray>* const lhs_device, hipStream_t stream) {
     const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
 
-    CUDA_CALL(cudaMemsetAsync(
+    CUDA_CALL(hipMemsetAsync(
         count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
 
     // possibly dublicate lhs nodes
@@ -112,7 +112,7 @@ class DeviceNodeMapMaker {
   void Make(
       const std::vector<IdArray>& lhs_nodes,
       const std::vector<IdArray>& rhs_nodes,
-      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
+      DeviceNodeMap<IdType>* const node_maps, hipStream_t stream) {
     const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
 
     // unique lhs nodes
@@ -155,7 +155,7 @@ struct CUDAIdsMapper {
     std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
     const bool generate_lhs_nodes = lhs_nodes.empty();
     auto device = runtime::DeviceAPI::Get(ctx);
-    cudaStream_t stream = runtime::getCurrentCUDAStream();
+    hipStream_t stream = runtime::getCurrentCUDAStream();
 
     // Allocate space for map creation process.
     DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
@@ -168,7 +168,7 @@ struct CUDAIdsMapper {
       }
     }
 
-    cudaEvent_t copyEvent;
+    hipEvent_t copyEvent;
     NDArray new_len_tensor;
     // Populate the mappings.
     if (generate_lhs_nodes) {
@@ -179,7 +179,7 @@ struct CUDAIdsMapper {
           src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
           stream);
 
-      CUDA_CALL(cudaEventCreate(&copyEvent));
+      CUDA_CALL(hipEventCreate(&copyEvent));
       if (TensorDispatcher::Global()->IsAvailable()) {
         new_len_tensor = NDArray::PinnedEmpty(
             {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
@@ -190,11 +190,11 @@ struct CUDAIdsMapper {
             {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
             DGLContext{kDGLCPU, 0});
       }
-      CUDA_CALL(cudaMemcpyAsync(
+      CUDA_CALL(hipMemcpyAsync(
           new_len_tensor->data, count_lhs_device,
           sizeof(*num_nodes_per_type.data()) * num_ntypes,
-          cudaMemcpyDeviceToHost, stream));
-      CUDA_CALL(cudaEventRecord(copyEvent, stream));
+          hipMemcpyDeviceToHost, stream));
+      CUDA_CALL(hipEventRecord(copyEvent, stream));
 
       device->FreeWorkspace(ctx, count_lhs_device);
     } else {
@@ -209,8 +209,8 @@ struct CUDAIdsMapper {
 
     if (generate_lhs_nodes) {
       // wait for the previous copy
-      CUDA_CALL(cudaEventSynchronize(copyEvent));
-      CUDA_CALL(cudaEventDestroy(copyEvent));
+      CUDA_CALL(hipEventSynchronize(copyEvent));
+      CUDA_CALL(hipEventDestroy(copyEvent));
 
       // Resize lhs nodes.
       for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
diff --git a/src/graph/transform/cuda/cuda_to_block.cu.prehip b/src/graph/transform/cuda/cuda_to_block.cu.prehip
new file mode 100644
index 000000000000..a8bffc8cc6e8
--- /dev/null
+++ b/src/graph/transform/cuda/cuda_to_block.cu.prehip
@@ -0,0 +1,258 @@
+/**
+ *  Copyright 2020-2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file graph/transform/cuda/cuda_to_block.cu
+ * @brief Functions to convert a set of edges into a graph block with local
+ * ids.
+ *
+ * Tested via python wrapper: python/dgl/path/to/to_block.py
+ */
+
+#include <cuda_runtime.h>
+#include <dgl/immutable_graph.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/tensordispatch.h>
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../../heterograph.h"
+#include "../to_block.h"
+#include "cuda_map_edges.cuh"
+
+using namespace dgl::aten;
+using namespace dgl::runtime::cuda;
+using namespace dgl::transform::cuda;
+using TensorDispatcher = dgl::runtime::TensorDispatcher;
+
+namespace dgl {
+namespace transform {
+
+namespace {
+
+template <typename IdType>
+class DeviceNodeMapMaker {
+ public:
+  explicit DeviceNodeMapMaker(const std::vector<int64_t>& maxNodesPerType)
+      : max_num_nodes_(0) {
+    max_num_nodes_ =
+        *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end());
+  }
+
+  /**
+   * @brief This function builds node maps for each node type, preserving the
+   * order of the input nodes. Here it is assumed the lhs_nodes are not unique,
+   * and thus a unique list is generated.
+   *
+   * @param lhs_nodes The set of source input nodes.
+   * @param rhs_nodes The set of destination input nodes.
+   * @param node_maps The node maps to be constructed.
+   * @param count_lhs_device The number of unique source nodes (on the GPU).
+   * @param lhs_device The unique source nodes (on the GPU).
+   * @param stream The stream to operate on.
+   */
+  void Make(
+      const std::vector<IdArray>& lhs_nodes,
+      const std::vector<IdArray>& rhs_nodes,
+      DeviceNodeMap<IdType>* const node_maps, int64_t* const count_lhs_device,
+      std::vector<IdArray>* const lhs_device, cudaStream_t stream) {
+    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
+
+    CUDA_CALL(cudaMemsetAsync(
+        count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream));
+
+    // possibly dublicate lhs nodes
+    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
+    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
+      const IdArray& nodes = lhs_nodes[ntype];
+      if (nodes->shape[0] > 0) {
+        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
+        node_maps->LhsHashTable(ntype).FillWithDuplicates(
+            nodes.Ptr<IdType>(), nodes->shape[0],
+            (*lhs_device)[ntype].Ptr<IdType>(), count_lhs_device + ntype,
+            stream);
+      }
+    }
+
+    // unique rhs nodes
+    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
+    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
+      const IdArray& nodes = rhs_nodes[ntype];
+      if (nodes->shape[0] > 0) {
+        node_maps->RhsHashTable(ntype).FillWithUnique(
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
+      }
+    }
+  }
+
+  /**
+   * @brief This function builds node maps for each node type, preserving the
+   * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes
+   * are unique.
+   *
+   * @param lhs_nodes The set of source input nodes.
+   * @param rhs_nodes The set of destination input nodes.
+   * @param node_maps The node maps to be constructed.
+   * @param stream The stream to operate on.
+   */
+  void Make(
+      const std::vector<IdArray>& lhs_nodes,
+      const std::vector<IdArray>& rhs_nodes,
+      DeviceNodeMap<IdType>* const node_maps, cudaStream_t stream) {
+    const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size();
+
+    // unique lhs nodes
+    const int64_t lhs_num_ntypes = static_cast<int64_t>(lhs_nodes.size());
+    for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) {
+      const IdArray& nodes = lhs_nodes[ntype];
+      if (nodes->shape[0] > 0) {
+        CHECK_EQ(nodes->ctx.device_type, kDGLCUDA);
+        node_maps->LhsHashTable(ntype).FillWithUnique(
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
+      }
+    }
+
+    // unique rhs nodes
+    const int64_t rhs_num_ntypes = static_cast<int64_t>(rhs_nodes.size());
+    for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) {
+      const IdArray& nodes = rhs_nodes[ntype];
+      if (nodes->shape[0] > 0) {
+        node_maps->RhsHashTable(ntype).FillWithUnique(
+            nodes.Ptr<IdType>(), nodes->shape[0], stream);
+      }
+    }
+  }
+
+ private:
+  IdType max_num_nodes_;
+};
+
+template <typename IdType>
+struct CUDAIdsMapper {
+  std::tuple<std::vector<IdArray>, std::vector<IdArray>> operator()(
+      const HeteroGraphPtr& graph, bool include_rhs_in_lhs, int64_t num_ntypes,
+      const DGLContext& ctx, const std::vector<int64_t>& maxNodesPerType,
+      const std::vector<EdgeArray>& edge_arrays,
+      const std::vector<IdArray>& src_nodes,
+      const std::vector<IdArray>& rhs_nodes,
+      std::vector<IdArray>* const lhs_nodes_ptr,
+      std::vector<int64_t>* const num_nodes_per_type_ptr) {
+    std::vector<IdArray>& lhs_nodes = *lhs_nodes_ptr;
+    std::vector<int64_t>& num_nodes_per_type = *num_nodes_per_type_ptr;
+    const bool generate_lhs_nodes = lhs_nodes.empty();
+    auto device = runtime::DeviceAPI::Get(ctx);
+    cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+    // Allocate space for map creation process.
+    DeviceNodeMapMaker<IdType> maker(maxNodesPerType);
+    DeviceNodeMap<IdType> node_maps(maxNodesPerType, num_ntypes, ctx, stream);
+    if (generate_lhs_nodes) {
+      lhs_nodes.reserve(num_ntypes);
+      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        lhs_nodes.emplace_back(
+            NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8));
+      }
+    }
+
+    cudaEvent_t copyEvent;
+    NDArray new_len_tensor;
+    // Populate the mappings.
+    if (generate_lhs_nodes) {
+      int64_t* count_lhs_device = static_cast<int64_t*>(
+          device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2));
+
+      maker.Make(
+          src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes,
+          stream);
+
+      CUDA_CALL(cudaEventCreate(&copyEvent));
+      if (TensorDispatcher::Global()->IsAvailable()) {
+        new_len_tensor = NDArray::PinnedEmpty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      } else {
+        // use pageable memory, it will unecessarily block but be functional
+        new_len_tensor = NDArray::Empty(
+            {num_ntypes}, DGLDataTypeTraits<int64_t>::dtype,
+            DGLContext{kDGLCPU, 0});
+      }
+      CUDA_CALL(cudaMemcpyAsync(
+          new_len_tensor->data, count_lhs_device,
+          sizeof(*num_nodes_per_type.data()) * num_ntypes,
+          cudaMemcpyDeviceToHost, stream));
+      CUDA_CALL(cudaEventRecord(copyEvent, stream));
+
+      device->FreeWorkspace(ctx, count_lhs_device);
+    } else {
+      maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream);
+
+      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0];
+      }
+    }
+    // Map node numberings from global to local, and build pointer for CSR.
+    auto ret = MapEdges(graph, edge_arrays, node_maps, stream);
+
+    if (generate_lhs_nodes) {
+      // wait for the previous copy
+      CUDA_CALL(cudaEventSynchronize(copyEvent));
+      CUDA_CALL(cudaEventDestroy(copyEvent));
+
+      // Resize lhs nodes.
+      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        num_nodes_per_type[ntype] =
+            static_cast<int64_t*>(new_len_tensor->data)[ntype];
+        lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype];
+      }
+    }
+
+    return ret;
+  }
+};
+
+template <typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU(
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes_ptr) {
+  return dgl::transform::ProcessToBlock<IdType>(
+      graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr,
+      CUDAIdsMapper<IdType>());
+}
+
+}  // namespace
+
+// Use explicit names to get around MSVC's broken mangling that thinks the
+// following two functions are the same. Using template<> fails to export the
+// symbols.
+std::tuple<HeteroGraphPtr, std::vector<IdArray>>
+// ToBlock<kDGLCUDA, int32_t>
+ToBlockGPU32(
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
+  return ToBlockGPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+std::tuple<HeteroGraphPtr, std::vector<IdArray>>
+// ToBlock<kDGLCUDA, int64_t>
+ToBlockGPU64(
+    HeteroGraphPtr graph, const std::vector<IdArray>& rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray>* const lhs_nodes) {
+  return ToBlockGPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+}  // namespace transform
+}  // namespace dgl
diff --git a/src/graph/transform/cuda/knn.cu b/src/graph/transform/cuda/knn.cu
index 988ff4f3e9e7..606352be5a03 100644
--- a/src/graph/transform/cuda/knn.cu
+++ b/src/graph/transform/cuda/knn.cu
@@ -1,16 +1,17 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2020 by Contributors
  * @file graph/transform/cuda/knn.cu
  * @brief k-nearest-neighbor (KNN) implementation (cuda)
  */
 
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #include <dgl/array.h>
 #include <dgl/random.h>
 #include <dgl/runtime/device_api.h>
 
 #include <algorithm>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 #include <limits>
 #include <string>
 #include <type_traits>
@@ -467,7 +468,7 @@ void BruteForceKNNCuda(
     const NDArray& data_points, const IdArray& data_offsets,
     const NDArray& query_points, const IdArray& query_offsets, const int k,
     IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = data_points->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   const int64_t batch_size = data_offsets->shape[0] - 1;
@@ -512,7 +513,7 @@ void BruteForceKNNSharedCuda(
     const NDArray& data_points, const IdArray& data_offsets,
     const NDArray& query_points, const IdArray& query_offsets, const int k,
     IdArray result) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = data_points->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   const int64_t batch_size = data_offsets->shape[0] - 1;
@@ -528,8 +529,8 @@ void BruteForceKNNSharedCuda(
   // get max shared memory per block in bytes
   // determine block size according to this value
   int max_sharedmem_per_block = 0;
-  CUDA_CALL(cudaDeviceGetAttribute(
-      &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+  CUDA_CALL(hipDeviceGetAttribute(
+      &max_sharedmem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock,
       ctx.device_id));
   const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
       (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
@@ -552,17 +553,17 @@ void BruteForceKNNSharedCuda(
       GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
       query_offsets_data, num_block_per_segment, batch_size, block_size);
   size_t prefix_temp_size = 0;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
       batch_size, stream));
   void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
       batch_size, stream));
   device->FreeWorkspace(ctx, prefix_temp);
 
   // wait for results
-  CUDA_CALL(cudaStreamSynchronize(stream));
+  CUDA_CALL(hipStreamSynchronize(stream));
 
   int64_t num_blocks = 0, final_elem = 0,
           copyoffset = (batch_size - 1) * sizeof(IdType);
@@ -603,10 +604,10 @@ void BruteForceKNNSharedCuda(
 
 /** @brief Setup rng state for nn-descent */
 __global__ void SetupRngKernel(
-    curandState* states, const uint64_t seed, const size_t n) {
+    hiprandState* states, const uint64_t seed, const size_t n) {
   size_t id = blockIdx.x * blockDim.x + threadIdx.x;
   if (id < n) {
-    curand_init(seed, id, 0, states + id);
+    hiprand_init(seed, id, 0, states + id);
   }
 }
 
@@ -622,8 +623,8 @@ __global__ void RandomInitNeighborsKernel(
   const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
   IdType batch_idx = 0;
   if (point_idx >= offsets[batch_size]) return;
-  curandState state;
-  curand_init(seed, point_idx, 0, &state);
+  hiprandState state;
+  hiprand_init(seed, point_idx, 0, &state);
 
   // find the segment location in the input batch
   for (IdType b = 0; b < batch_size + 1; ++b) {
@@ -646,7 +647,7 @@ __global__ void RandomInitNeighborsKernel(
     current_central_nodes[i] = point_idx;
   }
   for (IdType i = k; i < segment_size; ++i) {
-    const IdType j = static_cast<IdType>(curand(&state) % (i + 1));
+    const IdType j = static_cast<IdType>(hiprand(&state) % (i + 1));
     if (j < k) current_neighbors[j] = i + segment_start;
   }
 
@@ -674,8 +675,8 @@ __global__ void FindCandidatesKernel(
   const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
   IdType batch_idx = 0;
   if (point_idx >= offsets[batch_size]) return;
-  curandState state;
-  curand_init(seed, point_idx, 0, &state);
+  hiprandState state;
+  hiprand_init(seed, point_idx, 0, &state);
 
   // find the segment location in the input batch
   for (IdType b = 0; b < batch_size + 1; ++b) {
@@ -711,7 +712,7 @@ __global__ void FindCandidatesKernel(
     if (curr_num < num_candidates) {
       candidate_data[curr_num] = candidate;
     } else {
-      IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+      IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
       if (pos < num_candidates) candidate_data[pos] = candidate;
     }
     ++candidate_array[0];
@@ -732,7 +733,7 @@ __global__ void FindCandidatesKernel(
       if (curr_num < num_candidates) {
         candidate_data[curr_num] = reverse_candidate;
       } else {
-        IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+        IdType pos = static_cast<IdType>(hiprand(&state) % (curr_num + 1));
         if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
       }
       ++candidate_array[0];
@@ -873,7 +874,7 @@ template <DGLDeviceType XPU, typename FloatType, typename IdType>
 void NNDescent(
     const NDArray& points, const IdArray& offsets, IdArray result, const int k,
     const int num_iters, const int num_candidates, const double delta) {
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
   const auto& ctx = points->ctx;
   auto device = runtime::DeviceAPI::Get(ctx);
   const int64_t num_nodes = points->shape[0];
@@ -887,7 +888,7 @@ void NNDescent(
   uint64_t seed;
   int warp_size = 0;
   CUDA_CALL(
-      cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id));
+      hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, ctx.device_id));
   // We don't need large block sizes, since there's not much inter-thread
   // communication
   int64_t block_size = warp_size;
@@ -911,7 +912,7 @@ void NNDescent(
   IdType* total_num_updates_d =
       static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
 
-  CUDA_CALL(cub::DeviceReduce::Sum(
+  CUDA_CALL(hipcub::DeviceReduce::Sum(
       nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
       stream));
   IdType* sum_temp_storage =
@@ -942,7 +943,7 @@ void NNDescent(
         feature_size);
 
     total_num_updates = 0;
-    CUDA_CALL(cub::DeviceReduce::Sum(
+    CUDA_CALL(hipcub::DeviceReduce::Sum(
         sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
         num_nodes, stream));
     device->CopyDataFromTo(
diff --git a/src/graph/transform/cuda/knn.cu.prehip b/src/graph/transform/cuda/knn.cu.prehip
new file mode 100644
index 000000000000..988ff4f3e9e7
--- /dev/null
+++ b/src/graph/transform/cuda/knn.cu.prehip
@@ -0,0 +1,997 @@
+/**
+ *  Copyright (c) 2020 by Contributors
+ * @file graph/transform/cuda/knn.cu
+ * @brief k-nearest-neighbor (KNN) implementation (cuda)
+ */
+
+#include <curand_kernel.h>
+#include <dgl/array.h>
+#include <dgl/random.h>
+#include <dgl/runtime/device_api.h>
+
+#include <algorithm>
+#include <cub/cub.cuh>  // NOLINT
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "../../../array/cuda/utils.h"
+#include "../../../runtime/cuda/cuda_common.h"
+#include "../knn.h"
+
+namespace dgl {
+namespace transform {
+namespace impl {
+
+/**
+ * @brief Given input `size`, find the smallest value
+ * greater or equal to `size` that is a multiple of `align`.
+ *
+ * e.g. Pow2Align(17, 4) = 20, Pow2Align(17, 8) = 24
+ */
+template <typename Type>
+static __host__ __device__ std::enable_if_t<std::is_unsigned<Type>::value, Type>
+Pow2Align(Type size, Type align) {
+  if (align <= 1 || size <= 0) return size;
+  return ((size - 1) | (align - 1)) + 1;
+}
+
+/**
+ * @brief Utility class used to avoid linker errors with extern
+ *  unsized shared memory arrays with templated type
+ */
+template <typename Type>
+struct SharedMemory {
+  __device__ inline operator Type*() {
+    extern __shared__ int __smem[];
+    return reinterpret_cast<Type*>(__smem);
+  }
+
+  __device__ inline operator const Type*() const {
+    extern __shared__ int __smem[];
+    return reinterpret_cast<Type*>(__smem);
+  }
+};
+
+// specialize for double to avoid unaligned memory
+// access compile errors
+template <>
+struct SharedMemory<double> {
+  __device__ inline operator double*() {
+    extern __shared__ double __smem_d[];
+    return reinterpret_cast<double*>(__smem_d);
+  }
+
+  __device__ inline operator const double*() const {
+    extern __shared__ double __smem_d[];
+    return reinterpret_cast<double*>(__smem_d);
+  }
+};
+
+/** @brief Compute Euclidean distance between two vectors in a cuda kernel */
+template <typename FloatType, typename IdType>
+__device__ FloatType
+EuclideanDist(const FloatType* vec1, const FloatType* vec2, const int64_t dim) {
+  FloatType dist = 0;
+  IdType idx = 0;
+  for (; idx < dim - 3; idx += 4) {
+    FloatType diff0 = vec1[idx] - vec2[idx];
+    FloatType diff1 = vec1[idx + 1] - vec2[idx + 1];
+    FloatType diff2 = vec1[idx + 2] - vec2[idx + 2];
+    FloatType diff3 = vec1[idx + 3] - vec2[idx + 3];
+
+    dist += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+  }
+
+  for (; idx < dim; ++idx) {
+    FloatType diff = vec1[idx] - vec2[idx];
+    dist += diff * diff;
+  }
+
+  return dist;
+}
+
+/**
+ * @brief Compute Euclidean distance between two vectors in a cuda kernel,
+ *  return positive infinite value if the intermediate distance is greater
+ *  than the worst distance.
+ */
+template <typename FloatType, typename IdType>
+__device__ FloatType EuclideanDistWithCheck(
+    const FloatType* vec1, const FloatType* vec2, const int64_t dim,
+    const FloatType worst_dist) {
+  FloatType dist = 0;
+  IdType idx = 0;
+  bool early_stop = false;
+
+  for (; idx < dim - 3; idx += 4) {
+    FloatType diff0 = vec1[idx] - vec2[idx];
+    FloatType diff1 = vec1[idx + 1] - vec2[idx + 1];
+    FloatType diff2 = vec1[idx + 2] - vec2[idx + 2];
+    FloatType diff3 = vec1[idx + 3] - vec2[idx + 3];
+
+    dist += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+    if (dist > worst_dist) {
+      early_stop = true;
+      idx = dim;
+      break;
+    }
+  }
+
+  for (; idx < dim; ++idx) {
+    FloatType diff = vec1[idx] - vec2[idx];
+    dist += diff * diff;
+    if (dist > worst_dist) {
+      early_stop = true;
+      break;
+    }
+  }
+
+  if (early_stop) {
+    return std::numeric_limits<FloatType>::max();
+  } else {
+    return dist;
+  }
+}
+
+template <typename FloatType, typename IdType>
+__device__ void BuildHeap(IdType* indices, FloatType* dists, int size) {
+  for (int i = size / 2 - 1; i >= 0; --i) {
+    IdType idx = i;
+    while (true) {
+      IdType largest = idx;
+      IdType left = idx * 2 + 1;
+      IdType right = left + 1;
+      if (left < size && dists[left] > dists[largest]) {
+        largest = left;
+      }
+      if (right < size && dists[right] > dists[largest]) {
+        largest = right;
+      }
+      if (largest != idx) {
+        IdType tmp_idx = indices[largest];
+        indices[largest] = indices[idx];
+        indices[idx] = tmp_idx;
+
+        FloatType tmp_dist = dists[largest];
+        dists[largest] = dists[idx];
+        dists[idx] = tmp_dist;
+        idx = largest;
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+template <typename FloatType, typename IdType>
+__device__ void HeapInsert(
+    IdType* indices, FloatType* dist, IdType new_idx, FloatType new_dist,
+    int size, bool check_repeat = false) {
+  if (new_dist > dist[0]) return;
+
+  // check if we have it
+  if (check_repeat) {
+    for (IdType i = 0; i < size; ++i) {
+      if (indices[i] == new_idx) return;
+    }
+  }
+
+  IdType left = 0, right = 0, idx = 0, largest = 0;
+  dist[0] = new_dist;
+  indices[0] = new_idx;
+  while (true) {
+    left = idx * 2 + 1;
+    right = left + 1;
+    if (left < size && dist[left] > dist[largest]) {
+      largest = left;
+    }
+    if (right < size && dist[right] > dist[largest]) {
+      largest = right;
+    }
+    if (largest != idx) {
+      IdType tmp_idx = indices[idx];
+      indices[idx] = indices[largest];
+      indices[largest] = tmp_idx;
+
+      FloatType tmp_dist = dist[idx];
+      dist[idx] = dist[largest];
+      dist[largest] = tmp_dist;
+
+      idx = largest;
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename FloatType, typename IdType>
+__device__ bool FlaggedHeapInsert(
+    IdType* indices, FloatType* dist, bool* flags, IdType new_idx,
+    FloatType new_dist, bool new_flag, int size, bool check_repeat = false) {
+  if (new_dist > dist[0]) return false;
+
+  // check if we have it
+  if (check_repeat) {
+    for (IdType i = 0; i < size; ++i) {
+      if (indices[i] == new_idx) return false;
+    }
+  }
+
+  IdType left = 0, right = 0, idx = 0, largest = 0;
+  dist[0] = new_dist;
+  indices[0] = new_idx;
+  flags[0] = new_flag;
+  while (true) {
+    left = idx * 2 + 1;
+    right = left + 1;
+    if (left < size && dist[left] > dist[largest]) {
+      largest = left;
+    }
+    if (right < size && dist[right] > dist[largest]) {
+      largest = right;
+    }
+    if (largest != idx) {
+      IdType tmp_idx = indices[idx];
+      indices[idx] = indices[largest];
+      indices[largest] = tmp_idx;
+
+      FloatType tmp_dist = dist[idx];
+      dist[idx] = dist[largest];
+      dist[largest] = tmp_dist;
+
+      bool tmp_flag = flags[idx];
+      flags[idx] = flags[largest];
+      flags[largest] = tmp_flag;
+
+      idx = largest;
+    } else {
+      break;
+    }
+  }
+  return true;
+}
+
+/**
+ * @brief Brute force kNN kernel. Compute distance for each pair of input points
+ * and get the result directly (without a distance matrix).
+ */
+template <typename FloatType, typename IdType>
+__global__ void BruteforceKnnKernel(
+    const FloatType* data_points, const IdType* data_offsets,
+    const FloatType* query_points, const IdType* query_offsets, const int k,
+    FloatType* dists, IdType* query_out, IdType* data_out,
+    const int64_t num_batches, const int64_t feature_size) {
+  const IdType q_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (q_idx >= query_offsets[num_batches]) return;
+  IdType batch_idx = 0;
+  for (IdType b = 0; b < num_batches + 1; ++b) {
+    if (query_offsets[b] > q_idx) {
+      batch_idx = b - 1;
+      break;
+    }
+  }
+  const IdType data_start = data_offsets[batch_idx],
+               data_end = data_offsets[batch_idx + 1];
+
+  for (IdType k_idx = 0; k_idx < k; ++k_idx) {
+    query_out[q_idx * k + k_idx] = q_idx;
+    dists[q_idx * k + k_idx] = std::numeric_limits<FloatType>::max();
+  }
+  FloatType worst_dist = std::numeric_limits<FloatType>::max();
+
+  for (IdType d_idx = data_start; d_idx < data_end; ++d_idx) {
+    FloatType tmp_dist = EuclideanDistWithCheck<FloatType, IdType>(
+        query_points + q_idx * feature_size, data_points + d_idx * feature_size,
+        feature_size, worst_dist);
+
+    IdType out_offset = q_idx * k;
+    HeapInsert<FloatType, IdType>(
+        data_out + out_offset, dists + out_offset, d_idx, tmp_dist, k);
+    worst_dist = dists[q_idx * k];
+  }
+}
+
+/**
+ * @brief Same as BruteforceKnnKernel, but use shared memory as buffer.
+ *  This kernel divides query points and data points into blocks. For each
+ *  query block, it will make a loop over all data blocks and compute distances.
+ *  This kernel is faster when the dimension of input points is not large.
+ */
+template <typename FloatType, typename IdType>
+__global__ void BruteforceKnnShareKernel(
+    const FloatType* data_points, const IdType* data_offsets,
+    const FloatType* query_points, const IdType* query_offsets,
+    const IdType* block_batch_id, const IdType* local_block_id, const int k,
+    FloatType* dists, IdType* query_out, IdType* data_out,
+    const int64_t num_batches, const int64_t feature_size) {
+  const IdType block_idx = static_cast<IdType>(blockIdx.x);
+  const IdType block_size = static_cast<IdType>(blockDim.x);
+  const IdType batch_idx = block_batch_id[block_idx];
+  const IdType local_bid = local_block_id[block_idx];
+  const IdType query_start = query_offsets[batch_idx] + block_size * local_bid;
+  const IdType query_end =
+      min(query_start + block_size, query_offsets[batch_idx + 1]);
+  if (query_start >= query_end) return;
+  const IdType query_idx = query_start + threadIdx.x;
+  const IdType data_start = data_offsets[batch_idx];
+  const IdType data_end = data_offsets[batch_idx + 1];
+
+  // shared memory: points in block + distance buffer + result buffer
+  FloatType* data_buff = SharedMemory<FloatType>();
+  FloatType* query_buff = data_buff + block_size * feature_size;
+  FloatType* dist_buff = query_buff + block_size * feature_size;
+  IdType* res_buff = reinterpret_cast<IdType*>(Pow2Align<uint64_t>(
+      reinterpret_cast<uint64_t>(dist_buff + block_size * k), sizeof(IdType)));
+  FloatType worst_dist = std::numeric_limits<FloatType>::max();
+
+  // initialize dist buff with inf value
+  for (auto i = 0; i < k; ++i) {
+    dist_buff[threadIdx.x + i * block_size] =
+        std::numeric_limits<FloatType>::max();
+  }
+
+  // load query data to shared memory
+  // TODO(tianqi): could be better here to exploit coalesce global memory
+  // access.
+  if (query_idx < query_end) {
+    for (auto i = 0; i < feature_size; ++i) {
+      // to avoid bank conflict, we use transpose here
+      query_buff[threadIdx.x + i * block_size] =
+          query_points[query_idx * feature_size + i];
+    }
+  }
+
+  // perform computation on each tile
+  for (auto tile_start = data_start; tile_start < data_end;
+       tile_start += block_size) {
+    // each thread load one data point into the shared memory
+    IdType load_idx = tile_start + threadIdx.x;
+    if (load_idx < data_end) {
+      for (auto i = 0; i < feature_size; ++i) {
+        data_buff[threadIdx.x * feature_size + i] =
+            data_points[load_idx * feature_size + i];
+      }
+    }
+    __syncthreads();
+
+    // compute distance for one tile
+    IdType true_block_size = min(data_end - tile_start, block_size);
+    if (query_idx < query_end) {
+      for (IdType d_idx = 0; d_idx < true_block_size; ++d_idx) {
+        FloatType tmp_dist = 0;
+        bool early_stop = false;
+        IdType dim_idx = 0;
+
+        for (; dim_idx < feature_size - 3; dim_idx += 4) {
+          FloatType diff0 = query_buff[threadIdx.x + block_size * (dim_idx)] -
+                            data_buff[d_idx * feature_size + dim_idx];
+          FloatType diff1 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 1)] -
+              data_buff[d_idx * feature_size + dim_idx + 1];
+          FloatType diff2 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 2)] -
+              data_buff[d_idx * feature_size + dim_idx + 2];
+          FloatType diff3 =
+              query_buff[threadIdx.x + block_size * (dim_idx + 3)] -
+              data_buff[d_idx * feature_size + dim_idx + 3];
+
+          tmp_dist +=
+              diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+
+          if (tmp_dist > worst_dist) {
+            early_stop = true;
+            dim_idx = feature_size;
+            break;
+          }
+        }
+
+        for (; dim_idx < feature_size; ++dim_idx) {
+          const FloatType diff =
+              query_buff[threadIdx.x + dim_idx * block_size] -
+              data_buff[d_idx * feature_size + dim_idx];
+          tmp_dist += diff * diff;
+
+          if (tmp_dist > worst_dist) {
+            early_stop = true;
+            break;
+          }
+        }
+
+        if (early_stop) continue;
+
+        HeapInsert<FloatType, IdType>(
+            res_buff + threadIdx.x * k, dist_buff + threadIdx.x * k,
+            d_idx + tile_start, tmp_dist, k);
+        worst_dist = dist_buff[threadIdx.x * k];
+      }
+    }
+    __syncthreads();
+  }
+
+  // copy result to global memory
+  if (query_idx < query_end) {
+    for (auto i = 0; i < k; ++i) {
+      dists[query_idx * k + i] = dist_buff[threadIdx.x * k + i];
+      data_out[query_idx * k + i] = res_buff[threadIdx.x * k + i];
+      query_out[query_idx * k + i] = query_idx;
+    }
+  }
+}
+
+/** @brief determine the number of blocks for each segment */
+template <typename IdType>
+__global__ void GetNumBlockPerSegment(
+    const IdType* offsets, IdType* out, const int64_t batch_size,
+    const int64_t block_size) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < batch_size) {
+    out[idx] = (offsets[idx + 1] - offsets[idx] - 1) / block_size + 1;
+  }
+}
+
+/** @brief Get the batch index and local index in segment for each block */
+template <typename IdType>
+__global__ void GetBlockInfo(
+    const IdType* num_block_prefixsum, IdType* block_batch_id,
+    IdType* local_block_id, size_t batch_size, size_t num_blocks) {
+  const IdType idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdType i = 0;
+
+  if (idx < num_blocks) {
+    for (; i < batch_size; ++i) {
+      if (num_block_prefixsum[i] > idx) break;
+    }
+    i--;
+    block_batch_id[idx] = i;
+    local_block_id[idx] = idx - num_block_prefixsum[i];
+  }
+}
+
+/**
+ * @brief Brute force kNN. Compute distance for each pair of input points and
+ * get the result directly (without a distance matrix).
+ *
+ * @tparam FloatType The type of input points.
+ * @tparam IdType The type of id.
+ * @param data_points NDArray of dataset points.
+ * @param data_offsets offsets of point index in data points.
+ * @param query_points NDArray of query points
+ * @param query_offsets offsets of point index in query points.
+ * @param k the number of nearest points
+ * @param result output array
+ */
+template <typename FloatType, typename IdType>
+void BruteForceKNNCuda(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = data_points->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  const int64_t batch_size = data_offsets->shape[0] - 1;
+  const int64_t feature_size = data_points->shape[1];
+  const IdType* data_offsets_data = data_offsets.Ptr<IdType>();
+  const IdType* query_offsets_data = query_offsets.Ptr<IdType>();
+  const FloatType* data_points_data = data_points.Ptr<FloatType>();
+  const FloatType* query_points_data = query_points.Ptr<FloatType>();
+  IdType* query_out = result.Ptr<IdType>();
+  IdType* data_out = query_out + k * query_points->shape[0];
+
+  FloatType* dists = static_cast<FloatType*>(device->AllocWorkspace(
+      ctx, k * query_points->shape[0] * sizeof(FloatType)));
+
+  const int64_t block_size = cuda::FindNumThreads(query_points->shape[0]);
+  const int64_t num_blocks = (query_points->shape[0] - 1) / block_size + 1;
+  CUDA_KERNEL_CALL(
+      BruteforceKnnKernel, num_blocks, block_size, 0, stream, data_points_data,
+      data_offsets_data, query_points_data, query_offsets_data, k, dists,
+      query_out, data_out, batch_size, feature_size);
+
+  device->FreeWorkspace(ctx, dists);
+}
+
+/**
+ * @brief Brute force kNN with shared memory.
+ *  This function divides query points and data points into blocks. For each
+ *  query block, it will make a loop over all data blocks and compute distances.
+ *  It will be faster when the dimension of input points is not large.
+ *
+ * @tparam FloatType The type of input points.
+ * @tparam IdType The type of id.
+ * @param data_points NDArray of dataset points.
+ * @param data_offsets offsets of point index in data points.
+ * @param query_points NDArray of query points
+ * @param query_offsets offsets of point index in query points.
+ * @param k the number of nearest points
+ * @param result output array
+ */
+template <typename FloatType, typename IdType>
+void BruteForceKNNSharedCuda(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = data_points->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  const int64_t batch_size = data_offsets->shape[0] - 1;
+  const int64_t feature_size = data_points->shape[1];
+  const IdType* data_offsets_data = data_offsets.Ptr<IdType>();
+  const IdType* query_offsets_data = query_offsets.Ptr<IdType>();
+  const FloatType* data_points_data = data_points.Ptr<FloatType>();
+  const FloatType* query_points_data = query_points.Ptr<FloatType>();
+  IdType* query_out = result.Ptr<IdType>();
+  IdType* data_out = query_out + k * query_points->shape[0];
+  constexpr size_t smem_align = std::max(sizeof(IdType), sizeof(FloatType));
+
+  // get max shared memory per block in bytes
+  // determine block size according to this value
+  int max_sharedmem_per_block = 0;
+  CUDA_CALL(cudaDeviceGetAttribute(
+      &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+      ctx.device_id));
+  const int64_t single_shared_mem = static_cast<int64_t>(Pow2Align<size_t>(
+      (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType),
+      smem_align));
+
+  const int64_t block_size =
+      cuda::FindNumThreads(max_sharedmem_per_block / single_shared_mem);
+
+  // Determine the number of blocks. We first get the number of blocks for each
+  // segment. Then we get the block id offset via prefix sum.
+  IdType* num_block_per_segment = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));
+  IdType* num_block_prefixsum = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, batch_size * sizeof(IdType)));
+
+  // block size for GetNumBlockPerSegment computation
+  int64_t temp_block_size = cuda::FindNumThreads(batch_size);
+  int64_t temp_num_blocks = (batch_size - 1) / temp_block_size + 1;
+  CUDA_KERNEL_CALL(
+      GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream,
+      query_offsets_data, num_block_per_segment, batch_size, block_size);
+  size_t prefix_temp_size = 0;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
+      batch_size, stream));
+  void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size);
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum,
+      batch_size, stream));
+  device->FreeWorkspace(ctx, prefix_temp);
+
+  // wait for results
+  CUDA_CALL(cudaStreamSynchronize(stream));
+
+  int64_t num_blocks = 0, final_elem = 0,
+          copyoffset = (batch_size - 1) * sizeof(IdType);
+  device->CopyDataFromTo(
+      num_block_prefixsum, copyoffset, &num_blocks, 0, sizeof(IdType), ctx,
+      DGLContext{kDGLCPU, 0}, query_offsets->dtype);
+  device->CopyDataFromTo(
+      num_block_per_segment, copyoffset, &final_elem, 0, sizeof(IdType), ctx,
+      DGLContext{kDGLCPU, 0}, query_offsets->dtype);
+  num_blocks += final_elem;
+  device->FreeWorkspace(ctx, num_block_per_segment);
+
+  // get batch id and local id in segment
+  temp_block_size = cuda::FindNumThreads(num_blocks);
+  temp_num_blocks = (num_blocks - 1) / temp_block_size + 1;
+  IdType* block_batch_id = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, num_blocks * sizeof(IdType)));
+  IdType* local_block_id = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, num_blocks * sizeof(IdType)));
+  CUDA_KERNEL_CALL(
+      GetBlockInfo, temp_num_blocks, temp_block_size, 0, stream,
+      num_block_prefixsum, block_batch_id, local_block_id, batch_size,
+      num_blocks);
+
+  FloatType* dists = static_cast<FloatType*>(device->AllocWorkspace(
+      ctx, k * query_points->shape[0] * sizeof(FloatType)));
+  CUDA_KERNEL_CALL(
+      BruteforceKnnShareKernel, num_blocks, block_size,
+      single_shared_mem * block_size, stream, data_points_data,
+      data_offsets_data, query_points_data, query_offsets_data, block_batch_id,
+      local_block_id, k, dists, query_out, data_out, batch_size, feature_size);
+
+  device->FreeWorkspace(ctx, num_block_prefixsum);
+  device->FreeWorkspace(ctx, dists);
+  device->FreeWorkspace(ctx, local_block_id);
+  device->FreeWorkspace(ctx, block_batch_id);
+}
+
+/** @brief Setup rng state for nn-descent */
+__global__ void SetupRngKernel(
+    curandState* states, const uint64_t seed, const size_t n) {
+  size_t id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < n) {
+    curand_init(seed, id, 0, states + id);
+  }
+}
+
+/**
+ * @brief Randomly initialize neighbors (sampling without replacement)
+ * for each nodes
+ */
+template <typename FloatType, typename IdType>
+__global__ void RandomInitNeighborsKernel(
+    const FloatType* points, const IdType* offsets, IdType* central_nodes,
+    IdType* neighbors, FloatType* dists, bool* flags, const int k,
+    const int64_t feature_size, const int64_t batch_size, const uint64_t seed) {
+  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdType batch_idx = 0;
+  if (point_idx >= offsets[batch_size]) return;
+  curandState state;
+  curand_init(seed, point_idx, 0, &state);
+
+  // find the segment location in the input batch
+  for (IdType b = 0; b < batch_size + 1; ++b) {
+    if (offsets[b] > point_idx) {
+      batch_idx = b - 1;
+      break;
+    }
+  }
+
+  const IdType segment_size = offsets[batch_idx + 1] - offsets[batch_idx];
+  IdType* current_neighbors = neighbors + point_idx * k;
+  IdType* current_central_nodes = central_nodes + point_idx * k;
+  bool* current_flags = flags + point_idx * k;
+  FloatType* current_dists = dists + point_idx * k;
+  IdType segment_start = offsets[batch_idx];
+
+  // reservoir sampling
+  for (IdType i = 0; i < k; ++i) {
+    current_neighbors[i] = i + segment_start;
+    current_central_nodes[i] = point_idx;
+  }
+  for (IdType i = k; i < segment_size; ++i) {
+    const IdType j = static_cast<IdType>(curand(&state) % (i + 1));
+    if (j < k) current_neighbors[j] = i + segment_start;
+  }
+
+  // compute distances and set flags
+  for (IdType i = 0; i < k; ++i) {
+    current_flags[i] = true;
+    current_dists[i] = EuclideanDist<FloatType, IdType>(
+        points + point_idx * feature_size,
+        points + current_neighbors[i] * feature_size, feature_size);
+  }
+
+  // build heap
+  BuildHeap<FloatType, IdType>(neighbors + point_idx * k, current_dists, k);
+}
+
+/**
+ * @brief Randomly select candidates from current knn and reverse-knn graph for
+ *        nn-descent.
+ */
+template <typename IdType>
+__global__ void FindCandidatesKernel(
+    const IdType* offsets, IdType* new_candidates, IdType* old_candidates,
+    IdType* neighbors, bool* flags, const uint64_t seed,
+    const int64_t batch_size, const int num_candidates, const int k) {
+  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  IdType batch_idx = 0;
+  if (point_idx >= offsets[batch_size]) return;
+  curandState state;
+  curand_init(seed, point_idx, 0, &state);
+
+  // find the segment location in the input batch
+  for (IdType b = 0; b < batch_size + 1; ++b) {
+    if (offsets[b] > point_idx) {
+      batch_idx = b - 1;
+      break;
+    }
+  }
+
+  IdType segment_start = offsets[batch_idx],
+         segment_end = offsets[batch_idx + 1];
+  IdType* current_neighbors = neighbors + point_idx * k;
+  bool* current_flags = flags + point_idx * k;
+
+  // reset candidates
+  IdType* new_candidates_ptr =
+      new_candidates + point_idx * (num_candidates + 1);
+  IdType* old_candidates_ptr =
+      old_candidates + point_idx * (num_candidates + 1);
+  new_candidates_ptr[0] = 0;
+  old_candidates_ptr[0] = 0;
+
+  // select candidates from current knn graph
+  // here we use candidate[0] for reservoir sampling temporarily
+  for (IdType i = 0; i < k; ++i) {
+    IdType candidate = current_neighbors[i];
+    IdType* candidate_array =
+        current_flags[i] ? new_candidates_ptr : old_candidates_ptr;
+    IdType curr_num = candidate_array[0];
+    IdType* candidate_data = candidate_array + 1;
+
+    // reservoir sampling
+    if (curr_num < num_candidates) {
+      candidate_data[curr_num] = candidate;
+    } else {
+      IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+      if (pos < num_candidates) candidate_data[pos] = candidate;
+    }
+    ++candidate_array[0];
+  }
+
+  // select candidates from current reverse knn graph
+  // here we use candidate[0] for reservoir sampling temporarily
+  IdType index_start = segment_start * k, index_end = segment_end * k;
+  for (IdType i = index_start; i < index_end; ++i) {
+    if (neighbors[i] == point_idx) {
+      IdType reverse_candidate = (i - index_start) / k + segment_start;
+      IdType* candidate_array =
+          flags[i] ? new_candidates_ptr : old_candidates_ptr;
+      IdType curr_num = candidate_array[0];
+      IdType* candidate_data = candidate_array + 1;
+
+      // reservoir sampling
+      if (curr_num < num_candidates) {
+        candidate_data[curr_num] = reverse_candidate;
+      } else {
+        IdType pos = static_cast<IdType>(curand(&state) % (curr_num + 1));
+        if (pos < num_candidates) candidate_data[pos] = reverse_candidate;
+      }
+      ++candidate_array[0];
+    }
+  }
+
+  // set candidate[0] back to length
+  if (new_candidates_ptr[0] > num_candidates)
+    new_candidates_ptr[0] = num_candidates;
+  if (old_candidates_ptr[0] > num_candidates)
+    old_candidates_ptr[0] = num_candidates;
+
+  // mark new_candidates as old
+  IdType num_new_candidates = new_candidates_ptr[0];
+  for (IdType i = 0; i < k; ++i) {
+    IdType neighbor_idx = current_neighbors[i];
+
+    if (current_flags[i]) {
+      for (IdType j = 1; j < num_new_candidates + 1; ++j) {
+        if (new_candidates_ptr[j] == neighbor_idx) {
+          current_flags[i] = false;
+          break;
+        }
+      }
+    }
+  }
+}
+
+/** @brief Update knn graph according to selected candidates for nn-descent */
+template <typename FloatType, typename IdType>
+__global__ void UpdateNeighborsKernel(
+    const FloatType* points, const IdType* offsets, IdType* neighbors,
+    IdType* new_candidates, IdType* old_candidates, FloatType* distances,
+    bool* flags, IdType* num_updates, const int64_t batch_size,
+    const int num_candidates, const int k, const int64_t feature_size) {
+  const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= offsets[batch_size]) return;
+  IdType* current_neighbors = neighbors + point_idx * k;
+  bool* current_flags = flags + point_idx * k;
+  FloatType* current_dists = distances + point_idx * k;
+  IdType* new_candidates_ptr =
+      new_candidates + point_idx * (num_candidates + 1);
+  IdType* old_candidates_ptr =
+      old_candidates + point_idx * (num_candidates + 1);
+  IdType num_new_candidates = new_candidates_ptr[0];
+  IdType num_old_candidates = old_candidates_ptr[0];
+  IdType current_num_updates = 0;
+
+  // process new candidates
+  for (IdType i = 1; i <= num_new_candidates; ++i) {
+    IdType new_c = new_candidates_ptr[i];
+
+    // new/old candidates of the current new candidate
+    IdType* twohop_new_ptr = new_candidates + new_c * (num_candidates + 1);
+    IdType* twohop_old_ptr = old_candidates + new_c * (num_candidates + 1);
+    IdType num_twohop_new = twohop_new_ptr[0];
+    IdType num_twohop_old = twohop_old_ptr[0];
+    FloatType worst_dist = current_dists[0];
+
+    // new - new
+    for (IdType j = 1; j <= num_twohop_new; ++j) {
+      IdType twohop_new_c = twohop_new_ptr[j];
+      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
+          points + point_idx * feature_size,
+          points + twohop_new_c * feature_size, feature_size, worst_dist);
+
+      if (FlaggedHeapInsert<FloatType, IdType>(
+              current_neighbors, current_dists, current_flags, twohop_new_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
+      }
+    }
+
+    // new - old
+    for (IdType j = 1; j <= num_twohop_old; ++j) {
+      IdType twohop_old_c = twohop_old_ptr[j];
+      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
+          points + point_idx * feature_size,
+          points + twohop_old_c * feature_size, feature_size, worst_dist);
+
+      if (FlaggedHeapInsert<FloatType, IdType>(
+              current_neighbors, current_dists, current_flags, twohop_old_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
+      }
+    }
+  }
+
+  // process old candidates
+  for (IdType i = 1; i <= num_old_candidates; ++i) {
+    IdType old_c = old_candidates_ptr[i];
+
+    // new candidates of the current old candidate
+    IdType* twohop_new_ptr = new_candidates + old_c * (num_candidates + 1);
+    IdType num_twohop_new = twohop_new_ptr[0];
+    FloatType worst_dist = current_dists[0];
+
+    // old - new
+    for (IdType j = 1; j <= num_twohop_new; ++j) {
+      IdType twohop_new_c = twohop_new_ptr[j];
+      FloatType new_dist = EuclideanDistWithCheck<FloatType, IdType>(
+          points + point_idx * feature_size,
+          points + twohop_new_c * feature_size, feature_size, worst_dist);
+
+      if (FlaggedHeapInsert<FloatType, IdType>(
+              current_neighbors, current_dists, current_flags, twohop_new_c,
+              new_dist, true, k, true)) {
+        ++current_num_updates;
+        worst_dist = current_dists[0];
+      }
+    }
+  }
+
+  num_updates[point_idx] = current_num_updates;
+}
+
+}  // namespace impl
+
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
+void KNN(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm) {
+  if (algorithm == std::string("bruteforce")) {
+    impl::BruteForceKNNCuda<FloatType, IdType>(
+        data_points, data_offsets, query_points, query_offsets, k, result);
+  } else if (algorithm == std::string("bruteforce-sharemem")) {
+    impl::BruteForceKNNSharedCuda<FloatType, IdType>(
+        data_points, data_offsets, query_points, query_offsets, k, result);
+  } else {
+    LOG(FATAL) << "Algorithm " << algorithm << " is not supported on CUDA.";
+  }
+}
+
+template <DGLDeviceType XPU, typename FloatType, typename IdType>
+void NNDescent(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta) {
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  const auto& ctx = points->ctx;
+  auto device = runtime::DeviceAPI::Get(ctx);
+  const int64_t num_nodes = points->shape[0];
+  const int64_t feature_size = points->shape[1];
+  const int64_t batch_size = offsets->shape[0] - 1;
+  const IdType* offsets_data = offsets.Ptr<IdType>();
+  const FloatType* points_data = points.Ptr<FloatType>();
+
+  IdType* central_nodes = result.Ptr<IdType>();
+  IdType* neighbors = central_nodes + k * num_nodes;
+  uint64_t seed;
+  int warp_size = 0;
+  CUDA_CALL(
+      cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id));
+  // We don't need large block sizes, since there's not much inter-thread
+  // communication
+  int64_t block_size = warp_size;
+  int64_t num_blocks = (num_nodes - 1) / block_size + 1;
+
+  // allocate space for candidates, distances and flags
+  // we use the first element in candidate array to represent length
+  IdType* new_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
+  IdType* old_candidates = static_cast<IdType*>(device->AllocWorkspace(
+      ctx, num_nodes * (num_candidates + 1) * sizeof(IdType)));
+  IdType* num_updates = static_cast<IdType*>(
+      device->AllocWorkspace(ctx, num_nodes * sizeof(IdType)));
+  FloatType* distances = static_cast<FloatType*>(
+      device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));
+  bool* flags = static_cast<bool*>(
+      device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType)));
+
+  size_t sum_temp_size = 0;
+  IdType total_num_updates = 0;
+  IdType* total_num_updates_d =
+      static_cast<IdType*>(device->AllocWorkspace(ctx, sizeof(IdType)));
+
+  CUDA_CALL(cub::DeviceReduce::Sum(
+      nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes,
+      stream));
+  IdType* sum_temp_storage =
+      static_cast<IdType*>(device->AllocWorkspace(ctx, sum_temp_size));
+
+  // random initialize neighbors
+  seed = RandomEngine::ThreadLocal()->RandInt<uint64_t>(
+      std::numeric_limits<uint64_t>::max());
+  CUDA_KERNEL_CALL(
+      impl::RandomInitNeighborsKernel, num_blocks, block_size, 0, stream,
+      points_data, offsets_data, central_nodes, neighbors, distances, flags, k,
+      feature_size, batch_size, seed);
+
+  for (int i = 0; i < num_iters; ++i) {
+    // select candidates
+    seed = RandomEngine::ThreadLocal()->RandInt<uint64_t>(
+        std::numeric_limits<uint64_t>::max());
+    CUDA_KERNEL_CALL(
+        impl::FindCandidatesKernel, num_blocks, block_size, 0, stream,
+        offsets_data, new_candidates, old_candidates, neighbors, flags, seed,
+        batch_size, num_candidates, k);
+
+    // update
+    CUDA_KERNEL_CALL(
+        impl::UpdateNeighborsKernel, num_blocks, block_size, 0, stream,
+        points_data, offsets_data, neighbors, new_candidates, old_candidates,
+        distances, flags, num_updates, batch_size, num_candidates, k,
+        feature_size);
+
+    total_num_updates = 0;
+    CUDA_CALL(cub::DeviceReduce::Sum(
+        sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d,
+        num_nodes, stream));
+    device->CopyDataFromTo(
+        total_num_updates_d, 0, &total_num_updates, 0, sizeof(IdType), ctx,
+        DGLContext{kDGLCPU, 0}, offsets->dtype);
+
+    if (total_num_updates <= static_cast<IdType>(delta * k * num_nodes)) {
+      break;
+    }
+  }
+
+  device->FreeWorkspace(ctx, new_candidates);
+  device->FreeWorkspace(ctx, old_candidates);
+  device->FreeWorkspace(ctx, num_updates);
+  device->FreeWorkspace(ctx, distances);
+  device->FreeWorkspace(ctx, flags);
+  device->FreeWorkspace(ctx, total_num_updates_d);
+  device->FreeWorkspace(ctx, sum_temp_storage);
+}
+
+template void KNN<kDGLCUDA, float, int32_t>(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
+template void KNN<kDGLCUDA, float, int64_t>(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
+template void KNN<kDGLCUDA, double, int32_t>(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
+template void KNN<kDGLCUDA, double, int64_t>(
+    const NDArray& data_points, const IdArray& data_offsets,
+    const NDArray& query_points, const IdArray& query_offsets, const int k,
+    IdArray result, const std::string& algorithm);
+
+template void NNDescent<kDGLCUDA, float, int32_t>(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
+template void NNDescent<kDGLCUDA, float, int64_t>(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
+template void NNDescent<kDGLCUDA, double, int32_t>(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
+template void NNDescent<kDGLCUDA, double, int64_t>(
+    const NDArray& points, const IdArray& offsets, IdArray result, const int k,
+    const int num_iters, const int num_candidates, const double delta);
+
+}  // namespace transform
+}  // namespace dgl
diff --git a/src/graph/transform/to_block.cc b/src/graph/transform/to_block.cc
index 00f4769a9a25..963a6dfe0402 100644
--- a/src/graph/transform/to_block.cc
+++ b/src/graph/transform/to_block.cc
@@ -314,7 +314,7 @@ std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCPU, int64_t>(
   return ToBlockCPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }
 
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 
 // Forward declaration of GPU ToBlock implementations - actual implementation is
 // in
@@ -343,7 +343,7 @@ std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCUDA, int64_t>(
   return ToBlockGPU64(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
 }
 
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 DGL_REGISTER_GLOBAL("capi._CAPI_DGLToBlock")
     .set_body([](DGLArgs args, DGLRetValue *rv) {
diff --git a/src/graph/transform/to_block.cc.prehip b/src/graph/transform/to_block.cc.prehip
new file mode 100644
index 000000000000..00f4769a9a25
--- /dev/null
+++ b/src/graph/transform/to_block.cc.prehip
@@ -0,0 +1,383 @@
+/**
+ *  Copyright 2019-2021 Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * @file graph/transform/to_block.cc
+ * @brief Convert a graph to a bipartite-structured graph.
+ *
+ * Tested via python wrapper: python/dgl/path/to/to_block.py
+ */
+
+#include "to_block.h"
+
+#include <dgl/array.h>
+#include <dgl/base_heterograph.h>
+#include <dgl/immutable_graph.h>
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/container.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/registry.h>
+#include <dgl/transform.h>
+
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "../../array/cpu/concurrent_id_hash_map.h"
+
+namespace dgl {
+
+using namespace dgl::runtime;
+using namespace dgl::aten;
+
+namespace transform {
+
+namespace {
+
+template <typename IdType>
+struct CPUIdsMapper {
+  std::tuple<std::vector<IdArray>, std::vector<IdArray>> operator()(
+      const HeteroGraphPtr &graph, bool include_rhs_in_lhs, int64_t num_ntypes,
+      const DGLContext &ctx, const std::vector<int64_t> &max_nodes_per_type,
+      const std::vector<EdgeArray> &edge_arrays,
+      const std::vector<IdArray> &src_nodes,
+      const std::vector<IdArray> &rhs_nodes,
+      std::vector<IdArray> *const lhs_nodes_ptr,
+      std::vector<int64_t> *const num_nodes_per_type_ptr) {
+    std::vector<IdArray> &lhs_nodes = *lhs_nodes_ptr;
+    std::vector<int64_t> &num_nodes_per_type = *num_nodes_per_type_ptr;
+
+    const bool generate_lhs_nodes = lhs_nodes.empty();
+    if (generate_lhs_nodes) {
+      lhs_nodes.reserve(num_ntypes);
+    }
+
+    std::vector<ConcurrentIdHashMap<IdType>> lhs_nodes_map(num_ntypes);
+    std::vector<ConcurrentIdHashMap<IdType>> rhs_nodes_map(num_ntypes);
+    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+      IdArray unique_ids =
+          aten::NullArray(DGLDataTypeTraits<IdType>::dtype, ctx);
+      if (!aten::IsNullArray(src_nodes[ntype])) {
+        auto num_seeds = include_rhs_in_lhs ? rhs_nodes[ntype]->shape[0] : 0;
+        unique_ids = lhs_nodes_map[ntype].Init(src_nodes[ntype], num_seeds);
+      }
+      if (generate_lhs_nodes) {
+        num_nodes_per_type[ntype] = unique_ids->shape[0];
+        lhs_nodes.emplace_back(unique_ids);
+      }
+    }
+
+    // Skip rhs mapping construction to save efforts when rhs is already
+    // contained in lhs.
+    if (!include_rhs_in_lhs) {
+      for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+        if (!aten::IsNullArray(rhs_nodes[ntype])) {
+          rhs_nodes_map[ntype].Init(
+              rhs_nodes[ntype], rhs_nodes[ntype]->shape[0]);
+        }
+      }
+    }
+
+    // Map node numberings from global to local, and build pointer for CSR.
+    std::vector<IdArray> new_lhs;
+    std::vector<IdArray> new_rhs;
+    new_lhs.reserve(edge_arrays.size());
+    new_rhs.reserve(edge_arrays.size());
+    const int64_t num_etypes = static_cast<int64_t>(edge_arrays.size());
+    for (int64_t etype = 0; etype < num_etypes; ++etype) {
+      const EdgeArray &edges = edge_arrays[etype];
+      if (edges.id.defined() && !aten::IsNullArray(edges.src)) {
+        const auto src_dst_types = graph->GetEndpointTypes(etype);
+        const int src_type = src_dst_types.first;
+        const int dst_type = src_dst_types.second;
+        new_lhs.emplace_back(lhs_nodes_map[src_type].MapIds(edges.src));
+        if (include_rhs_in_lhs) {
+          new_rhs.emplace_back(lhs_nodes_map[dst_type].MapIds(edges.dst));
+        } else {
+          new_rhs.emplace_back(rhs_nodes_map[dst_type].MapIds(edges.dst));
+        }
+      } else {
+        new_lhs.emplace_back(
+            aten::NullArray(DGLDataTypeTraits<IdType>::dtype, ctx));
+        new_rhs.emplace_back(
+            aten::NullArray(DGLDataTypeTraits<IdType>::dtype, ctx));
+      }
+    }
+    return std::tuple<std::vector<IdArray>, std::vector<IdArray>>(
+        std::move(new_lhs), std::move(new_rhs));
+  }
+};
+
+// Since partial specialization is not allowed for functions, use this as an
+// intermediate for ToBlock where XPU = kDGLCPU.
+template <typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockCPU(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes_ptr) {
+  return dgl::transform::ProcessToBlock<IdType>(
+      graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr,
+      CPUIdsMapper<IdType>());
+}
+
+}  // namespace
+
+template <typename IdType>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ProcessToBlock(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes_ptr,
+    IdsMapper &&ids_mapper) {
+  std::vector<IdArray> &lhs_nodes = *lhs_nodes_ptr;
+  const bool generate_lhs_nodes = lhs_nodes.empty();
+
+  const auto &ctx = graph->Context();
+  auto device = runtime::DeviceAPI::Get(ctx);
+
+  // Since DST nodes are included in SRC nodes, a common requirement is to fetch
+  // the DST node features from the SRC nodes features. To avoid expensive
+  // sparse lookup, the function assures that the DST nodes in both SRC and DST
+  // sets have the same ids. As a result, given the node feature tensor ``X`` of
+  // type ``utype``, the following code finds the corresponding DST node
+  // features of type ``vtype``:
+
+  const int64_t num_etypes = graph->NumEdgeTypes();
+  const int64_t num_ntypes = graph->NumVertexTypes();
+
+  CHECK(rhs_nodes.size() == static_cast<size_t>(num_ntypes))
+      << "rhs_nodes not given for every node type";
+
+  std::vector<EdgeArray> edge_arrays(num_etypes);
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const auto src_dst_types = graph->GetEndpointTypes(etype);
+    const dgl_type_t dsttype = src_dst_types.second;
+    if (!aten::IsNullArray(rhs_nodes[dsttype])) {
+      edge_arrays[etype] = graph->Edges(etype);
+    }
+  }
+
+  // Count lhs and rhs nodes.
+  std::vector<int64_t> maxNodesPerType(num_ntypes * 2, 0);
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    maxNodesPerType[ntype + num_ntypes] += rhs_nodes[ntype]->shape[0];
+
+    if (generate_lhs_nodes) {
+      if (include_rhs_in_lhs) {
+        maxNodesPerType[ntype] += rhs_nodes[ntype]->shape[0];
+      }
+    } else {
+      maxNodesPerType[ntype] += lhs_nodes[ntype]->shape[0];
+    }
+  }
+  if (generate_lhs_nodes) {
+    // We don't have lhs_nodes, see we need to count inbound edges to get an
+    // upper bound.
+    for (int64_t etype = 0; etype < num_etypes; ++etype) {
+      const auto src_dst_types = graph->GetEndpointTypes(etype);
+      const dgl_type_t srctype = src_dst_types.first;
+      if (edge_arrays[etype].src.defined()) {
+        maxNodesPerType[srctype] += edge_arrays[etype].src->shape[0];
+      }
+    }
+  }
+
+  // Gather lhs_nodes.
+  std::vector<IdArray> src_nodes(num_ntypes);
+  if (generate_lhs_nodes) {
+    std::vector<int64_t> src_node_offsets(num_ntypes, 0);
+    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+      src_nodes[ntype] =
+          NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8);
+      if (include_rhs_in_lhs) {
+        // Place rhs nodes first.
+        device->CopyDataFromTo(
+            rhs_nodes[ntype].Ptr<IdType>(), 0, src_nodes[ntype].Ptr<IdType>(),
+            src_node_offsets[ntype],
+            sizeof(IdType) * rhs_nodes[ntype]->shape[0], rhs_nodes[ntype]->ctx,
+            src_nodes[ntype]->ctx, rhs_nodes[ntype]->dtype);
+        src_node_offsets[ntype] += sizeof(IdType) * rhs_nodes[ntype]->shape[0];
+      }
+    }
+    for (int64_t etype = 0; etype < num_etypes; ++etype) {
+      const auto src_dst_types = graph->GetEndpointTypes(etype);
+      const dgl_type_t srctype = src_dst_types.first;
+      if (edge_arrays[etype].src.defined()) {
+        device->CopyDataFromTo(
+            edge_arrays[etype].src.Ptr<IdType>(), 0,
+            src_nodes[srctype].Ptr<IdType>(), src_node_offsets[srctype],
+            sizeof(IdType) * edge_arrays[etype].src->shape[0],
+            rhs_nodes[srctype]->ctx, src_nodes[srctype]->ctx,
+            rhs_nodes[srctype]->dtype);
+
+        src_node_offsets[srctype] +=
+            sizeof(IdType) * edge_arrays[etype].src->shape[0];
+      }
+    }
+  } else {
+    for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+      src_nodes[ntype] = lhs_nodes[ntype];
+    }
+  }
+
+  std::vector<int64_t> num_nodes_per_type(num_ntypes * 2);
+  // Populate RHS nodes from what we already know.
+  for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) {
+    num_nodes_per_type[num_ntypes + ntype] = rhs_nodes[ntype]->shape[0];
+  }
+
+  std::vector<IdArray> new_lhs;
+  std::vector<IdArray> new_rhs;
+  std::tie(new_lhs, new_rhs) = ids_mapper(
+      graph, include_rhs_in_lhs, num_ntypes, ctx, maxNodesPerType, edge_arrays,
+      src_nodes, rhs_nodes, lhs_nodes_ptr, &num_nodes_per_type);
+
+  std::vector<IdArray> induced_edges;
+  induced_edges.reserve(num_etypes);
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    if (edge_arrays[etype].id.defined()) {
+      induced_edges.push_back(edge_arrays[etype].id);
+    } else {
+      induced_edges.push_back(
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx));
+    }
+  }
+
+  // Build metagraph.
+  const auto meta_graph = graph->meta_graph();
+  const EdgeArray etypes = meta_graph->Edges("eid");
+  const IdArray new_dst = Add(etypes.dst, num_ntypes);
+  const auto new_meta_graph =
+      ImmutableGraph::CreateFromCOO(num_ntypes * 2, etypes.src, new_dst);
+
+  // Allocate vector for graph relations while GPU is busy.
+  std::vector<HeteroGraphPtr> rel_graphs;
+  rel_graphs.reserve(num_etypes);
+
+  // Build the heterograph.
+  for (int64_t etype = 0; etype < num_etypes; ++etype) {
+    const auto src_dst_types = graph->GetEndpointTypes(etype);
+    const dgl_type_t srctype = src_dst_types.first;
+    const dgl_type_t dsttype = src_dst_types.second;
+
+    if (rhs_nodes[dsttype]->shape[0] == 0) {
+      // No rhs nodes are given for this edge type. Create an empty graph.
+      rel_graphs.push_back(CreateFromCOO(
+          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx),
+          aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)));
+    } else {
+      rel_graphs.push_back(CreateFromCOO(
+          2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0],
+          new_lhs[etype], new_rhs[etype]));
+    }
+  }
+
+  HeteroGraphPtr new_graph =
+      CreateHeteroGraph(new_meta_graph, rel_graphs, num_nodes_per_type);
+
+  // Return the new graph, the new src nodes, and new edges.
+  return std::make_tuple(new_graph, induced_edges);
+}
+
+template std::tuple<HeteroGraphPtr, std::vector<IdArray>>
+ProcessToBlock<int32_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes_ptr,
+    IdsMapper &&get_maping_ids);
+
+template std::tuple<HeteroGraphPtr, std::vector<IdArray>>
+ProcessToBlock<int64_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes_ptr,
+    IdsMapper &&get_maping_ids);
+
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCPU, int32_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
+  return ToBlockCPU<int32_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCPU, int64_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
+  return ToBlockCPU<int64_t>(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+#ifdef DGL_USE_CUDA
+
+// Forward declaration of GPU ToBlock implementations - actual implementation is
+// in
+// ./cuda/cuda_to_block.cu
+// This is to get around the broken name mangling in VS2019 CL 16.5.5 +
+// CUDA 11.3 which complains that the two template specializations have the same
+// signature.
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU32(
+    HeteroGraphPtr, const std::vector<IdArray> &, bool,
+    std::vector<IdArray> *const);
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlockGPU64(
+    HeteroGraphPtr, const std::vector<IdArray> &, bool,
+    std::vector<IdArray> *const);
+
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCUDA, int32_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
+  return ToBlockGPU32(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+template <>
+std::tuple<HeteroGraphPtr, std::vector<IdArray>> ToBlock<kDGLCUDA, int64_t>(
+    HeteroGraphPtr graph, const std::vector<IdArray> &rhs_nodes,
+    bool include_rhs_in_lhs, std::vector<IdArray> *const lhs_nodes) {
+  return ToBlockGPU64(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes);
+}
+
+#endif  // DGL_USE_CUDA
+
+DGL_REGISTER_GLOBAL("capi._CAPI_DGLToBlock")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      const HeteroGraphRef graph_ref = args[0];
+      const std::vector<IdArray> &rhs_nodes =
+          ListValueToVector<IdArray>(args[1]);
+      const bool include_rhs_in_lhs = args[2];
+      std::vector<IdArray> lhs_nodes = ListValueToVector<IdArray>(args[3]);
+
+      HeteroGraphPtr new_graph;
+      std::vector<IdArray> induced_edges;
+
+      ATEN_XPU_SWITCH_CUDA(graph_ref->Context().device_type, XPU, "ToBlock", {
+        ATEN_ID_TYPE_SWITCH(graph_ref->DataType(), IdType, {
+          std::tie(new_graph, induced_edges) = ToBlock<XPU, IdType>(
+              graph_ref.sptr(), rhs_nodes, include_rhs_in_lhs, &lhs_nodes);
+        });
+      });
+
+      List<Value> lhs_nodes_ref;
+      for (IdArray &array : lhs_nodes)
+        lhs_nodes_ref.push_back(Value(MakeValue(array)));
+      List<Value> induced_edges_ref;
+      for (IdArray &array : induced_edges)
+        induced_edges_ref.push_back(Value(MakeValue(array)));
+
+      List<ObjectRef> ret;
+      ret.push_back(HeteroGraphRef(new_graph));
+      ret.push_back(lhs_nodes_ref);
+      ret.push_back(induced_edges_ref);
+
+      *rv = ret;
+    });
+
+};  // namespace transform
+
+};  // namespace dgl
diff --git a/src/partition/cuda/partition_op.cu b/src/partition/cuda/partition_op.cu
index 04d10acd1e09..4db276b873b1 100644
--- a/src/partition/cuda/partition_op.cu
+++ b/src/partition/cuda/partition_op.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file ndarray_partition.h
@@ -6,7 +7,7 @@
 
 #include <dgl/runtime/device_api.h>
 
-#include <cub/cub.cuh>
+#include <hipcub/hipcub.hpp>
 
 #include "../../runtime/cuda/cuda_common.h"
 #include "../../runtime/workspace.h"
@@ -239,7 +240,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
 
   const auto& ctx = in_idx->ctx;
   auto device = DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int64_t num_in = in_idx->shape[0];
 
@@ -295,13 +296,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
     IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
 
     size_t sort_workspace_size;
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
         nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
         static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
         stream));
 
     Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
         sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
         proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
         num_in, 0, part_bits, stream));
@@ -317,7 +318,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
     static_assert(
         sizeof(AtomicCount) == sizeof(*out_counts),
         "AtomicCount must be the same width as int64_t for atomicAdd "
-        "in cub::DeviceHistogram::HistogramEven() to work");
+        "in hipcub::DeviceHistogram::HistogramEven() to work");
 
     // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
     // add a compile time check against the cub version to allow
@@ -327,14 +328,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
            "value of int.";
 
     size_t hist_workspace_size;
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
         nullptr, hist_workspace_size, proc_id_out.get(),
         reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
         static_cast<IdType>(0), static_cast<IdType>(num_parts),
         static_cast<int>(num_in), stream));
 
     Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
         hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
         reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
         static_cast<IdType>(0), static_cast<IdType>(num_parts),
@@ -352,7 +353,7 @@ template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
 template <DGLDeviceType XPU, typename IdType>
 IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
   const auto& ctx = global_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   if (num_parts > 1) {
     IdArray local_idx =
@@ -387,7 +388,7 @@ IdArray MapToGlobalFromRemainder(
                        << num_parts;
 
   const auto& ctx = local_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   if (num_parts > 1) {
     IdArray global_idx =
@@ -423,7 +424,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
 
   const auto& ctx = in_idx->ctx;
   auto device = DeviceAPI::Get(ctx);
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   const int64_t num_in = in_idx->shape[0];
 
@@ -470,13 +471,13 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
     IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
 
     size_t sort_workspace_size;
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
         nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
         static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
         stream));
 
     Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
-    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+    CUDA_CALL(hipcub::DeviceRadixSort::SortPairs(
         sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
         proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
         num_in, 0, part_bits, stream));
@@ -492,7 +493,7 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
     static_assert(
         sizeof(AtomicCount) == sizeof(*out_counts),
         "AtomicCount must be the same width as int64_t for atomicAdd "
-        "in cub::DeviceHistogram::HistogramEven() to work");
+        "in hipcub::DeviceHistogram::HistogramEven() to work");
 
     // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
     // add a compile time check against the cub version to allow
@@ -502,14 +503,14 @@ std::pair<IdArray, NDArray> GeneratePermutationFromRange(
            "value of int.";
 
     size_t hist_workspace_size;
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
         nullptr, hist_workspace_size, proc_id_out.get(),
         reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
         static_cast<IdType>(0), static_cast<IdType>(num_parts),
         static_cast<int>(num_in), stream));
 
     Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
-    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(
         hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
         reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
         static_cast<IdType>(0), static_cast<IdType>(num_parts),
@@ -536,7 +537,7 @@ template <DGLDeviceType XPU, typename IdType, typename RangeType>
 IdArray MapToLocalFromRange(
     const int num_parts, IdArray range, IdArray global_idx) {
   const auto& ctx = global_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   if (num_parts > 1 && global_idx->shape[0] > 0) {
     IdArray local_idx =
@@ -576,7 +577,7 @@ IdArray MapToGlobalFromRange(
                        << num_parts;
 
   const auto& ctx = local_idx->ctx;
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
+  hipStream_t stream = runtime::getCurrentCUDAStream();
 
   if (num_parts > 1 && local_idx->shape[0] > 0) {
     IdArray global_idx =
diff --git a/src/partition/cuda/partition_op.cu.prehip b/src/partition/cuda/partition_op.cu.prehip
new file mode 100644
index 000000000000..04d10acd1e09
--- /dev/null
+++ b/src/partition/cuda/partition_op.cu.prehip
@@ -0,0 +1,613 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file ndarray_partition.h
+ * @brief Operations on partition implemented in CUDA.
+ */
+
+#include <dgl/runtime/device_api.h>
+
+#include <cub/cub.cuh>
+
+#include "../../runtime/cuda/cuda_common.h"
+#include "../../runtime/workspace.h"
+#include "../partition_op.h"
+
+using namespace dgl::runtime;
+
+namespace dgl {
+namespace partition {
+namespace impl {
+
+namespace {
+
+/**
+ * @brief Kernel to map global element IDs to partition IDs by remainder.
+ *
+ * @tparam IdType The type of ID.
+ * @param global The global element IDs.
+ * @param num_elements The number of element IDs.
+ * @param num_parts The number of partitions.
+ * @param part_id The mapped partition ID (outupt).
+ */
+template <typename IdType>
+__global__ void _MapProcByRemainderKernel(
+    const IdType* const global, const int64_t num_elements,
+    const int64_t num_parts, IdType* const part_id) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx =
+      blockDim.x * static_cast<int64_t>(blockIdx.x) + threadIdx.x;
+
+  if (idx < num_elements) {
+    part_id[idx] = global[idx] % num_parts;
+  }
+}
+
+/**
+ * @brief Kernel to map global element IDs to partition IDs, using a bit-mask.
+ * The number of partitions must be a power a two.
+ *
+ * @tparam IdType The type of ID.
+ * @param global The global element IDs.
+ * @param num_elements The number of element IDs.
+ * @param mask The bit-mask with 1's for each bit to keep from the element ID to
+ * extract the partition ID (e.g., an 8 partition mask would be 0x07).
+ * @param part_id The mapped partition ID (outupt).
+ */
+template <typename IdType>
+__global__ void _MapProcByMaskRemainderKernel(
+    const IdType* const global, const int64_t num_elements, const IdType mask,
+    IdType* const part_id) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx =
+      blockDim.x * static_cast<int64_t>(blockIdx.x) + threadIdx.x;
+
+  if (idx < num_elements) {
+    part_id[idx] = global[idx] & mask;
+  }
+}
+
+/**
+ * @brief Kernel to map global element IDs to local element IDs.
+ *
+ * @tparam IdType The type of ID.
+ * @param global The global element IDs.
+ * @param num_elements The number of IDs.
+ * @param num_parts The number of partitions.
+ * @param local The local element IDs (output).
+ */
+template <typename IdType>
+__global__ void _MapLocalIndexByRemainderKernel(
+    const IdType* const global, const int64_t num_elements, const int num_parts,
+    IdType* const local) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < num_elements) {
+    local[idx] = global[idx] / num_parts;
+  }
+}
+
+/**
+ * @brief Kernel to map local element IDs within a partition to their global
+ * IDs, using the remainder over the number of partitions.
+ *
+ * @tparam IdType The type of ID.
+ * @param local The local element IDs.
+ * @param part_id The partition to map local elements from.
+ * @param num_elements The number of elements to map.
+ * @param num_parts The number of partitions.
+ * @param global The global element IDs (output).
+ */
+template <typename IdType>
+__global__ void _MapGlobalIndexByRemainderKernel(
+    const IdType* const local, const int part_id, const int64_t num_elements,
+    const int num_parts, IdType* const global) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  assert(part_id < num_parts);
+
+  if (idx < num_elements) {
+    global[idx] = (local[idx] * num_parts) + part_id;
+  }
+}
+
+/**
+ * @brief Device function to perform a binary search to find to which partition
+ * a given ID belongs.
+ *
+ * @tparam RangeType The type of range.
+ * @param range The prefix-sum of IDs assigned to partitions.
+ * @param num_parts The number of partitions.
+ * @param target The element ID to find the partition of.
+ *
+ * @return The partition.
+ */
+template <typename RangeType>
+__device__ RangeType _SearchRange(
+    const RangeType* const range, const int num_parts, const RangeType target) {
+  int start = 0;
+  int end = num_parts;
+  int cur = (end + start) / 2;
+
+  assert(range[0] == 0);
+  assert(target < range[num_parts]);
+
+  while (start + 1 < end) {
+    if (target < range[cur]) {
+      end = cur;
+    } else {
+      start = cur;
+    }
+    cur = (start + end) / 2;
+  }
+
+  return cur;
+}
+
+/**
+ * @brief Kernel to map element IDs to partition IDs.
+ *
+ * @tparam IdType The type of element ID.
+ * @tparam RangeType The type of of the range.
+ * @param range The prefix-sum of IDs assigned to partitions.
+ * @param global The global element IDs.
+ * @param num_elements The number of element IDs.
+ * @param num_parts The number of partitions.
+ * @param part_id The partition ID assigned to each element (output).
+ */
+template <typename IdType, typename RangeType>
+__global__ void _MapProcByRangeKernel(
+    const RangeType* const range, const IdType* const global,
+    const int64_t num_elements, const int64_t num_parts,
+    IdType* const part_id) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx =
+      blockDim.x * static_cast<int64_t>(blockIdx.x) + threadIdx.x;
+
+  // rely on caching to load the range into L1 cache
+  if (idx < num_elements) {
+    part_id[idx] = static_cast<IdType>(_SearchRange(
+        range, static_cast<int>(num_parts),
+        static_cast<RangeType>(global[idx])));
+  }
+}
+
+/**
+ * @brief Kernel to map global element IDs to their ID within their respective
+ * partition.
+ *
+ * @tparam IdType The type of element ID.
+ * @tparam RangeType The type of the range.
+ * @param range The prefix-sum of IDs assigned to partitions.
+ * @param global The global element IDs.
+ * @param num_elements The number of elements.
+ * @param num_parts The number of partitions.
+ * @param local The local element IDs (output).
+ */
+template <typename IdType, typename RangeType>
+__global__ void _MapLocalIndexByRangeKernel(
+    const RangeType* const range, const IdType* const global,
+    const int64_t num_elements, const int num_parts, IdType* const local) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  // rely on caching to load the range into L1 cache
+  if (idx < num_elements) {
+    const int proc = _SearchRange(
+        range, static_cast<int>(num_parts),
+        static_cast<RangeType>(global[idx]));
+    local[idx] = global[idx] - range[proc];
+  }
+}
+
+/**
+ * @brief Kernel to map local element IDs within a partition to their global
+ * IDs.
+ *
+ * @tparam IdType The type of ID.
+ * @tparam RangeType The type of the range.
+ * @param range The prefix-sum of IDs assigend to partitions.
+ * @param local The local element IDs.
+ * @param part_id The partition to map local elements from.
+ * @param num_elements The number of elements to map.
+ * @param num_parts The number of partitions.
+ * @param global The global element IDs (output).
+ */
+template <typename IdType, typename RangeType>
+__global__ void _MapGlobalIndexByRangeKernel(
+    const RangeType* const range, const IdType* const local, const int part_id,
+    const int64_t num_elements, const int num_parts, IdType* const global) {
+  assert(num_elements <= gridDim.x * blockDim.x);
+  const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  assert(part_id < num_parts);
+
+  // rely on caching to load the range into L1 cache
+  if (idx < num_elements) {
+    global[idx] = local[idx] + range[part_id];
+  }
+}
+}  // namespace
+
+// Remainder Based Partition Operations
+
+template <DGLDeviceType XPU, typename IdType>
+std::pair<IdArray, NDArray> GeneratePermutationFromRemainder(
+    int64_t array_size, int num_parts, IdArray in_idx) {
+  std::pair<IdArray, NDArray> result;
+
+  const auto& ctx = in_idx->ctx;
+  auto device = DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int64_t num_in = in_idx->shape[0];
+
+  CHECK_GE(num_parts, 1) << "The number of partitions (" << num_parts
+                         << ") must be at least 1.";
+  if (num_parts == 1) {
+    // no permutation
+    result.first = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
+    result.second = aten::Full(num_in, num_parts, sizeof(int64_t) * 8, ctx);
+
+    return result;
+  }
+
+  result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType) * 8);
+  result.second = aten::Full(0, num_parts, sizeof(int64_t) * 8, ctx);
+  int64_t* out_counts = static_cast<int64_t*>(result.second->data);
+  if (num_in == 0) {
+    // now that we've zero'd out_counts, nothing left to do for an empty
+    // mapping
+    return result;
+  }
+
+  const int64_t part_bits =
+      static_cast<int64_t>(std::ceil(std::log2(num_parts)));
+
+  // First, generate a mapping of indexes to processors
+  Workspace<IdType> proc_id_in(device, ctx, num_in);
+  {
+    const dim3 block(256);
+    const dim3 grid((num_in + block.x - 1) / block.x);
+
+    if (num_parts < (1 << part_bits)) {
+      // num_parts is not a power of 2
+      CUDA_KERNEL_CALL(
+          _MapProcByRemainderKernel, grid, block, 0, stream,
+          static_cast<const IdType*>(in_idx->data), num_in, num_parts,
+          proc_id_in.get());
+    } else {
+      // num_parts is a power of 2
+      CUDA_KERNEL_CALL(
+          _MapProcByMaskRemainderKernel, grid, block, 0, stream,
+          static_cast<const IdType*>(in_idx->data), num_in,
+          static_cast<IdType>(num_parts - 1),  // bit mask
+          proc_id_in.get());
+    }
+  }
+
+  // then create a permutation array that groups processors together by
+  // performing a radix sort
+  Workspace<IdType> proc_id_out(device, ctx, num_in);
+  IdType* perm_out = static_cast<IdType*>(result.first->data);
+  {
+    IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
+
+    size_t sort_workspace_size;
+    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+        nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
+        static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
+        stream));
+
+    Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
+    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+        sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
+        proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
+        num_in, 0, part_bits, stream));
+  }
+  // explicitly free so workspace can be re-used
+  proc_id_in.free();
+
+  // perform a histogram and then prefixsum on the sorted proc_id vector
+
+  // Count the number of values to be sent to each processor
+  {
+    using AtomicCount = unsigned long long;  // NOLINT
+    static_assert(
+        sizeof(AtomicCount) == sizeof(*out_counts),
+        "AtomicCount must be the same width as int64_t for atomicAdd "
+        "in cub::DeviceHistogram::HistogramEven() to work");
+
+    // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
+    // add a compile time check against the cub version to allow
+    // num_in > (2 << 31).
+    CHECK(num_in < static_cast<int64_t>(std::numeric_limits<int>::max()))
+        << "number of values to insert into histogram must be less than max "
+           "value of int.";
+
+    size_t hist_workspace_size;
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+        nullptr, hist_workspace_size, proc_id_out.get(),
+        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
+        static_cast<IdType>(0), static_cast<IdType>(num_parts),
+        static_cast<int>(num_in), stream));
+
+    Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+        hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
+        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
+        static_cast<IdType>(0), static_cast<IdType>(num_parts),
+        static_cast<int>(num_in), stream));
+  }
+
+  return result;
+}
+
+template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
+    kDGLCUDA, int32_t>(int64_t array_size, int num_parts, IdArray in_idx);
+template std::pair<IdArray, IdArray> GeneratePermutationFromRemainder<
+    kDGLCUDA, int64_t>(int64_t array_size, int num_parts, IdArray in_idx);
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) {
+  const auto& ctx = global_idx->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  if (num_parts > 1) {
+    IdArray local_idx =
+        aten::NewIdArray(global_idx->shape[0], ctx, sizeof(IdType) * 8);
+
+    const dim3 block(128);
+    const dim3 grid((global_idx->shape[0] + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapLocalIndexByRemainderKernel, grid, block, 0, stream,
+        static_cast<const IdType*>(global_idx->data), global_idx->shape[0],
+        num_parts, static_cast<IdType*>(local_idx->data));
+
+    return local_idx;
+  } else {
+    // no mapping to be done
+    return global_idx;
+  }
+}
+
+template IdArray MapToLocalFromRemainder<kDGLCUDA, int32_t>(
+    int num_parts, IdArray in_idx);
+template IdArray MapToLocalFromRemainder<kDGLCUDA, int64_t>(
+    int num_parts, IdArray in_idx);
+
+template <DGLDeviceType XPU, typename IdType>
+IdArray MapToGlobalFromRemainder(
+    const int num_parts, IdArray local_idx, const int part_id) {
+  CHECK_LT(part_id, num_parts)
+      << "Invalid partition id " << part_id << "/" << num_parts;
+  CHECK_GE(part_id, 0) << "Invalid partition id " << part_id << "/"
+                       << num_parts;
+
+  const auto& ctx = local_idx->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  if (num_parts > 1) {
+    IdArray global_idx =
+        aten::NewIdArray(local_idx->shape[0], ctx, sizeof(IdType) * 8);
+
+    const dim3 block(128);
+    const dim3 grid((local_idx->shape[0] + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapGlobalIndexByRemainderKernel, grid, block, 0, stream,
+        static_cast<const IdType*>(local_idx->data), part_id,
+        global_idx->shape[0], num_parts,
+        static_cast<IdType*>(global_idx->data));
+
+    return global_idx;
+  } else {
+    // no mapping to be done
+    return local_idx;
+  }
+}
+
+template IdArray MapToGlobalFromRemainder<kDGLCUDA, int32_t>(
+    int num_parts, IdArray in_idx, int part_id);
+template IdArray MapToGlobalFromRemainder<kDGLCUDA, int64_t>(
+    int num_parts, IdArray in_idx, int part_id);
+
+// Range Based Partition Operations
+
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
+std::pair<IdArray, NDArray> GeneratePermutationFromRange(
+    int64_t array_size, int num_parts, IdArray range, IdArray in_idx) {
+  std::pair<IdArray, NDArray> result;
+
+  const auto& ctx = in_idx->ctx;
+  auto device = DeviceAPI::Get(ctx);
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  const int64_t num_in = in_idx->shape[0];
+
+  CHECK_GE(num_parts, 1) << "The number of partitions (" << num_parts
+                         << ") must be at least 1.";
+  if (num_parts == 1) {
+    // no permutation
+    result.first = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
+    result.second = aten::Full(num_in, num_parts, sizeof(int64_t) * 8, ctx);
+
+    return result;
+  }
+
+  result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType) * 8);
+  result.second = aten::Full(0, num_parts, sizeof(int64_t) * 8, ctx);
+  int64_t* out_counts = static_cast<int64_t*>(result.second->data);
+  if (num_in == 0) {
+    // now that we've zero'd out_counts, nothing left to do for an empty
+    // mapping
+    return result;
+  }
+
+  const int64_t part_bits =
+      static_cast<int64_t>(std::ceil(std::log2(num_parts)));
+
+  // First, generate a mapping of indexes to processors
+  Workspace<IdType> proc_id_in(device, ctx, num_in);
+  {
+    const dim3 block(256);
+    const dim3 grid((num_in + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapProcByRangeKernel, grid, block, 0, stream,
+        static_cast<const RangeType*>(range->data),
+        static_cast<const IdType*>(in_idx->data), num_in, num_parts,
+        proc_id_in.get());
+  }
+
+  // then create a permutation array that groups processors together by
+  // performing a radix sort
+  Workspace<IdType> proc_id_out(device, ctx, num_in);
+  IdType* perm_out = static_cast<IdType*>(result.first->data);
+  {
+    IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx);
+
+    size_t sort_workspace_size;
+    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+        nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(),
+        static_cast<IdType*>(perm_in->data), perm_out, num_in, 0, part_bits,
+        stream));
+
+    Workspace<void> sort_workspace(device, ctx, sort_workspace_size);
+    CUDA_CALL(cub::DeviceRadixSort::SortPairs(
+        sort_workspace.get(), sort_workspace_size, proc_id_in.get(),
+        proc_id_out.get(), static_cast<IdType*>(perm_in->data), perm_out,
+        num_in, 0, part_bits, stream));
+  }
+  // explicitly free so workspace can be re-used
+  proc_id_in.free();
+
+  // perform a histogram and then prefixsum on the sorted proc_id vector
+
+  // Count the number of values to be sent to each processor
+  {
+    using AtomicCount = unsigned long long;  // NOLINT
+    static_assert(
+        sizeof(AtomicCount) == sizeof(*out_counts),
+        "AtomicCount must be the same width as int64_t for atomicAdd "
+        "in cub::DeviceHistogram::HistogramEven() to work");
+
+    // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged,
+    // add a compile time check against the cub version to allow
+    // num_in > (2 << 31).
+    CHECK(num_in < static_cast<int64_t>(std::numeric_limits<int>::max()))
+        << "number of values to insert into histogram must be less than max "
+           "value of int.";
+
+    size_t hist_workspace_size;
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+        nullptr, hist_workspace_size, proc_id_out.get(),
+        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
+        static_cast<IdType>(0), static_cast<IdType>(num_parts),
+        static_cast<int>(num_in), stream));
+
+    Workspace<void> hist_workspace(device, ctx, hist_workspace_size);
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(
+        hist_workspace.get(), hist_workspace_size, proc_id_out.get(),
+        reinterpret_cast<AtomicCount*>(out_counts), num_parts + 1,
+        static_cast<IdType>(0), static_cast<IdType>(num_parts),
+        static_cast<int>(num_in), stream));
+  }
+
+  return result;
+}
+
+template std::pair<IdArray, IdArray>
+GeneratePermutationFromRange<kDGLCUDA, int32_t, int32_t>(
+    int64_t array_size, int num_parts, IdArray range, IdArray in_idx);
+template std::pair<IdArray, IdArray>
+GeneratePermutationFromRange<kDGLCUDA, int64_t, int32_t>(
+    int64_t array_size, int num_parts, IdArray range, IdArray in_idx);
+template std::pair<IdArray, IdArray>
+GeneratePermutationFromRange<kDGLCUDA, int32_t, int64_t>(
+    int64_t array_size, int num_parts, IdArray range, IdArray in_idx);
+template std::pair<IdArray, IdArray>
+GeneratePermutationFromRange<kDGLCUDA, int64_t, int64_t>(
+    int64_t array_size, int num_parts, IdArray range, IdArray in_idx);
+
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
+IdArray MapToLocalFromRange(
+    const int num_parts, IdArray range, IdArray global_idx) {
+  const auto& ctx = global_idx->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  if (num_parts > 1 && global_idx->shape[0] > 0) {
+    IdArray local_idx =
+        aten::NewIdArray(global_idx->shape[0], ctx, sizeof(IdType) * 8);
+
+    const dim3 block(128);
+    const dim3 grid((global_idx->shape[0] + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapLocalIndexByRangeKernel, grid, block, 0, stream,
+        static_cast<const RangeType*>(range->data),
+        static_cast<const IdType*>(global_idx->data), global_idx->shape[0],
+        num_parts, static_cast<IdType*>(local_idx->data));
+
+    return local_idx;
+  } else {
+    // no mapping to be done
+    return global_idx;
+  }
+}
+
+template IdArray MapToLocalFromRange<kDGLCUDA, int32_t, int32_t>(
+    int num_parts, IdArray range, IdArray in_idx);
+template IdArray MapToLocalFromRange<kDGLCUDA, int64_t, int32_t>(
+    int num_parts, IdArray range, IdArray in_idx);
+template IdArray MapToLocalFromRange<kDGLCUDA, int32_t, int64_t>(
+    int num_parts, IdArray range, IdArray in_idx);
+template IdArray MapToLocalFromRange<kDGLCUDA, int64_t, int64_t>(
+    int num_parts, IdArray range, IdArray in_idx);
+
+template <DGLDeviceType XPU, typename IdType, typename RangeType>
+IdArray MapToGlobalFromRange(
+    const int num_parts, IdArray range, IdArray local_idx, const int part_id) {
+  CHECK_LT(part_id, num_parts)
+      << "Invalid partition id " << part_id << "/" << num_parts;
+  CHECK_GE(part_id, 0) << "Invalid partition id " << part_id << "/"
+                       << num_parts;
+
+  const auto& ctx = local_idx->ctx;
+  cudaStream_t stream = runtime::getCurrentCUDAStream();
+
+  if (num_parts > 1 && local_idx->shape[0] > 0) {
+    IdArray global_idx =
+        aten::NewIdArray(local_idx->shape[0], ctx, sizeof(IdType) * 8);
+
+    const dim3 block(128);
+    const dim3 grid((local_idx->shape[0] + block.x - 1) / block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapGlobalIndexByRangeKernel, grid, block, 0, stream,
+        static_cast<const RangeType*>(range->data),
+        static_cast<const IdType*>(local_idx->data), part_id,
+        global_idx->shape[0], num_parts,
+        static_cast<IdType*>(global_idx->data));
+
+    return global_idx;
+  } else {
+    // no mapping to be done
+    return local_idx;
+  }
+}
+
+template IdArray MapToGlobalFromRange<kDGLCUDA, int32_t, int32_t>(
+    int num_parts, IdArray range, IdArray in_idx, int part_id);
+template IdArray MapToGlobalFromRange<kDGLCUDA, int64_t, int32_t>(
+    int num_parts, IdArray range, IdArray in_idx, int part_id);
+template IdArray MapToGlobalFromRange<kDGLCUDA, int32_t, int64_t>(
+    int num_parts, IdArray range, IdArray in_idx, int part_id);
+template IdArray MapToGlobalFromRange<kDGLCUDA, int64_t, int64_t>(
+    int num_parts, IdArray range, IdArray in_idx, int part_id);
+
+}  // namespace impl
+}  // namespace partition
+}  // namespace dgl
diff --git a/src/partition/ndarray_partition.cc b/src/partition/ndarray_partition.cc
index 196f9c7535fb..f83326ed9c72 100644
--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -37,7 +37,7 @@ class RemainderPartition : public NDArrayPartition {
 
   std::pair<IdArray, NDArray> GeneratePermutation(
       IdArray in_idx) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
@@ -54,7 +54,7 @@ class RemainderPartition : public NDArrayPartition {
   }
 
   IdArray MapToLocal(IdArray in_idx) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
@@ -71,7 +71,7 @@ class RemainderPartition : public NDArrayPartition {
   }
 
   IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
@@ -116,7 +116,7 @@ class RangePartition : public NDArrayPartition {
 
   std::pair<IdArray, NDArray> GeneratePermutation(
       IdArray in_idx) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       if (ctx.device_type != range_->ctx.device_type ||
@@ -142,7 +142,7 @@ class RangePartition : public NDArrayPartition {
   }
 
   IdArray MapToLocal(IdArray in_idx) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
@@ -161,7 +161,7 @@ class RangePartition : public NDArrayPartition {
   }
 
   IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
     auto ctx = in_idx->ctx;
     if (ctx.device_type == kDGLCUDA) {
       ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
diff --git a/src/partition/ndarray_partition.cc.prehip b/src/partition/ndarray_partition.cc.prehip
new file mode 100644
index 000000000000..196f9c7535fb
--- /dev/null
+++ b/src/partition/ndarray_partition.cc.prehip
@@ -0,0 +1,266 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file ndarray_partition.cc
+ * @brief DGL utilities for working with the partitioned NDArrays
+ */
+
+#include "ndarray_partition.h"
+
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
+
+#include <memory>
+#include <utility>
+
+#include "../c_api_common.h"
+#include "partition_op.h"
+
+using namespace dgl::runtime;
+
+namespace dgl {
+namespace partition {
+
+NDArrayPartition::NDArrayPartition(
+    const int64_t array_size, const int num_parts)
+    : array_size_(array_size), num_parts_(num_parts) {}
+
+int64_t NDArrayPartition::ArraySize() const { return array_size_; }
+
+int NDArrayPartition::NumParts() const { return num_parts_; }
+
+class RemainderPartition : public NDArrayPartition {
+ public:
+  RemainderPartition(const int64_t array_size, const int num_parts)
+      : NDArrayPartition(array_size, num_parts) {
+    // do nothing
+  }
+
+  std::pair<IdArray, NDArray> GeneratePermutation(
+      IdArray in_idx) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        return impl::GeneratePermutationFromRemainder<kDGLCUDA, IdType>(
+            ArraySize(), NumParts(), in_idx);
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return std::pair<IdArray, NDArray>{};
+  }
+
+  IdArray MapToLocal(IdArray in_idx) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        return impl::MapToLocalFromRemainder<kDGLCUDA, IdType>(
+            NumParts(), in_idx);
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return IdArray{};
+  }
+
+  IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        return impl::MapToGlobalFromRemainder<kDGLCUDA, IdType>(
+            NumParts(), in_idx, part_id);
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return IdArray{};
+  }
+
+  int64_t PartSize(const int part_id) const override {
+    CHECK_LT(part_id, NumParts()) << "Invalid part ID (" << part_id
+                                  << ") for "
+                                     "partition of size "
+                                  << NumParts() << ".";
+    return ArraySize() / NumParts() + (part_id < ArraySize() % NumParts());
+  }
+};
+
+class RangePartition : public NDArrayPartition {
+ public:
+  RangePartition(const int64_t array_size, const int num_parts, IdArray range)
+      : NDArrayPartition(array_size, num_parts),
+        range_(range),
+        // We also need a copy of the range on the CPU, to compute partition
+        // sizes. We require the input range on the GPU, as if we have multiple
+        // GPUs, we can't know which is the proper one to copy the array to, but
+        // we have only one CPU context, and can safely copy the array to that.
+        range_cpu_(range.CopyTo(DGLContext{kDGLCPU, 0})) {
+    auto ctx = range->ctx;
+    if (ctx.device_type != kDGLCUDA) {
+      LOG(FATAL) << "The range for an NDArrayPartition is only supported "
+                    " on GPUs. Transfer the range to the target device before "
+                    "creating the partition.";
+    }
+  }
+
+  std::pair<IdArray, NDArray> GeneratePermutation(
+      IdArray in_idx) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      if (ctx.device_type != range_->ctx.device_type ||
+          ctx.device_id != range_->ctx.device_id) {
+        LOG(FATAL) << "The range for the NDArrayPartition and the input "
+                      "array must be on the same device: "
+                   << ctx << " vs. " << range_->ctx;
+      }
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
+          return impl::GeneratePermutationFromRange<
+              kDGLCUDA, IdType, RangeType>(
+              ArraySize(), NumParts(), range_, in_idx);
+        });
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return std::pair<IdArray, NDArray>{};
+  }
+
+  IdArray MapToLocal(IdArray in_idx) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
+          return impl::MapToLocalFromRange<kDGLCUDA, IdType, RangeType>(
+              NumParts(), range_, in_idx);
+        });
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return IdArray{};
+  }
+
+  IdArray MapToGlobal(IdArray in_idx, const int part_id) const override {
+#ifdef DGL_USE_CUDA
+    auto ctx = in_idx->ctx;
+    if (ctx.device_type == kDGLCUDA) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, {
+          return impl::MapToGlobalFromRange<kDGLCUDA, IdType, RangeType>(
+              NumParts(), range_, in_idx, part_id);
+        });
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+                  "implemented.";
+    // should be unreachable
+    return IdArray{};
+  }
+
+  int64_t PartSize(const int part_id) const override {
+    CHECK_LT(part_id, NumParts()) << "Invalid part ID (" << part_id
+                                  << ") for "
+                                     "partition of size "
+                                  << NumParts() << ".";
+    int64_t part_size = -1;
+    ATEN_ID_TYPE_SWITCH(range_cpu_->dtype, RangeType, {
+      const RangeType* const ptr =
+          static_cast<const RangeType*>(range_cpu_->data);
+      part_size = ptr[part_id + 1] - ptr[part_id];
+    });
+    return part_size;
+  }
+
+ private:
+  IdArray range_;
+  IdArray range_cpu_;
+};
+
+NDArrayPartitionRef CreatePartitionRemainderBased(
+    const int64_t array_size, const int num_parts) {
+  return NDArrayPartitionRef(
+      std::make_shared<RemainderPartition>(array_size, num_parts));
+}
+
+NDArrayPartitionRef CreatePartitionRangeBased(
+    const int64_t array_size, const int num_parts, IdArray range) {
+  return NDArrayPartitionRef(
+      std::make_shared<RangePartition>(array_size, num_parts, range));
+}
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionCreateRemainderBased")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      int64_t array_size = args[0];
+      int num_parts = args[1];
+
+      *rv = CreatePartitionRemainderBased(array_size, num_parts);
+    });
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionCreateRangeBased")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      const int64_t array_size = args[0];
+      const int num_parts = args[1];
+      IdArray range = args[2];
+
+      *rv = CreatePartitionRangeBased(array_size, num_parts, range);
+    });
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGetPartSize")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      int part_id = args[1];
+
+      *rv = part->PartSize(part_id);
+    });
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToLocal")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      IdArray idxs = args[1];
+
+      *rv = part->MapToLocal(idxs);
+    });
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      IdArray idxs = args[1];
+      const int part_id = args[2];
+
+      *rv = part->MapToGlobal(idxs, part_id);
+    });
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      IdArray idxs = args[1];
+
+      std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
+      *rv =
+          ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
+    });
+
+}  // namespace partition
+}  // namespace dgl
diff --git a/src/random/continuous_seed.h b/src/random/continuous_seed.h
index 60ce6762806b..a4478f4e8084 100644
--- a/src/random/continuous_seed.h
+++ b/src/random/continuous_seed.h
@@ -25,7 +25,7 @@
 #include <cmath>
 
 #ifdef __NVCC__
-#include <curand_kernel.h>
+#include <hiprand/hiprand_kernel.h>
 #else
 #include <random>
 
@@ -61,16 +61,16 @@ class continuous_seed {
 #ifdef __CUDA_ARCH__
   __device__ inline float uniform(const uint64_t t) const {
     const uint64_t kCurandSeed = 999961;  // Could be any random number.
-    curandStatePhilox4_32_10_t rng;
-    curand_init(kCurandSeed, s[0], t, &rng);
+    hiprandStatePhilox4_32_10_t rng;
+    hiprand_init(kCurandSeed, s[0], t, &rng);
     float rnd;
     if (s[0] != s[1]) {
-      rnd = c[0] * curand_normal(&rng);
-      curand_init(kCurandSeed, s[1], t, &rng);
-      rnd += c[1] * curand_normal(&rng);
+      rnd = c[0] * hiprand_normal(&rng);
+      hiprand_init(kCurandSeed, s[1], t, &rng);
+      rnd += c[1] * hiprand_normal(&rng);
       rnd = normcdff(rnd);
     } else {
-      rnd = curand_uniform(&rng);
+      rnd = hiprand_uniform(&rng);
     }
     return rnd;
   }
diff --git a/src/random/continuous_seed.h.prehip b/src/random/continuous_seed.h.prehip
new file mode 100644
index 000000000000..60ce6762806b
--- /dev/null
+++ b/src/random/continuous_seed.h.prehip
@@ -0,0 +1,100 @@
+/*!
+ *   Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek)
+ *   All rights reserved.
+ *
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ *
+ * @file dgl/continuous_seed.h
+ * @brief CPU and CUDA implementation for continuous random seeds
+ */
+#ifndef DGL_RANDOM_CONTINUOUS_SEED_H_
+#define DGL_RANDOM_CONTINUOUS_SEED_H_
+
+#include <dgl/array.h>
+
+#include <cmath>
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#else
+#include <random>
+
+#include "pcg_random.hpp"
+#endif  // __CUDA_ARCH__
+
+#ifndef M_SQRT1_2
+#define M_SQRT1_2 0.707106781186547524401
+#endif  // M_SQRT1_2
+
+namespace dgl {
+namespace random {
+
+class continuous_seed {
+  uint64_t s[2];
+  float c[2];
+
+ public:
+  /* implicit */ continuous_seed(const int64_t seed) {  // NOLINT
+    s[0] = s[1] = seed;
+    c[0] = c[1] = 0;
+  }
+
+  continuous_seed(IdArray seed_arr, float r) {
+    auto seed = seed_arr.Ptr<int64_t>();
+    s[0] = seed[0];
+    s[1] = seed[seed_arr->shape[0] - 1];
+    const auto pi = std::acos(-1.0);
+    c[0] = std::cos(pi * r / 2);
+    c[1] = std::sin(pi * r / 2);
+  }
+
+#ifdef __CUDA_ARCH__
+  __device__ inline float uniform(const uint64_t t) const {
+    const uint64_t kCurandSeed = 999961;  // Could be any random number.
+    curandStatePhilox4_32_10_t rng;
+    curand_init(kCurandSeed, s[0], t, &rng);
+    float rnd;
+    if (s[0] != s[1]) {
+      rnd = c[0] * curand_normal(&rng);
+      curand_init(kCurandSeed, s[1], t, &rng);
+      rnd += c[1] * curand_normal(&rng);
+      rnd = normcdff(rnd);
+    } else {
+      rnd = curand_uniform(&rng);
+    }
+    return rnd;
+  }
+#else
+  inline float uniform(const uint64_t t) const {
+    pcg32 ng0(s[0], t);
+    float rnd;
+    if (s[0] != s[1]) {
+      std::normal_distribution<float> norm;
+      rnd = c[0] * norm(ng0);
+      pcg32 ng1(s[1], t);
+      norm.reset();
+      rnd += c[1] * norm(ng1);
+      rnd = std::erfc(-rnd * static_cast<float>(M_SQRT1_2)) / 2.0f;
+    } else {
+      std::uniform_real_distribution<float> uni;
+      rnd = uni(ng0);
+    }
+    return rnd;
+  }
+#endif  // __CUDA_ARCH__
+};
+
+}  // namespace random
+}  // namespace dgl
+
+#endif  // DGL_RANDOM_CONTINUOUS_SEED_H_
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index a679e3576395..583a1816a6d1 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -122,13 +122,13 @@ void DeviceAPI::SyncStreamFromTo(
 }
 
 bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
-  LOG(FATAL) << "Device does not support cudaHostRegister api.";
+  LOG(FATAL) << "Device does not support hipHostRegister api.";
   return false;
 }
 
 void* DeviceAPI::AllocPinnedDataSpace(
     size_t nbytes, void** ctx, void** deleter) {
-  LOG(FATAL) << "Device does not support cudaHostAlloc api.";
+  LOG(FATAL) << "Device does not support hipHostAlloc api.";
   return nullptr;
 }
 
@@ -137,7 +137,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) {
 }
 
 void DeviceAPI::UnpinData(void* ptr) {
-  LOG(FATAL) << "Device does not support cudaHostUnregister api.";
+  LOG(FATAL) << "Device does not support hipHostUnregister api.";
 }
 }  // namespace runtime
 }  // namespace dgl
diff --git a/src/runtime/c_runtime_api.cc.prehip b/src/runtime/c_runtime_api.cc.prehip
new file mode 100644
index 000000000000..a679e3576395
--- /dev/null
+++ b/src/runtime/c_runtime_api.cc.prehip
@@ -0,0 +1,418 @@
+/**
+ *  Copyright (c) 2016-2022 by Contributors
+ * @file c_runtime_api.cc
+ * @brief Runtime API implementation
+ */
+#include <dgl/runtime/c_backend_api.h>
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/module.h>
+#include <dgl/runtime/packed_func.h>
+#include <dgl/runtime/registry.h>
+#include <dgl/runtime/tensordispatch.h>
+#include <dmlc/thread_local.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <string>
+
+#include "runtime_base.h"
+
+namespace dgl {
+namespace runtime {
+
+/**
+ * @brief The name of Device API factory.
+ * @param type The device type.
+ */
+inline std::string DeviceName(int type) {
+  switch (type) {
+    case kDGLCPU:
+      return "cpu";
+    case kDGLCUDA:
+      return "cuda";
+    // add more device here once supported
+    default:
+      LOG(FATAL) << "unknown type =" << type;
+      return "Unknown";
+  }
+}
+
+class DeviceAPIManager {
+ public:
+  static const int kMaxDeviceAPI = 32;
+  // Get API
+  static DeviceAPI* Get(const DGLContext& ctx) { return Get(ctx.device_type); }
+  static DeviceAPI* Get(int dev_type, bool allow_missing = false) {
+    return Global()->GetAPI(dev_type, allow_missing);
+  }
+
+ private:
+  std::array<DeviceAPI*, kMaxDeviceAPI> api_;
+  DeviceAPI* rpc_api_{nullptr};
+  std::mutex mutex_;
+  // constructor
+  DeviceAPIManager() { std::fill(api_.begin(), api_.end(), nullptr); }
+  // Global static variable.
+  static DeviceAPIManager* Global() {
+    static DeviceAPIManager inst;
+    return &inst;
+  }
+  // Get or initialize API.
+  DeviceAPI* GetAPI(int type, bool allow_missing) {
+    if (type < kRPCSessMask) {
+      if (api_[type] != nullptr) return api_[type];
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (api_[type] != nullptr) return api_[type];
+      api_[type] = GetAPI(DeviceName(type), allow_missing);
+      return api_[type];
+    } else {
+      if (rpc_api_ != nullptr) return rpc_api_;
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (rpc_api_ != nullptr) return rpc_api_;
+      rpc_api_ = GetAPI("rpc", allow_missing);
+      return rpc_api_;
+    }
+  }
+  DeviceAPI* GetAPI(const std::string name, bool allow_missing) {
+    std::string factory = "device_api." + name;
+    auto* f = Registry::Get(factory);
+    if (f == nullptr) {
+      CHECK(allow_missing)
+          << "Device API " << name
+          << " is not enabled. Please install the cuda version of dgl.";
+      return nullptr;
+    }
+    void* ptr = (*f)();
+    return static_cast<DeviceAPI*>(ptr);
+  }
+};
+
+DeviceAPI* DeviceAPI::Get(DGLContext ctx, bool allow_missing) {
+  return DeviceAPIManager::Get(
+      static_cast<int>(ctx.device_type), allow_missing);
+}
+
+DeviceAPI* DeviceAPI::Get(DGLDeviceType dev_type, bool allow_missing) {
+  return DeviceAPIManager::Get(static_cast<int>(dev_type), allow_missing);
+}
+
+void* DeviceAPI::AllocWorkspace(
+    DGLContext ctx, size_t size, DGLDataType type_hint) {
+  return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint);
+}
+
+void DeviceAPI::FreeWorkspace(DGLContext ctx, void* ptr) {
+  FreeDataSpace(ctx, ptr);
+}
+
+DGLStreamHandle DeviceAPI::CreateStream(DGLContext ctx) {
+  LOG(FATAL) << "Device does not support stream api.";
+  return 0;
+}
+
+void DeviceAPI::FreeStream(DGLContext ctx, DGLStreamHandle stream) {
+  LOG(FATAL) << "Device does not support stream api.";
+}
+
+void DeviceAPI::SyncStreamFromTo(
+    DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
+  LOG(FATAL) << "Device does not support stream api.";
+}
+
+bool DeviceAPI::PinData(void* ptr, size_t nbytes) {
+  LOG(FATAL) << "Device does not support cudaHostRegister api.";
+  return false;
+}
+
+void* DeviceAPI::AllocPinnedDataSpace(
+    size_t nbytes, void** ctx, void** deleter) {
+  LOG(FATAL) << "Device does not support cudaHostAlloc api.";
+  return nullptr;
+}
+
+void DeviceAPI::FreePinnedDataSpace(void** deleter) {
+  LOG(FATAL) << "Device does not support cudaHostFree api.";
+}
+
+void DeviceAPI::UnpinData(void* ptr) {
+  LOG(FATAL) << "Device does not support cudaHostUnregister api.";
+}
+}  // namespace runtime
+}  // namespace dgl
+
+using namespace dgl::runtime;
+
+struct DGLRuntimeEntry {
+  std::string ret_str;
+  std::string last_error;
+  DGLByteArray ret_bytes;
+};
+
+typedef dmlc::ThreadLocalStore<DGLRuntimeEntry> DGLAPIRuntimeStore;
+
+const char* DGLGetLastError() {
+  return DGLAPIRuntimeStore::Get()->last_error.c_str();
+}
+
+void DGLAPISetLastError(const char* msg) {
+#ifndef _LIBCPP_SGX_CONFIG
+  DGLAPIRuntimeStore::Get()->last_error = msg;
+#else
+  sgx::OCallPackedFunc("__sgx_set_last_error__", msg);
+#endif
+}
+
+int DGLModLoadFromFile(
+    const char* file_name, const char* format, DGLModuleHandle* out) {
+  API_BEGIN();
+  Module m = Module::LoadFromFile(file_name, format);
+  *out = new Module(m);
+  API_END();
+}
+
+int DGLModImport(DGLModuleHandle mod, DGLModuleHandle dep) {
+  API_BEGIN();
+  static_cast<Module*>(mod)->Import(*static_cast<Module*>(dep));
+  API_END();
+}
+
+int DGLModGetFunction(
+    DGLModuleHandle mod, const char* func_name, int query_imports,
+    DGLFunctionHandle* func) {
+  API_BEGIN();
+  PackedFunc pf =
+      static_cast<Module*>(mod)->GetFunction(func_name, query_imports != 0);
+  if (pf != nullptr) {
+    *func = new PackedFunc(pf);
+  } else {
+    *func = nullptr;
+  }
+  API_END();
+}
+
+int DGLModFree(DGLModuleHandle mod) {
+  API_BEGIN();
+  delete static_cast<Module*>(mod);
+  API_END();
+}
+
+int DGLBackendGetFuncFromEnv(
+    void* mod_node, const char* func_name, DGLFunctionHandle* func) {
+  API_BEGIN();
+  *func =
+      (DGLFunctionHandle)(static_cast<ModuleNode*>(mod_node)->GetFuncFromEnv(
+          func_name));
+  API_END();
+}
+
+void* DGLBackendAllocWorkspace(
+    int device_type, int device_id, uint64_t size, int dtype_code_hint,
+    int dtype_bits_hint) {
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+
+  DGLDataType type_hint;
+  type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
+  type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
+  type_hint.lanes = 1;
+
+  return DeviceAPIManager::Get(ctx)->AllocWorkspace(
+      ctx, static_cast<size_t>(size), type_hint);
+}
+
+int DGLBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->FreeWorkspace(ctx, ptr);
+  return 0;
+}
+
+int DGLBackendRunOnce(void** handle, int (*f)(void*), void* cdata, int nbytes) {
+  if (*handle == nullptr) {
+    *handle = reinterpret_cast<void*>(1);
+    return (*f)(cdata);
+  }
+  return 0;
+}
+
+int DGLFuncFree(DGLFunctionHandle func) {
+  API_BEGIN();
+  delete static_cast<PackedFunc*>(func);
+  API_END();
+}
+
+int DGLFuncCall(
+    DGLFunctionHandle func, DGLValue* args, int* arg_type_codes, int num_args,
+    DGLValue* ret_val, int* ret_type_code) {
+  API_BEGIN();
+  DGLRetValue rv;
+  (*static_cast<const PackedFunc*>(func))
+      .CallPacked(DGLArgs(args, arg_type_codes, num_args), &rv);
+  // handle return string.
+  if (rv.type_code() == kStr || rv.type_code() == kDGLDataType ||
+      rv.type_code() == kBytes) {
+    DGLRuntimeEntry* e = DGLAPIRuntimeStore::Get();
+    if (rv.type_code() != kDGLDataType) {
+      e->ret_str = *rv.ptr<std::string>();
+    } else {
+      e->ret_str = rv.operator std::string();
+    }
+    if (rv.type_code() == kBytes) {
+      e->ret_bytes.data = e->ret_str.c_str();
+      e->ret_bytes.size = e->ret_str.length();
+      *ret_type_code = kBytes;
+      ret_val->v_handle = &(e->ret_bytes);
+    } else {
+      *ret_type_code = kStr;
+      ret_val->v_str = e->ret_str.c_str();
+    }
+  } else {
+    rv.MoveToCHost(ret_val, ret_type_code);
+  }
+  API_END();
+}
+
+int DGLCFuncSetReturn(
+    DGLRetValueHandle ret, DGLValue* value, int* type_code, int num_ret) {
+  API_BEGIN();
+  CHECK_EQ(num_ret, 1);
+  DGLRetValue* rv = static_cast<DGLRetValue*>(ret);
+  *rv = DGLArgValue(value[0], type_code[0]);
+  API_END();
+}
+
+int DGLFuncCreateFromCFunc(
+    DGLPackedCFunc func, void* resource_handle, DGLPackedCFuncFinalizer fin,
+    DGLFunctionHandle* out) {
+  API_BEGIN();
+  if (fin == nullptr) {
+    *out =
+        new PackedFunc([func, resource_handle](DGLArgs args, DGLRetValue* rv) {
+          int ret = func(
+              (DGLValue*)args.values, (int*)args.type_codes,  // NOLINT(*)
+              args.num_args, rv, resource_handle);
+          if (ret != 0) {
+            std::string err = "DGLCall CFunc Error:\n";
+            err += DGLGetLastError();
+            throw dmlc::Error(err);
+          }
+        });
+  } else {
+    // wrap it in a shared_ptr, with fin as deleter.
+    // so fin will be called when the lambda went out of scope.
+    std::shared_ptr<void> rpack(resource_handle, fin);
+    *out = new PackedFunc([func, rpack](DGLArgs args, DGLRetValue* rv) {
+      int ret = func(
+          (DGLValue*)args.values, (int*)args.type_codes,  // NOLINT(*)
+          args.num_args, rv, rpack.get());
+      if (ret != 0) {
+        std::string err = "DGLCall CFunc Error:\n";
+        err += DGLGetLastError();
+        throw dmlc::Error(err);
+      }
+    });
+  }
+  API_END();
+}
+
+int DGLStreamCreate(int device_type, int device_id, DGLStreamHandle* out) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx);
+  API_END();
+}
+
+int DGLStreamFree(int device_type, int device_id, DGLStreamHandle stream) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream);
+  API_END();
+}
+
+int DGLSetStream(int device_type, int device_id, DGLStreamHandle stream) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->SetStream(ctx, stream);
+  API_END();
+}
+
+int DGLGetStream(int device_type, int device_id, DGLStreamHandle* stream) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *stream = DeviceAPIManager::Get(ctx)->GetStream();
+  API_END();
+}
+
+int DGLSynchronize(int device_type, int device_id, DGLStreamHandle stream) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream);
+  API_END();
+}
+
+int DGLStreamStreamSynchronize(
+    int device_type, int device_id, DGLStreamHandle src, DGLStreamHandle dst) {
+  API_BEGIN();
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst);
+  API_END();
+}
+
+int DGLCbArgToReturn(DGLValue* value, int code) {
+  API_BEGIN();
+  dgl::runtime::DGLRetValue rv;
+  rv = dgl::runtime::DGLArgValue(*value, code);
+  int tcode;
+  rv.MoveToCHost(value, &tcode);
+  CHECK_EQ(tcode, code);
+  API_END();
+}
+
+int DGLLoadTensorAdapter(const char* path) {
+  return TensorDispatcher::Global()->Load(path) ? 0 : -1;
+}
+
+// set device api
+DGL_REGISTER_GLOBAL(dgl::runtime::symbol::dgl_set_device)
+    .set_body([](DGLArgs args, DGLRetValue* ret) {
+      DGLContext ctx;
+      ctx.device_type = static_cast<DGLDeviceType>(args[0].operator int());
+      ctx.device_id = args[1];
+      DeviceAPIManager::Get(ctx)->SetDevice(ctx);
+    });
+
+// set device api
+DGL_REGISTER_GLOBAL("_GetDeviceAttr")
+    .set_body([](DGLArgs args, DGLRetValue* ret) {
+      DGLContext ctx;
+      ctx.device_type = static_cast<DGLDeviceType>(args[0].operator int());
+      ctx.device_id = args[1];
+
+      DeviceAttrKind kind = static_cast<DeviceAttrKind>(args[2].operator int());
+      if (kind == kExist) {
+        DeviceAPI* api = DeviceAPIManager::Get(ctx.device_type, true);
+        if (api != nullptr) {
+          api->GetAttr(ctx, kind, ret);
+        } else {
+          *ret = 0;
+        }
+      } else {
+        DeviceAPIManager::Get(ctx)->GetAttr(ctx, kind, ret);
+      }
+    });
diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h
index 2fa26e101342..1534b5ce0c38 100644
--- a/src/runtime/cuda/cuda_common.h
+++ b/src/runtime/cuda/cuda_common.h
@@ -6,10 +6,10 @@
 #ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
 #define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
 
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
-#include <curand.h>
-#include <cusparse.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_runtime.h>
+#include <hiprand.h>
+#include <hipsparse/hipsparse.h>
 #include <dgl/runtime/packed_func.h>
 
 #include <memory>
@@ -26,7 +26,7 @@ namespace runtime {
 
   runtime::CUDAWorkspaceAllocator allocator(ctx);
   const auto stream = runtime::getCurrentCUDAStream();
-  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+  const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream);
 
   now, one can pass exec_policy to thrust functions
 
@@ -79,112 +79,112 @@ inline bool is_zero<dim3>(dim3 size) {
 
 #define CUDA_DRIVER_CALL(x)                                             \
   {                                                                     \
-    CUresult result = x;                                                \
-    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+    hipError_t result = x;                                                \
+    if (result != hipSuccess && result != hipErrorDeinitialized) { \
       const char* msg;                                                  \
-      cuGetErrorName(result, &msg);                                     \
+      hipDrvGetErrorName(result, &msg);                                     \
       LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg;     \
     }                                                                   \
   }
 
 #define CUDA_CALL(func)                                      \
   {                                                          \
-    cudaError_t e = (func);                                  \
-    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-        << "CUDA: " << cudaGetErrorString(e);                \
+    hipError_t e = (func);                                  \
+    CHECK(e == hipSuccess || e == hipErrorDeinitialized) \
+        << "CUDA: " << hipGetErrorString(e);                \
   }
 
 #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...)            \
   {                                                                           \
     if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
       (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__);         \
-      cudaError_t e = cudaGetLastError();                                     \
-      CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                \
-          << "CUDA kernel launch error: " << cudaGetErrorString(e);           \
+      hipError_t e = hipGetLastError();                                     \
+      CHECK(e == hipSuccess || e == hipErrorDeinitialized)                \
+          << "CUDA kernel launch error: " << hipGetErrorString(e);           \
     }                                                                         \
   }
 
 #define CUSPARSE_CALL(func)                                         \
   {                                                                 \
-    cusparseStatus_t e = (func);                                    \
-    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
+    hipsparseStatus_t e = (func);                                    \
+    CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
   }
 
 #define CUBLAS_CALL(func)                                       \
   {                                                             \
-    cublasStatus_t e = (func);                                  \
-    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
+    hipblasStatus_t e = (func);                                  \
+    CHECK(e == HIPBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
   }
 
 #define CURAND_CALL(func)                                                      \
   {                                                                            \
-    curandStatus_t e = (func);                                                 \
-    CHECK(e == CURAND_STATUS_SUCCESS)                                          \
+    hiprandStatus_t e = (func);                                                 \
+    CHECK(e == HIPRAND_STATUS_SUCCESS)                                          \
         << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
         << __FILE__ << ":" << __LINE__;                                        \
   }
 
-inline const char* curandGetErrorString(curandStatus_t error) {
+inline const char* curandGetErrorString(hiprandStatus_t error) {
   switch (error) {
-    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
-    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
+    case HIPRAND_STATUS_SUCCESS:
+      return "HIPRAND_STATUS_SUCCESS";
+    case HIPRAND_STATUS_VERSION_MISMATCH:
+      return "HIPRAND_STATUS_VERSION_MISMATCH";
+    case HIPRAND_STATUS_NOT_INITIALIZED:
+      return "HIPRAND_STATUS_NOT_INITIALIZED";
+    case HIPRAND_STATUS_ALLOCATION_FAILED:
+      return "HIPRAND_STATUS_ALLOCATION_FAILED";
+    case HIPRAND_STATUS_TYPE_ERROR:
+      return "HIPRAND_STATUS_TYPE_ERROR";
+    case HIPRAND_STATUS_OUT_OF_RANGE:
+      return "HIPRAND_STATUS_OUT_OF_RANGE";
+    case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case HIPRAND_STATUS_LAUNCH_FAILURE:
+      return "HIPRAND_STATUS_LAUNCH_FAILURE";
+    case HIPRAND_STATUS_PREEXISTING_FAILURE:
+      return "HIPRAND_STATUS_PREEXISTING_FAILURE";
+    case HIPRAND_STATUS_INITIALIZATION_FAILED:
+      return "HIPRAND_STATUS_INITIALIZATION_FAILED";
+    case HIPRAND_STATUS_ARCH_MISMATCH:
+      return "HIPRAND_STATUS_ARCH_MISMATCH";
+    case HIPRAND_STATUS_INTERNAL_ERROR:
+      return "HIPRAND_STATUS_INTERNAL_ERROR";
   }
   // To suppress compiler warning.
-  return "Unrecognized curand error string";
+  return "Unrecognized hiprand error string";
 }
 
 /**
- * @brief Cast data type to cudaDataType_t.
+ * @brief Cast data type to hipDataType.
  */
 template <typename T>
 struct cuda_dtype {
-  static constexpr cudaDataType_t value = CUDA_R_32F;
+  static constexpr hipDataType value = HIP_R_32F;
 };
 
 template <>
 struct cuda_dtype<__half> {
-  static constexpr cudaDataType_t value = CUDA_R_16F;
+  static constexpr hipDataType value = HIP_R_16F;
 };
 
 #if BF16_ENABLED
 template <>
-struct cuda_dtype<__nv_bfloat16> {
-  static constexpr cudaDataType_t value = CUDA_R_16BF;
+struct cuda_dtype<__hip_bfloat16> {
+  static constexpr hipDataType value = HIP_R_16BF;
 };
 #endif  // BF16_ENABLED
 
 template <>
 struct cuda_dtype<float> {
-  static constexpr cudaDataType_t value = CUDA_R_32F;
+  static constexpr hipDataType value = HIP_R_32F;
 };
 
 template <>
 struct cuda_dtype<double> {
-  static constexpr cudaDataType_t value = CUDA_R_64F;
+  static constexpr hipDataType value = HIP_R_64F;
 };
 
 /*
@@ -202,7 +202,7 @@ struct accum_dtype<__half> {
 
 #if BF16_ENABLED
 template <>
-struct accum_dtype<__nv_bfloat16> {
+struct accum_dtype<__hip_bfloat16> {
   typedef float type;
 };
 #endif  // BF16_ENABLED
@@ -219,21 +219,21 @@ struct accum_dtype<double> {
 
 #if CUDART_VERSION >= 11000
 /**
- * @brief Cast index data type to cusparseIndexType_t.
+ * @brief Cast index data type to hipsparseIndexType_t.
  */
 template <typename T>
 struct cusparse_idtype {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
 };
 
 template <>
 struct cusparse_idtype<int32_t> {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I;
 };
 
 template <>
 struct cusparse_idtype<int64_t> {
-  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+  static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_64I;
 };
 #endif
 
@@ -241,9 +241,9 @@ struct cusparse_idtype<int64_t> {
 class CUDAThreadEntry {
  public:
   /** @brief The cusparse handler */
-  cusparseHandle_t cusparse_handle{nullptr};
+  hipsparseHandle_t cusparse_handle{nullptr};
   /** @brief The cublas handler */
-  cublasHandle_t cublas_handle{nullptr};
+  hipblasHandle_t cublas_handle{nullptr};
   /** @brief thread local pool*/
   WorkspacePool pool;
   /** @brief constructor */
@@ -253,7 +253,7 @@ class CUDAThreadEntry {
 };
 
 /** @brief Get the current CUDA stream */
-cudaStream_t getCurrentCUDAStream();
+hipStream_t getCurrentCUDAStream();
 }  // namespace runtime
 }  // namespace dgl
 #endif  // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
diff --git a/src/runtime/cuda/cuda_common.h.prehip b/src/runtime/cuda/cuda_common.h.prehip
new file mode 100644
index 000000000000..2fa26e101342
--- /dev/null
+++ b/src/runtime/cuda/cuda_common.h.prehip
@@ -0,0 +1,259 @@
+/**
+ *  Copyright (c) 2017 by Contributors
+ * @file cuda_common.h
+ * @brief Common utilities for CUDA
+ */
+#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_
+#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <curand.h>
+#include <cusparse.h>
+#include <dgl/runtime/packed_func.h>
+
+#include <memory>
+#include <string>
+
+#include "../workspace_pool.h"
+
+namespace dgl {
+namespace runtime {
+
+/*
+  How to use this class to get a nonblocking thrust execution policy that uses
+  DGL's memory pool and the current cuda stream
+
+  runtime::CUDAWorkspaceAllocator allocator(ctx);
+  const auto stream = runtime::getCurrentCUDAStream();
+  const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream);
+
+  now, one can pass exec_policy to thrust functions
+
+  to get an integer array of size 1000 whose lifetime is managed by unique_ptr,
+  use: auto int_array = allocator.alloc_unique<int>(1000); int_array.get() gives
+  the raw pointer.
+*/
+class CUDAWorkspaceAllocator {
+  DGLContext ctx;
+
+ public:
+  typedef char value_type;
+
+  void operator()(void* ptr) const {
+    runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr);
+  }
+
+  explicit CUDAWorkspaceAllocator(DGLContext ctx) : ctx(ctx) {}
+
+  CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default;
+
+  template <typename T>
+  std::unique_ptr<T, CUDAWorkspaceAllocator> alloc_unique(
+      std::size_t size) const {
+    return std::unique_ptr<T, CUDAWorkspaceAllocator>(
+        reinterpret_cast<T*>(runtime::DeviceAPI::Get(ctx)->AllocWorkspace(
+            ctx, sizeof(T) * size)),
+        *this);
+  }
+
+  char* allocate(std::ptrdiff_t size) const {
+    return reinterpret_cast<char*>(
+        runtime::DeviceAPI::Get(ctx)->AllocWorkspace(ctx, size));
+  }
+
+  void deallocate(char* ptr, std::size_t) const {
+    runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr);
+  }
+};
+
+template <typename T>
+inline bool is_zero(T size) {
+  return size == 0;
+}
+
+template <>
+inline bool is_zero<dim3>(dim3 size) {
+  return size.x == 0 || size.y == 0 || size.z == 0;
+}
+
+#define CUDA_DRIVER_CALL(x)                                             \
+  {                                                                     \
+    CUresult result = x;                                                \
+    if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \
+      const char* msg;                                                  \
+      cuGetErrorName(result, &msg);                                     \
+      LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg;     \
+    }                                                                   \
+  }
+
+#define CUDA_CALL(func)                                      \
+  {                                                          \
+    cudaError_t e = (func);                                  \
+    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                \
+  }
+
+#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...)            \
+  {                                                                           \
+    if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \
+      (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__);         \
+      cudaError_t e = cudaGetLastError();                                     \
+      CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading)                \
+          << "CUDA kernel launch error: " << cudaGetErrorString(e);           \
+    }                                                                         \
+  }
+
+#define CUSPARSE_CALL(func)                                         \
+  {                                                                 \
+    cusparseStatus_t e = (func);                                    \
+    CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \
+  }
+
+#define CUBLAS_CALL(func)                                       \
+  {                                                             \
+    cublasStatus_t e = (func);                                  \
+    CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \
+  }
+
+#define CURAND_CALL(func)                                                      \
+  {                                                                            \
+    curandStatus_t e = (func);                                                 \
+    CHECK(e == CURAND_STATUS_SUCCESS)                                          \
+        << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \
+        << __FILE__ << ":" << __LINE__;                                        \
+  }
+
+inline const char* curandGetErrorString(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  // To suppress compiler warning.
+  return "Unrecognized curand error string";
+}
+
+/**
+ * @brief Cast data type to cudaDataType_t.
+ */
+template <typename T>
+struct cuda_dtype {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+
+template <>
+struct cuda_dtype<__half> {
+  static constexpr cudaDataType_t value = CUDA_R_16F;
+};
+
+#if BF16_ENABLED
+template <>
+struct cuda_dtype<__nv_bfloat16> {
+  static constexpr cudaDataType_t value = CUDA_R_16BF;
+};
+#endif  // BF16_ENABLED
+
+template <>
+struct cuda_dtype<float> {
+  static constexpr cudaDataType_t value = CUDA_R_32F;
+};
+
+template <>
+struct cuda_dtype<double> {
+  static constexpr cudaDataType_t value = CUDA_R_64F;
+};
+
+/*
+ * \brief Accumulator type for SpMM.
+ */
+template <typename T>
+struct accum_dtype {
+  typedef float type;
+};
+
+template <>
+struct accum_dtype<__half> {
+  typedef float type;
+};
+
+#if BF16_ENABLED
+template <>
+struct accum_dtype<__nv_bfloat16> {
+  typedef float type;
+};
+#endif  // BF16_ENABLED
+
+template <>
+struct accum_dtype<float> {
+  typedef float type;
+};
+
+template <>
+struct accum_dtype<double> {
+  typedef double type;
+};
+
+#if CUDART_VERSION >= 11000
+/**
+ * @brief Cast index data type to cusparseIndexType_t.
+ */
+template <typename T>
+struct cusparse_idtype {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+
+template <>
+struct cusparse_idtype<int32_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I;
+};
+
+template <>
+struct cusparse_idtype<int64_t> {
+  static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I;
+};
+#endif
+
+/** @brief Thread local workspace */
+class CUDAThreadEntry {
+ public:
+  /** @brief The cusparse handler */
+  cusparseHandle_t cusparse_handle{nullptr};
+  /** @brief The cublas handler */
+  cublasHandle_t cublas_handle{nullptr};
+  /** @brief thread local pool*/
+  WorkspacePool pool;
+  /** @brief constructor */
+  CUDAThreadEntry();
+  // get the threadlocal workspace
+  static CUDAThreadEntry* ThreadLocal();
+};
+
+/** @brief Get the current CUDA stream */
+cudaStream_t getCurrentCUDAStream();
+}  // namespace runtime
+}  // namespace dgl
+#endif  // DGL_RUNTIME_CUDA_CUDA_COMMON_H_
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index aa97d7a79d08..069938df9aab 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -3,7 +3,7 @@
  * @file cuda_device_api.cc
  * @brief GPU specific API
  */
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/runtime/device_api.h>
 #include <dgl/runtime/registry.h>
 #include <dgl/runtime/tensordispatch.h>
@@ -18,13 +18,13 @@ class CUDADeviceAPI final : public DeviceAPI {
  public:
   CUDADeviceAPI() {
     int count;
-    auto err = cudaGetDeviceCount(&count);
+    auto err = hipGetDeviceCount(&count);
     switch (err) {
-      case cudaSuccess:
+      case hipSuccess:
         break;
       default:
         count = 0;
-        cudaGetLastError();
+        hipGetLastError();
     }
     is_available_ = count > 0;
   }
@@ -32,67 +32,67 @@ class CUDADeviceAPI final : public DeviceAPI {
   bool IsAvailable() final { return is_available_; }
 
   void SetDevice(DGLContext ctx) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
   }
   void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
     int value = 0;
     switch (kind) {
       case kExist:
         value =
-            (cudaDeviceGetAttribute(
-                 &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) ==
-             cudaSuccess);
+            (hipDeviceGetAttribute(
+                 &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) ==
+             hipSuccess);
         break;
       case kMaxThreadsPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id));
         break;
       }
       case kWarpSize: {
         CUDA_CALL(
-            cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id));
+            hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id));
         break;
       }
       case kMaxSharedMemoryPerBlock: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id));
         break;
       }
       case kComputeVersion: {
         std::ostringstream os;
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id));
         os << value << ".";
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id));
         os << value;
         *rv = os.str();
         return;
       }
       case kDeviceName: {
-        cudaDeviceProp props;
-        CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
+        hipDeviceProp_t props;
+        CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id));
         *rv = std::string(props.name);
         return;
       }
       case kMaxClockRate: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrClockRate, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeClockRate, ctx.device_id));
         break;
       }
       case kMultiProcessorCount: {
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &value, cudaDevAttrMultiProcessorCount, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &value, hipDeviceAttributeMultiprocessorCount, ctx.device_id));
         break;
       }
       case kMaxThreadDimensions: {
         int dims[3];
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
-        CUDA_CALL(cudaDeviceGetAttribute(
-            &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id));
+        CUDA_CALL(hipDeviceGetAttribute(
+            &dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id));
 
         std::stringstream ss;  // use json string to return multiple int values;
         ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
@@ -114,7 +114,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     }
     CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
     void* ret;
-    CUDA_CALL(cudaMalloc(&ret, nbytes));
+    CUDA_CALL(hipMalloc(&ret, nbytes));
     return ret;
   }
 
@@ -124,32 +124,32 @@ class CUDADeviceAPI final : public DeviceAPI {
     if (tensor_dispatcher->IsAvailable()) {
       return tensor_dispatcher->CUDAFreeWorkspace(ptr);
     }
-    CUDA_CALL(cudaFree(ptr));
+    CUDA_CALL(hipFree(ptr));
   }
 
   void CopyDataFromTo(
       const void* from, size_t from_offset, void* to, size_t to_offset,
       size_t size, DGLContext ctx_from, DGLContext ctx_to,
       DGLDataType type_hint, DGLStreamHandle stream) {
-    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
     to = static_cast<char*>(to) + to_offset;
     if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+      CUDA_CALL(hipSetDevice(ctx_from.device_id));
       if (ctx_from.device_id == ctx_to.device_id) {
-        GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+        GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream);
       } else {
-        CUDA_CALL(cudaMemcpyPeerAsync(
+        CUDA_CALL(hipMemcpyPeerAsync(
             to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
       }
     } else if (
         ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) {
-      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
-      GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
+      CUDA_CALL(hipSetDevice(ctx_from.device_id));
+      GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream);
     } else if (
         ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) {
-      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
-      GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
+      CUDA_CALL(hipSetDevice(ctx_to.device_id));
+      GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream);
     } else {
       LOG(FATAL) << "expect copy from/to GPU or between GPU";
     }
@@ -166,9 +166,9 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
 
   // To ensure correct behavior, `record_event` must be invoked anytime a
-  // pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync
+  // pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync
   // call. It provides a way to re-use freed pinned (page-locked) memory
-  // allocations and avoid device sync due to cudaFreeHost calls.
+  // allocations and avoid device sync due to hipHostFree calls.
   void RecordedCopyDataFromTo(
       void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
       DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
@@ -179,7 +179,7 @@ class CUDADeviceAPI final : public DeviceAPI {
         stream);
     auto tensor_dispatcher = TensorDispatcher::Global();
     if (tensor_dispatcher->IsAvailable()) {
-      auto custream = static_cast<cudaStream_t>(stream);
+      auto custream = static_cast<hipStream_t>(stream);
       void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
       int id =
           ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
@@ -188,34 +188,34 @@ class CUDADeviceAPI final : public DeviceAPI {
   }
 
   DGLStreamHandle CreateStream(DGLContext ctx) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t retval;
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t retval;
     // make sure the legacy default stream won't block on this stream
-    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+    CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking));
     return static_cast<DGLStreamHandle>(retval);
   }
 
   void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
-    CUDA_CALL(cudaStreamDestroy(cu_stream));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t cu_stream = static_cast<hipStream_t>(stream);
+    CUDA_CALL(hipStreamDestroy(cu_stream));
   }
 
   void SyncStreamFromTo(
       DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
-    cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
-    cudaEvent_t evt;
-    CUDA_CALL(cudaEventCreate(&evt));
-    CUDA_CALL(cudaEventRecord(evt, src_stream));
-    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
-    CUDA_CALL(cudaEventDestroy(evt));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    hipStream_t src_stream = static_cast<hipStream_t>(event_src);
+    hipStream_t dst_stream = static_cast<hipStream_t>(event_dst);
+    hipEvent_t evt;
+    CUDA_CALL(hipEventCreate(&evt));
+    CUDA_CALL(hipEventRecord(evt, src_stream));
+    CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0));
+    CUDA_CALL(hipEventDestroy(evt));
   }
 
   void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
-    CUDA_CALL(cudaSetDevice(ctx.device_id));
-    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+    CUDA_CALL(hipSetDevice(ctx.device_id));
+    CUDA_CALL(hipStreamSynchronize(static_cast<hipStream_t>(stream)));
   }
 
   /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
@@ -230,7 +230,7 @@ class CUDADeviceAPI final : public DeviceAPI {
     return static_cast<DGLStreamHandle>(getCurrentCUDAStream());
   }
 
-  /** NOTE: cudaHostRegister can be called from an arbitrary GPU device,
+  /** NOTE: hipHostRegister can be called from an arbitrary GPU device,
    *        so we don't need to specify a ctx.
    *        The pinned memory can be seen by all CUDA contexts,
    *        not just the one that performed the allocation
@@ -244,13 +244,13 @@ class CUDADeviceAPI final : public DeviceAPI {
     if (tensor_dispatcher->IsAvailable()) {
       tensor_dispatcher->CUDAHostAllocatorEmptyCache();
     }
-    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
+    CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault));
     return true;
   }
 
   void UnpinData(void* ptr) {
     if (ptr == nullptr) return;
-    CUDA_CALL(cudaHostUnregister(ptr));
+    CUDA_CALL(hipHostUnregister(ptr));
   }
 
   void* AllocPinnedDataSpace(
@@ -276,33 +276,33 @@ class CUDADeviceAPI final : public DeviceAPI {
     // can't be a pinned tensor if CUDA context is unavailable.
     if (!is_available_) return false;
 
-    cudaPointerAttributes attr;
-    cudaError_t status = cudaPointerGetAttributes(&attr, ptr);
+    hipPointerAttribute_t attr;
+    hipError_t status = hipPointerGetAttributes(&attr, ptr);
     bool result = false;
 
     switch (status) {
-      case cudaErrorInvalidValue:
+      case hipErrorInvalidValue:
         // might be a normal CPU tensor in CUDA 10.2-
-        cudaGetLastError();  // clear error
+        hipGetLastError();  // clear error
         break;
-      case cudaSuccess:
-        result = (attr.type == cudaMemoryTypeHost);
+      case hipSuccess:
+        result = (attr.type == hipMemoryTypeHost);
         break;
-      case cudaErrorInitializationError:
-      case cudaErrorNoDevice:
-      case cudaErrorInsufficientDriver:
-      case cudaErrorInvalidDevice:
+      case hipErrorNotInitialized:
+      case hipErrorNoDevice:
+      case hipErrorInsufficientDriver:
+      case hipErrorInvalidDevice:
         // We don't want to fail in these particular cases since this function
         // can be called when users only want to run on CPU even if CUDA API is
         // enabled, or in a forked subprocess where CUDA context cannot be
         // initialized.  So we just mark the CUDA context to unavailable and
         // return.
         is_available_ = false;
-        cudaGetLastError();  // clear error
+        hipGetLastError();  // clear error
         break;
       default:
         LOG(FATAL) << "error while determining memory status: "
-                   << cudaGetErrorString(status);
+                   << hipGetErrorString(status);
         break;
     }
 
@@ -338,13 +338,13 @@ class CUDADeviceAPI final : public DeviceAPI {
 
  private:
   static void GPUCopy(
-      const void* from, void* to, size_t size, cudaMemcpyKind kind,
-      cudaStream_t stream) {
-    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
-    if (stream == 0 && kind == cudaMemcpyDeviceToHost) {
+      const void* from, void* to, size_t size, hipMemcpyKind kind,
+      hipStream_t stream) {
+    CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream));
+    if (stream == 0 && kind == hipMemcpyDeviceToHost) {
       // only wait for the copy, when it's on the default stream, and it's to
       // host memory
-      CUDA_CALL(cudaStreamSynchronize(stream));
+      CUDA_CALL(hipStreamSynchronize(stream));
     }
   }
 
@@ -359,7 +359,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
   return CUDAThreadStore::Get();
 }
 
-cudaStream_t getCurrentCUDAStream() {
+hipStream_t getCurrentCUDAStream() {
   TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
   if (tensor_dispatcher->IsAvailable())
     return tensor_dispatcher->CUDAGetCurrentStream();
diff --git a/src/runtime/cuda/cuda_device_api.cc.prehip b/src/runtime/cuda/cuda_device_api.cc.prehip
new file mode 100644
index 000000000000..aa97d7a79d08
--- /dev/null
+++ b/src/runtime/cuda/cuda_device_api.cc.prehip
@@ -0,0 +1,377 @@
+/**
+ *  Copyright (c) 2017-2022 by Contributors
+ * @file cuda_device_api.cc
+ * @brief GPU specific API
+ */
+#include <cuda_runtime.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/registry.h>
+#include <dgl/runtime/tensordispatch.h>
+#include <dmlc/thread_local.h>
+
+#include "cuda_common.h"
+
+namespace dgl {
+namespace runtime {
+
+class CUDADeviceAPI final : public DeviceAPI {
+ public:
+  CUDADeviceAPI() {
+    int count;
+    auto err = cudaGetDeviceCount(&count);
+    switch (err) {
+      case cudaSuccess:
+        break;
+      default:
+        count = 0;
+        cudaGetLastError();
+    }
+    is_available_ = count > 0;
+  }
+
+  bool IsAvailable() final { return is_available_; }
+
+  void SetDevice(DGLContext ctx) final {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+  }
+  void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final {
+    int value = 0;
+    switch (kind) {
+      case kExist:
+        value =
+            (cudaDeviceGetAttribute(
+                 &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) ==
+             cudaSuccess);
+        break;
+      case kMaxThreadsPerBlock: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id));
+        break;
+      }
+      case kWarpSize: {
+        CUDA_CALL(
+            cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id));
+        break;
+      }
+      case kMaxSharedMemoryPerBlock: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id));
+        break;
+      }
+      case kComputeVersion: {
+        std::ostringstream os;
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrComputeCapabilityMajor, ctx.device_id));
+        os << value << ".";
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrComputeCapabilityMinor, ctx.device_id));
+        os << value;
+        *rv = os.str();
+        return;
+      }
+      case kDeviceName: {
+        cudaDeviceProp props;
+        CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id));
+        *rv = std::string(props.name);
+        return;
+      }
+      case kMaxClockRate: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrClockRate, ctx.device_id));
+        break;
+      }
+      case kMultiProcessorCount: {
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &value, cudaDevAttrMultiProcessorCount, ctx.device_id));
+        break;
+      }
+      case kMaxThreadDimensions: {
+        int dims[3];
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id));
+        CUDA_CALL(cudaDeviceGetAttribute(
+            &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id));
+
+        std::stringstream ss;  // use json string to return multiple int values;
+        ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]";
+        *rv = ss.str();
+        return;
+      }
+    }
+    *rv = value;
+  }
+  void* AllocDataSpace(
+      DGLContext ctx, size_t nbytes, size_t alignment,
+      DGLDataType type_hint) final {
+    SetDevice(ctx);
+    // Redirect to PyTorch's allocator when available.
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    if (tensor_dispatcher->IsAvailable()) {
+      return tensor_dispatcher->CUDAAllocWorkspace(
+          nbytes, getCurrentCUDAStream());
+    }
+    CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
+    void* ret;
+    CUDA_CALL(cudaMalloc(&ret, nbytes));
+    return ret;
+  }
+
+  void FreeDataSpace(DGLContext ctx, void* ptr) final {
+    SetDevice(ctx);
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    if (tensor_dispatcher->IsAvailable()) {
+      return tensor_dispatcher->CUDAFreeWorkspace(ptr);
+    }
+    CUDA_CALL(cudaFree(ptr));
+  }
+
+  void CopyDataFromTo(
+      const void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t size, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint, DGLStreamHandle stream) {
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    from = static_cast<const char*>(from) + from_offset;
+    to = static_cast<char*>(to) + to_offset;
+    if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) {
+      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+      if (ctx_from.device_id == ctx_to.device_id) {
+        GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
+      } else {
+        CUDA_CALL(cudaMemcpyPeerAsync(
+            to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream));
+      }
+    } else if (
+        ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) {
+      CUDA_CALL(cudaSetDevice(ctx_from.device_id));
+      GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
+    } else if (
+        ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) {
+      CUDA_CALL(cudaSetDevice(ctx_to.device_id));
+      GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
+    } else {
+      LOG(FATAL) << "expect copy from/to GPU or between GPU";
+    }
+  }
+
+  void CopyDataFromTo(
+      const void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t size, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint) final {
+    auto stream = GetStream();
+    CopyDataFromTo(
+        from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint,
+        stream);
+  }
+
+  // To ensure correct behavior, `record_event` must be invoked anytime a
+  // pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync
+  // call. It provides a way to re-use freed pinned (page-locked) memory
+  // allocations and avoid device sync due to cudaFreeHost calls.
+  void RecordedCopyDataFromTo(
+      void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
+      DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint,
+      void* pytorch_ctx) final {
+    auto stream = GetStream();
+    CopyDataFromTo(
+        from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint,
+        stream);
+    auto tensor_dispatcher = TensorDispatcher::Global();
+    if (tensor_dispatcher->IsAvailable()) {
+      auto custream = static_cast<cudaStream_t>(stream);
+      void* ptr = ctx_to.device_type == kDGLCPU ? to : from;
+      int id =
+          ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id;
+      tensor_dispatcher->CUDARecordHostAlloc(ptr, pytorch_ctx, custream, id);
+    }
+  }
+
+  DGLStreamHandle CreateStream(DGLContext ctx) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t retval;
+    // make sure the legacy default stream won't block on this stream
+    CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking));
+    return static_cast<DGLStreamHandle>(retval);
+  }
+
+  void FreeStream(DGLContext ctx, DGLStreamHandle stream) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
+    CUDA_CALL(cudaStreamDestroy(cu_stream));
+  }
+
+  void SyncStreamFromTo(
+      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    cudaStream_t src_stream = static_cast<cudaStream_t>(event_src);
+    cudaStream_t dst_stream = static_cast<cudaStream_t>(event_dst);
+    cudaEvent_t evt;
+    CUDA_CALL(cudaEventCreate(&evt));
+    CUDA_CALL(cudaEventRecord(evt, src_stream));
+    CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0));
+    CUDA_CALL(cudaEventDestroy(evt));
+  }
+
+  void StreamSync(DGLContext ctx, DGLStreamHandle stream) final {
+    CUDA_CALL(cudaSetDevice(ctx.device_id));
+    CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
+  }
+
+  /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management,
+   *        so just avoid calling our SetStream/CreateStream unless
+   *        you really need advanced stream control.
+   * TODO(Xin): Redirect this to PyTorch or remove it.
+   * PyTorch allows external CUDA streams to be set as current since v1.11.
+   */
+  void SetStream(DGLContext ctx, DGLStreamHandle stream) final {}
+
+  DGLStreamHandle GetStream() const final {
+    return static_cast<DGLStreamHandle>(getCurrentCUDAStream());
+  }
+
+  /** NOTE: cudaHostRegister can be called from an arbitrary GPU device,
+   *        so we don't need to specify a ctx.
+   *        The pinned memory can be seen by all CUDA contexts,
+   *        not just the one that performed the allocation
+   */
+  bool PinData(void* ptr, size_t nbytes) override {
+    // prevent users from pinning empty tensors or graphs
+    if (ptr == nullptr || nbytes == 0) return false;
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    // Minimize the pinned memory pool allocated by backend (via tensoradapter)
+    // to preserve enough memory for DGL inherited in-place pin-memory operation
+    if (tensor_dispatcher->IsAvailable()) {
+      tensor_dispatcher->CUDAHostAllocatorEmptyCache();
+    }
+    CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault));
+    return true;
+  }
+
+  void UnpinData(void* ptr) {
+    if (ptr == nullptr) return;
+    CUDA_CALL(cudaHostUnregister(ptr));
+  }
+
+  void* AllocPinnedDataSpace(
+      size_t nbytes, void** ctx, void** deleter) override {
+    // prevent pinning empty tensors or graphs
+    if (nbytes == 0) return nullptr;
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    CHECK(tensor_dispatcher->IsAvailable())
+        << "CachingHostAllocator is not available in the current backend "
+           "PyTorch. Please update the PyTorch version to 1.11+";
+    return tensor_dispatcher->CUDAAllocHostWorkspace(nbytes, ctx, deleter);
+  }
+
+  void FreePinnedDataSpace(void** deleter) override {
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    CHECK(tensor_dispatcher->IsAvailable())
+        << "CachingHostAllocator is not available in the current backend "
+           "PyTorch. Please update the PyTorch version to 1.11+";
+    tensor_dispatcher->CUDAFreeHostWorkspace(deleter);
+  }
+
+  bool IsPinned(const void* ptr) override {
+    // can't be a pinned tensor if CUDA context is unavailable.
+    if (!is_available_) return false;
+
+    cudaPointerAttributes attr;
+    cudaError_t status = cudaPointerGetAttributes(&attr, ptr);
+    bool result = false;
+
+    switch (status) {
+      case cudaErrorInvalidValue:
+        // might be a normal CPU tensor in CUDA 10.2-
+        cudaGetLastError();  // clear error
+        break;
+      case cudaSuccess:
+        result = (attr.type == cudaMemoryTypeHost);
+        break;
+      case cudaErrorInitializationError:
+      case cudaErrorNoDevice:
+      case cudaErrorInsufficientDriver:
+      case cudaErrorInvalidDevice:
+        // We don't want to fail in these particular cases since this function
+        // can be called when users only want to run on CPU even if CUDA API is
+        // enabled, or in a forked subprocess where CUDA context cannot be
+        // initialized.  So we just mark the CUDA context to unavailable and
+        // return.
+        is_available_ = false;
+        cudaGetLastError();  // clear error
+        break;
+      default:
+        LOG(FATAL) << "error while determining memory status: "
+                   << cudaGetErrorString(status);
+        break;
+    }
+
+    return result;
+  }
+
+  void* AllocWorkspace(
+      DGLContext ctx, size_t size, DGLDataType type_hint) final {
+    SetDevice(ctx);
+    // Redirect to PyTorch's allocator when available.
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    if (tensor_dispatcher->IsAvailable())
+      return tensor_dispatcher->CUDAAllocWorkspace(
+          size, getCurrentCUDAStream());
+
+    return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size);
+  }
+
+  void FreeWorkspace(DGLContext ctx, void* data) final {
+    SetDevice(ctx);
+    TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+    if (tensor_dispatcher->IsAvailable())
+      return tensor_dispatcher->CUDAFreeWorkspace(data);
+
+    CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data);
+  }
+
+  static const std::shared_ptr<CUDADeviceAPI>& Global() {
+    static std::shared_ptr<CUDADeviceAPI> inst =
+        std::make_shared<CUDADeviceAPI>();
+    return inst;
+  }
+
+ private:
+  static void GPUCopy(
+      const void* from, void* to, size_t size, cudaMemcpyKind kind,
+      cudaStream_t stream) {
+    CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
+    if (stream == 0 && kind == cudaMemcpyDeviceToHost) {
+      // only wait for the copy, when it's on the default stream, and it's to
+      // host memory
+      CUDA_CALL(cudaStreamSynchronize(stream));
+    }
+  }
+
+  bool is_available_ = true;
+};
+
+typedef dmlc::ThreadLocalStore<CUDAThreadEntry> CUDAThreadStore;
+
+CUDAThreadEntry::CUDAThreadEntry() : pool(kDGLCUDA, CUDADeviceAPI::Global()) {}
+
+CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() {
+  return CUDAThreadStore::Get();
+}
+
+cudaStream_t getCurrentCUDAStream() {
+  TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+  if (tensor_dispatcher->IsAvailable())
+    return tensor_dispatcher->CUDAGetCurrentStream();
+  else  // return the default stream when TA is not available
+    return nullptr;
+}
+
+DGL_REGISTER_GLOBAL("device_api.cuda")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      DeviceAPI* ptr = CUDADeviceAPI::Global().get();
+      *rv = static_cast<void*>(ptr);
+    });
+
+}  // namespace runtime
+}  // namespace dgl
diff --git a/src/runtime/cuda/cuda_hashtable.cu b/src/runtime/cuda/cuda_hashtable.cu
index e24ead9ee336..cb8ccbb3a47a 100644
--- a/src/runtime/cuda/cuda_hashtable.cu
+++ b/src/runtime/cuda/cuda_hashtable.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file runtime/cuda/cuda_device_common.cuh
@@ -5,7 +6,7 @@
  */
 
 #include <cassert>
-#include <cub/cub.cuh>  // NOLINT
+#include <hipcub/hipcub.hpp>  // NOLINT
 
 #include "../../array/cuda/atomic.cuh"
 #include "cuda_common.h"
@@ -246,7 +247,7 @@ __global__ void count_hashmap(
     DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
   assert(BLOCK_SIZE == blockDim.x);
 
-  using BlockReduce = typename cub::BlockReduce<IdType, BLOCK_SIZE>;
+  using BlockReduce = typename hipcub::BlockReduce<IdType, BLOCK_SIZE>;
   using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
 
   const size_t block_start = TILE_SIZE * blockIdx.x;
@@ -300,7 +301,7 @@ __global__ void compact_hashmap(
   assert(BLOCK_SIZE == blockDim.x);
 
   using FlagType = uint16_t;
-  using BlockScan = typename cub::BlockScan<FlagType, BLOCK_SIZE>;
+  using BlockScan = typename hipcub::BlockScan<FlagType, BLOCK_SIZE>;
   using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
 
   constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
@@ -359,7 +360,7 @@ DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
 
 template <typename IdType>
 OrderedHashTable<IdType>::OrderedHashTable(
-    const size_t size, DGLContext ctx, cudaStream_t stream, const int scale)
+    const size_t size, DGLContext ctx, hipStream_t stream, const int scale)
     : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
   // make sure we will at least as many buckets as items.
   CHECK_GT(scale, 0);
@@ -368,7 +369,7 @@ OrderedHashTable<IdType>::OrderedHashTable(
   table_ = static_cast<Mapping*>(
       device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
 
-  CUDA_CALL(cudaMemsetAsync(
+  CUDA_CALL(hipMemsetAsync(
       table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
       sizeof(Mapping) * size_, stream));
 }
@@ -382,7 +383,7 @@ OrderedHashTable<IdType>::~OrderedHashTable() {
 template <typename IdType>
 void OrderedHashTable<IdType>::FillWithDuplicates(
     const IdType* const input, const size_t num_input, IdType* const unique,
-    int64_t* const num_unique, cudaStream_t stream) {
+    int64_t* const num_unique, hipStream_t stream) {
   auto device = runtime::DeviceAPI::Get(ctx_);
 
   const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
@@ -404,12 +405,12 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
       input, num_input, device_table, item_prefix);
 
   size_t workspace_bytes;
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
       static_cast<IdType*>(nullptr), grid.x + 1, stream));
   void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
 
-  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+  CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(
       workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
       stream));
   device->FreeWorkspace(ctx_, workspace);
@@ -422,7 +423,7 @@ void OrderedHashTable<IdType>::FillWithDuplicates(
 
 template <typename IdType>
 void OrderedHashTable<IdType>::FillWithUnique(
-    const IdType* const input, const size_t num_input, cudaStream_t stream) {
+    const IdType* const input, const size_t num_input, hipStream_t stream) {
   const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
 
   const dim3 grid(num_tiles);
diff --git a/src/runtime/cuda/cuda_hashtable.cu.prehip b/src/runtime/cuda/cuda_hashtable.cu.prehip
new file mode 100644
index 000000000000..e24ead9ee336
--- /dev/null
+++ b/src/runtime/cuda/cuda_hashtable.cu.prehip
@@ -0,0 +1,443 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file runtime/cuda/cuda_device_common.cuh
+ * @brief Device level functions for within cuda kernels.
+ */
+
+#include <cassert>
+#include <cub/cub.cuh>  // NOLINT
+
+#include "../../array/cuda/atomic.cuh"
+#include "cuda_common.h"
+#include "cuda_hashtable.cuh"
+
+using namespace dgl::aten::cuda;
+
+namespace dgl {
+namespace runtime {
+namespace cuda {
+
+namespace {
+
+constexpr static const int BLOCK_SIZE = 256;
+constexpr static const size_t TILE_SIZE = 1024;
+
+/**
+ * @brief This is the mutable version of the DeviceOrderedHashTable, for use in
+ * inserting elements into the hashtable.
+ *
+ * @tparam IdType The type of ID to store in the hashtable.
+ */
+template <typename IdType>
+class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable<IdType> {
+ public:
+  typedef typename DeviceOrderedHashTable<IdType>::Mapping* Iterator;
+  static constexpr IdType kEmptyKey = DeviceOrderedHashTable<IdType>::kEmptyKey;
+
+  /**
+   * @brief Create a new mutable hashtable for use on the device.
+   *
+   * @param hostTable The original hash table on the host.
+   */
+  explicit MutableDeviceOrderedHashTable(
+      OrderedHashTable<IdType>* const hostTable)
+      : DeviceOrderedHashTable<IdType>(hostTable->DeviceHandle()) {}
+
+  /**
+   * @brief Find the mutable mapping of a given key within the hash table.
+   *
+   * WARNING: The key must exist within the hashtable. Searching for a key not
+   * in the hashtable is undefined behavior.
+   *
+   * @param id The key to search for.
+   *
+   * @return The mapping.
+   */
+  inline __device__ Iterator Search(const IdType id) {
+    const IdType pos = SearchForPosition(id);
+
+    return GetMutable(pos);
+  }
+
+  /**
+   * @brief Attempt to insert into the hash table at a specific location.
+   *
+   * @param pos The position to insert at.
+   * @param id The ID to insert into the hash table.
+   * @param index The original index of the item being inserted.
+   *
+   * @return True, if the insertion was successful.
+   */
+  inline __device__ bool AttemptInsertAt(
+      const size_t pos, const IdType id, const size_t index) {
+    const IdType key = AtomicCAS(&GetMutable(pos)->key, kEmptyKey, id);
+    if (key == kEmptyKey || key == id) {
+      // we either set a match key, or found a matching key, so then place the
+      // minimum index in position. Match the type of atomicMin, so ignore
+      // linting
+      atomicMin(
+          reinterpret_cast<unsigned long long*>(  // NOLINT
+              &GetMutable(pos)->index),
+          static_cast<unsigned long long>(index));  // NOLINT
+      return true;
+    } else {
+      // we need to search elsewhere
+      return false;
+    }
+  }
+
+  /**
+   * @brief Insert key-index pair into the hashtable.
+   *
+   * @param id The ID to insert.
+   * @param index The index at which the ID occured.
+   *
+   * @return An iterator to inserted mapping.
+   */
+  inline __device__ Iterator Insert(const IdType id, const size_t index) {
+    size_t pos = Hash(id);
+
+    // linearly scan for an empty slot or matching entry
+    IdType delta = 1;
+    while (!AttemptInsertAt(pos, id, index)) {
+      pos = Hash(pos + delta);
+      delta += 1;
+    }
+
+    return GetMutable(pos);
+  }
+
+ private:
+  /**
+   * @brief Get a mutable iterator to the given bucket in the hashtable.
+   *
+   * @param pos The given bucket.
+   *
+   * @return The iterator.
+   */
+  inline __device__ Iterator GetMutable(const size_t pos) {
+    assert(pos < this->size_);
+    // The parent class Device is read-only, but we ensure this can only be
+    // constructed from a mutable version of OrderedHashTable, making this
+    // a safe cast to perform.
+    return const_cast<Iterator>(this->table_ + pos);
+  }
+};
+
+/**
+ * @brief Calculate the number of buckets in the hashtable. To guarantee we can
+ * fill the hashtable in the worst case, we must use a number of buckets which
+ * is a power of two.
+ * https://en.wikipedia.org/wiki/Quadratic_probing#Limitations
+ *
+ * @param num The number of items to insert (should be an upper bound on the
+ * number of unique keys).
+ * @param scale The power of two larger the number of buckets should be than the
+ * unique keys.
+ *
+ * @return The number of buckets the table should contain.
+ */
+size_t TableSize(const size_t num, const int scale) {
+  const size_t next_pow2 = 1 << static_cast<size_t>(1 + std::log2(num >> 1));
+  return next_pow2 << scale;
+}
+
+/**
+ * @brief This structure is used with cub's block-level prefixscan in order to
+ * keep a running sum as items are iteratively processed.
+ *
+ * @tparam IdType The type to perform the prefixsum on.
+ */
+template <typename IdType>
+struct BlockPrefixCallbackOp {
+  IdType running_total_;
+
+  __device__ BlockPrefixCallbackOp(const IdType running_total)
+      : running_total_(running_total) {}
+
+  __device__ IdType operator()(const IdType block_aggregate) {
+    const IdType old_prefix = running_total_;
+    running_total_ += block_aggregate;
+    return old_prefix;
+  }
+};
+
+}  // namespace
+
+/**
+ * @brief This generates a hash map where the keys are the global item numbers,
+ * and the values are indexes, and inputs may have duplciates.
+ *
+ * @tparam IdType The type of of id.
+ * @tparam BLOCK_SIZE The size of the thread block.
+ * @tparam TILE_SIZE The number of entries each thread block will process.
+ * @param items The items to insert.
+ * @param num_items The number of items to insert.
+ * @param table The hash table.
+ */
+template <typename IdType, int BLOCK_SIZE, size_t TILE_SIZE>
+__global__ void generate_hashmap_duplicates(
+    const IdType* const items, const int64_t num_items,
+    MutableDeviceOrderedHashTable<IdType> table) {
+  assert(BLOCK_SIZE == blockDim.x);
+
+  const size_t block_start = TILE_SIZE * blockIdx.x;
+  const size_t block_end = TILE_SIZE * (blockIdx.x + 1);
+
+#pragma unroll
+  for (size_t index = threadIdx.x + block_start; index < block_end;
+       index += BLOCK_SIZE) {
+    if (index < num_items) {
+      table.Insert(items[index], index);
+    }
+  }
+}
+
+/**
+ * @brief This generates a hash map where the keys are the global item numbers,
+ * and the values are indexes, and all inputs are unique.
+ *
+ * @tparam IdType The type of of id.
+ * @tparam BLOCK_SIZE The size of the thread block.
+ * @tparam TILE_SIZE The number of entries each thread block will process.
+ * @param items The unique items to insert.
+ * @param num_items The number of items to insert.
+ * @param table The hash table.
+ */
+template <typename IdType, int BLOCK_SIZE, size_t TILE_SIZE>
+__global__ void generate_hashmap_unique(
+    const IdType* const items, const int64_t num_items,
+    MutableDeviceOrderedHashTable<IdType> table) {
+  assert(BLOCK_SIZE == blockDim.x);
+
+  using Iterator = typename MutableDeviceOrderedHashTable<IdType>::Iterator;
+
+  const size_t block_start = TILE_SIZE * blockIdx.x;
+  const size_t block_end = TILE_SIZE * (blockIdx.x + 1);
+
+#pragma unroll
+  for (size_t index = threadIdx.x + block_start; index < block_end;
+       index += BLOCK_SIZE) {
+    if (index < num_items) {
+      const Iterator pos = table.Insert(items[index], index);
+
+      // since we are only inserting unique items, we know their local id
+      // will be equal to their index
+      pos->local = static_cast<IdType>(index);
+    }
+  }
+}
+
+/**
+ * @brief This counts the number of nodes inserted per thread block.
+ *
+ * @tparam IdType The type of of id.
+ * @tparam BLOCK_SIZE The size of the thread block.
+ * @tparam TILE_SIZE The number of entries each thread block will process.
+ * @param input The nodes to insert.
+ * @param num_input The number of nodes to insert.
+ * @param table The hash table.
+ * @param num_unique The number of nodes inserted into the hash table per thread
+ * block.
+ */
+template <typename IdType, int BLOCK_SIZE, size_t TILE_SIZE>
+__global__ void count_hashmap(
+    const IdType* items, const size_t num_items,
+    DeviceOrderedHashTable<IdType> table, IdType* const num_unique) {
+  assert(BLOCK_SIZE == blockDim.x);
+
+  using BlockReduce = typename cub::BlockReduce<IdType, BLOCK_SIZE>;
+  using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
+
+  const size_t block_start = TILE_SIZE * blockIdx.x;
+  const size_t block_end = TILE_SIZE * (blockIdx.x + 1);
+
+  IdType count = 0;
+
+#pragma unroll
+  for (size_t index = threadIdx.x + block_start; index < block_end;
+       index += BLOCK_SIZE) {
+    if (index < num_items) {
+      const Mapping& mapping = *table.Search(items[index]);
+      if (mapping.index == index) {
+        ++count;
+      }
+    }
+  }
+
+  __shared__ typename BlockReduce::TempStorage temp_space;
+
+  count = BlockReduce(temp_space).Sum(count);
+
+  if (threadIdx.x == 0) {
+    num_unique[blockIdx.x] = count;
+    if (blockIdx.x == 0) {
+      num_unique[gridDim.x] = 0;
+    }
+  }
+}
+
+/**
+ * @brief Update the local numbering of elements in the hashmap.
+ *
+ * @tparam IdType The type of id.
+ * @tparam BLOCK_SIZE The size of the thread blocks.
+ * @tparam TILE_SIZE The number of elements each thread block works on.
+ * @param items The set of non-unique items to update from.
+ * @param num_items The number of non-unique items.
+ * @param table The hash table.
+ * @param num_items_prefix The number of unique items preceding each thread
+ * block.
+ * @param unique_items The set of unique items (output).
+ * @param num_unique_items The number of unique items (output).
+ */
+template <typename IdType, int BLOCK_SIZE, size_t TILE_SIZE>
+__global__ void compact_hashmap(
+    const IdType* const items, const size_t num_items,
+    MutableDeviceOrderedHashTable<IdType> table,
+    const IdType* const num_items_prefix, IdType* const unique_items,
+    int64_t* const num_unique_items) {
+  assert(BLOCK_SIZE == blockDim.x);
+
+  using FlagType = uint16_t;
+  using BlockScan = typename cub::BlockScan<FlagType, BLOCK_SIZE>;
+  using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
+
+  constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE;
+
+  __shared__ typename BlockScan::TempStorage temp_space;
+
+  const IdType offset = num_items_prefix[blockIdx.x];
+
+  BlockPrefixCallbackOp<FlagType> prefix_op(0);
+
+  // count successful placements
+  for (int32_t i = 0; i < VALS_PER_THREAD; ++i) {
+    const IdType index = threadIdx.x + i * BLOCK_SIZE + blockIdx.x * TILE_SIZE;
+
+    FlagType flag;
+    Mapping* kv;
+    if (index < num_items) {
+      kv = table.Search(items[index]);
+      flag = kv->index == index;
+    } else {
+      flag = 0;
+    }
+
+    if (!flag) {
+      kv = nullptr;
+    }
+
+    BlockScan(temp_space).ExclusiveSum(flag, flag, prefix_op);
+    __syncthreads();
+
+    if (kv) {
+      const IdType pos = offset + flag;
+      kv->local = pos;
+      unique_items[pos] = items[index];
+    }
+  }
+
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    *num_unique_items = num_items_prefix[gridDim.x];
+  }
+}
+
+// DeviceOrderedHashTable implementation
+
+template <typename IdType>
+DeviceOrderedHashTable<IdType>::DeviceOrderedHashTable(
+    const Mapping* const table, const size_t size)
+    : table_(table), size_(size) {}
+
+template <typename IdType>
+DeviceOrderedHashTable<IdType> OrderedHashTable<IdType>::DeviceHandle() const {
+  return DeviceOrderedHashTable<IdType>(table_, size_);
+}
+
+// OrderedHashTable implementation
+
+template <typename IdType>
+OrderedHashTable<IdType>::OrderedHashTable(
+    const size_t size, DGLContext ctx, cudaStream_t stream, const int scale)
+    : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) {
+  // make sure we will at least as many buckets as items.
+  CHECK_GT(scale, 0);
+
+  auto device = runtime::DeviceAPI::Get(ctx_);
+  table_ = static_cast<Mapping*>(
+      device->AllocWorkspace(ctx_, sizeof(Mapping) * size_));
+
+  CUDA_CALL(cudaMemsetAsync(
+      table_, DeviceOrderedHashTable<IdType>::kEmptyKey,
+      sizeof(Mapping) * size_, stream));
+}
+
+template <typename IdType>
+OrderedHashTable<IdType>::~OrderedHashTable() {
+  auto device = runtime::DeviceAPI::Get(ctx_);
+  device->FreeWorkspace(ctx_, table_);
+}
+
+template <typename IdType>
+void OrderedHashTable<IdType>::FillWithDuplicates(
+    const IdType* const input, const size_t num_input, IdType* const unique,
+    int64_t* const num_unique, cudaStream_t stream) {
+  auto device = runtime::DeviceAPI::Get(ctx_);
+
+  const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
+
+  const dim3 grid(num_tiles);
+  const dim3 block(BLOCK_SIZE);
+
+  auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
+
+  CUDA_KERNEL_CALL(
+      (generate_hashmap_duplicates<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block,
+      0, stream, input, num_input, device_table);
+
+  IdType* item_prefix = static_cast<IdType*>(
+      device->AllocWorkspace(ctx_, sizeof(IdType) * (num_input + 1)));
+
+  CUDA_KERNEL_CALL(
+      (count_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0, stream,
+      input, num_input, device_table, item_prefix);
+
+  size_t workspace_bytes;
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      nullptr, workspace_bytes, static_cast<IdType*>(nullptr),
+      static_cast<IdType*>(nullptr), grid.x + 1, stream));
+  void* workspace = device->AllocWorkspace(ctx_, workspace_bytes);
+
+  CUDA_CALL(cub::DeviceScan::ExclusiveSum(
+      workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1,
+      stream));
+  device->FreeWorkspace(ctx_, workspace);
+
+  CUDA_KERNEL_CALL(
+      (compact_hashmap<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0, stream,
+      input, num_input, device_table, item_prefix, unique, num_unique);
+  device->FreeWorkspace(ctx_, item_prefix);
+}
+
+template <typename IdType>
+void OrderedHashTable<IdType>::FillWithUnique(
+    const IdType* const input, const size_t num_input, cudaStream_t stream) {
+  const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE;
+
+  const dim3 grid(num_tiles);
+  const dim3 block(BLOCK_SIZE);
+
+  auto device_table = MutableDeviceOrderedHashTable<IdType>(this);
+
+  CUDA_KERNEL_CALL(
+      (generate_hashmap_unique<IdType, BLOCK_SIZE, TILE_SIZE>), grid, block, 0,
+      stream, input, num_input, device_table);
+}
+
+template class OrderedHashTable<int32_t>;
+template class OrderedHashTable<int64_t>;
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace dgl
diff --git a/src/runtime/cuda/cuda_hashtable.cuh b/src/runtime/cuda/cuda_hashtable.cuh
index 8a3c3a2c990f..cae772674fef 100644
--- a/src/runtime/cuda/cuda_hashtable.cuh
+++ b/src/runtime/cuda/cuda_hashtable.cuh
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /**
  *  Copyright (c) 2021 by Contributors
  * @file runtime/cuda/cuda_device_common.cuh
@@ -10,7 +11,7 @@
 #include <dgl/runtime/c_runtime_api.h>
 
 #include "cuda_common.h"
-#include "cuda_runtime.h"
+#include "hip/hip_runtime.h"
 
 namespace dgl {
 namespace runtime {
@@ -228,7 +229,7 @@ class OrderedHashTable {
    * @param stream The stream to use for initializing the hashtable.
    */
   OrderedHashTable(
-      const size_t size, DGLContext ctx, cudaStream_t stream,
+      const size_t size, DGLContext ctx, hipStream_t stream,
       const int scale = kDefaultScale);
 
   /**
@@ -252,7 +253,7 @@ class OrderedHashTable {
    */
   void FillWithDuplicates(
       const IdType* const input, const size_t num_input, IdType* const unique,
-      int64_t* const num_unique, cudaStream_t stream);
+      int64_t* const num_unique, hipStream_t stream);
 
   /**
    * @brief Fill the hashtable with an array of unique keys.
@@ -262,7 +263,7 @@ class OrderedHashTable {
    * @param stream The stream to perform operations on.
    */
   void FillWithUnique(
-      const IdType* const input, const size_t num_input, cudaStream_t stream);
+      const IdType* const input, const size_t num_input, hipStream_t stream);
 
   /**
    * @brief Get a verison of the hashtable usable from device functions.
diff --git a/src/runtime/cuda/cuda_hashtable.cuh.prehip b/src/runtime/cuda/cuda_hashtable.cuh.prehip
new file mode 100644
index 000000000000..8a3c3a2c990f
--- /dev/null
+++ b/src/runtime/cuda/cuda_hashtable.cuh.prehip
@@ -0,0 +1,284 @@
+/**
+ *  Copyright (c) 2021 by Contributors
+ * @file runtime/cuda/cuda_device_common.cuh
+ * @brief Device level functions for within cuda kernels.
+ */
+
+#ifndef DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_
+#define DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_
+
+#include <dgl/runtime/c_runtime_api.h>
+
+#include "cuda_common.h"
+#include "cuda_runtime.h"
+
+namespace dgl {
+namespace runtime {
+namespace cuda {
+
+template <typename>
+class OrderedHashTable;
+
+/**
+ * @brief A device-side handle for a GPU hashtable for mapping items to the
+ * first index at which they appear in the provided data array.
+ *
+ * For any ID array A, one can view it as a mapping from the index `i`
+ * (continuous integer range from zero) to its element `A[i]`. This hashtable
+ * serves as a reverse mapping, i.e., from element `A[i]` to its index `i`.
+ * Quadratic probing is used for collision resolution. See
+ * DeviceOrderedHashTable's documentation for how the Mapping structure is
+ * used.
+ *
+ * The hash table should be used in two phases, with the first being populating
+ * the hash table with the OrderedHashTable object, and then generating this
+ * handle from it. This object can then be used to search the hash table,
+ * to find mappings, from with CUDA code.
+ *
+ * If a device-side handle is created from a hash table with the following
+ * entries:
+ * [
+ *   {key: 0, local: 0, index: 0},
+ *   {key: 3, local: 1, index: 1},
+ *   {key: 2, local: 2, index: 2},
+ *   {key: 8, local: 3, index: 4},
+ *   {key: 4, local: 4, index: 5},
+ *   {key: 1, local: 5, index: 8}
+ * ]
+ * The array [0, 3, 2, 0, 8, 4, 3, 2, 1, 8] could have `Search()` called on
+ * each id, to be mapped via:
+ * ```
+ * __global__ void map(int32_t * array,
+ *                     size_t size,
+ *                     DeviceOrderedHashTable<int32_t> table) {
+ *   int idx = threadIdx.x + blockIdx.x*blockDim.x;
+ *   if (idx < size) {
+ *     array[idx] = table.Search(array[idx])->local;
+ *   }
+ * }
+ * ```
+ * to get the remaped array:
+ * [0, 1, 2, 0, 3, 4, 1, 2, 5, 3]
+ *
+ * @tparam IdType The type of the IDs.
+ */
+template <typename IdType>
+class DeviceOrderedHashTable {
+ public:
+  /**
+   * @brief An entry in the hashtable.
+   */
+  struct Mapping {
+    /**
+     * @brief The ID of the item inserted.
+     */
+    IdType key;
+    /**
+     * @brief The index of the item in the unique list.
+     */
+    IdType local;
+    /**
+     * @brief The index of the item when inserted into the hashtable (e.g.,
+     * the index within the array passed into FillWithDuplicates()).
+     */
+    int64_t index;
+  };
+
+  typedef const Mapping* ConstIterator;
+
+  DeviceOrderedHashTable(const DeviceOrderedHashTable& other) = default;
+  DeviceOrderedHashTable& operator=(const DeviceOrderedHashTable& other) =
+      default;
+
+  /**
+   * @brief Find the non-mutable mapping of a given key within the hash table.
+   *
+   * WARNING: The key must exist within the hashtable. Searching for a key not
+   * in the hashtable is undefined behavior.
+   *
+   * @param id The key to search for.
+   *
+   * @return An iterator to the mapping.
+   */
+  inline __device__ ConstIterator Search(const IdType id) const {
+    const IdType pos = SearchForPosition(id);
+
+    return &table_[pos];
+  }
+
+  /**
+   * @brief Check whether a key exists within the hashtable.
+   *
+   * @param id The key to check for.
+   *
+   * @return True if the key exists in the hashtable.
+   */
+  inline __device__ bool Contains(const IdType id) const {
+    IdType pos = Hash(id);
+
+    IdType delta = 1;
+    while (table_[pos].key != kEmptyKey) {
+      if (table_[pos].key == id) {
+        return true;
+      }
+      pos = Hash(pos + delta);
+      delta += 1;
+    }
+    return false;
+  }
+
+ protected:
+  // Must be uniform bytes for memset to work
+  static constexpr IdType kEmptyKey = static_cast<IdType>(-1);
+
+  const Mapping* table_;
+  size_t size_;
+
+  /**
+   * @brief Create a new device-side handle to the hash table.
+   *
+   * @param table The table stored in GPU memory.
+   * @param size The size of the table.
+   */
+  explicit DeviceOrderedHashTable(const Mapping* table, size_t size);
+
+  /**
+   * @brief Search for an item in the hash table which is known to exist.
+   *
+   * WARNING: If the ID searched for does not exist within the hashtable, this
+   * function will never return.
+   *
+   * @param id The ID of the item to search for.
+   *
+   * @return The the position of the item in the hashtable.
+   */
+  inline __device__ IdType SearchForPosition(const IdType id) const {
+    IdType pos = Hash(id);
+
+    // linearly scan for matching entry
+    IdType delta = 1;
+    while (table_[pos].key != id) {
+      assert(table_[pos].key != kEmptyKey);
+      pos = Hash(pos + delta);
+      delta += 1;
+    }
+    assert(pos < size_);
+
+    return pos;
+  }
+
+  /**
+   * @brief Hash an ID to a to a position in the hash table.
+   *
+   * @param id The ID to hash.
+   *
+   * @return The hash.
+   */
+  inline __device__ size_t Hash(const IdType id) const { return id % size_; }
+
+  friend class OrderedHashTable<IdType>;
+};
+
+/**
+ * @brief A host-side handle for a GPU hashtable for mapping items to the
+ * first index at which they appear in the provided data array. This host-side
+ * handle is responsible for allocating and free the GPU memory of the
+ * hashtable.
+ *
+ * For any ID array A, one can view it as a mapping from the index `i`
+ * (continuous integer range from zero) to its element `A[i]`. This hashtable
+ * serves as a reverse mapping, i.e., from element `A[i]` to its index `i`.
+ * Quadratic probing is used for collision resolution.
+ *
+ * The hash table should be used in two phases, the first is filling the hash
+ * table via 'FillWithDuplicates()' or 'FillWithUnique()'. Then, the
+ * 'DeviceHandle()' method can be called, to get a version suitable for
+ * searching from device and kernel functions.
+ *
+ * If 'FillWithDuplicates()' was called with an array of:
+ * [0, 3, 2, 0, 8, 4, 3, 2, 1, 8]
+ *
+ * The resulting entries in the hash-table would be:
+ * [
+ *   {key: 0, local: 0, index: 0},
+ *   {key: 3, local: 1, index: 1},
+ *   {key: 2, local: 2, index: 2},
+ *   {key: 8, local: 3, index: 4},
+ *   {key: 4, local: 4, index: 5},
+ *   {key: 1, local: 5, index: 8}
+ * ]
+ *
+ * @tparam IdType The type of the IDs.
+ */
+template <typename IdType>
+class OrderedHashTable {
+ public:
+  static constexpr int kDefaultScale = 3;
+
+  using Mapping = typename DeviceOrderedHashTable<IdType>::Mapping;
+
+  /**
+   * @brief Create a new ordered hash table. The amoutn of GPU memory
+   * consumed by the resulting hashtable is O(`size` * 2^`scale`).
+   *
+   * @param size The number of items to insert into the hashtable.
+   * @param ctx The device context to store the hashtable on.
+   * @param scale The power of two times larger the number of buckets should
+   * be than the number of items.
+   * @param stream The stream to use for initializing the hashtable.
+   */
+  OrderedHashTable(
+      const size_t size, DGLContext ctx, cudaStream_t stream,
+      const int scale = kDefaultScale);
+
+  /**
+   * @brief Cleanup after the hashtable.
+   */
+  ~OrderedHashTable();
+
+  // Disable copying
+  OrderedHashTable(const OrderedHashTable& other) = delete;
+  OrderedHashTable& operator=(const OrderedHashTable& other) = delete;
+
+  /**
+   * @brief Fill the hashtable with the array containing possibly duplicate
+   * IDs.
+   *
+   * @param input The array of IDs to insert.
+   * @param num_input The number of IDs to insert.
+   * @param unique The list of unique IDs inserted.
+   * @param num_unique The number of unique IDs inserted.
+   * @param stream The stream to perform operations on.
+   */
+  void FillWithDuplicates(
+      const IdType* const input, const size_t num_input, IdType* const unique,
+      int64_t* const num_unique, cudaStream_t stream);
+
+  /**
+   * @brief Fill the hashtable with an array of unique keys.
+   *
+   * @param input The array of unique IDs.
+   * @param num_input The number of keys.
+   * @param stream The stream to perform operations on.
+   */
+  void FillWithUnique(
+      const IdType* const input, const size_t num_input, cudaStream_t stream);
+
+  /**
+   * @brief Get a verison of the hashtable usable from device functions.
+   *
+   * @return This hashtable.
+   */
+  DeviceOrderedHashTable<IdType> DeviceHandle() const;
+
+ private:
+  Mapping* table_;
+  size_t size_;
+  DGLContext ctx_;
+};
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace dgl
+
+#endif  // DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_
diff --git a/src/runtime/cuda/gpu_cache.cu b/src/runtime/cuda/gpu_cache.cu
index e2b6767e2792..f0f4eb73a43b 100644
--- a/src/runtime/cuda/gpu_cache.cu
+++ b/src/runtime/cuda/gpu_cache.cu
@@ -20,7 +20,7 @@
 #ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
 #define DGL_RUNTIME_CUDA_GPU_CACHE_H_
 
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #include <dgl/array.h>
 #include <dgl/aten/array_ops.h>
 #include <dgl/packed_func_ext.h>
@@ -55,12 +55,12 @@ class GpuCache : public runtime::Object {
       : num_feats(num_feats),
         cache(std::make_unique<gpu_cache_t>(
             (num_items + bucket_size - 1) / bucket_size, num_feats)) {
-    CUDA_CALL(cudaGetDevice(&cuda_device));
+    CUDA_CALL(hipGetDevice(&cuda_device));
   }
 
   std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
     const auto &ctx = keys->ctx;
-    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    hipStream_t stream = dgl::runtime::getCurrentCUDAStream();
     auto device = dgl::runtime::DeviceAPI::Get(ctx);
     CHECK_EQ(ctx.device_type, kDGLCUDA)
         << "The keys should be on a CUDA device";
@@ -94,7 +94,7 @@ class GpuCache : public runtime::Object {
   }
 
   void Replace(IdArray keys, NDArray values) {
-    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    hipStream_t stream = dgl::runtime::getCurrentCUDAStream();
     CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
         << "The keys should be on a CUDA device";
     CHECK_EQ(keys->ctx.device_id, cuda_device)
diff --git a/src/runtime/cuda/gpu_cache.cu.prehip b/src/runtime/cuda/gpu_cache.cu.prehip
new file mode 100644
index 000000000000..e2b6767e2792
--- /dev/null
+++ b/src/runtime/cuda/gpu_cache.cu.prehip
@@ -0,0 +1,189 @@
+/*!
+ *  Copyright (c) 2022 by Contributors
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ * \file gpu_cache.cu
+ * \brief Implementation of wrapper HugeCTR gpu_cache routines.
+ */
+
+#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_
+#define DGL_RUNTIME_CUDA_GPU_CACHE_H_
+
+#include <cuda_runtime.h>
+#include <dgl/array.h>
+#include <dgl/aten/array_ops.h>
+#include <dgl/packed_func_ext.h>
+#include <dgl/runtime/container.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/object.h>
+#include <dgl/runtime/registry.h>
+
+#include <nv_gpu_cache.hpp>
+
+#include "../../runtime/cuda/cuda_common.h"
+
+namespace dgl {
+namespace runtime {
+namespace cuda {
+
+template <typename key_t>
+class GpuCache : public runtime::Object {
+  constexpr static int set_associativity = 2;
+  constexpr static int WARP_SIZE = 32;
+  constexpr static int bucket_size = WARP_SIZE * set_associativity;
+  using gpu_cache_t = gpu_cache::gpu_cache<
+      key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity,
+      WARP_SIZE>;
+
+ public:
+  static constexpr const char *_type_key =
+      sizeof(key_t) == 4 ? "cuda.GpuCache32" : "cuda.GpuCache64";
+  DGL_DECLARE_OBJECT_TYPE_INFO(GpuCache, Object);
+
+  GpuCache(size_t num_items, size_t num_feats)
+      : num_feats(num_feats),
+        cache(std::make_unique<gpu_cache_t>(
+            (num_items + bucket_size - 1) / bucket_size, num_feats)) {
+    CUDA_CALL(cudaGetDevice(&cuda_device));
+  }
+
+  std::tuple<NDArray, IdArray, IdArray> Query(IdArray keys) {
+    const auto &ctx = keys->ctx;
+    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    auto device = dgl::runtime::DeviceAPI::Get(ctx);
+    CHECK_EQ(ctx.device_type, kDGLCUDA)
+        << "The keys should be on a CUDA device";
+    CHECK_EQ(ctx.device_id, cuda_device)
+        << "The keys should be on the correct CUDA device";
+    CHECK_EQ(keys->ndim, 1)
+        << "The tensor of requested indices must be of dimension one.";
+    NDArray values = NDArray::Empty(
+        {keys->shape[0], (int64_t)num_feats}, DGLDataType{kDGLFloat, 32, 1},
+        ctx);
+    IdArray missing_index = aten::NewIdArray(keys->shape[0], ctx, 64);
+    IdArray missing_keys =
+        aten::NewIdArray(keys->shape[0], ctx, sizeof(key_t) * 8);
+    size_t *missing_len =
+        static_cast<size_t *>(device->AllocWorkspace(ctx, sizeof(size_t)));
+    cache->Query(
+        static_cast<const key_t *>(keys->data), keys->shape[0],
+        static_cast<float *>(values->data),
+        static_cast<uint64_t *>(missing_index->data),
+        static_cast<key_t *>(missing_keys->data), missing_len, stream);
+    size_t missing_len_host;
+    device->CopyDataFromTo(
+        missing_len, 0, &missing_len_host, 0, sizeof(missing_len_host), ctx,
+        DGLContext{kDGLCPU, 0}, keys->dtype);
+    device->FreeWorkspace(ctx, missing_len);
+    missing_index = missing_index.CreateView(
+        {(int64_t)missing_len_host}, missing_index->dtype);
+    missing_keys =
+        missing_keys.CreateView({(int64_t)missing_len_host}, keys->dtype);
+    return std::make_tuple(values, missing_index, missing_keys);
+  }
+
+  void Replace(IdArray keys, NDArray values) {
+    cudaStream_t stream = dgl::runtime::getCurrentCUDAStream();
+    CHECK_EQ(keys->ctx.device_type, kDGLCUDA)
+        << "The keys should be on a CUDA device";
+    CHECK_EQ(keys->ctx.device_id, cuda_device)
+        << "The keys should be on the correct CUDA device";
+    CHECK_EQ(values->ctx.device_type, kDGLCUDA)
+        << "The values should be on a CUDA device";
+    CHECK_EQ(values->ctx.device_id, cuda_device)
+        << "The values should be on the correct CUDA device";
+    CHECK_EQ(keys->shape[0], values->shape[0])
+        << "First dimensions of keys and values must match";
+    CHECK_EQ(values->shape[1], num_feats) << "Embedding dimension must match";
+    cache->Replace(
+        static_cast<const key_t *>(keys->data), keys->shape[0],
+        static_cast<const float *>(values->data), stream);
+  }
+
+ private:
+  size_t num_feats;
+  std::unique_ptr<gpu_cache_t> cache;
+  int cuda_device;
+};
+
+static_assert(sizeof(unsigned int) == 4);
+DGL_DEFINE_OBJECT_REF(GpuCacheRef32, GpuCache<unsigned int>);
+// The cu file in HugeCTR gpu cache uses unsigned int and long long.
+// Changing to int64_t results in a mismatch of template arguments.
+static_assert(sizeof(long long) == 8);                      // NOLINT
+DGL_DEFINE_OBJECT_REF(GpuCacheRef64, GpuCache<long long>);  // NOLINT
+
+/* CAPI **********************************************************************/
+
+using namespace dgl::runtime;
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheCreate")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      const size_t num_items = args[0];
+      const size_t num_feats = args[1];
+      const int num_bits = args[2];
+
+      if (num_bits == 32)
+        *rv = GpuCacheRef32(
+            std::make_shared<GpuCache<unsigned int>>(num_items, num_feats));
+      else
+        *rv = GpuCacheRef64(std::make_shared<GpuCache<long long>>(  // NOLINT
+            num_items, num_feats));
+    });
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheQuery")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      IdArray keys = args[1];
+
+      List<ObjectRef> ret;
+      if (keys->dtype.bits == 32) {
+        GpuCacheRef32 cache = args[0];
+        auto result = cache->Query(keys);
+
+        ret.push_back(Value(MakeValue(std::get<0>(result))));
+        ret.push_back(Value(MakeValue(std::get<1>(result))));
+        ret.push_back(Value(MakeValue(std::get<2>(result))));
+      } else {
+        GpuCacheRef64 cache = args[0];
+        auto result = cache->Query(keys);
+
+        ret.push_back(Value(MakeValue(std::get<0>(result))));
+        ret.push_back(Value(MakeValue(std::get<1>(result))));
+        ret.push_back(Value(MakeValue(std::get<2>(result))));
+      }
+
+      *rv = ret;
+    });
+
+DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheReplace")
+    .set_body([](DGLArgs args, DGLRetValue *rv) {
+      IdArray keys = args[1];
+      NDArray values = args[2];
+
+      if (keys->dtype.bits == 32) {
+        GpuCacheRef32 cache = args[0];
+        cache->Replace(keys, values);
+      } else {
+        GpuCacheRef64 cache = args[0];
+        cache->Replace(keys, values);
+      }
+
+      *rv = List<ObjectRef>{};
+    });
+
+}  // namespace cuda
+}  // namespace runtime
+}  // namespace dgl
+
+#endif
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index abbe3b1310f4..797dfeff9012 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -23,12 +23,12 @@ constexpr DGLDataType DGLDataTypeTraits<int32_t>::dtype;
 constexpr DGLDataType DGLDataTypeTraits<int64_t>::dtype;
 constexpr DGLDataType DGLDataTypeTraits<uint32_t>::dtype;
 constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
 #if BF16_ENABLED
-constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<__hip_bfloat16>::dtype;
 #endif  // BF16_ENABLED
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
 constexpr DGLDataType DGLDataTypeTraits<double>::dtype;
 
@@ -262,7 +262,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) {
 void NDArray::UnpinContainer(NDArray::Container* ptr) {
   auto container_is_pinned = IsContainerPinned(ptr);
   // The tensor may be pinned outside of DGL via a different CUDA API,
-  // so we cannot unpin it with cudaHostUnregister.
+  // so we cannot unpin it with hipHostUnregister.
   CHECK(ptr->pinned_by_dgl_ || !container_is_pinned)
       << "Cannot unpin a tensor that is pinned outside of DGL.";
   // 1. not pinned, do nothing
diff --git a/src/runtime/ndarray.cc.prehip b/src/runtime/ndarray.cc.prehip
new file mode 100644
index 000000000000..abbe3b1310f4
--- /dev/null
+++ b/src/runtime/ndarray.cc.prehip
@@ -0,0 +1,505 @@
+/**
+ *  Copyright (c) 2017-2022 by Contributors
+ * @file ndarray.cc
+ * @brief NDArray container infratructure.
+ */
+#include <dgl/runtime/c_runtime_api.h>
+#include <dgl/runtime/device_api.h>
+#include <dgl/runtime/ndarray.h>
+#include <dgl/runtime/shared_mem.h>
+#include <dgl/runtime/tensordispatch.h>
+#include <dgl/zerocopy_serializer.h>
+#include <dmlc/logging.h>
+#include <string.h>
+
+#include "runtime_base.h"
+
+namespace dgl {
+
+constexpr DGLDataType DGLDataTypeTraits<int8_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<uint8_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int16_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int32_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<int64_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<uint32_t>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<uint64_t>::dtype;
+#ifdef DGL_USE_CUDA
+constexpr DGLDataType DGLDataTypeTraits<__half>::dtype;
+#if BF16_ENABLED
+constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype;
+#endif  // BF16_ENABLED
+#endif  // DGL_USE_CUDA
+constexpr DGLDataType DGLDataTypeTraits<float>::dtype;
+constexpr DGLDataType DGLDataTypeTraits<double>::dtype;
+
+namespace runtime {
+
+inline void VerifyDataType(DGLDataType dtype) {
+  CHECK_GE(dtype.lanes, 1);
+  if (dtype.code == kDGLFloat) {
+    CHECK_EQ(dtype.bits % 8, 0);
+  } else {
+    CHECK_EQ(dtype.bits % 8, 0);
+  }
+  CHECK_EQ(dtype.bits & (dtype.bits - 1), 0);
+}
+
+inline size_t GetDataSize(const DGLArray& arr) {
+  size_t size = 1;
+  for (dgl_index_t i = 0; i < arr.ndim; ++i) {
+    size *= arr.shape[i];
+  }
+  size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8;
+  return size;
+}
+
+inline size_t GetDataAlignment(const DGLArray& arr) {
+  size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes;
+  if (align < kAllocAlignment) return kAllocAlignment;
+  return align;
+}
+
+void NDArray::Internal::DefaultDeleter(NDArray::Container* ptr) {
+  using dgl::runtime::NDArray;
+  if (ptr->manager_ctx != nullptr) {
+    static_cast<NDArray::Container*>(ptr->manager_ctx)->DecRef();
+  } else if (ptr->mem) {
+    ptr->mem = nullptr;
+  } else if (ptr->dl_tensor.data != nullptr) {
+    // if the array is still pinned before freeing, unpin it.
+    if (ptr->pinned_by_dgl_) UnpinContainer(ptr);
+    if (ptr->pinned_by_pytorch_) {
+      DeviceAPI::Get(kDGLCUDA)->FreePinnedDataSpace(
+          &(ptr->pytorch_raw_deleter_));
+      CHECK(ptr->pytorch_raw_deleter_ == nullptr);
+      ptr->pinned_by_pytorch_ = false;
+      ptr->pytorch_ctx_ = nullptr;
+    } else {
+      dgl::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx)
+          ->FreeDataSpace(ptr->dl_tensor.ctx, ptr->dl_tensor.data);
+    }
+  }
+  delete ptr;
+}
+
+NDArray NDArray::Internal::Create(
+    std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx) {
+  VerifyDataType(dtype);
+  // critical zone
+  NDArray::Container* data = new NDArray::Container();
+  data->deleter = DefaultDeleter;
+  NDArray ret(data);
+  ret.data_ = data;
+  // RAII now in effect
+  // setup shape
+  data->shape_ = std::move(shape);
+  data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+  data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+  // setup stride (this should be optional, but some framework
+  //   does not support NULL stride and thus will crash the program).
+  data->stride_.resize(data->dl_tensor.ndim, 1);
+  for (int i = data->dl_tensor.ndim - 2; i >= 0; --i) {
+    data->stride_[i] = data->shape_[i + 1] * data->stride_[i + 1];
+  }
+  data->dl_tensor.strides = dmlc::BeginPtr(data->stride_);
+  // setup dtype
+  data->dl_tensor.dtype = dtype;
+  // setup ctx
+  data->dl_tensor.ctx = ctx;
+  return ret;
+}
+
+DGLArray* NDArray::Internal::MoveAsDGLArray(NDArray arr) {
+  DGLArray* tensor = reinterpret_cast<DGLArray*>(arr.data_);
+  CHECK(tensor == const_cast<DGLArray*>(arr.operator->()));
+  arr.data_ = nullptr;
+  return tensor;
+}
+
+size_t NDArray::GetSize() const { return GetDataSize(data_->dl_tensor); }
+
+int64_t NDArray::NumElements() const {
+  if (data_->dl_tensor.ndim == 0) return 0;
+  int64_t size = 1;
+  for (int i = 0; i < data_->dl_tensor.ndim; ++i) {
+    size *= data_->dl_tensor.shape[i];
+  }
+  return size;
+}
+
+bool NDArray::IsContiguous() const {
+  CHECK(data_ != nullptr);
+  if (data_->dl_tensor.strides == nullptr) return true;
+
+  // See https://github.com/dmlc/dgl/issues/2118 and PyTorch's
+  // compute_contiguous() implementation
+  int64_t z = 1;
+  for (int64_t i = data_->dl_tensor.ndim - 1; i >= 0; --i) {
+    if (data_->dl_tensor.shape[i] != 1) {
+      if (data_->dl_tensor.strides[i] == z)
+        z *= data_->dl_tensor.shape[i];
+      else
+        return false;
+    }
+  }
+  return true;
+}
+
+NDArray NDArray::CreateView(
+    std::vector<int64_t> shape, DGLDataType dtype, int64_t offset) {
+  CHECK(data_ != nullptr);
+  CHECK(IsContiguous()) << "Can only create view for compact tensor";
+  NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx);
+  ret.data_->dl_tensor.byte_offset = this->data_->dl_tensor.byte_offset;
+  size_t curr_size = GetDataSize(this->data_->dl_tensor);
+  size_t view_size = GetDataSize(ret.data_->dl_tensor);
+  CHECK_LE(view_size, curr_size)
+      << "Tries to create a view that has bigger memory than current one";
+  // increase ref count
+  this->data_->IncRef();
+  ret.data_->manager_ctx = this->data_;
+  ret.data_->dl_tensor.data =
+      static_cast<char*>(this->data_->dl_tensor.data) + offset;
+  return ret;
+}
+
+NDArray NDArray::EmptyShared(
+    const std::string& name, std::vector<int64_t> shape, DGLDataType dtype,
+    DGLContext ctx, bool is_create) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  auto mem = std::make_shared<SharedMemory>(name);
+  if (is_create) {
+    ret.data_->dl_tensor.data = mem->CreateNew(size);
+  } else {
+    ret.data_->dl_tensor.data = mem->Open(size);
+  }
+
+  ret.data_->mem = mem;
+  return ret;
+}
+
+NDArray NDArray::Empty(
+    std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  size_t alignment = GetDataAlignment(ret.data_->dl_tensor);
+  if (size > 0)
+    ret.data_->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace(
+        ret->ctx, size, alignment, ret->dtype);
+  return ret;
+}
+
+void NDArray::CopyFromTo(DGLArray* from, DGLArray* to) {
+  size_t from_size = GetDataSize(*from);
+  size_t to_size = GetDataSize(*to);
+  CHECK_EQ(from_size, to_size)
+      << "DGLArrayCopyFromTo: The size must exactly match";
+
+  CHECK(
+      from->ctx.device_type == to->ctx.device_type ||
+      from->ctx.device_type == kDGLCPU || to->ctx.device_type == kDGLCPU)
+      << "Can not copy across different ctx types directly";
+
+  // Use the context that is *not* a cpu context to get the correct device
+  // api manager.
+  DGLContext ctx = from->ctx.device_type != kDGLCPU ? from->ctx : to->ctx;
+
+  // default: local current cuda stream
+  DeviceAPI::Get(ctx)->CopyDataFromTo(
+      from->data, static_cast<size_t>(from->byte_offset), to->data,
+      static_cast<size_t>(to->byte_offset), from_size, from->ctx, to->ctx,
+      from->dtype);
+}
+
+void NDArray::RecordedCopyFromTo(
+    DGLArray* from, DGLArray* to, void* pytorch_ctx) {
+  size_t from_size = GetDataSize(*from);
+  size_t to_size = GetDataSize(*to);
+  CHECK_EQ(from_size, to_size)
+      << "DGLArrayCopyFromTo: The size must exactly match.";
+
+  CHECK(from->ctx.device_type != to->ctx.device_type)
+      << "Recoding event is only called for the copy between CPU and GPU.";
+
+  CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLCUDA)
+      << "At least one CUDA ctx needs to be involved.";
+
+  DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo(
+      from->data, static_cast<size_t>(from->byte_offset), to->data,
+      static_cast<size_t>(to->byte_offset), from_size, from->ctx, to->ctx,
+      from->dtype, pytorch_ctx);
+}
+
+NDArray NDArray::PinnedEmpty(
+    std::vector<int64_t> shape, DGLDataType dtype, DGLContext ctx) {
+  CHECK_EQ(ctx.device_type, kDGLCPU) << "Only NDArray on CPU can be pinned";
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  size_t size = GetDataSize(ret.data_->dl_tensor);
+  if (size > 0) {
+    ret.data_->dl_tensor.data = DeviceAPI::Get(kDGLCUDA)->AllocPinnedDataSpace(
+        size, &(ret.data_->pytorch_ctx_), &(ret.data_->pytorch_raw_deleter_));
+    CHECK(
+        ret.data_->pytorch_ctx_ != nullptr &&
+        ret.data_->pytorch_raw_deleter_ != nullptr)
+        << "The allocation failed in PyTorch's CachingHostAllocator. "
+        << "The returned context pointer is " << ret.data_->pytorch_ctx_
+        << " and the function deleter is " << ret.data_->pytorch_raw_deleter_;
+    ret.data_->pinned_by_pytorch_ = true;
+  }
+  return ret;
+}
+
+void NDArray::PinContainer(NDArray::Container* ptr) {
+  if (IsContainerPinned(ptr)) return;
+  auto* tensor = &(ptr->dl_tensor);
+  CHECK_EQ(tensor->ctx.device_type, kDGLCPU)
+      << "Only NDArray on CPU can be pinned";
+  ptr->pinned_by_dgl_ =
+      DeviceAPI::Get(kDGLCUDA)->PinData(tensor->data, GetDataSize(*tensor));
+}
+
+void NDArray::UnpinContainer(NDArray::Container* ptr) {
+  auto container_is_pinned = IsContainerPinned(ptr);
+  // The tensor may be pinned outside of DGL via a different CUDA API,
+  // so we cannot unpin it with cudaHostUnregister.
+  CHECK(ptr->pinned_by_dgl_ || !container_is_pinned)
+      << "Cannot unpin a tensor that is pinned outside of DGL.";
+  // 1. not pinned, do nothing
+  if (!container_is_pinned) return;
+  // 2. pinned by DGL, unpin it
+  DeviceAPI::Get(kDGLCUDA)->UnpinData(ptr->dl_tensor.data);
+  ptr->pinned_by_dgl_ = false;
+}
+
+void NDArray::RecordStream(DGLArray* tensor, DGLStreamHandle stream) {
+  TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global();
+  CHECK(tensor_dispatcher->IsAvailable())
+      << "RecordStream only works when TensorAdapter is available.";
+  CHECK_EQ(tensor->ctx.device_type, kDGLCUDA)
+      << "RecordStream only works with GPU tensors.";
+
+  tensor_dispatcher->RecordStream(tensor->data, stream, tensor->ctx.device_id);
+}
+
+template <typename T>
+NDArray NDArray::FromVector(const std::vector<T>& vec, DGLContext ctx) {
+  const DGLDataType dtype = DGLDataTypeTraits<T>::dtype;
+  int64_t size = static_cast<int64_t>(vec.size());
+  NDArray ret = NDArray::Empty({size}, dtype, ctx);
+  DeviceAPI::Get(ctx)->CopyDataFromTo(
+      vec.data(), 0, static_cast<T*>(ret->data), 0, size * sizeof(T),
+      DGLContext{kDGLCPU, 0}, ctx, dtype);
+  return ret;
+}
+
+NDArray NDArray::CreateFromRaw(
+    const std::vector<int64_t>& shape, DGLDataType dtype, DGLContext ctx,
+    void* raw, bool auto_free) {
+  NDArray ret = Internal::Create(shape, dtype, ctx);
+  ret.data_->dl_tensor.data = raw;
+  if (!auto_free) ret.data_->deleter = nullptr;
+  return ret;
+}
+
+// export specializations
+template NDArray NDArray::FromVector<int32_t>(
+    const std::vector<int32_t>&, DGLContext);
+template NDArray NDArray::FromVector<int64_t>(
+    const std::vector<int64_t>&, DGLContext);
+template NDArray NDArray::FromVector<uint32_t>(
+    const std::vector<uint32_t>&, DGLContext);
+template NDArray NDArray::FromVector<uint64_t>(
+    const std::vector<uint64_t>&, DGLContext);
+template NDArray NDArray::FromVector<float>(
+    const std::vector<float>&, DGLContext);
+template NDArray NDArray::FromVector<double>(
+    const std::vector<double>&, DGLContext);
+
+template <typename T>
+std::vector<T> NDArray::ToVector() const {
+  const DGLDataType dtype = DGLDataTypeTraits<T>::dtype;
+  CHECK(data_->dl_tensor.ndim == 1)
+      << "ToVector() only supported for 1D arrays";
+  CHECK(data_->dl_tensor.dtype == dtype) << "dtype mismatch";
+
+  int64_t size = data_->dl_tensor.shape[0];
+  std::vector<T> vec(size);
+  const DGLContext& ctx = data_->dl_tensor.ctx;
+  DeviceAPI::Get(ctx)->CopyDataFromTo(
+      static_cast<T*>(data_->dl_tensor.data), 0, vec.data(), 0,
+      size * sizeof(T), ctx, DGLContext{kDGLCPU, 0}, dtype);
+  return vec;
+}
+
+template std::vector<int32_t> NDArray::ToVector<int32_t>() const;
+template std::vector<int64_t> NDArray::ToVector<int64_t>() const;
+template std::vector<uint32_t> NDArray::ToVector<uint32_t>() const;
+template std::vector<uint64_t> NDArray::ToVector<uint64_t>() const;
+template std::vector<float> NDArray::ToVector<float>() const;
+template std::vector<double> NDArray::ToVector<double>() const;
+
+std::shared_ptr<SharedMemory> NDArray::GetSharedMem() const {
+  return this->data_->mem;
+}
+
+bool NDArray::IsContainerPinned(NDArray::Container* ptr) {
+  if (ptr->pinned_by_dgl_ || ptr->pinned_by_pytorch_) return true;
+  auto* tensor = &(ptr->dl_tensor);
+  // Can only be pinned if on CPU...
+  if (tensor->ctx.device_type != kDGLCPU) return false;
+  // ... and CUDA device API is enabled, and the tensor is indeed in pinned
+  // memory.
+  auto device = DeviceAPI::Get(kDGLCUDA, true);
+  return device && device->IsPinned(tensor->data);
+}
+
+void NDArray::Save(dmlc::Stream* strm) const {
+  auto zc_strm = dynamic_cast<StreamWithBuffer*>(strm);
+  if (zc_strm) {
+    zc_strm->PushNDArray(*this);
+    return;
+  }
+  SaveDGLArray(strm, const_cast<DGLArray*>(operator->()));
+}
+
+bool NDArray::Load(dmlc::Stream* strm) {
+  auto zc_strm = dynamic_cast<StreamWithBuffer*>(strm);
+  if (zc_strm) {
+    *this = zc_strm->PopNDArray();
+    return true;
+  }
+  uint64_t header, reserved;
+  CHECK(strm->Read(&header)) << "Invalid DGLArray file format";
+  CHECK(strm->Read(&reserved)) << "Invalid DGLArray file format";
+  CHECK(header == kDGLNDArrayMagic) << "Invalid DGLArray file format";
+  DGLContext ctx;
+  int ndim;
+  DGLDataType dtype;
+  CHECK(strm->Read(&ctx)) << "Invalid DGLArray file format";
+  CHECK(strm->Read(&ndim)) << "Invalid DGLArray file format";
+  CHECK(strm->Read(&dtype)) << "Invalid DGLArray file format";
+  CHECK_EQ(ctx.device_type, kDGLCPU)
+      << "Invalid DGLArray context: can only save as CPU tensor";
+  std::vector<int64_t> shape(ndim);
+  if (ndim != 0) {
+    CHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DGLArray file format";
+  }
+  NDArray ret = NDArray::Empty(shape, dtype, ctx);
+  int64_t num_elems = 1;
+  int elem_bytes = (ret->dtype.bits + 7) / 8;
+  for (int i = 0; i < ret->ndim; ++i) {
+    num_elems *= ret->shape[i];
+  }
+  int64_t data_byte_size;
+  CHECK(strm->Read(&data_byte_size)) << "Invalid DGLArray file format";
+  CHECK(data_byte_size == num_elems * elem_bytes)
+      << "Invalid DGLArray file format";
+  if (data_byte_size != 0) {
+    // strm->Read will return the total number of elements successfully read.
+    // Therefore if data_byte_size is zero, the CHECK below would fail.
+    CHECK(strm->Read(ret->data, data_byte_size))
+        << "Invalid DGLArray file format";
+  }
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    dmlc::ByteSwap(ret->data, elem_bytes, num_elems);
+  }
+  *this = ret;
+  return true;
+}
+
+}  // namespace runtime
+}  // namespace dgl
+
+using namespace dgl::runtime;
+
+int DGLArrayAlloc(
+    const dgl_index_t* shape, int ndim, int dtype_code, int dtype_bits,
+    int dtype_lanes, int device_type, int device_id, DGLArrayHandle* out) {
+  API_BEGIN();
+  DGLDataType dtype;
+  dtype.code = static_cast<uint8_t>(dtype_code);
+  dtype.bits = static_cast<uint8_t>(dtype_bits);
+  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
+  DGLContext ctx;
+  ctx.device_type = static_cast<DGLDeviceType>(device_type);
+  ctx.device_id = device_id;
+  *out = NDArray::Internal::MoveAsDGLArray(
+      NDArray::Empty(std::vector<int64_t>(shape, shape + ndim), dtype, ctx));
+  API_END();
+}
+
+int DGLArrayAllocSharedMem(
+    const char* mem_name, const dgl_index_t* shape, int ndim, int dtype_code,
+    int dtype_bits, int dtype_lanes, bool is_create, DGLArrayHandle* out) {
+  API_BEGIN();
+  DGLDataType dtype;
+  dtype.code = static_cast<uint8_t>(dtype_code);
+  dtype.bits = static_cast<uint8_t>(dtype_bits);
+  dtype.lanes = static_cast<uint16_t>(dtype_lanes);
+  std::vector<int64_t> shape_vec(shape, shape + ndim);
+  NDArray arr = NDArray::EmptyShared(
+      mem_name, shape_vec, dtype, DGLContext{kDGLCPU, 0}, is_create);
+  *out = NDArray::Internal::MoveAsDGLArray(arr);
+  API_END();
+}
+
+int DGLArrayFree(DGLArrayHandle handle) {
+  API_BEGIN();
+  reinterpret_cast<NDArray::Container*>(handle)->DecRef();
+  API_END();
+}
+
+int DGLArrayCopyFromTo(DGLArrayHandle from, DGLArrayHandle to) {
+  API_BEGIN();
+  NDArray::CopyFromTo(from, to);
+  API_END();
+}
+
+int DGLArrayCopyFromBytes(DGLArrayHandle handle, void* data, size_t nbytes) {
+  API_BEGIN();
+  DGLContext cpu_ctx;
+  cpu_ctx.device_type = kDGLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes) << "DGLArrayCopyFromBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)
+      ->CopyDataFromTo(
+          data, 0, handle->data, static_cast<size_t>(handle->byte_offset),
+          nbytes, cpu_ctx, handle->ctx, handle->dtype);
+  API_END();
+}
+
+int DGLArrayCopyToBytes(DGLArrayHandle handle, void* data, size_t nbytes) {
+  API_BEGIN();
+  DGLContext cpu_ctx;
+  cpu_ctx.device_type = kDGLCPU;
+  cpu_ctx.device_id = 0;
+  size_t arr_size = GetDataSize(*handle);
+  CHECK_EQ(arr_size, nbytes) << "DGLArrayCopyToBytes: size mismatch";
+  DeviceAPI::Get(handle->ctx)
+      ->CopyDataFromTo(
+          handle->data, static_cast<size_t>(handle->byte_offset), data, 0,
+          nbytes, handle->ctx, cpu_ctx, handle->dtype);
+  API_END();
+}
+
+int DGLArrayPinData(DGLArrayHandle handle, DGLContext ctx) {
+  API_BEGIN();
+  auto* nd_container = reinterpret_cast<NDArray::Container*>(handle);
+  NDArray::PinContainer(nd_container);
+  API_END();
+}
+
+int DGLArrayUnpinData(DGLArrayHandle handle, DGLContext ctx) {
+  API_BEGIN();
+  auto* nd_container = reinterpret_cast<NDArray::Container*>(handle);
+  NDArray::UnpinContainer(nd_container);
+  API_END();
+}
+
+int DGLArrayRecordStream(DGLArrayHandle handle, DGLStreamHandle stream) {
+  API_BEGIN();
+  NDArray::RecordStream(handle, stream);
+  API_END();
+}
diff --git a/tensoradapter/include/tensoradapter.h b/tensoradapter/include/tensoradapter.h
index cf7341cac105..990043920ab8 100644
--- a/tensoradapter/include/tensoradapter.h
+++ b/tensoradapter/include/tensoradapter.h
@@ -1,3 +1,4 @@
+// !!! This is a file automatically generated by hipify!!!
 /**
  *  Copyright (c) 2020-2022 by Contributors
  * @file tensoradapter.h
@@ -10,9 +11,9 @@
 #ifndef TENSORADAPTER_H_
 #define TENSORADAPTER_H_
 
-#ifdef DGL_USE_CUDA
-#include <cuda_runtime.h>
-#endif  // DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
+#include <hip/hip_runtime.h>
+#endif  // DGL_USE_ROCM
 
 namespace tensoradapter {
 
@@ -34,7 +35,7 @@ void* CPURawAlloc(size_t nbytes);
  */
 void CPURawDelete(void* ptr);
 
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 /**
  * @brief Allocate a piece of GPU memory via
  * PyTorch's THCCachingAllocator.
@@ -43,7 +44,7 @@ void CPURawDelete(void* ptr);
  * @param stream The stream to be allocated on.
  * @return Pointer to the allocated memory.
  */
-void* CUDARawAlloc(size_t nbytes, cudaStream_t stream);
+void* CUDARawAlloc(size_t nbytes, hipStream_t stream);
 
 /**
  * @brief Free the GPU memory.
@@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr);
 /**
  * @brief Get the current CUDA stream.
  */
-cudaStream_t CUDACurrentStream();
+hipStream_t CUDACurrentStream();
 
 /**
  * @brief Let the caching allocator know which streams are using this tensor.
@@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream();
  * @param stream The stream that is using this tensor.
  * @param device_id Device of the tensor.
  */
-void RecordStream(void* ptr, cudaStream_t stream, int device_id);
+void RecordStream(void* ptr, hipStream_t stream, int device_id);
 
 /**
  * @brief Allocate a piece of pinned CPU memory via
@@ -98,14 +99,14 @@ void CUDARawHostDelete(void** raw_deleter);
  * @param device_id Device of the tensor.
  */
 void CUDARecordHostAlloc(
-    void* data, void* ctx, cudaStream_t stream, int device_id);
+    void* data, void* ctx, hipStream_t stream, int device_id);
 
 /**
  * @brief Release cached pinned memory allocations via cudaHostFree.
  */
 void CUDAHostAllocatorEmptyCache();
 
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 }
 
 };  // namespace tensoradapter
diff --git a/tensoradapter/include/tensoradapter.h.prehip b/tensoradapter/include/tensoradapter.h.prehip
new file mode 100644
index 000000000000..cf7341cac105
--- /dev/null
+++ b/tensoradapter/include/tensoradapter.h.prehip
@@ -0,0 +1,113 @@
+/**
+ *  Copyright (c) 2020-2022 by Contributors
+ * @file tensoradapter.h
+ * @brief Header file for functions exposed by the adapter library.
+ *
+ * Functions in this library must be exported with extern "C" so that DGL can
+ * locate them with dlsym(3) (or GetProcAddress on Windows).
+ */
+
+#ifndef TENSORADAPTER_H_
+#define TENSORADAPTER_H_
+
+#ifdef DGL_USE_CUDA
+#include <cuda_runtime.h>
+#endif  // DGL_USE_CUDA
+
+namespace tensoradapter {
+
+extern "C" {
+
+/**
+ * @brief Allocate a piece of CPU memory via
+ * PyTorch's CPUAllocator
+ *
+ * @param nbytes The size to be allocated.
+ * @return Pointer to the allocated memory.
+ */
+void* CPURawAlloc(size_t nbytes);
+
+/**
+ * @brief Free the CPU memory.
+ *
+ * @param ptr Pointer to the memory to be freed.
+ */
+void CPURawDelete(void* ptr);
+
+#ifdef DGL_USE_CUDA
+/**
+ * @brief Allocate a piece of GPU memory via
+ * PyTorch's THCCachingAllocator.
+ *
+ * @param nbytes The size to be allocated.
+ * @param stream The stream to be allocated on.
+ * @return Pointer to the allocated memory.
+ */
+void* CUDARawAlloc(size_t nbytes, cudaStream_t stream);
+
+/**
+ * @brief Free the GPU memory.
+ *
+ * @param ptr Pointer to the memory to be freed.
+ */
+void CUDARawDelete(void* ptr);
+
+/**
+ * @brief Get the current CUDA stream.
+ */
+cudaStream_t CUDACurrentStream();
+
+/**
+ * @brief Let the caching allocator know which streams are using this tensor.
+ *
+ * @param ptr Pointer of the tensor to be recorded.
+ * @param stream The stream that is using this tensor.
+ * @param device_id Device of the tensor.
+ */
+void RecordStream(void* ptr, cudaStream_t stream, int device_id);
+
+/**
+ * @brief Allocate a piece of pinned CPU memory via
+ *     PyTorch's CachingHostAllocator.
+ *
+ * @param nbytes The size to be allocated.
+ * @param ctx Pointer to the PyTorch storage ctx ptr returned from the
+ *     allocator.
+ * @param deleter Pointer to the delete function ptr returned from the
+ *     allocator.
+ * @return Raw pointer to the allocated memory.
+ */
+void* CUDARawHostAlloc(size_t nbytes, void** ctx, void** raw_deleter);
+
+/**
+ * @brief 'Free' the pinned CPU memory via
+ *     inserting the memory block back to the free list.
+ *
+ * @param deleter Pointer to the delete function ptr returned from the
+ *     allocator.
+ */
+void CUDARawHostDelete(void** raw_deleter);
+
+/**
+ * @brief 'Record' a CUDA stream (usually from a copy kernel) for the pinned
+ *     memory via PyTorch's CachingHostAllocator.
+ *
+ * @param data Pointer of the tensor to be recorded.
+ * @param ctx PyTorch storage ctx ptr returned from the allocator.
+ * @param stream The stream that currently consumes this tensor.
+ * @param device_id Device of the tensor.
+ */
+void CUDARecordHostAlloc(
+    void* data, void* ctx, cudaStream_t stream, int device_id);
+
+/**
+ * @brief Release cached pinned memory allocations via cudaHostFree.
+ */
+void CUDAHostAllocatorEmptyCache();
+
+#endif  // DGL_USE_CUDA
+}
+
+};  // namespace tensoradapter
+
+#endif  // TENSORADAPTER_H_
diff --git a/tensoradapter/pytorch/torch.cpp b/tensoradapter/pytorch/torch.cpp
index e02b02e46dda..55183595b1a7 100644
--- a/tensoradapter/pytorch/torch.cpp
+++ b/tensoradapter/pytorch/torch.cpp
@@ -1,3 +1,4 @@
+// !!! This is a file automatically generated by hipify!!!
 /**
  *  Copyright (c) 2020-2022 by Contributors
  * @file torch/torch.cpp
@@ -6,13 +7,13 @@
 
 #include <c10/core/CPUAllocator.h>
 #include <tensoradapter_exports.h>
-#ifdef DGL_USE_CUDA
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CachingHostAllocator.h>
-#include <c10/cuda/CUDACachingAllocator.h>
-#include <c10/cuda/CUDAStream.h>
-#include <cuda_runtime.h>
-#endif  // DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/CachingHostAllocator.h>
+#include <ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h>
+#include <ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h>
+#include <hip/hip_runtime.h>
+#endif  // DGL_USE_ROCM
 
 namespace tensoradapter {
 
@@ -26,30 +27,30 @@ TA_EXPORTS void CPURawDelete(void* ptr) {
   c10::GetCPUAllocator()->raw_deallocate(ptr);
 }
 
-#ifdef DGL_USE_CUDA
-TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) {
+#ifdef DGL_USE_ROCM
+TA_EXPORTS void* CUDARawAlloc(size_t nbytes, hipStream_t stream) {
   at::globalContext().lazyInitCUDA();
-  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream);
+  return c10::hip::HIPCachingAllocator::raw_alloc_with_stream(nbytes, stream);
 }
 
 TA_EXPORTS void CUDARawDelete(void* ptr) {
-  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
+  c10::hip::HIPCachingAllocator::raw_delete(ptr);
 }
 
-TA_EXPORTS cudaStream_t CUDACurrentStream() {
-  return at::cuda::getCurrentCUDAStream();
+TA_EXPORTS hipStream_t CUDACurrentStream() {
+  return at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
 }
 
-TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
+TA_EXPORTS void RecordStream(void* ptr, hipStream_t stream, int device_id) {
   c10::DataPtr data_ptr{
-      ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(),
+      ptr, ptr, c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::get()->raw_deleter(),
       c10::Device(c10::DeviceType::CUDA, device_id)};
-  c10::cuda::CUDACachingAllocator::recordStream(
+  c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA(
       data_ptr,
-      // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
+      // getStreamFromExternalMasqueradingAsCUDA doesn't exist before PyTorch 1.10, just copy it
       // here
-      c10::cuda::CUDAStream(
-          c10::cuda::CUDAStream::UNCHECKED,
+      c10::hip::HIPStreamMasqueradingAsCUDA(
+          c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
           c10::Stream(
               c10::Stream::UNSAFE,
               c10::Device(c10::DeviceType::CUDA, device_id),
@@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
 }
 
 TA_EXPORTS void CUDARecordHostAlloc(
-    void* ptr, void* ctx, cudaStream_t stream, int device_id) {
+    void* ptr, void* ctx, hipStream_t stream, int device_id) {
   at::cuda::CachingHostAllocator_recordEvent(
       ptr, ctx,
-      c10::cuda::CUDAStream(
-          c10::cuda::CUDAStream::UNCHECKED,
+      c10::hip::HIPStreamMasqueradingAsCUDA(
+          c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED,
           c10::Stream(
               c10::Stream::UNSAFE,
               c10::Device(c10::DeviceType::CUDA, device_id),
@@ -100,7 +101,7 @@ TA_EXPORTS void CUDARecordHostAlloc(
 TA_EXPORTS void CUDAHostAllocatorEmptyCache() {
   at::cuda::CachingHostAllocator_emptyCache();
 }
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 };
 
 };  // namespace tensoradapter
diff --git a/tensoradapter/pytorch/torch.cpp.prehip b/tensoradapter/pytorch/torch.cpp.prehip
new file mode 100644
index 000000000000..e02b02e46dda
--- /dev/null
+++ b/tensoradapter/pytorch/torch.cpp.prehip
@@ -0,0 +1,106 @@
+/**
+ *  Copyright (c) 2020-2022 by Contributors
+ * @file torch/torch.cpp
+ * @brief Implementation of PyTorch adapter library.
+ */
+
+#include <c10/core/CPUAllocator.h>
+#include <tensoradapter_exports.h>
+#ifdef DGL_USE_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CachingHostAllocator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cuda_runtime.h>
+#endif  // DGL_USE_CUDA
+
+namespace tensoradapter {
+
+extern "C" {
+
+TA_EXPORTS void* CPURawAlloc(size_t nbytes) {
+  return c10::GetCPUAllocator()->raw_allocate(nbytes);
+}
+
+TA_EXPORTS void CPURawDelete(void* ptr) {
+  c10::GetCPUAllocator()->raw_deallocate(ptr);
+}
+
+#ifdef DGL_USE_CUDA
+TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) {
+  at::globalContext().lazyInitCUDA();
+  return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream);
+}
+
+TA_EXPORTS void CUDARawDelete(void* ptr) {
+  c10::cuda::CUDACachingAllocator::raw_delete(ptr);
+}
+
+TA_EXPORTS cudaStream_t CUDACurrentStream() {
+  return at::cuda::getCurrentCUDAStream();
+}
+
+TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) {
+  c10::DataPtr data_ptr{
+      ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(),
+      c10::Device(c10::DeviceType::CUDA, device_id)};
+  c10::cuda::CUDACachingAllocator::recordStream(
+      data_ptr,
+      // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it
+      // here
+      c10::cuda::CUDAStream(
+          c10::cuda::CUDAStream::UNCHECKED,
+          c10::Stream(
+              c10::Stream::UNSAFE,
+              c10::Device(c10::DeviceType::CUDA, device_id),
+              reinterpret_cast<int64_t>(stream))));
+  data_ptr.release_context();
+}
+
+class CUDAHostDeleter {
+ public:
+  explicit CUDAHostDeleter(std::unique_ptr<void, c10::DeleterFnPtr> ptr)
+      : ptr_(std::move(ptr)) {}
+
+ private:
+  std::unique_ptr<void, c10::DeleterFnPtr> ptr_;
+};
+
+TA_EXPORTS void* CUDARawHostAlloc(
+    size_t nbytes, void** ctx, void** raw_deleter) {
+  auto data_ptr = at::cuda::getCachingHostAllocator()->allocate(nbytes);
+  auto raw = data_ptr.get();
+  // Return the raw ctx ptr for recording event.
+  *ctx = data_ptr.get_context();
+
+  // Transfer ownership to raw_deleter.
+  auto* data_deleter = new CUDAHostDeleter(data_ptr.move_context());
+  *raw_deleter = static_cast<void*>(data_deleter);
+  return raw;
+}
+
+// Designated CUDAHostDeleter for CUDARawHostAlloc.
+TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) {
+  delete static_cast<CUDAHostDeleter*>(*raw_deleter);
+  *raw_deleter = nullptr;
+}
+
+TA_EXPORTS void CUDARecordHostAlloc(
+    void* ptr, void* ctx, cudaStream_t stream, int device_id) {
+  at::cuda::CachingHostAllocator_recordEvent(
+      ptr, ctx,
+      c10::cuda::CUDAStream(
+          c10::cuda::CUDAStream::UNCHECKED,
+          c10::Stream(
+              c10::Stream::UNSAFE,
+              c10::Device(c10::DeviceType::CUDA, device_id),
+              reinterpret_cast<int64_t>(stream))));
+}
+
+TA_EXPORTS void CUDAHostAllocatorEmptyCache() {
+  at::cuda::CachingHostAllocator_emptyCache();
+}
+#endif  // DGL_USE_CUDA
+};
+
+};  // namespace tensoradapter
diff --git a/tests/cpp/common.h b/tests/cpp/common.h
index b1e871f565b4..d0fdede622bf 100644
--- a/tests/cpp/common.h
+++ b/tests/cpp/common.h
@@ -5,7 +5,7 @@
 
 static constexpr DGLContext CTX = DGLContext{kDGLCPU, 0};
 static constexpr DGLContext CPU = DGLContext{kDGLCPU, 0};
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 static constexpr DGLContext GPU = DGLContext{kDGLCUDA, 0};
 #endif
 
diff --git a/tests/cpp/common.h.prehip b/tests/cpp/common.h.prehip
new file mode 100644
index 000000000000..b1e871f565b4
--- /dev/null
+++ b/tests/cpp/common.h.prehip
@@ -0,0 +1,56 @@
+#ifndef TEST_COMMON_H_
+#define TEST_COMMON_H_
+
+#include <dgl/runtime/ndarray.h>
+
+static constexpr DGLContext CTX = DGLContext{kDGLCPU, 0};
+static constexpr DGLContext CPU = DGLContext{kDGLCPU, 0};
+#ifdef DGL_USE_CUDA
+static constexpr DGLContext GPU = DGLContext{kDGLCUDA, 0};
+#endif
+
+template <typename T>
+inline T* Ptr(dgl::runtime::NDArray nd) {
+  return static_cast<T*>(nd->data);
+}
+
+inline int64_t* PI64(dgl::runtime::NDArray nd) {
+  return static_cast<int64_t*>(nd->data);
+}
+
+inline int32_t* PI32(dgl::runtime::NDArray nd) {
+  return static_cast<int32_t*>(nd->data);
+}
+
+inline int64_t Len(dgl::runtime::NDArray nd) { return nd->shape[0]; }
+
+template <typename T>
+inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) {
+  if (a1->ndim != a2->ndim) return false;
+  if (a1->dtype != a2->dtype) return false;
+  if (a1->ctx != a2->ctx) return false;
+  if (a1.NumElements() != a2.NumElements()) return false;
+  if (a1.NumElements() == 0) return true;
+  int64_t num = 1;
+  for (int i = 0; i < a1->ndim; ++i) {
+    if (a1->shape[i] != a2->shape[i]) return false;
+    num *= a1->shape[i];
+  }
+  a1 = a1.CopyTo(CPU);
+  a2 = a2.CopyTo(CPU);
+  for (int64_t i = 0; i < num; ++i)
+    if (static_cast<T*>(a1->data)[i] != static_cast<T*>(a2->data)[i])
+      return false;
+  return true;
+}
+
+template <typename T>
+inline bool IsInArray(dgl::runtime::NDArray a, T x) {
+  if (!a.defined() || a->shape[0] == 0) return false;
+  for (int64_t i = 0; i < a->shape[0]; ++i) {
+    if (x == static_cast<T*>(a->data)[i]) return true;
+  }
+  return false;
+}
+
+#endif  // TEST_COMMON_H_
diff --git a/tests/cpp/test_aten.cc b/tests/cpp/test_aten.cc
index 7edca973a90d..57b93c52942b 100644
--- a/tests/cpp/test_aten.cc
+++ b/tests/cpp/test_aten.cc
@@ -38,7 +38,7 @@ void _TestRange(DGLContext ctx) {
 
 TEST(ArrayTest, TestRange) {
   _TestRange(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestRange(GPU);
 #endif
 };
@@ -78,7 +78,7 @@ void _TestNumBits(DGLContext ctx) {
 
 TEST(ArrayTest, TestAsNumBits) {
   _TestNumBits(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestNumBits(GPU);
 #endif
 };
@@ -170,7 +170,7 @@ void _TestArith(DGLContext ctx) {
 TEST(ArrayTest, Arith) {
   _TestArith<int32_t>(CPU);
   _TestArith<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestArith<int32_t>(GPU);
   _TestArith<int64_t>(GPU);
 #endif
@@ -189,7 +189,7 @@ void _TestHStack(DGLContext ctx) {
 TEST(ArrayTest, HStack) {
   _TestHStack<int32_t>(CPU);
   _TestHStack<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestHStack<int32_t>(GPU);
   _TestHStack<int64_t>(GPU);
 #endif
@@ -210,7 +210,7 @@ void _TestIndexSelect(DGLContext ctx) {
 TEST(ArrayTest, TestIndexSelect) {
   _TestIndexSelect<int32_t>(CPU);
   _TestIndexSelect<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestIndexSelect<int32_t>(GPU);
   _TestIndexSelect<int64_t>(GPU);
 #endif
@@ -239,7 +239,7 @@ void _TestRelabel_(DGLContext ctx) {
 TEST(ArrayTest, TestRelabel_) {
   _TestRelabel_<int32_t>(CPU);
   _TestRelabel_<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestRelabel_<int32_t>(GPU);
   _TestRelabel_<int64_t>(GPU);
 #endif
@@ -556,7 +556,7 @@ void _TestDisjointUnionPartitionCoo(DGLContext ctx) {
 TEST(DisjointUnionTest, TestDisjointUnionPartitionCoo) {
   _TestDisjointUnionPartitionCoo<int32_t>(CPU);
   _TestDisjointUnionPartitionCoo<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestDisjointUnionPartitionCoo<int32_t>(GPU);
   _TestDisjointUnionPartitionCoo<int64_t>(GPU);
 #endif
@@ -682,7 +682,7 @@ void _TestDisjointUnionPartitionCsr(DGLContext ctx) {
 TEST(DisjointUnionTest, TestDisjointUnionPartitionCsr) {
   _TestDisjointUnionPartitionCsr<int32_t>(CPU);
   _TestDisjointUnionPartitionCsr<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestDisjointUnionPartitionCsr<int32_t>(GPU);
   _TestDisjointUnionPartitionCsr<int64_t>(GPU);
 #endif
@@ -750,7 +750,7 @@ void _TestSliceContiguousChunkCoo(DGLContext ctx) {
 TEST(SliceContiguousChunk, TestSliceContiguousChunkCoo) {
   _TestSliceContiguousChunkCoo<int32_t>(CPU);
   _TestSliceContiguousChunkCoo<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestSliceContiguousChunkCoo<int32_t>(GPU);
   _TestSliceContiguousChunkCoo<int64_t>(GPU);
 #endif
@@ -817,7 +817,7 @@ void _TestSliceContiguousChunkCsr(DGLContext ctx) {
 TEST(SliceContiguousChunk, TestSliceContiguousChunkCsr) {
   _TestSliceContiguousChunkCsr<int32_t>(CPU);
   _TestSliceContiguousChunkCsr<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestSliceContiguousChunkCsr<int32_t>(GPU);
   _TestSliceContiguousChunkCsr<int64_t>(GPU);
 #endif
@@ -1224,7 +1224,7 @@ void _TestCumSum(DGLContext ctx) {
 TEST(ArrayTest, CumSum) {
   _TestCumSum<int32_t>(CPU);
   _TestCumSum<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCumSum<int32_t>(GPU);
   _TestCumSum<int64_t>(GPU);
 #endif
@@ -1249,7 +1249,7 @@ TEST(ArrayTest, Scatter_) {
   _TestScatter_<int64_t, int32_t>(CPU);
   _TestScatter_<int32_t, int64_t>(CPU);
   _TestScatter_<int64_t, int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestScatter_<int32_t, int32_t>(GPU);
   _TestScatter_<int64_t, int32_t>(GPU);
   _TestScatter_<int32_t, int64_t>(GPU);
@@ -1285,7 +1285,7 @@ void _TestNonZero(DGLContext ctx) {
 TEST(ArrayTest, NonZero) {
   _TestNonZero<int32_t>(CPU);
   _TestNonZero<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestNonZero<int32_t>(GPU);
   _TestNonZero<int64_t>(GPU);
 #endif
@@ -1421,7 +1421,7 @@ void _TestSort(DGLContext ctx) {
 TEST(ArrayTest, Sort) {
   _TestSort<int32_t>(CPU);
   _TestSort<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestSort<int32_t>(GPU);
   _TestSort<int64_t>(GPU);
 #endif
diff --git a/tests/cpp/test_aten.cc.prehip b/tests/cpp/test_aten.cc.prehip
new file mode 100644
index 000000000000..7edca973a90d
--- /dev/null
+++ b/tests/cpp/test_aten.cc.prehip
@@ -0,0 +1,1437 @@
+#include <dgl/array.h>
+#include <gtest/gtest.h>
+
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::runtime;
+
+TEST(ArrayTest, TestCreate) {
+  IdArray a = aten::NewIdArray(100, CTX, 32);
+  ASSERT_EQ(a->dtype.bits, 32);
+  ASSERT_EQ(a->shape[0], 100);
+
+  a = aten::NewIdArray(0);
+  ASSERT_EQ(a->shape[0], 0);
+
+  std::vector<int64_t> vec = {2, 94, 232, 30};
+  a = aten::VecToIdArray(vec, 32);
+  ASSERT_EQ(Len(a), vec.size());
+  ASSERT_EQ(a->dtype.bits, 32);
+  for (int i = 0; i < Len(a); ++i) {
+    ASSERT_EQ(Ptr<int32_t>(a)[i], vec[i]);
+  }
+
+  a = aten::VecToIdArray(std::vector<int32_t>());
+  ASSERT_EQ(Len(a), 0);
+};
+
+void _TestRange(DGLContext ctx) {
+  IdArray a = aten::Range(10, 10, 64, ctx);
+  ASSERT_EQ(Len(a), 0);
+  a = aten::Range(10, 20, 32, ctx);
+  ASSERT_EQ(Len(a), 10);
+  ASSERT_EQ(a->dtype.bits, 32);
+  a = a.CopyTo(CPU);
+  for (int i = 0; i < 10; ++i) ASSERT_EQ(Ptr<int32_t>(a)[i], i + 10);
+}
+
+TEST(ArrayTest, TestRange) {
+  _TestRange(CPU);
+#ifdef DGL_USE_CUDA
+  _TestRange(GPU);
+#endif
+};
+
+TEST(ArrayTest, TestFull) {
+  IdArray a = aten::Full(-100, 0, 32, CTX);
+  ASSERT_EQ(Len(a), 0);
+  a = aten::Full(-100, 13, 64, CTX);
+  ASSERT_EQ(Len(a), 13);
+  ASSERT_EQ(a->dtype.bits, 64);
+  for (int i = 0; i < 13; ++i) ASSERT_EQ(Ptr<int64_t>(a)[i], -100);
+};
+
+TEST(ArrayTest, TestClone) {
+  IdArray a = aten::NewIdArray(0);
+  IdArray b = aten::Clone(a);
+  ASSERT_EQ(Len(b), 0);
+
+  a = aten::Range(0, 10, 32, CTX);
+  b = aten::Clone(a);
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(PI32(b)[i], i);
+  }
+  PI32(b)[0] = -1;
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_EQ(PI32(a)[i], i);
+  }
+};
+
+void _TestNumBits(DGLContext ctx) {
+  IdArray a = aten::Range(0, 10, 32, ctx);
+  a = aten::AsNumBits(a, 64);
+  ASSERT_EQ(a->dtype.bits, 64);
+  a = a.CopyTo(CPU);
+  for (int i = 0; i < 10; ++i) ASSERT_EQ(PI64(a)[i], i);
+}
+
+TEST(ArrayTest, TestAsNumBits) {
+  _TestNumBits(CPU);
+#ifdef DGL_USE_CUDA
+  _TestNumBits(GPU);
+#endif
+};
+
+template <typename IDX>
+void _TestArith(DGLContext ctx) {
+  const int N = 100;
+  IdArray a = aten::Full(-10, N, sizeof(IDX) * 8, ctx);
+  IdArray b = aten::Full(7, N, sizeof(IDX) * 8, ctx);
+
+  IdArray c = a + b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -3);
+  c = a - b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -17);
+  c = a * b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -70);
+  c = a / b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -1);
+  c = -a;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 10);
+  c = (-a) % b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 3);
+
+  const int val = -3;
+  c = aten::Add(a, val);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -13);
+  c = aten::Sub(a, val);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -7);
+  c = aten::Mul(a, val);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 30);
+  c = aten::Div(a, val);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 3);
+  c = b % 3;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 1);
+
+  c = aten::Add(val, b);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 4);
+  c = aten::Sub(val, b);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -10);
+  c = aten::Mul(val, b);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], -21);
+  c = aten::Div(val, b);
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 0);
+  c = 3 % b;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], 3);
+
+  a = aten::Range(0, N, sizeof(IDX) * 8, ctx);
+  c = a < 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i < 50));
+
+  c = a > 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i > 50));
+
+  c = a >= 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i >= 50));
+
+  c = a <= 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i <= 50));
+
+  c = a == 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i == 50));
+
+  c = a != 50;
+  c = c.CopyTo(CPU);
+  for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], (int)(i != 50));
+}
+
+TEST(ArrayTest, Arith) {
+  _TestArith<int32_t>(CPU);
+  _TestArith<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestArith<int32_t>(GPU);
+  _TestArith<int64_t>(GPU);
+#endif
+};
+
+template <typename IDX>
+void _TestHStack(DGLContext ctx) {
+  IdArray a = aten::Range(0, 100, sizeof(IDX) * 8, ctx);
+  IdArray b = aten::Range(100, 200, sizeof(IDX) * 8, ctx);
+  IdArray c = aten::HStack(a, b).CopyTo(aten::CPU);
+  ASSERT_EQ(c->ndim, 1);
+  ASSERT_EQ(c->shape[0], 200);
+  for (int i = 0; i < 200; ++i) ASSERT_EQ(Ptr<IDX>(c)[i], i);
+}
+
+TEST(ArrayTest, HStack) {
+  _TestHStack<int32_t>(CPU);
+  _TestHStack<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestHStack<int32_t>(GPU);
+  _TestHStack<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestIndexSelect(DGLContext ctx) {
+  IdArray a = aten::Range(0, 100, sizeof(IDX) * 8, ctx);
+  ASSERT_EQ(aten::IndexSelect<int>(a, 50), 50);
+  ASSERT_TRUE(ArrayEQ<IDX>(
+      aten::IndexSelect(a, 10, 20), aten::Range(10, 20, sizeof(IDX) * 8, ctx)));
+  IdArray b =
+      aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX) * 8, ctx);
+  IdArray c = aten::IndexSelect(a, b);
+  ASSERT_TRUE(ArrayEQ<IDX>(b, c));
+}
+
+TEST(ArrayTest, TestIndexSelect) {
+  _TestIndexSelect<int32_t>(CPU);
+  _TestIndexSelect<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestIndexSelect<int32_t>(GPU);
+  _TestIndexSelect<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestRelabel_(DGLContext ctx) {
+  IdArray a =
+      aten::VecToIdArray(std::vector<IDX>({0, 20, 10}), sizeof(IDX) * 8, ctx);
+  IdArray b =
+      aten::VecToIdArray(std::vector<IDX>({20, 5, 6}), sizeof(IDX) * 8, ctx);
+  IdArray c = aten::Relabel_({a, b});
+
+  IdArray ta =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  IdArray tb =
+      aten::VecToIdArray(std::vector<IDX>({1, 3, 4}), sizeof(IDX) * 8, ctx);
+  IdArray tc = aten::VecToIdArray(
+      std::vector<IDX>({0, 20, 10, 5, 6}), sizeof(IDX) * 8, ctx);
+
+  ASSERT_TRUE(ArrayEQ<IDX>(a, ta));
+  ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  ASSERT_TRUE(ArrayEQ<IDX>(c, tc));
+}
+
+TEST(ArrayTest, TestRelabel_) {
+  _TestRelabel_<int32_t>(CPU);
+  _TestRelabel_<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestRelabel_<int32_t>(GPU);
+  _TestRelabel_<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestConcat(DGLContext ctx) {
+  IdArray a =
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 3}), sizeof(IDX) * 8, CTX);
+  IdArray b =
+      aten::VecToIdArray(std::vector<IDX>({4, 5, 6}), sizeof(IDX) * 8, CTX);
+  IdArray tc = aten::VecToIdArray(
+      std::vector<IDX>({1, 2, 3, 4, 5, 6}), sizeof(IDX) * 8, CTX);
+  IdArray c = aten::Concat(std::vector<IdArray>{a, b});
+  ASSERT_TRUE(ArrayEQ<IDX>(c, tc));
+  IdArray d = aten::Concat(std::vector<IdArray>{a, b, c});
+  IdArray td = aten::VecToIdArray(
+      std::vector<IDX>({1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}), sizeof(IDX) * 8,
+      CTX);
+  ASSERT_TRUE(ArrayEQ<IDX>(d, td));
+}
+
+template <typename IdType>
+void _TestToSimpleCsr(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 1, 1, 1],
+   *      [3, 2, 2, 3],
+   *      [2, 0, 0, 2]]
+   *
+   * B = CSRToSimple(A)
+   * B = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 1, 1, 1],
+   *      [1, 1, 1, 1],
+   *      [1, 0, 0, 1]]
+   */
+  IdArray a_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 2, 6, 16, 20}), sizeof(IdType) * 8, CTX);
+  IdArray a_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 0, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 0, 3, 3}),
+      sizeof(IdType) * 8, CTX);
+  IdArray b_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 2, 6, 10, 12}), sizeof(IdType) * 8, CTX);
+  IdArray b_indices = aten::VecToIdArray(
+      std::vector<IdType>({0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  IdArray cnt = aten::VecToIdArray(
+      std::vector<IdType>({1, 1, 1, 1, 1, 1, 3, 2, 2, 3, 2, 2}),
+      sizeof(IdType) * 8, CTX);
+  IdArray map = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_a =
+      aten::CSRMatrix(5, 4, a_indptr, a_indices, aten::NullArray(), true);
+  auto ret = CSRToSimple(csr_a);
+  aten::CSRMatrix csr_b = std::get<0>(ret);
+  IdArray ecnt = std::get<1>(ret);
+  IdArray emap = std::get<2>(ret);
+  ASSERT_EQ(csr_b.num_rows, 5);
+  ASSERT_EQ(csr_b.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indptr, b_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indices, b_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(ecnt, cnt));
+  ASSERT_TRUE(ArrayEQ<IdType>(emap, map));
+  ASSERT_TRUE(csr_b.sorted);
+
+  // a not sorted
+  a_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 0, 1, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  map = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 1, 2, 3, 4, 5, 9, 6, 6, 7, 7, 8, 8, 9, 9, 6, 10, 11, 10, 11}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_a2 =
+      aten::CSRMatrix(5, 4, a_indptr, a_indices, aten::NullArray(), false);
+  ret = CSRToSimple(csr_a2);
+  csr_b = std::get<0>(ret);
+  ecnt = std::get<1>(ret);
+  emap = std::get<2>(ret);
+  ASSERT_EQ(csr_b.num_rows, 5);
+  ASSERT_EQ(csr_b.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indptr, b_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indices, b_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(ecnt, cnt));
+  ASSERT_TRUE(ArrayEQ<IdType>(emap, map));
+  ASSERT_TRUE(csr_b.sorted);
+}
+
+TEST(MatrixTest, TestToSimpleCsr) {
+  _TestToSimpleCsr<int32_t>(CPU);
+  _TestToSimpleCsr<int64_t>(CPU);
+}
+
+template <typename IdType>
+void _TestToSimpleCoo(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 1, 1, 1],
+   *      [3, 2, 2, 3],
+   *      [2, 0, 0, 2]]
+   *
+   * B = CSRToSimple(A)
+   * B = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 1, 1, 1],
+   *      [1, 1, 1, 1],
+   *      [1, 0, 0, 1]]
+   */
+  IdArray a_row = aten::VecToIdArray(
+      std::vector<IdType>(
+          {1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4}),
+      sizeof(IdType) * 8, CTX);
+  IdArray a_col = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 0, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 0, 3, 3}),
+      sizeof(IdType) * 8, CTX);
+  IdArray b_row = aten::VecToIdArray(
+      std::vector<IdType>({1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4}),
+      sizeof(IdType) * 8, CTX);
+  IdArray b_col = aten::VecToIdArray(
+      std::vector<IdType>({0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  IdArray cnt = aten::VecToIdArray(
+      std::vector<IdType>({1, 1, 1, 1, 1, 1, 3, 2, 2, 3, 2, 2}),
+      sizeof(IdType) * 8, CTX);
+  IdArray map = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11}),
+      sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a =
+      aten::COOMatrix(5, 4, a_row, a_col, aten::NullArray(), true, true);
+  auto ret = COOToSimple(coo_a);
+  aten::COOMatrix coo_b = std::get<0>(ret);
+  IdArray ecnt = std::get<1>(ret);
+  IdArray emap = std::get<2>(ret);
+  ASSERT_EQ(coo_b.num_rows, 5);
+  ASSERT_EQ(coo_b.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b.row, b_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b.col, b_col));
+  ASSERT_TRUE(ArrayEQ<IdType>(ecnt, cnt));
+  ASSERT_TRUE(ArrayEQ<IdType>(emap, map));
+  ASSERT_FALSE(COOHasData(coo_b));
+  ASSERT_TRUE(coo_b.row_sorted);
+  ASSERT_TRUE(coo_b.col_sorted);
+
+  // a not sorted
+  a_row = aten::VecToIdArray(
+      std::vector<IdType>(
+          {1, 2, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4}),
+      sizeof(IdType) * 8, CTX);
+  a_col = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 0, 3, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  map = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 2, 1, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 11, 10, 11}),
+      sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a2 =
+      aten::COOMatrix(5, 4, a_row, a_col, aten::NullArray(), false, false);
+  ret = COOToSimple(coo_a2);
+  coo_b = std::get<0>(ret);
+  ecnt = std::get<1>(ret);
+  emap = std::get<2>(ret);
+  ASSERT_EQ(coo_b.num_rows, 5);
+  ASSERT_EQ(coo_b.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b.row, b_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b.col, b_col));
+  ASSERT_TRUE(ArrayEQ<IdType>(ecnt, cnt));
+  ASSERT_TRUE(ArrayEQ<IdType>(emap, map));
+  ASSERT_FALSE(COOHasData(coo_b));
+  ASSERT_TRUE(coo_b.row_sorted);
+  ASSERT_TRUE(coo_b.col_sorted);
+}
+
+TEST(MatrixTest, TestToSimpleCoo) {
+  _TestToSimpleCoo<int32_t>(CPU);
+  _TestToSimpleCoo<int64_t>(CPU);
+}
+
+template <typename IdType>
+void _TestDisjointUnionPartitionCoo(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 1],
+   *      [1, 0, 1],
+   *      [0, 1, 0]]
+   *
+   * B = [[1, 1, 0],
+   *      [0, 1, 0]]
+   *
+   * C = [[1]]
+   *
+   * AB = [[0, 0, 1, 0, 0, 0],
+   *       [1, 0, 1, 0, 0, 0],
+   *       [0, 1, 0, 0, 0, 0],
+   *       [0, 0, 0, 1, 1, 0],
+   *       [0, 0, 0, 0, 1, 0]]
+   *
+   * ABC = [[0, 0, 1, 0, 0, 0, 0],
+   *        [1, 0, 1, 0, 0, 0, 0],
+   *        [0, 1, 0, 0, 0, 0, 0],
+   *        [0, 0, 0, 1, 1, 0, 0],
+   *        [0, 0, 0, 0, 1, 0, 0],
+   *        [0, 0, 0, 0, 0, 0, 1]]
+   */
+  IdArray a_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 2}), sizeof(IdType) * 8, CTX);
+  IdArray a_col = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_col = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_data = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray c_row =
+      aten::VecToIdArray(std::vector<IdType>({0}), sizeof(IdType) * 8, CTX);
+  IdArray c_col =
+      aten::VecToIdArray(std::vector<IdType>({0}), sizeof(IdType) * 8, CTX);
+  IdArray ab_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 2, 3, 3, 4}), sizeof(IdType) * 8, CTX);
+  IdArray ab_col = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1, 3, 4, 4}), sizeof(IdType) * 8, CTX);
+  IdArray ab_data = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 3, 6, 4, 5}), sizeof(IdType) * 8, CTX);
+  IdArray abc_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 2, 3, 3, 4, 5}), sizeof(IdType) * 8, CTX);
+  IdArray abc_col = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1, 3, 4, 4, 6}), sizeof(IdType) * 8, CTX);
+  IdArray abc_data = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 3, 6, 4, 5, 7}), sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a =
+      aten::COOMatrix(3, 3, a_row, a_col, aten::NullArray(), true, false);
+  const aten::COOMatrix &coo_b =
+      aten::COOMatrix(2, 3, b_row, b_col, b_data, true, true);
+  const aten::COOMatrix &coo_c =
+      aten::COOMatrix(1, 1, c_row, c_col, aten::NullArray(), true, true);
+
+  const std::vector<aten::COOMatrix> coos_ab({coo_a, coo_b});
+  const aten::COOMatrix &coo_ab = aten::DisjointUnionCoo(coos_ab);
+  ASSERT_EQ(coo_ab.num_rows, 5);
+  ASSERT_EQ(coo_ab.num_cols, 6);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab.row, ab_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab.col, ab_col));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab.data, ab_data));
+  ASSERT_TRUE(coo_ab.row_sorted);
+  ASSERT_FALSE(coo_ab.col_sorted);
+
+  const std::vector<uint64_t> edge_cumsum({0, 4, 7});
+  const std::vector<uint64_t> src_vertex_cumsum({0, 3, 5});
+  const std::vector<uint64_t> dst_vertex_cumsum({0, 3, 6});
+  const std::vector<aten::COOMatrix> &p_coos =
+      aten::DisjointPartitionCooBySizes(
+          coo_ab, 2, edge_cumsum, src_vertex_cumsum, dst_vertex_cumsum);
+  ASSERT_EQ(p_coos[0].num_rows, coo_a.num_rows);
+  ASSERT_EQ(p_coos[0].num_cols, coo_a.num_cols);
+  ASSERT_EQ(p_coos[1].num_rows, coo_b.num_rows);
+  ASSERT_EQ(p_coos[1].num_cols, coo_b.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos[0].row, coo_a.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos[0].col, coo_a.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos[1].row, coo_b.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos[1].col, coo_b.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos[1].data, coo_b.data));
+  ASSERT_TRUE(p_coos[0].row_sorted);
+  ASSERT_FALSE(p_coos[0].col_sorted);
+  ASSERT_TRUE(p_coos[1].row_sorted);
+  ASSERT_FALSE(p_coos[1].col_sorted);
+
+  const std::vector<aten::COOMatrix> coos_abc({coo_a, coo_b, coo_c});
+  const aten::COOMatrix &coo_abc = aten::DisjointUnionCoo(coos_abc);
+  ASSERT_EQ(coo_abc.num_rows, 6);
+  ASSERT_EQ(coo_abc.num_cols, 7);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_abc.row, abc_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_abc.col, abc_col));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_abc.data, abc_data));
+  ASSERT_TRUE(coo_abc.row_sorted);
+  ASSERT_FALSE(coo_abc.col_sorted);
+
+  const std::vector<uint64_t> edge_cumsum_abc({0, 4, 7, 8});
+  const std::vector<uint64_t> src_vertex_cumsum_abc({0, 3, 5, 6});
+  const std::vector<uint64_t> dst_vertex_cumsum_abc({0, 3, 6, 7});
+  const std::vector<aten::COOMatrix> &p_coos_abc =
+      aten::DisjointPartitionCooBySizes(
+          coo_abc, 3, edge_cumsum_abc, src_vertex_cumsum_abc,
+          dst_vertex_cumsum_abc);
+  ASSERT_EQ(p_coos_abc[0].num_rows, coo_a.num_rows);
+  ASSERT_EQ(p_coos_abc[0].num_cols, coo_a.num_cols);
+  ASSERT_EQ(p_coos_abc[1].num_rows, coo_b.num_rows);
+  ASSERT_EQ(p_coos_abc[1].num_cols, coo_b.num_cols);
+  ASSERT_EQ(p_coos_abc[2].num_rows, coo_c.num_rows);
+  ASSERT_EQ(p_coos_abc[2].num_cols, coo_c.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[0].row, coo_a.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[0].col, coo_a.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[1].row, coo_b.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[1].col, coo_b.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[1].data, coo_b.data));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[2].row, coo_c.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_coos_abc[2].col, coo_c.col));
+  ASSERT_TRUE(p_coos_abc[0].row_sorted);
+  ASSERT_FALSE(p_coos_abc[0].col_sorted);
+  ASSERT_TRUE(p_coos_abc[1].row_sorted);
+  ASSERT_FALSE(p_coos_abc[1].col_sorted);
+  ASSERT_TRUE(p_coos_abc[2].row_sorted);
+  ASSERT_FALSE(p_coos_abc[2].col_sorted);
+}
+
+TEST(DisjointUnionTest, TestDisjointUnionPartitionCoo) {
+  _TestDisjointUnionPartitionCoo<int32_t>(CPU);
+  _TestDisjointUnionPartitionCoo<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestDisjointUnionPartitionCoo<int32_t>(GPU);
+  _TestDisjointUnionPartitionCoo<int64_t>(GPU);
+#endif
+}
+
+template <typename IdType>
+void _TestDisjointUnionPartitionCsr(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 1],
+   *      [1, 0, 1],
+   *      [0, 1, 0]]
+   *
+   * B = [[1, 1, 0],
+   *      [0, 1, 0]]
+   *
+   * C = [[1]]
+   *
+   * BC = [[1, 1, 0, 0],
+   *       [0, 1, 0, 0],
+   *       [0, 0, 0, 1]],
+   *
+   * ABC = [[0, 0, 1, 0, 0, 0, 0],
+   *        [1, 0, 1, 0, 0, 0, 0],
+   *        [0, 1, 0, 0, 0, 0, 0],
+   *        [0, 0, 0, 1, 1, 0, 0],
+   *        [0, 0, 0, 0, 1, 0, 0],
+   *        [0, 0, 0, 0, 0, 0, 1]]
+   */
+  IdArray a_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 3, 4}), sizeof(IdType) * 8, CTX);
+  IdArray a_indices = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 2, 3}), sizeof(IdType) * 8, CTX);
+  IdArray b_indices = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_data = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray c_indptr =
+      aten::VecToIdArray(std::vector<IdType>({0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray c_indices =
+      aten::VecToIdArray(std::vector<IdType>({0}), sizeof(IdType) * 8, CTX);
+  IdArray bc_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 2, 3, 4}), sizeof(IdType) * 8, CTX);
+  IdArray bc_indices = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 3}), sizeof(IdType) * 8, CTX);
+  IdArray bc_data = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 1, 3}), sizeof(IdType) * 8, CTX);
+  IdArray abc_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 3, 4, 6, 7, 8}), sizeof(IdType) * 8, CTX);
+  IdArray abc_indices = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1, 3, 4, 4, 6}), sizeof(IdType) * 8, CTX);
+  IdArray abc_data = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 3, 6, 4, 5, 7}), sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_a =
+      aten::CSRMatrix(3, 3, a_indptr, a_indices, aten::NullArray(), false);
+  const aten::CSRMatrix &csr_b =
+      aten::CSRMatrix(2, 3, b_indptr, b_indices, b_data, true);
+  const aten::CSRMatrix &csr_c =
+      aten::CSRMatrix(1, 1, c_indptr, c_indices, aten::NullArray(), true);
+
+  const std::vector<aten::CSRMatrix> csrs_bc({csr_b, csr_c});
+  const aten::CSRMatrix &csr_bc = aten::DisjointUnionCsr(csrs_bc);
+  ASSERT_EQ(csr_bc.num_rows, 3);
+  ASSERT_EQ(csr_bc.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_bc.indptr, bc_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_bc.indices, bc_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_bc.data, bc_data));
+  ASSERT_TRUE(csr_bc.sorted);
+
+  const std::vector<uint64_t> edge_cumsum({0, 3, 4});
+  const std::vector<uint64_t> src_vertex_cumsum({0, 2, 3});
+  const std::vector<uint64_t> dst_vertex_cumsum({0, 3, 4});
+  const std::vector<aten::CSRMatrix> &p_csrs =
+      aten::DisjointPartitionCsrBySizes(
+          csr_bc, 2, edge_cumsum, src_vertex_cumsum, dst_vertex_cumsum);
+  ASSERT_EQ(p_csrs[0].num_rows, csr_b.num_rows);
+  ASSERT_EQ(p_csrs[0].num_cols, csr_b.num_cols);
+  ASSERT_EQ(p_csrs[1].num_rows, csr_c.num_rows);
+  ASSERT_EQ(p_csrs[1].num_cols, csr_c.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs[0].indptr, csr_b.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs[0].indices, csr_b.indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs[0].data, csr_b.data));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs[1].indptr, csr_c.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs[1].indices, csr_c.indices));
+  ASSERT_TRUE(p_csrs[0].sorted);
+  ASSERT_TRUE(p_csrs[1].sorted);
+
+  const std::vector<aten::CSRMatrix> csrs_abc({csr_a, csr_b, csr_c});
+  const aten::CSRMatrix &csr_abc = aten::DisjointUnionCsr(csrs_abc);
+  ASSERT_EQ(csr_abc.num_rows, 6);
+  ASSERT_EQ(csr_abc.num_cols, 7);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_abc.indptr, abc_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_abc.indices, abc_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_abc.data, abc_data));
+  ASSERT_FALSE(csr_abc.sorted);
+
+  const std::vector<uint64_t> edge_cumsum_abc({0, 4, 7, 8});
+  const std::vector<uint64_t> src_vertex_cumsum_abc({0, 3, 5, 6});
+  const std::vector<uint64_t> dst_vertex_cumsum_abc({0, 3, 6, 7});
+  const std::vector<aten::CSRMatrix> &p_csrs_abc =
+      aten::DisjointPartitionCsrBySizes(
+          csr_abc, 3, edge_cumsum_abc, src_vertex_cumsum_abc,
+          dst_vertex_cumsum_abc);
+  ASSERT_EQ(p_csrs_abc[0].num_rows, csr_a.num_rows);
+  ASSERT_EQ(p_csrs_abc[0].num_cols, csr_a.num_cols);
+  ASSERT_EQ(p_csrs_abc[1].num_rows, csr_b.num_rows);
+  ASSERT_EQ(p_csrs_abc[1].num_cols, csr_b.num_cols);
+  ASSERT_EQ(p_csrs_abc[2].num_rows, csr_c.num_rows);
+  ASSERT_EQ(p_csrs_abc[2].num_cols, csr_c.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[0].indptr, csr_a.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[0].indices, csr_a.indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[1].indptr, csr_b.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[1].indices, csr_b.indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[1].data, csr_b.data));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[2].indptr, csr_c.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(p_csrs_abc[2].indices, csr_c.indices));
+  ASSERT_FALSE(p_csrs_abc[0].sorted);
+  ASSERT_FALSE(p_csrs_abc[1].sorted);
+  ASSERT_FALSE(p_csrs_abc[2].sorted);
+}
+
+TEST(DisjointUnionTest, TestDisjointUnionPartitionCsr) {
+  _TestDisjointUnionPartitionCsr<int32_t>(CPU);
+  _TestDisjointUnionPartitionCsr<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestDisjointUnionPartitionCsr<int32_t>(GPU);
+  _TestDisjointUnionPartitionCsr<int64_t>(GPU);
+#endif
+}
+
+template <typename IdType>
+void _TestSliceContiguousChunkCoo(DGLContext ctx) {
+  /**
+   * A = [[1, 0, 0, 0],
+   *      [0, 0, 1, 0],
+   *      [0, 0, 0, 0]]
+   *
+   * B = [[1, 0, 0],
+   *      [0, 0, 1]]
+   *
+   * C = [[0]]
+   *
+   */
+  IdArray a_row =
+      aten::VecToIdArray(std::vector<IdType>({0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray a_col =
+      aten::VecToIdArray(std::vector<IdType>({0, 2}), sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a =
+      aten::COOMatrix(3, 4, a_row, a_col, aten::NullArray(), true, false);
+
+  IdArray b_row =
+      aten::VecToIdArray(std::vector<IdType>({0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray b_col =
+      aten::VecToIdArray(std::vector<IdType>({0, 2}), sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_b_raw =
+      aten::COOMatrix(2, 3, b_row, b_col, aten::NullArray(), true, false);
+
+  const std::vector<uint64_t> edge_range_b({0, 2});
+  const std::vector<uint64_t> src_vertex_range_b({0, 2});
+  const std::vector<uint64_t> dst_vertex_range_b({0, 3});
+  const aten::COOMatrix &coo_b = aten::COOSliceContiguousChunk(
+      coo_a, edge_range_b, src_vertex_range_b, dst_vertex_range_b);
+  ASSERT_EQ(coo_b_raw.num_rows, coo_b.num_rows);
+  ASSERT_EQ(coo_b_raw.num_cols, coo_b.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b_raw.row, coo_b.row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_b_raw.col, coo_b.col));
+  ASSERT_TRUE(coo_b.row_sorted);
+  ASSERT_FALSE(coo_b.col_sorted);
+
+  IdArray c_row =
+      aten::VecToIdArray(std::vector<IdType>({}), sizeof(IdType) * 8, CTX);
+  IdArray c_col =
+      aten::VecToIdArray(std::vector<IdType>({}), sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_c_raw =
+      aten::COOMatrix(1, 1, c_row, c_col, aten::NullArray(), true, false);
+
+  const std::vector<uint64_t> edge_range_c({2, 2});
+  const std::vector<uint64_t> src_vertex_range_c({2, 3});
+  const std::vector<uint64_t> dst_vertex_range_c({3, 4});
+  const aten::COOMatrix &coo_c = aten::COOSliceContiguousChunk(
+      coo_a, edge_range_c, src_vertex_range_c, dst_vertex_range_c);
+  ASSERT_EQ(coo_c_raw.num_rows, coo_c.num_rows);
+  ASSERT_EQ(coo_c_raw.num_cols, coo_c.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_c.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_c.col, c_col));
+  ASSERT_TRUE(coo_c.row_sorted);
+  ASSERT_FALSE(coo_c.col_sorted);
+}
+
+TEST(SliceContiguousChunk, TestSliceContiguousChunkCoo) {
+  _TestSliceContiguousChunkCoo<int32_t>(CPU);
+  _TestSliceContiguousChunkCoo<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestSliceContiguousChunkCoo<int32_t>(GPU);
+  _TestSliceContiguousChunkCoo<int64_t>(GPU);
+#endif
+}
+
+template <typename IdType>
+void _TestSliceContiguousChunkCsr(DGLContext ctx) {
+  /**
+   * A = [[1, 0, 0, 0],
+   *      [0, 0, 1, 0],
+   *      [0, 0, 0, 0]]
+   *
+   * B = [[1, 0, 0],
+   *      [0, 0, 1]]
+   *
+   * C = [[0]]
+   *
+   */
+  IdArray a_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 2}), sizeof(IdType) * 8, CTX);
+  IdArray a_indices =
+      aten::VecToIdArray(std::vector<IdType>({0, 2}), sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_a =
+      aten::CSRMatrix(3, 4, a_indptr, a_indices, aten::NullArray(), false);
+
+  IdArray b_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2}), sizeof(IdType) * 8, CTX);
+  IdArray b_indices =
+      aten::VecToIdArray(std::vector<IdType>({0, 2}), sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_b_raw =
+      aten::CSRMatrix(2, 3, b_indptr, b_indices, aten::NullArray(), false);
+
+  const std::vector<uint64_t> edge_range_b({0, 2});
+  const std::vector<uint64_t> src_vertex_range_b({0, 2});
+  const std::vector<uint64_t> dst_vertex_range_b({0, 3});
+  const aten::CSRMatrix &csr_b = aten::CSRSliceContiguousChunk(
+      csr_a, edge_range_b, src_vertex_range_b, dst_vertex_range_b);
+  ASSERT_EQ(csr_b.num_rows, csr_b_raw.num_rows);
+  ASSERT_EQ(csr_b.num_cols, csr_b_raw.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indptr, csr_b_raw.indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_b.indices, csr_b_raw.indices));
+  ASSERT_FALSE(csr_b.sorted);
+
+  const std::vector<uint64_t> edge_range_c({2, 2});
+  const std::vector<uint64_t> src_vertex_range_c({2, 3});
+  const std::vector<uint64_t> dst_vertex_range_c({3, 4});
+  const aten::CSRMatrix &csr_c = aten::CSRSliceContiguousChunk(
+      csr_a, edge_range_c, src_vertex_range_c, dst_vertex_range_c);
+
+  int64_t indptr_len = src_vertex_range_c[1] - src_vertex_range_c[0] + 1;
+  IdArray c_indptr = aten::Full(0, indptr_len, sizeof(IdType) * 8, CTX);
+  IdArray c_indices =
+      aten::VecToIdArray(std::vector<IdType>({}), sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_c_raw =
+      aten::CSRMatrix(1, 1, c_indptr, c_indices, aten::NullArray(), false);
+
+  ASSERT_EQ(csr_c.num_rows, csr_c_raw.num_rows);
+  ASSERT_EQ(csr_c.num_cols, csr_c_raw.num_cols);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_c.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_c.indices, c_indices));
+  ASSERT_FALSE(csr_c.sorted);
+}
+
+TEST(SliceContiguousChunk, TestSliceContiguousChunkCsr) {
+  _TestSliceContiguousChunkCsr<int32_t>(CPU);
+  _TestSliceContiguousChunkCsr<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestSliceContiguousChunkCsr<int32_t>(GPU);
+  _TestSliceContiguousChunkCsr<int64_t>(GPU);
+#endif
+}
+
+template <typename IdType>
+void _TestMatrixUnionCsr(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 1, 0, 0],
+   *      [1, 1, 1, 1],
+   *      [0, 1, 1, 0],
+   *      [1, 0, 0, 1]]
+   *
+   * B = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 0, 1, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 0, 0, 1]]
+   *      [1, 0, 0, 1]]
+   *
+   * C = UnionCsr({A, B})
+   *
+   * C = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 1, 1, 0],
+   *      [2, 1, 1, 2],
+   *      [1, 1, 1, 1]]
+   *      [2, 0, 0, 2]]
+   *
+   * D = [[1, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [1, 0, 0, 1]]
+   *
+   * C = UnionCsr({A, B, D})
+   *
+   * C = [[1, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 1, 1, 0],
+   *      [2, 1, 1, 2],
+   *      [1, 1, 1, 1]]
+   *      [3, 0, 0, 3]]
+   */
+  IdArray a_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 0, 1, 5, 7, 9}), sizeof(IdType) * 8, CTX);
+  IdArray a_indices = aten::VecToIdArray(
+      std::vector<IdType>({1, 0, 1, 2, 3, 1, 2, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  IdArray b_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 2, 3, 5, 7, 9}), sizeof(IdType) * 8, CTX);
+  IdArray b_indices = aten::VecToIdArray(
+      std::vector<IdType>({0, 3, 2, 0, 3, 0, 3, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  IdArray c_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 2, 4, 10, 14, 18}), sizeof(IdType) * 8, CTX);
+  IdArray c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 1, 2, 0, 0, 1, 2, 3, 3, 0, 1, 2, 3, 0, 0, 3, 3}),
+      sizeof(IdType) * 8, CTX);
+  IdArray c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {9, 10, 0, 11, 1, 12, 2, 3, 4, 13, 14, 5, 6, 15, 7, 16, 8, 17}),
+      sizeof(IdType) * 8, CTX);
+
+  const aten::CSRMatrix &csr_a =
+      aten::CSRMatrix(6, 4, a_indptr, a_indices, aten::NullArray(), true);
+  const aten::CSRMatrix &csr_b =
+      aten::CSRMatrix(6, 4, b_indptr, b_indices, aten::NullArray(), true);
+
+  const aten::CSRMatrix &csr_aUb = aten::UnionCsr({csr_a, csr_b});
+  ASSERT_EQ(csr_aUb.num_rows, 6);
+  ASSERT_EQ(csr_aUb.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb.data, c_data));
+  ASSERT_TRUE(csr_aUb.sorted);
+
+  IdArray a_data = aten::VecToIdArray(
+      std::vector<IdType>({8, 7, 6, 5, 4, 3, 2, 1, 0}), sizeof(IdType) * 8,
+      CTX);
+
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {9, 10, 8, 11, 7, 12, 6, 5, 4, 13, 14, 3, 2, 15, 1, 16, 0, 17}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_ad =
+      aten::CSRMatrix(6, 4, a_indptr, a_indices, a_data, true);
+  const aten::CSRMatrix &csr_adUb = aten::UnionCsr({csr_ad, csr_b});
+  ASSERT_EQ(csr_adUb.num_rows, 6);
+  ASSERT_EQ(csr_adUb.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_adUb.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_adUb.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_adUb.data, c_data));
+  ASSERT_TRUE(csr_adUb.sorted);
+
+  IdArray b_indices2 = aten::VecToIdArray(
+      std::vector<IdType>({0, 3, 2, 0, 3, 3, 0, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 1, 2, 0, 1, 2, 3, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {9, 10, 0, 11, 1, 2, 3, 4, 12, 13, 5, 6, 14, 15, 7, 8, 16, 17}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_b2 =
+      aten::CSRMatrix(6, 4, b_indptr, b_indices2, aten::NullArray(), false);
+  const aten::CSRMatrix &csr_aUb2 = aten::UnionCsr({csr_a, csr_b2});
+  ASSERT_EQ(csr_aUb2.num_rows, 6);
+  ASSERT_EQ(csr_aUb2.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb2.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb2.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb2.data, c_data));
+  ASSERT_FALSE(csr_aUb2.sorted);
+
+  IdArray a_indices2 = aten::VecToIdArray(
+      std::vector<IdType>({1, 3, 2, 1, 0, 1, 2, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 0, 3, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_a2 =
+      aten::CSRMatrix(6, 4, a_indptr, a_indices2, aten::NullArray(), false);
+  const aten::CSRMatrix &csr_aUb3 = aten::UnionCsr({csr_a2, csr_b});
+  ASSERT_EQ(csr_aUb3.num_rows, 6);
+  ASSERT_EQ(csr_aUb3.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb3.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb3.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb3.data, c_data));
+  ASSERT_FALSE(csr_aUb3.sorted);
+
+  c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_aUb4 = aten::UnionCsr({csr_a2, csr_b2});
+  ASSERT_EQ(csr_aUb4.num_rows, 6);
+  ASSERT_EQ(csr_aUb4.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb4.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb4.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUb4.data, c_data));
+  ASSERT_FALSE(csr_aUb4.sorted);
+
+  IdArray d_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 1, 1, 1, 3}), sizeof(IdType) * 8, CTX);
+  IdArray d_indices = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 3}), sizeof(IdType) * 8, CTX);
+  c_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 3, 5, 11, 15, 21}), sizeof(IdType) * 8, CTX);
+  c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 0, 3, 1, 2, 0, 0, 1, 2, 3, 3, 0, 1, 2, 3, 0, 0, 0, 3, 3, 3}),
+      sizeof(IdType) * 8, CTX);
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>({18, 9, 10, 8,  11, 7,  12, 6, 5,  4, 13,
+                           14, 3, 2,  15, 1,  16, 19, 0, 17, 20}),
+      sizeof(IdType) * 8, CTX);
+  const aten::CSRMatrix &csr_d =
+      aten::CSRMatrix(6, 4, d_indptr, d_indices, aten::NullArray(), true);
+  const aten::CSRMatrix &csr_aUbUd = aten::UnionCsr({csr_ad, csr_b, csr_d});
+  ASSERT_EQ(csr_aUbUd.num_rows, 6);
+  ASSERT_EQ(csr_aUbUd.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd.data, c_data));
+  ASSERT_TRUE(csr_aUbUd.sorted);
+
+  c_indices = aten::VecToIdArray(
+      std::vector<IdType>(
+          {0, 0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>({18, 9, 10, 0,  11, 1, 2,  3,  4,  12, 13,
+                           5,  6, 14, 15, 7,  8, 16, 17, 19, 20}),
+      sizeof(IdType) * 8, CTX);
+
+  const aten::CSRMatrix &csr_aUbUd2 = aten::UnionCsr({csr_a2, csr_b2, csr_d});
+  ASSERT_EQ(csr_aUbUd2.num_rows, 6);
+  ASSERT_EQ(csr_aUbUd2.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.indptr, c_indptr));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.indices, c_indices));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.data, c_data));
+  ASSERT_FALSE(csr_aUbUd2.sorted);
+}
+
+TEST(MatrixUnionTest, TestMatrixUnionCsr) {
+  _TestMatrixUnionCsr<int32_t>(CPU);
+  _TestMatrixUnionCsr<int64_t>(CPU);
+}
+
+template <typename IdType>
+void _TestMatrixUnionCoo(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 1, 0, 0],
+   *      [1, 1, 1, 1],
+   *      [0, 1, 1, 0],
+   *      [1, 0, 0, 1]]
+   *
+   * B = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 0, 1, 0],
+   *      [1, 0, 0, 1],
+   *      [1, 0, 0, 1]]
+   *      [1, 0, 0, 1]]
+   *
+   * C = UnionCsr({A, B})
+   *
+   * C = [[0, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 1, 1, 0],
+   *      [2, 1, 1, 2],
+   *      [1, 1, 1, 1]]
+   *      [2, 0, 0, 2]]
+   *
+   * D = [[1, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [0, 0, 0, 0],
+   *      [1, 0, 0, 1]]
+   *
+   * C = UnionCsr({A, B, D})
+   *
+   * C = [[1, 0, 0, 0],
+   *      [1, 0, 0, 1],
+   *      [0, 1, 1, 0],
+   *      [2, 1, 1, 2],
+   *      [1, 1, 1, 1]]
+   *      [3, 0, 0, 3]]
+   */
+  IdArray a_row = aten::VecToIdArray(
+      std::vector<IdType>({2, 3, 3, 3, 3, 4, 4, 5, 5}), sizeof(IdType) * 8,
+      CTX);
+  IdArray a_col = aten::VecToIdArray(
+      std::vector<IdType>({1, 0, 1, 2, 3, 1, 2, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  IdArray b_row = aten::VecToIdArray(
+      std::vector<IdType>({1, 1, 2, 3, 3, 4, 4, 5, 5}), sizeof(IdType) * 8,
+      CTX);
+  IdArray b_col = aten::VecToIdArray(
+      std::vector<IdType>({0, 3, 2, 0, 3, 0, 3, 0, 3}), sizeof(IdType) * 8,
+      CTX);
+  IdArray c_row = aten::VecToIdArray(
+      std::vector<IdType>(
+          {2, 3, 3, 3, 3, 4, 4, 5, 5, 1, 1, 2, 3, 3, 4, 4, 5, 5}),
+      sizeof(IdType) * 8, CTX);
+  IdArray c_col = aten::VecToIdArray(
+      std::vector<IdType>(
+          {1, 0, 1, 2, 3, 1, 2, 0, 3, 0, 3, 2, 0, 3, 0, 3, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a =
+      aten::COOMatrix(6, 4, a_row, a_col, aten::NullArray(), true, true);
+  const aten::COOMatrix &coo_b =
+      aten::COOMatrix(6, 4, b_row, b_col, aten::NullArray(), true, true);
+  const std::vector<aten::COOMatrix> coos_ab({coo_a, coo_b});
+  const aten::COOMatrix &coo_ab = aten::UnionCoo(coos_ab);
+  ASSERT_EQ(coo_ab.num_rows, 6);
+  ASSERT_EQ(coo_ab.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab.col, c_col));
+  ASSERT_FALSE(COOHasData(coo_ab));
+  ASSERT_FALSE(coo_ab.row_sorted);
+  ASSERT_FALSE(coo_ab.col_sorted);
+
+  IdArray a_data = aten::VecToIdArray(
+      std::vector<IdType>({2, 1, 0, 3, 4, 5, 6, 7, 8}), sizeof(IdType) * 8,
+      CTX);
+
+  IdArray c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}),
+      sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_a2 =
+      aten::COOMatrix(6, 4, a_row, a_col, a_data, true, true);
+  const std::vector<aten::COOMatrix> coos_ab2({coo_a2, coo_b});
+  const aten::COOMatrix &coo_ab2 = aten::UnionCoo(coos_ab2);
+  ASSERT_EQ(coo_ab2.num_rows, 6);
+  ASSERT_EQ(coo_ab2.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab2.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab2.col, c_col));
+  ASSERT_TRUE(COOHasData(coo_ab2));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab2.data, c_data));
+  ASSERT_FALSE(coo_ab2.row_sorted);
+  ASSERT_FALSE(coo_ab2.col_sorted);
+
+  IdArray b_data = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 3, 4, 5, 6, 8, 7}), sizeof(IdType) * 8,
+      CTX);
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16}),
+      sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo_b2 =
+      aten::COOMatrix(6, 4, b_row, b_col, b_data, true, true);
+  const std::vector<aten::COOMatrix> coos_ab3({coo_a2, coo_b2});
+  const aten::COOMatrix &coo_ab3 = aten::UnionCoo(coos_ab3);
+  ASSERT_EQ(coo_ab3.num_rows, 6);
+  ASSERT_EQ(coo_ab3.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab3.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab3.col, c_col));
+  ASSERT_TRUE(COOHasData(coo_ab3));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab3.data, c_data));
+  ASSERT_FALSE(coo_ab3.row_sorted);
+  ASSERT_FALSE(coo_ab3.col_sorted);
+
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>(
+          {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16}),
+      sizeof(IdType) * 8, CTX);
+
+  const std::vector<aten::COOMatrix> coos_ab4({coo_a2, coo_b2});
+  const aten::COOMatrix &coo_ab4 = aten::UnionCoo(coos_ab4);
+  ASSERT_EQ(coo_ab4.num_rows, 6);
+  ASSERT_EQ(coo_ab4.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab4.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab4.col, c_col));
+  ASSERT_TRUE(COOHasData(coo_ab4));
+  ASSERT_TRUE(ArrayEQ<IdType>(coo_ab4.data, c_data));
+  ASSERT_FALSE(coo_ab4.row_sorted);
+  ASSERT_FALSE(coo_ab4.col_sorted);
+
+  IdArray d_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 5, 5}), sizeof(IdType) * 8, CTX);
+  IdArray d_col = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 3}), sizeof(IdType) * 8, CTX);
+  c_row = aten::VecToIdArray(
+      std::vector<IdType>(
+          {2, 3, 3, 3, 3, 4, 4, 5, 5, 1, 1, 2, 3, 3, 4, 4, 5, 5, 0, 5, 5}),
+      sizeof(IdType) * 8, CTX);
+  c_col = aten::VecToIdArray(
+      std::vector<IdType>(
+          {1, 0, 1, 2, 3, 1, 2, 0, 3, 0, 3, 2, 0, 3, 0, 3, 0, 3, 0, 0, 3}),
+      sizeof(IdType) * 8, CTX);
+
+  const aten::COOMatrix &coo_d =
+      aten::COOMatrix(6, 4, d_row, d_col, aten::NullArray(), true, true);
+  const aten::COOMatrix &csr_aUbUd = aten::UnionCoo({coo_a, coo_b, coo_d});
+  ASSERT_EQ(csr_aUbUd.num_rows, 6);
+  ASSERT_EQ(csr_aUbUd.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd.col, c_col));
+  ASSERT_FALSE(COOHasData(csr_aUbUd));
+  ASSERT_FALSE(csr_aUbUd.row_sorted);
+  ASSERT_FALSE(csr_aUbUd.col_sorted);
+
+  c_data = aten::VecToIdArray(
+      std::vector<IdType>({2,  1,  0,  3,  4,  5,  6,  7,  8,  9, 10,
+                           11, 12, 13, 14, 15, 17, 16, 18, 19, 20}),
+      sizeof(IdType) * 8, CTX);
+
+  const aten::COOMatrix &csr_aUbUd2 = aten::UnionCoo({coo_a2, coo_b2, coo_d});
+  ASSERT_EQ(csr_aUbUd2.num_rows, 6);
+  ASSERT_EQ(csr_aUbUd2.num_cols, 4);
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.col, c_col));
+  ASSERT_TRUE(COOHasData(csr_aUbUd2));
+  ASSERT_TRUE(ArrayEQ<IdType>(csr_aUbUd2.data, c_data));
+  ASSERT_FALSE(csr_aUbUd2.row_sorted);
+  ASSERT_FALSE(csr_aUbUd2.col_sorted);
+}
+
+TEST(MatrixUnionTest, TestMatrixUnionCoo) {
+  _TestMatrixUnionCoo<int32_t>(CPU);
+  _TestMatrixUnionCoo<int64_t>(CPU);
+}
+
+template <typename IDX>
+void _TestCumSum(DGLContext ctx) {
+  IdArray a = aten::VecToIdArray(
+      std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX) * 8, ctx);
+  {
+    IdArray tb = aten::VecToIdArray(
+        std::vector<IDX>({8, 14, 21, 26, 29, 29, 38}), sizeof(IDX) * 8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  {
+    IdArray tb = aten::VecToIdArray(
+        std::vector<IDX>({0, 8, 14, 21, 26, 29, 29, 38}), sizeof(IDX) * 8, ctx);
+    IdArray b = aten::CumSum(a, true);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+  {
+    IdArray tb = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    IdArray b = aten::CumSum(a);
+    ASSERT_TRUE(ArrayEQ<IDX>(b, tb));
+  }
+}
+
+TEST(ArrayTest, CumSum) {
+  _TestCumSum<int32_t>(CPU);
+  _TestCumSum<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCumSum<int32_t>(GPU);
+  _TestCumSum<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX, typename D>
+void _TestScatter_(DGLContext ctx) {
+  IdArray out = aten::Full(1, 10, 8 * sizeof(IDX), ctx);
+  IdArray idx =
+      aten::VecToIdArray(std::vector<IDX>({2, 3, 9}), sizeof(IDX) * 8, ctx);
+  IdArray val =
+      aten::VecToIdArray(std::vector<IDX>({-20, 30, 90}), sizeof(IDX) * 8, ctx);
+  aten::Scatter_(idx, val, out);
+  IdArray tout = aten::VecToIdArray(
+      std::vector<IDX>({1, 1, -20, 30, 1, 1, 1, 1, 1, 90}), sizeof(IDX) * 8,
+      ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(out, tout));
+}
+
+TEST(ArrayTest, Scatter_) {
+  _TestScatter_<int32_t, int32_t>(CPU);
+  _TestScatter_<int64_t, int32_t>(CPU);
+  _TestScatter_<int32_t, int64_t>(CPU);
+  _TestScatter_<int64_t, int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestScatter_<int32_t, int32_t>(GPU);
+  _TestScatter_<int64_t, int32_t>(GPU);
+  _TestScatter_<int32_t, int64_t>(GPU);
+  _TestScatter_<int64_t, int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestNonZero(DGLContext ctx) {
+  auto val = aten::VecToIdArray(
+      std::vector<IDX>({0, 1, 2, 0, -10, 0, 0, 23}), sizeof(IDX) * 8, ctx);
+  auto idx = aten::NonZero(val);
+  auto tidx = aten::VecToIdArray(std::vector<int64_t>({1, 2, 4, 7}), 64, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, tidx));
+
+  val = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  idx = aten::NonZero(val);
+  tidx = aten::VecToIdArray(std::vector<int64_t>({}), 64, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, tidx));
+
+  val =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 0}), sizeof(IDX) * 8, ctx);
+  idx = aten::NonZero(val);
+  tidx = aten::VecToIdArray(std::vector<int64_t>({}), 64, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, tidx));
+
+  val = aten::Full(1, 3, sizeof(IDX) * 8, ctx);
+  idx = aten::NonZero(val);
+  tidx = aten::VecToIdArray(std::vector<int64_t>({0, 1, 2}), 64, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, tidx));
+}
+
+TEST(ArrayTest, NonZero) {
+  _TestNonZero<int32_t>(CPU);
+  _TestNonZero<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestNonZero<int32_t>(GPU);
+  _TestNonZero<int64_t>(GPU);
+#endif
+}
+
+template <typename IdType>
+void _TestLineGraphCOO(DGLContext ctx) {
+  /**
+   * A = [[0, 0, 1, 0],
+   *      [1, 0, 1, 0],
+   *      [1, 1, 0, 0],
+   *      [0, 0, 0, 1]]
+   * row: 0 1 1 2 2 3
+   * col: 2 0 2 0 1 3
+   * ID:  0 1 2 3 4 5
+   *
+   * B = COOLineGraph(A, backtracking=False)
+   *
+   * B = [[0, 0, 0, 0, 1, 0],
+   *      [1, 0, 0, 0, 0, 0],
+   *      [0, 0, 0, 1, 0, 0],
+   *      [0, 0, 0, 0, 0, 0],
+   *      [0, 1, 0, 0, 0, 0],
+   *      [0, 0, 0, 0, 0, 0]]
+   *
+   * C = COOLineGraph(A, backtracking=True)
+   *
+   * C = [[0, 0, 0, 1, 1, 0],
+   *      [1, 0, 0, 0, 0, 0],
+   *      [0, 0, 0, 1, 1, 0],
+   *      [1, 0, 0, 0, 0, 0],
+   *      [0, 1, 1, 0, 0, 0],
+   *      [0, 0, 0, 0, 0, 0]]
+   */
+  IdArray a_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1, 2, 2, 3}), sizeof(IdType) * 8, ctx);
+  IdArray a_col = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 0, 1, 3}), sizeof(IdType) * 8, ctx);
+  IdArray b_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 2, 4}), sizeof(IdType) * 8, ctx);
+  IdArray b_col = aten::VecToIdArray(
+      std::vector<IdType>({4, 0, 3, 1}), sizeof(IdType) * 8, ctx);
+  IdArray c_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType) * 8, ctx);
+  IdArray c_col = aten::VecToIdArray(
+      std::vector<IdType>({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType) * 8, ctx);
+
+  const aten::COOMatrix &coo_a =
+      aten::COOMatrix(4, 4, a_row, a_col, aten::NullArray(), true, false);
+
+  const aten::COOMatrix &l_coo = COOLineGraph(coo_a, false);
+  ASSERT_EQ(l_coo.num_rows, 6);
+  ASSERT_EQ(l_coo.num_cols, 6);
+  ASSERT_TRUE(ArrayEQ<IdType>(l_coo.row, b_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(l_coo.col, b_col));
+  ASSERT_FALSE(l_coo.row_sorted);
+  ASSERT_FALSE(l_coo.col_sorted);
+
+  const aten::COOMatrix &l_coo2 = COOLineGraph(coo_a, true);
+  ASSERT_EQ(l_coo2.num_rows, 6);
+  ASSERT_EQ(l_coo2.num_cols, 6);
+  ASSERT_TRUE(ArrayEQ<IdType>(l_coo2.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(l_coo2.col, c_col));
+  ASSERT_FALSE(l_coo2.row_sorted);
+  ASSERT_FALSE(l_coo2.col_sorted);
+
+  IdArray a_data = aten::VecToIdArray(
+      std::vector<IdType>({4, 5, 0, 1, 2, 3}), sizeof(IdType) * 8, ctx);
+  b_row = aten::VecToIdArray(
+      std::vector<IdType>({4, 5, 0, 2}), sizeof(IdType) * 8, ctx);
+  b_col = aten::VecToIdArray(
+      std::vector<IdType>({2, 4, 1, 5}), sizeof(IdType) * 8, ctx);
+  c_row = aten::VecToIdArray(
+      std::vector<IdType>({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType) * 8, ctx);
+  c_col = aten::VecToIdArray(
+      std::vector<IdType>({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType) * 8, ctx);
+  const aten::COOMatrix &coo_ad =
+      aten::COOMatrix(4, 4, a_row, a_col, a_data, true, false);
+  const aten::COOMatrix &ld_coo = COOLineGraph(coo_ad, false);
+  ASSERT_EQ(ld_coo.num_rows, 6);
+  ASSERT_EQ(ld_coo.num_cols, 6);
+  ASSERT_TRUE(ArrayEQ<IdType>(ld_coo.row, b_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(ld_coo.col, b_col));
+  ASSERT_FALSE(ld_coo.row_sorted);
+  ASSERT_FALSE(ld_coo.col_sorted);
+
+  const aten::COOMatrix &ld_coo2 = COOLineGraph(coo_ad, true);
+  ASSERT_EQ(ld_coo2.num_rows, 6);
+  ASSERT_EQ(ld_coo2.num_cols, 6);
+  ASSERT_TRUE(ArrayEQ<IdType>(ld_coo2.row, c_row));
+  ASSERT_TRUE(ArrayEQ<IdType>(ld_coo2.col, c_col));
+  ASSERT_FALSE(ld_coo2.row_sorted);
+  ASSERT_FALSE(ld_coo2.col_sorted);
+}
+
+TEST(LineGraphTest, LineGraphCOO) {
+  _TestLineGraphCOO<int32_t>(CPU);
+  _TestLineGraphCOO<int64_t>(CPU);
+}
+
+template <typename IDX>
+void _TestSort(DGLContext ctx) {
+  // case 1
+  IdArray a = aten::VecToIdArray(
+      std::vector<IDX>({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX) * 8, ctx);
+  IdArray sorted_a = aten::VecToIdArray(
+      std::vector<IDX>({0, 3, 5, 6, 7, 8, 9}), sizeof(IDX) * 8, ctx);
+  IdArray sorted_idx =
+      aten::VecToIdArray(std::vector<IDX>({5, 4, 3, 1, 2, 0, 6}), 64, ctx);
+
+  IdArray sorted, idx;
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+
+  // case 2: empty array
+  a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  sorted_a = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  sorted_idx = aten::VecToIdArray(std::vector<IDX>({}), 64, ctx);
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+
+  // case 3: array with one element
+  a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX) * 8, ctx);
+  sorted_a = aten::VecToIdArray(std::vector<IDX>({2}), sizeof(IDX) * 8, ctx);
+  sorted_idx = aten::VecToIdArray(std::vector<IDX>({0}), 64, ctx);
+  std::tie(sorted, idx) = aten::Sort(a);
+  ASSERT_TRUE(ArrayEQ<IDX>(sorted, sorted_a));
+  ASSERT_TRUE(ArrayEQ<IDX>(idx, sorted_idx));
+}
+
+TEST(ArrayTest, Sort) {
+  _TestSort<int32_t>(CPU);
+  _TestSort<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestSort<int32_t>(GPU);
+  _TestSort<int64_t>(GPU);
+#endif
+}
+
+TEST(ArrayTest, BFloatCast) {
+  for (int i = -100; i < 100; ++i) {
+    float a = i;
+    BFloat16 b = a;
+    float a_casted = b;
+    ASSERT_FLOAT_EQ(a, a_casted);
+  }
+}
diff --git a/tests/cpp/test_csrmm.cc b/tests/cpp/test_csrmm.cc
index df8d60775dc6..880b138037a3 100644
--- a/tests/cpp/test_csrmm.cc
+++ b/tests/cpp/test_csrmm.cc
@@ -178,7 +178,7 @@ TEST(CsrmmTest, TestCsrmm) {
   _TestCsrmm<int32_t, double>(CPU);
   _TestCsrmm<int64_t, float>(CPU);
   _TestCsrmm<int64_t, double>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCsrmm<int32_t, float>(GPU);
   _TestCsrmm<int32_t, double>(GPU);
   _TestCsrmm<int64_t, float>(GPU);
@@ -191,7 +191,7 @@ TEST(CsrmmTest, TestCsrsum) {
   _TestCsrsum<int32_t, double>(CPU);
   _TestCsrsum<int64_t, float>(CPU);
   _TestCsrsum<int64_t, double>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCsrsum<int32_t, float>(GPU);
   _TestCsrsum<int32_t, double>(GPU);
   _TestCsrsum<int64_t, float>(GPU);
@@ -204,7 +204,7 @@ TEST(CsrmmTest, TestCsrmask) {
   _TestCsrmask<int32_t, double>(CPU);
   _TestCsrmask<int64_t, float>(CPU);
   _TestCsrmask<int64_t, double>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCsrmask<int32_t, float>(GPU);
   _TestCsrmask<int32_t, double>(GPU);
   _TestCsrmask<int64_t, float>(GPU);
diff --git a/tests/cpp/test_csrmm.cc.prehip b/tests/cpp/test_csrmm.cc.prehip
new file mode 100644
index 000000000000..df8d60775dc6
--- /dev/null
+++ b/tests/cpp/test_csrmm.cc.prehip
@@ -0,0 +1,215 @@
+#include <dgl/array.h>
+#include <dgl/kernel.h>
+#include <gtest/gtest.h>
+
+#include "../../src/array/cpu/array_utils.h"  // PairHash
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::runtime;
+
+namespace {
+
+// Unit tests:
+// CSRMM(A, B) == A_mm_B
+// CSRSum({A, C}) == A_plus_C
+// CSRMask(A, C) = A_mask_C
+
+template <typename IdType, typename DType>
+std::unordered_map<std::pair<IdType, IdType>, DType, aten::PairHash> COOToMap(
+    aten::COOMatrix coo, NDArray weights) {
+  std::unordered_map<std::pair<IdType, IdType>, DType, aten::PairHash> map;
+
+  for (int64_t i = 0; i < coo.row->shape[0]; ++i) {
+    IdType irow = aten::IndexSelect<IdType>(coo.row, i);
+    IdType icol = aten::IndexSelect<IdType>(coo.col, i);
+    IdType ieid =
+        aten::COOHasData(coo) ? aten::IndexSelect<IdType>(coo.data, i) : i;
+    DType idata = aten::IndexSelect<DType>(weights, ieid);
+    map.insert({{irow, icol}, idata});
+  }
+  return map;
+}
+
+template <typename IdType, typename DType>
+bool CSRIsClose(
+    aten::CSRMatrix A, aten::CSRMatrix B, NDArray A_weights, NDArray B_weights,
+    DType rtol, DType atol) {
+  auto Amap = COOToMap<IdType, DType>(CSRToCOO(A, false), A_weights);
+  auto Bmap = COOToMap<IdType, DType>(CSRToCOO(B, false), B_weights);
+
+  if (Amap.size() != Bmap.size()) return false;
+
+  for (auto itA : Amap) {
+    auto itB = Bmap.find(itA.first);
+    if (itB == Bmap.end()) return false;
+    if (fabs(itA.second - itB->second) >= rtol * fabs(itA.second) + atol)
+      return false;
+  }
+
+  return true;
+}
+
+template <typename IdType, typename DType>
+std::pair<aten::CSRMatrix, NDArray> CSR_A(DGLContext ctx = CTX) {
+  // matrix([[0. , 0. , 1. , 0.7, 0. ],
+  //         [0. , 0. , 0.5, 0.+, 0. ],
+  //         [0.4, 0.7, 0. , 0.2, 0. ],
+  //         [0. , 0. , 0. , 0. , 0.2]])
+  // (0.+ indicates that the entry exists but the value is 0.)
+  auto csr = aten::CSRMatrix(
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 2, 4, 7, 8}), ctx),
+      NDArray::FromVector(std::vector<IdType>({2, 3, 2, 3, 0, 1, 3, 4}), ctx),
+      NDArray::FromVector(std::vector<IdType>({1, 0, 2, 3, 4, 5, 6, 7}), ctx));
+  auto weights = NDArray::FromVector(
+      std::vector<DType>({0.7, 1.0, 0.5, 0.0, 0.4, 0.7, 0.2, 0.2}), ctx);
+  return {csr, weights};
+}
+
+template <typename IdType, typename DType>
+std::pair<aten::CSRMatrix, NDArray> CSR_B(DGLContext ctx = CTX) {
+  // matrix([[0. , 0.9, 0. , 0.6, 0. , 0.3],
+  //         [0. , 0. , 0. , 0. , 0. , 0.4],
+  //         [0.+, 0. , 0. , 0. , 0. , 0.9],
+  //         [0.8, 0.2, 0.3, 0.2, 0. , 0. ],
+  //         [0.2, 0.4, 0. , 0. , 0. , 0. ]])
+  // (0.+ indicates that the entry exists but the value is 0.)
+  auto csr = aten::CSRMatrix(
+      5, 6, NDArray::FromVector(std::vector<IdType>({0, 3, 4, 6, 10, 12}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>({1, 3, 5, 5, 0, 5, 0, 1, 2, 3, 0, 1}), ctx));
+  auto weights = NDArray::FromVector(
+      std::vector<DType>(
+          {0.9, 0.6, 0.3, 0.4, 0.0, 0.9, 0.8, 0.2, 0.3, 0.2, 0.2, 0.4}),
+      ctx);
+  return {csr, weights};
+}
+
+template <typename IdType, typename DType>
+std::pair<aten::CSRMatrix, NDArray> CSR_C(DGLContext ctx = CTX) {
+  // matrix([[0. , 0. , 0. , 0.2, 0. ],
+  //         [0. , 0. , 0. , 0.5, 0.4],
+  //         [0. , 0.2, 0. , 0.9, 0.2],
+  //         [0. , 1. , 0. , 0.7, 0. ]])
+  auto csr = aten::CSRMatrix(
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 1, 3, 6, 8}), ctx),
+      NDArray::FromVector(std::vector<IdType>({3, 3, 4, 1, 3, 4, 1, 3}), ctx));
+  auto weights = NDArray::FromVector(
+      std::vector<DType>({0.2, 0.5, 0.4, 0.2, 0.9, 0.2, 1., 0.7}), ctx);
+  return {csr, weights};
+}
+
+template <typename IdType, typename DType>
+std::pair<aten::CSRMatrix, NDArray> CSR_A_mm_B(DGLContext ctx = CTX) {
+  // matrix([[0.56, 0.14, 0.21, 0.14, 0.  , 0.9 ],
+  //         [0.+ , 0.+ , 0.+ , 0.+ , 0.  , 0.45],
+  //         [0.16, 0.4 , 0.06, 0.28, 0.  , 0.4 ],
+  //         [0.04, 0.08, 0.  , 0.  , 0.  , 0.  ]])
+  // (0.+ indicates that the entry exists but the value is 0.)
+  auto csr = aten::CSRMatrix(
+      4, 6, NDArray::FromVector(std::vector<IdType>({0, 5, 10, 15, 17}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>(
+              {0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1}),
+          ctx));
+  auto weights = NDArray::FromVector(
+      std::vector<DType>(
+          {0.56, 0.14, 0.21, 0.14, 0.9, 0., 0., 0., 0., 0.45, 0.16, 0.4, 0.06,
+           0.28, 0.4, 0.04, 0.08}),
+      ctx);
+  return {csr, weights};
+}
+
+template <typename IdType, typename DType>
+std::pair<aten::CSRMatrix, NDArray> CSR_A_plus_C(DGLContext ctx = CTX) {
+  auto csr = aten::CSRMatrix(
+      4, 5, NDArray::FromVector(std::vector<IdType>({0, 2, 5, 9, 12}), ctx),
+      NDArray::FromVector(
+          std::vector<IdType>({2, 3, 2, 3, 4, 0, 1, 3, 4, 1, 3, 4}), ctx));
+  auto weights = NDArray::FromVector(
+      std::vector<DType>(
+          {1., 0.9, 0.5, 0.5, 0.4, 0.4, 0.9, 1.1, 0.2, 1., 0.7, 0.2}),
+      ctx);
+  return {csr, weights};
+}
+
+template <typename DType>
+NDArray CSR_A_mask_C(DGLContext ctx = CTX) {
+  return NDArray::FromVector(
+      std::vector<DType>({0.7, 0.0, 0.0, 0.7, 0.2, 0.0, 0.0, 0.0}), ctx);
+}
+
+template <typename IdType, typename DType>
+void _TestCsrmm(DGLContext ctx = CTX) {
+  auto A = CSR_A<IdType, DType>(ctx);
+  auto B = CSR_B<IdType, DType>(ctx);
+  auto A_mm_B = aten::CSRMM(A.first, A.second, B.first, B.second);
+  auto A_mm_B2 = CSR_A_mm_B<IdType, DType>(ctx);
+  bool result = CSRIsClose<IdType, DType>(
+      A_mm_B.first, A_mm_B2.first, A_mm_B.second, A_mm_B2.second, 1e-4, 1e-4);
+  ASSERT_TRUE(result);
+}
+
+template <typename IdType, typename DType>
+void _TestCsrsum(DGLContext ctx = CTX) {
+  auto A = CSR_A<IdType, DType>(ctx);
+  auto C = CSR_C<IdType, DType>(ctx);
+  auto A_plus_C = aten::CSRSum({A.first, C.first}, {A.second, C.second});
+  auto A_plus_C2 = CSR_A_plus_C<IdType, DType>(ctx);
+  bool result = CSRIsClose<IdType, DType>(
+      A_plus_C.first, A_plus_C2.first, A_plus_C.second, A_plus_C2.second, 1e-4,
+      1e-4);
+  ASSERT_TRUE(result);
+}
+
+template <typename IdType, typename DType>
+void _TestCsrmask(DGLContext ctx = CTX) {
+  auto A = CSR_A<IdType, DType>(ctx);
+  auto C = CSR_C<IdType, DType>(ctx);
+  auto C_coo = CSRToCOO(C.first, false);
+  auto A_mask_C =
+      aten::CSRGetData<DType>(A.first, C_coo.row, C_coo.col, A.second, 0);
+  auto A_mask_C2 = CSR_A_mask_C<DType>(ctx);
+  ASSERT_TRUE(ArrayEQ<DType>(A_mask_C, A_mask_C2));
+}
+
+TEST(CsrmmTest, TestCsrmm) {
+  _TestCsrmm<int32_t, float>(CPU);
+  _TestCsrmm<int32_t, double>(CPU);
+  _TestCsrmm<int64_t, float>(CPU);
+  _TestCsrmm<int64_t, double>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCsrmm<int32_t, float>(GPU);
+  _TestCsrmm<int32_t, double>(GPU);
+  _TestCsrmm<int64_t, float>(GPU);
+  _TestCsrmm<int64_t, double>(GPU);
+#endif
+}
+
+TEST(CsrmmTest, TestCsrsum) {
+  _TestCsrsum<int32_t, float>(CPU);
+  _TestCsrsum<int32_t, double>(CPU);
+  _TestCsrsum<int64_t, float>(CPU);
+  _TestCsrsum<int64_t, double>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCsrsum<int32_t, float>(GPU);
+  _TestCsrsum<int32_t, double>(GPU);
+  _TestCsrsum<int64_t, float>(GPU);
+  _TestCsrsum<int64_t, double>(GPU);
+#endif
+}
+
+TEST(CsrmmTest, TestCsrmask) {
+  _TestCsrmask<int32_t, float>(CPU);
+  _TestCsrmask<int32_t, double>(CPU);
+  _TestCsrmask<int64_t, float>(CPU);
+  _TestCsrmask<int64_t, double>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCsrmask<int32_t, float>(GPU);
+  _TestCsrmask<int32_t, double>(GPU);
+  _TestCsrmask<int64_t, float>(GPU);
+  _TestCsrmask<int64_t, double>(GPU);
+#endif
+}
+
+};  // namespace
diff --git a/tests/cpp/test_partition.cc b/tests/cpp/test_partition.cc
index 4281e2646ce6..beac56204f59 100644
--- a/tests/cpp/test_partition.cc
+++ b/tests/cpp/test_partition.cc
@@ -77,7 +77,7 @@ void _TestRemainder_MapToX() {
 }
 
 TEST(PartitionTest, TestRemainderPartition) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestRemainder_GeneratePermutation<kDGLCUDA, int32_t>();
   _TestRemainder_GeneratePermutation<kDGLCUDA, int64_t>();
 
@@ -185,7 +185,7 @@ void _TestRange_MapToX() {
 }
 
 TEST(PartitionTest, TestRangePartition) {
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestRange_GeneratePermutation<kDGLCUDA, int32_t>();
   _TestRange_GeneratePermutation<kDGLCUDA, int64_t>();
 
diff --git a/tests/cpp/test_partition.cc.prehip b/tests/cpp/test_partition.cc.prehip
new file mode 100644
index 000000000000..4281e2646ce6
--- /dev/null
+++ b/tests/cpp/test_partition.cc.prehip
@@ -0,0 +1,196 @@
+#include <gtest/gtest.h>
+
+#include "../../src/partition/ndarray_partition.h"
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::partition;
+
+template <DGLDeviceType XPU, typename IdType>
+void _TestRemainder_GeneratePermutation() {
+  const int64_t size = 160000;
+  const int num_parts = 7;
+  NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts);
+
+  IdArray idxs =
+      aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0});
+
+  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);
+
+  // first part of result should be the permutation
+  IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0});
+  ASSERT_TRUE(perm.Ptr<IdType>() != nullptr);
+  ASSERT_EQ(perm->shape[0], idxs->shape[0]);
+  const IdType* const perm_cpu = static_cast<const IdType*>(perm->data);
+
+  // second part of result should be the counts
+  IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0});
+  ASSERT_TRUE(counts.Ptr<int64_t>() != nullptr);
+  ASSERT_EQ(counts->shape[0], num_parts);
+  const int64_t* const counts_cpu = static_cast<const int64_t*>(counts->data);
+
+  std::vector<int64_t> prefix(num_parts + 1, 0);
+  for (int p = 0; p < num_parts; ++p) {
+    prefix[p + 1] = prefix[p] + counts_cpu[p];
+  }
+  ASSERT_EQ(prefix.back(), idxs->shape[0]);
+
+  // copy original indexes to cpu
+  idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0});
+  const IdType* const idxs_cpu = static_cast<const IdType*>(idxs->data);
+
+  for (int p = 0; p < num_parts; ++p) {
+    for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) {
+      EXPECT_EQ(idxs_cpu[perm_cpu[i]] % num_parts, p);
+    }
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+void _TestRemainder_MapToX() {
+  const int64_t size = 160000;
+  const int num_parts = 7;
+  NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts);
+
+  for (int part_id = 0; part_id < num_parts; ++part_id) {
+    IdArray local = aten::Range(
+        0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0});
+    IdArray global = part->MapToGlobal(local, part_id);
+    IdArray act_local = part->MapToLocal(global).CopyTo(CPU);
+
+    // every global index should have the same remainder as the part id
+    ASSERT_EQ(global->shape[0], local->shape[0]);
+    global = global.CopyTo(CPU);
+    for (int64_t i = 0; i < global->shape[0]; ++i) {
+      EXPECT_EQ(Ptr<IdType>(global)[i] % num_parts, part_id)
+          << "i=" << i << ", num_parts=" << num_parts
+          << ", part_id=" << part_id;
+    }
+
+    // the remapped local indices to should match the original
+    local = local.CopyTo(CPU);
+    ASSERT_EQ(local->shape[0], act_local->shape[0]);
+    for (int64_t i = 0; i < act_local->shape[0]; ++i) {
+      EXPECT_EQ(Ptr<IdType>(local)[i], Ptr<IdType>(act_local)[i]);
+    }
+  }
+}
+
+TEST(PartitionTest, TestRemainderPartition) {
+#ifdef DGL_USE_CUDA
+  _TestRemainder_GeneratePermutation<kDGLCUDA, int32_t>();
+  _TestRemainder_GeneratePermutation<kDGLCUDA, int64_t>();
+
+  _TestRemainder_MapToX<kDGLCUDA, int32_t>();
+  _TestRemainder_MapToX<kDGLCUDA, int64_t>();
+#endif
+  // CPU is not implemented
+}
+
+template <typename INDEX, typename RANGE>
+int _FindPart(const INDEX idx, const RANGE* const range, const int num_parts) {
+  for (int i = 0; i < num_parts; ++i) {
+    if (range[i + 1] > idx) {
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+template <DGLDeviceType XPU, typename IdType>
+void _TestRange_GeneratePermutation() {
+  const int64_t size = 160000;
+  const int num_parts = 7;
+  IdArray range = aten::NewIdArray(
+      num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
+  for (int i = 0; i < num_parts; ++i) {
+    range.Ptr<IdType>()[i] = (size / num_parts) * i;
+  }
+  range.Ptr<IdType>()[num_parts] = size;
+  NDArrayPartitionRef part = CreatePartitionRangeBased(
+      size, num_parts, range.CopyTo(DGLContext{XPU, 0}));
+
+  IdArray idxs =
+      aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0});
+
+  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);
+
+  // first part of result should be the permutation
+  IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0});
+  ASSERT_TRUE(perm.Ptr<IdType>() != nullptr);
+  ASSERT_EQ(perm->shape[0], idxs->shape[0]);
+  const IdType* const perm_cpu = static_cast<const IdType*>(perm->data);
+
+  // second part of result should be the counts
+  IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0});
+  ASSERT_TRUE(counts.Ptr<int64_t>() != nullptr);
+  ASSERT_EQ(counts->shape[0], num_parts);
+  const int64_t* const counts_cpu = static_cast<const int64_t*>(counts->data);
+
+  std::vector<int64_t> prefix(num_parts + 1, 0);
+  for (int p = 0; p < num_parts; ++p) {
+    prefix[p + 1] = prefix[p] + counts_cpu[p];
+  }
+  ASSERT_EQ(prefix.back(), idxs->shape[0]);
+
+  // copy original indexes to cpu
+  idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0});
+  const IdType* const idxs_cpu = static_cast<const IdType*>(idxs->data);
+
+  for (int p = 0; p < num_parts; ++p) {
+    for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) {
+      EXPECT_EQ(
+          _FindPart(idxs_cpu[perm_cpu[i]], range.Ptr<IdType>(), num_parts), p);
+    }
+  }
+}
+
+template <DGLDeviceType XPU, typename IdType>
+void _TestRange_MapToX() {
+  const int64_t size = 160000;
+  const int num_parts = 7;
+  IdArray range = aten::NewIdArray(
+      num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8);
+  for (int i = 0; i < num_parts; ++i) {
+    Ptr<IdType>(range)[i] = (size / num_parts) * i;
+  }
+  range.Ptr<IdType>()[num_parts] = size;
+  NDArrayPartitionRef part = CreatePartitionRangeBased(
+      size, num_parts, range.CopyTo(DGLContext{XPU, 0}));
+
+  for (int part_id = 0; part_id < num_parts; ++part_id) {
+    IdArray local = aten::Range(
+        0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0});
+    IdArray global = part->MapToGlobal(local, part_id);
+    IdArray act_local = part->MapToLocal(global).CopyTo(CPU);
+
+    ASSERT_EQ(global->shape[0], local->shape[0]);
+    global = global.CopyTo(CPU);
+    for (int64_t i = 0; i < global->shape[0]; ++i) {
+      EXPECT_EQ(
+          _FindPart(Ptr<IdType>(global)[i], Ptr<IdType>(range), num_parts),
+          part_id)
+          << "i=" << i << ", num_parts=" << num_parts << ", part_id=" << part_id
+          << ", shape=" << global->shape[0];
+    }
+
+    // the remapped local indices to should match the original
+    local = local.CopyTo(CPU);
+    ASSERT_EQ(local->shape[0], act_local->shape[0]);
+    for (int64_t i = 0; i < act_local->shape[0]; ++i) {
+      EXPECT_EQ(Ptr<IdType>(local)[i], Ptr<IdType>(act_local)[i]);
+    }
+  }
+}
+
+TEST(PartitionTest, TestRangePartition) {
+#ifdef DGL_USE_CUDA
+  _TestRange_GeneratePermutation<kDGLCUDA, int32_t>();
+  _TestRange_GeneratePermutation<kDGLCUDA, int64_t>();
+
+  _TestRange_MapToX<kDGLCUDA, int32_t>();
+  _TestRange_MapToX<kDGLCUDA, int64_t>();
+#endif
+  // CPU is not implemented
+}
diff --git a/tests/cpp/test_spmat_coo.cc b/tests/cpp/test_spmat_coo.cc
index 4b663a5895b6..05acf8eb624a 100644
--- a/tests/cpp/test_spmat_coo.cc
+++ b/tests/cpp/test_spmat_coo.cc
@@ -331,7 +331,7 @@ void _TestCOOToCSR(DGLContext ctx) {
 TEST(SpmatTest, COOToCSR) {
   _TestCOOToCSR<int32_t>(CPU);
   _TestCOOToCSR<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCOOToCSR<int32_t>(GPU);
   _TestCOOToCSR<int64_t>(GPU);
 #endif
@@ -417,7 +417,7 @@ void _TestCOOSort(DGLContext ctx) {
 TEST(SpmatTest, COOSort) {
   _TestCOOSort<int32_t>(CPU);
   _TestCOOSort<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCOOSort<int32_t>(GPU);
   _TestCOOSort<int64_t>(GPU);
 #endif
@@ -479,7 +479,7 @@ void _TestCOOGetData(DGLContext ctx) {
 TEST(SpmatTest, COOGetData) {
   _TestCOOGetData<int32_t>(CPU);
   _TestCOOGetData<int64_t>(CPU);
-  // #ifdef DGL_USE_CUDA
+  // #ifdef DGL_USE_ROCM
   //_TestCOOGetData<int32_t>(GPU);
   //_TestCOOGetData<int64_t>(GPU);
   // #endif
diff --git a/tests/cpp/test_spmat_coo.cc.prehip b/tests/cpp/test_spmat_coo.cc.prehip
new file mode 100644
index 000000000000..4b663a5895b6
--- /dev/null
+++ b/tests/cpp/test_spmat_coo.cc.prehip
@@ -0,0 +1,576 @@
+#include <dgl/array.h>
+#include <dmlc/omp.h>
+#include <gtest/gtest.h>
+#include <omp.h>
+
+#include <random>
+
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::runtime;
+
+namespace {
+
+template <typename IDX>
+aten::CSRMatrix CSR1(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 3, 1, 4]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::CSRMatrix CSR2(DGLContext ctx = CTX) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 5, 3, 1, 4]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::COOMatrix COO1(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 3, 1, 4]
+  // row : [0, 2, 0, 1, 2]
+  // col : [1, 2, 2, 0, 3]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 1, 2, 4}), sizeof(IDX) * 8, ctx));
+}
+
+template <typename IDX>
+aten::COOMatrix COO2(DGLContext ctx = CTX) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 5, 3, 1, 4]
+  // row : [0, 2, 0, 1, 2, 0]
+  // col : [1, 2, 2, 0, 3, 2]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx));
+}
+
+template <typename IDX>
+aten::CSRMatrix SR_CSR3(DGLContext ctx) {
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::CSRMatrix SRC_CSR3(DGLContext ctx) {
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::COOMatrix COO3(DGLContext ctx) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // row : [0, 2, 0, 1, 2, 0]
+  // col : [2, 2, 1, 0, 3, 2]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx));
+}
+
+template <typename IDX>
+aten::COOMatrix COORandomized(IDX rows_and_cols, int64_t nnz, int seed) {
+  std::vector<IDX> vec_rows(nnz);
+  std::vector<IDX> vec_cols(nnz);
+  std::vector<IDX> vec_data(nnz);
+
+#pragma omp parallel
+  {
+    const int64_t num_threads = omp_get_num_threads();
+    const int64_t thread_id = omp_get_thread_num();
+    const int64_t chunk = nnz / num_threads;
+    const int64_t size = (thread_id == num_threads - 1)
+                             ? nnz - chunk * (num_threads - 1)
+                             : chunk;
+    auto rows = vec_rows.data() + thread_id * chunk;
+    auto cols = vec_cols.data() + thread_id * chunk;
+    auto data = vec_data.data() + thread_id * chunk;
+
+    std::mt19937_64 gen64(seed + thread_id);
+    std::mt19937 gen32(seed + thread_id);
+
+    for (int64_t i = 0; i < size; ++i) {
+      rows[i] = gen64() % rows_and_cols;
+      cols[i] = gen64() % rows_and_cols;
+      data[i] = gen32() % 90 + 1;
+    }
+  }
+
+  return aten::COOMatrix(
+      rows_and_cols, rows_and_cols,
+      aten::VecToIdArray(vec_rows, sizeof(IDX) * 8, CTX),
+      aten::VecToIdArray(vec_cols, sizeof(IDX) * 8, CTX),
+      aten::VecToIdArray(vec_data, sizeof(IDX) * 8, CTX), false, false);
+}
+
+struct SparseCOOCSR {
+  static constexpr uint64_t NUM_ROWS = 100;
+  static constexpr uint64_t NUM_COLS = 150;
+  static constexpr uint64_t NUM_NZ = 5;
+  template <typename IDX>
+  static aten::COOMatrix COOSparse(const DGLContext &ctx = CTX) {
+    return aten::COOMatrix(
+        NUM_ROWS, NUM_COLS,
+        aten::VecToIdArray(
+            std::vector<IDX>({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx));
+  }
+
+  template <typename IDX>
+  static aten::CSRMatrix CSRSparse(const DGLContext &ctx = CTX) {
+    auto &&indptr = std::vector<IDX>(NUM_ROWS + 1, NUM_NZ);
+    for (size_t i = 0; i < NUM_NZ; ++i) {
+      indptr[i + 1] = static_cast<IDX>(i + 1);
+    }
+    indptr[0] = 0;
+    return aten::CSRMatrix(
+        NUM_ROWS, NUM_COLS, aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx),
+        aten::VecToIdArray(
+            std::vector<IDX>({1, 1, 1, 1, 1}), sizeof(IDX) * 8, ctx),
+        false);
+  }
+};
+
+template <typename IDX>
+aten::COOMatrix RowSorted_NullData_COO(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // row : [0, 0, 1, 2, 2]
+  // col : [1, 2, 0, 2, 3]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::NullArray(), true, false);
+}
+
+template <typename IDX>
+aten::CSRMatrix RowSorted_NullData_CSR(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 1, 2, 3, 4]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+}  // namespace
+
+template <typename IDX>
+void _TestCOOToCSR(DGLContext ctx) {
+  auto coo = COO1<IDX>(ctx);
+  auto csr = CSR1<IDX>(ctx);
+  auto tcsr = aten::COOToCSR(coo);
+  ASSERT_FALSE(coo.row_sorted);
+  ASSERT_EQ(csr.num_rows, tcsr.num_rows);
+  ASSERT_EQ(csr.num_cols, tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));
+
+  coo = COO2<IDX>(ctx);
+  csr = CSR2<IDX>(ctx);
+  tcsr = aten::COOToCSR(coo);
+  ASSERT_EQ(coo.num_rows, csr.num_rows);
+  ASSERT_EQ(coo.num_cols, csr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
+
+  // Convert from row sorted coo
+  coo = COO1<IDX>(ctx);
+  auto rs_coo = aten::COOSort(coo, false);
+  auto rs_csr = CSR1<IDX>(ctx);
+  auto rs_tcsr = aten::COOToCSR(rs_coo);
+  ASSERT_TRUE(rs_coo.row_sorted);
+  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
+  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
+
+  coo = COO3<IDX>(ctx);
+  rs_coo = aten::COOSort(coo, false);
+  rs_csr = SR_CSR3<IDX>(ctx);
+  rs_tcsr = aten::COOToCSR(rs_coo);
+  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
+  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.indices, rs_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_tcsr.data, rs_coo.data));
+
+  rs_coo = RowSorted_NullData_COO<IDX>(ctx);
+  ASSERT_TRUE(rs_coo.row_sorted);
+  rs_csr = RowSorted_NullData_CSR<IDX>(ctx);
+  rs_tcsr = aten::COOToCSR(rs_coo);
+  ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows);
+  ASSERT_EQ(rs_csr.num_rows, rs_tcsr.num_rows);
+  ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols);
+  ASSERT_EQ(rs_csr.num_cols, rs_tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indptr, rs_tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.indices, rs_tcsr.indices));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_csr.data, rs_tcsr.data));
+  ASSERT_TRUE(ArrayEQ<IDX>(rs_coo.col, rs_tcsr.indices));
+  ASSERT_FALSE(ArrayEQ<IDX>(rs_coo.data, rs_tcsr.data));
+
+  // Convert from col sorted coo
+  coo = COO1<IDX>(ctx);
+  auto src_coo = aten::COOSort(coo, true);
+  auto src_csr = CSR1<IDX>(ctx);
+  auto src_tcsr = aten::COOToCSR(src_coo);
+  ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
+  ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
+  ASSERT_TRUE(src_tcsr.sorted);
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
+
+  coo = COO3<IDX>(ctx);
+  src_coo = aten::COOSort(coo, true);
+  src_csr = SRC_CSR3<IDX>(ctx);
+  src_tcsr = aten::COOToCSR(src_coo);
+  ASSERT_EQ(coo.num_rows, src_tcsr.num_rows);
+  ASSERT_EQ(coo.num_cols, src_tcsr.num_cols);
+  ASSERT_TRUE(src_tcsr.sorted);
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indptr, src_csr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.indices, src_coo.col));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_tcsr.data, src_coo.data));
+
+  coo = SparseCOOCSR::COOSparse<IDX>(ctx);
+  csr = SparseCOOCSR::CSRSparse<IDX>(ctx);
+  tcsr = aten::COOToCSR(coo);
+  ASSERT_FALSE(coo.row_sorted);
+  ASSERT_EQ(csr.num_rows, tcsr.num_rows);
+  ASSERT_EQ(csr.num_cols, tcsr.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indptr, tcsr.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr.indices, tcsr.indices));
+}
+
+TEST(SpmatTest, COOToCSR) {
+  _TestCOOToCSR<int32_t>(CPU);
+  _TestCOOToCSR<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCOOToCSR<int32_t>(GPU);
+  _TestCOOToCSR<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCOOHasDuplicate() {
+  auto coo = COO1<IDX>();
+  ASSERT_FALSE(aten::COOHasDuplicate(coo));
+  coo = COO2<IDX>();
+  ASSERT_TRUE(aten::COOHasDuplicate(coo));
+}
+
+TEST(SpmatTest, TestCOOHasDuplicate) {
+  _TestCOOHasDuplicate<int32_t>();
+  _TestCOOHasDuplicate<int64_t>();
+}
+
+template <typename IDX>
+void _TestCOOSort(DGLContext ctx) {
+  auto coo = COO3<IDX>(ctx);
+
+  auto sr_coo = COOSort(coo, false);
+  ASSERT_EQ(coo.num_rows, sr_coo.num_rows);
+  ASSERT_EQ(coo.num_cols, sr_coo.num_cols);
+  ASSERT_TRUE(sr_coo.row_sorted);
+  auto flags = COOIsSorted(sr_coo);
+  ASSERT_TRUE(flags.first);
+  flags = COOIsSorted(coo);  // original coo should stay the same
+  ASSERT_FALSE(flags.first);
+  ASSERT_FALSE(flags.second);
+
+  auto src_coo = COOSort(coo, true);
+  ASSERT_EQ(coo.num_rows, src_coo.num_rows);
+  ASSERT_EQ(coo.num_cols, src_coo.num_cols);
+  ASSERT_TRUE(src_coo.row_sorted);
+  ASSERT_TRUE(src_coo.col_sorted);
+  flags = COOIsSorted(src_coo);
+  ASSERT_TRUE(flags.first);
+  ASSERT_TRUE(flags.second);
+
+  // sort inplace
+  COOSort_(&coo);
+  ASSERT_TRUE(coo.row_sorted);
+  flags = COOIsSorted(coo);
+  ASSERT_TRUE(flags.first);
+  COOSort_(&coo, true);
+  ASSERT_TRUE(coo.row_sorted);
+  ASSERT_TRUE(coo.col_sorted);
+  flags = COOIsSorted(coo);
+  ASSERT_TRUE(flags.first);
+  ASSERT_TRUE(flags.second);
+
+  // COO3
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 1, 2, 3, 4, 5]
+  // row : [0, 2, 0, 1, 2, 0]
+  // col : [2, 2, 1, 0, 3, 2]
+  // Row Sorted
+  // data: [0, 2, 5, 3, 1, 4]
+  // row : [0, 0, 0, 1, 2, 2]
+  // col : [2, 1, 2, 0, 2, 3]
+  // Row Col Sorted
+  // data: [2, 0, 5, 3, 1, 4]
+  // row : [0, 0, 0, 1, 2, 2]
+  // col : [1, 2, 2, 0, 2, 3]
+  auto sort_row = aten::VecToIdArray(
+      std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
+  auto sort_col = aten::VecToIdArray(
+      std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx);
+  auto sort_col_data = aten::VecToIdArray(
+      std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx);
+
+  ASSERT_TRUE(ArrayEQ<IDX>(sr_coo.row, sort_row));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_coo.row, sort_row));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_coo.col, sort_col));
+  ASSERT_TRUE(ArrayEQ<IDX>(src_coo.data, sort_col_data));
+}
+
+TEST(SpmatTest, COOSort) {
+  _TestCOOSort<int32_t>(CPU);
+  _TestCOOSort<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCOOSort<int32_t>(GPU);
+  _TestCOOSort<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCOOReorder() {
+  auto coo = COO2<IDX>();
+  auto new_row =
+      aten::VecToIdArray(std::vector<IDX>({2, 0, 3, 1}), sizeof(IDX) * 8, CTX);
+  auto new_col = aten::VecToIdArray(
+      std::vector<IDX>({2, 0, 4, 3, 1}), sizeof(IDX) * 8, CTX);
+  auto new_coo = COOReorder(coo, new_row, new_col);
+  ASSERT_EQ(new_coo.num_rows, coo.num_rows);
+  ASSERT_EQ(new_coo.num_cols, coo.num_cols);
+}
+
+TEST(SpmatTest, TestCOOReorder) {
+  _TestCOOReorder<int32_t>();
+  _TestCOOReorder<int64_t>();
+}
+
+template <typename IDX>
+void _TestCOOGetData(DGLContext ctx) {
+  auto coo = COO2<IDX>(ctx);
+  // test get all data
+  auto x = aten::COOGetAllData(coo, 0, 0);
+  auto tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::COOGetAllData(coo, 0, 2);
+  tx = aten::VecToIdArray(std::vector<IDX>({2, 5}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::COOGetData(coo, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data on sorted
+  coo = aten::COOSort(coo);
+  r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::COOGetData(coo, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data w/ broadcasting
+  r = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::COOGetData(coo, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+TEST(SpmatTest, COOGetData) {
+  _TestCOOGetData<int32_t>(CPU);
+  _TestCOOGetData<int64_t>(CPU);
+  // #ifdef DGL_USE_CUDA
+  //_TestCOOGetData<int32_t>(GPU);
+  //_TestCOOGetData<int64_t>(GPU);
+  // #endif
+}
+
+template <typename IDX>
+void _TestCOOGetDataAndIndices() {
+  auto coo = COO2<IDX>();
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, CTX);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, CTX);
+  auto x = aten::COOGetDataAndIndices(coo, r, c);
+  auto tr =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, CTX);
+  auto tc =
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX) * 8, CTX);
+  auto td =
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, CTX);
+  ASSERT_TRUE(ArrayEQ<IDX>(x[0], tr));
+  ASSERT_TRUE(ArrayEQ<IDX>(x[1], tc));
+  ASSERT_TRUE(ArrayEQ<IDX>(x[2], td));
+}
+
+TEST(SpmatTest, COOGetDataAndIndices) {
+  _TestCOOGetDataAndIndices<int32_t>();
+  _TestCOOGetDataAndIndices<int64_t>();
+}
+
+template <typename IDX>
+void _TestCOOToCSRAlgs() {
+  // Compare results between different CPU COOToCSR implementations.
+  // NNZ is chosen to be bigger than the limit for the "small" matrix algorithm.
+  // N is set to lay on border between "sparse" and "dense" algorithm choice.
+
+  const int64_t num_threads = std::min(256, omp_get_max_threads());
+  const int64_t min_num_threads = 3;
+
+  if (num_threads < min_num_threads) {
+    std::cerr << "[          ] [ INFO ]"
+              << "This test requires at least 3 OMP threads to work properly"
+              << std::endl;
+    GTEST_SKIP();
+    return;
+  }
+
+  // Select N and NNZ for COO matrix in a way than depending on number of
+  // threads different algorithm will be used.
+  // See WhichCOOToCSR in src/array/cpu/spmat_op_impl_coo.cc for details
+  const int64_t type_scale = sizeof(IDX) >> 1;
+  const int64_t small = 50 * num_threads * type_scale * type_scale;
+  // NNZ should be bigger than limit for small matrix algorithm
+  const int64_t nnz = small + 1234;
+  // N is chosen to lay on sparse/dense border
+  const int64_t n = type_scale * nnz / num_threads;
+  const IDX rows_nad_cols = n + 1;  // should be bigger than sparse/dense border
+
+  // Note that it will be better to set the seed to a random value when gtest
+  // allows to use --gtest_random_seed without --gtest_shuffle and report this
+  // value for reproduction. This way we can find unforeseen situations and
+  // potential bugs.
+  const auto seed = 123321;
+  auto coo = COORandomized<IDX>(rows_nad_cols, nnz, seed);
+
+  omp_set_num_threads(1);
+  // UnSortedSmallCOOToCSR will be used
+  auto tcsr_small = aten::COOToCSR(coo);
+  ASSERT_EQ(coo.num_rows, tcsr_small.num_rows);
+  ASSERT_EQ(coo.num_cols, tcsr_small.num_cols);
+
+  omp_set_num_threads(num_threads - 1);
+  // UnSortedDenseCOOToCSR will be used
+  auto tcsr_dense = aten::COOToCSR(coo);
+  ASSERT_EQ(tcsr_small.num_rows, tcsr_dense.num_rows);
+  ASSERT_EQ(tcsr_small.num_cols, tcsr_dense.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indptr, tcsr_dense.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indices, tcsr_dense.indices));
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.data, tcsr_dense.data));
+
+  omp_set_num_threads(num_threads);
+  // UnSortedSparseCOOToCSR will be used
+  auto tcsr_sparse = aten::COOToCSR(coo);
+  ASSERT_EQ(tcsr_small.num_rows, tcsr_sparse.num_rows);
+  ASSERT_EQ(tcsr_small.num_cols, tcsr_sparse.num_cols);
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indptr, tcsr_sparse.indptr));
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.indices, tcsr_sparse.indices));
+  ASSERT_TRUE(ArrayEQ<IDX>(tcsr_small.data, tcsr_sparse.data));
+  return;
+}
+
+TEST(SpmatTest, COOToCSRAlgs) {
+  _TestCOOToCSRAlgs<int32_t>();
+  _TestCOOToCSRAlgs<int64_t>();
+}
diff --git a/tests/cpp/test_spmat_csr.cc b/tests/cpp/test_spmat_csr.cc
index 4604df64cdb6..2477bee84eef 100644
--- a/tests/cpp/test_spmat_csr.cc
+++ b/tests/cpp/test_spmat_csr.cc
@@ -214,7 +214,7 @@ TEST(SpmatTest, TestCSRIsNonZero) {
   _TestCSRIsNonZero1<int64_t>(CPU);
   _TestCSRIsNonZero2<int32_t>(CPU);
   _TestCSRIsNonZero2<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRIsNonZero1<int32_t>(GPU);
   _TestCSRIsNonZero1<int64_t>(GPU);
   _TestCSRIsNonZero2<int32_t>(GPU);
@@ -238,7 +238,7 @@ void _TestCSRGetRowNNZ(DGLContext ctx) {
 TEST(SpmatTest, TestCSRGetRowNNZ) {
   _TestCSRGetRowNNZ<int32_t>(CPU);
   _TestCSRGetRowNNZ<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRGetRowNNZ<int32_t>(GPU);
   _TestCSRGetRowNNZ<int64_t>(GPU);
 #endif
@@ -262,7 +262,7 @@ void _TestCSRGetRowColumnIndices(DGLContext ctx) {
 TEST(SpmatTest, TestCSRGetRowColumnIndices) {
   _TestCSRGetRowColumnIndices<int32_t>(CPU);
   _TestCSRGetRowColumnIndices<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRGetRowColumnIndices<int32_t>(GPU);
   _TestCSRGetRowColumnIndices<int64_t>(GPU);
 #endif
@@ -286,7 +286,7 @@ void _TestCSRGetRowData(DGLContext ctx) {
 TEST(SpmatTest, TestCSRGetRowData) {
   _TestCSRGetRowData<int32_t>(CPU);
   _TestCSRGetRowData<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRGetRowData<int32_t>(GPU);
   _TestCSRGetRowData<int64_t>(GPU);
 #endif
@@ -331,7 +331,7 @@ void _TestCSRGetData(DGLContext ctx) {
 TEST(SpmatTest, CSRGetData) {
   _TestCSRGetData<int32_t>(CPU);
   _TestCSRGetData<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRGetData<int32_t>(GPU);
   _TestCSRGetData<int64_t>(GPU);
 #endif
@@ -359,7 +359,7 @@ void _TestCSRGetDataAndIndices(DGLContext ctx) {
 TEST(SpmatTest, CSRGetDataAndIndices) {
   _TestCSRGetDataAndIndices<int32_t>(CPU);
   _TestCSRGetDataAndIndices<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRGetDataAndIndices<int32_t>(GPU);
   _TestCSRGetDataAndIndices<int64_t>(GPU);
 #endif
@@ -391,7 +391,7 @@ void _TestCSRTranspose(DGLContext ctx) {
 TEST(SpmatTest, CSRTranspose) {
   _TestCSRTranspose<int32_t>(CPU);
   _TestCSRTranspose<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRTranspose<int32_t>(GPU);
   _TestCSRTranspose<int64_t>(GPU);
 #endif
@@ -437,7 +437,7 @@ void _TestCSRToCOO(DGLContext ctx) {
 TEST(SpmatTest, CSRToCOO) {
   _TestCSRToCOO<int32_t>(CPU);
   _TestCSRToCOO<int64_t>(CPU);
-#if DGL_USE_CUDA
+#if DGL_USE_ROCM
   _TestCSRToCOO<int32_t>(GPU);
   _TestCSRToCOO<int64_t>(GPU);
 #endif
@@ -545,7 +545,7 @@ void _TestCSRSliceRows(DGLContext ctx) {
 TEST(SpmatTest, TestCSRSliceRows) {
   _TestCSRSliceRows<int32_t>(CPU);
   _TestCSRSliceRows<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRSliceRows<int32_t>(GPU);
   _TestCSRSliceRows<int64_t>(GPU);
 #endif
@@ -693,7 +693,7 @@ TEST(SpmatTest, CSRSliceMatrix) {
   _TestCSRSliceMatrix1<int64_t>(CPU);
   _TestCSRSliceMatrix2<int32_t>(CPU);
   _TestCSRSliceMatrix2<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRSliceMatrix1<int32_t>(GPU);
   _TestCSRSliceMatrix1<int64_t>(GPU);
   _TestCSRSliceMatrix2<int32_t>(GPU);
@@ -712,7 +712,7 @@ void _TestCSRHasDuplicate(DGLContext ctx) {
 TEST(SpmatTest, CSRHasDuplicate) {
   _TestCSRHasDuplicate<int32_t>(CPU);
   _TestCSRHasDuplicate<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRHasDuplicate<int32_t>(GPU);
   _TestCSRHasDuplicate<int64_t>(GPU);
 #endif
@@ -736,7 +736,7 @@ void _TestCSRSort(DGLContext ctx) {
 TEST(SpmatTest, CSRSort) {
   _TestCSRSort<int32_t>(CPU);
   _TestCSRSort<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestCSRSort<int32_t>(GPU);
   _TestCSRSort<int64_t>(GPU);
 #endif
diff --git a/tests/cpp/test_spmat_csr.cc.prehip b/tests/cpp/test_spmat_csr.cc.prehip
new file mode 100644
index 000000000000..4604df64cdb6
--- /dev/null
+++ b/tests/cpp/test_spmat_csr.cc.prehip
@@ -0,0 +1,760 @@
+#include <dgl/array.h>
+#include <gtest/gtest.h>
+
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::runtime;
+
+namespace {
+
+template <typename IDX>
+aten::CSRMatrix CSR1(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 3, 1, 4]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 3, 4, 1}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::CSRMatrix CSR2(DGLContext ctx = CTX) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 5, 3, 1, 4]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::CSRMatrix CSR3(DGLContext ctx = CTX) {
+  // has duplicate entries and the columns are not sorted
+  // [[0, 1, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0, 0],
+  //  [0, 0, 0, 0, 0, 0],
+  //  [1, 1, 1, 0, 0, 0],
+  //  [0, 0, 0, 1, 0, 0],
+  //  [0, 0, 0, 0, 0, 0],
+  //  [1, 2, 1, 1, 0, 0],
+  //  [0, 1, 0, 0, 0, 1]],
+  // data: [5, 2, 0, 3, 1, 4, 8, 7, 6, 9, 12, 13, 11, 10, 14, 15, 16]
+  return aten::CSRMatrix(
+      9, 6,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6, 9, 10, 10, 15, 17}), sizeof(IDX) * 8,
+          ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({3, 2, 1, 0, 2, 3, 1, 2, 0, 3, 1, 2, 1, 3, 0, 5, 1}),
+          sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>(
+              {0, 2, 5, 3, 1, 4, 6, 8, 7, 9, 13, 10, 11, 14, 12, 16, 15}),
+          sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::COOMatrix COO1(DGLContext ctx = CTX) {
+  // [[0, 1, 1, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 3, 1, 4]
+  // row : [0, 2, 0, 1, 2]
+  // col : [1, 2, 2, 0, 3]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 1, 2, 4}), sizeof(IDX) * 8, ctx));
+}
+
+template <typename IDX>
+aten::COOMatrix COO2(DGLContext ctx = CTX) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 5, 3, 1, 4]
+  // row : [0, 2, 0, 1, 2, 0]
+  // col : [1, 2, 2, 0, 3, 2]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx));
+}
+
+template <typename IDX>
+aten::CSRMatrix SR_CSR3(DGLContext ctx) {
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::CSRMatrix SRC_CSR3(DGLContext ctx) {
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  return aten::CSRMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx),
+      false);
+}
+
+template <typename IDX>
+aten::COOMatrix COO3(DGLContext ctx) {
+  // has duplicate entries
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // row : [0, 2, 0, 1, 2, 0]
+  // col : [2, 2, 1, 0, 3, 2]
+  return aten::COOMatrix(
+      4, 5,
+      aten::VecToIdArray(
+          std::vector<IDX>({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx),
+      aten::VecToIdArray(
+          std::vector<IDX>({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx));
+}
+
+}  // namespace
+
+template <typename IDX>
+void _TestCSRIsNonZero1(DGLContext ctx) {
+  auto csr = CSR1<IDX>(ctx);
+  ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1));
+  ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0));
+  IdArray r =
+      aten::VecToIdArray(std::vector<IDX>({2, 2, 0, 0}), sizeof(IDX) * 8, ctx);
+  IdArray c =
+      aten::VecToIdArray(std::vector<IDX>({1, 1, 1, 3}), sizeof(IDX) * 8, ctx);
+  IdArray x = aten::CSRIsNonZero(csr, r, c);
+  IdArray tx =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 1, 0}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+template <typename IDX>
+void _TestCSRIsNonZero2(DGLContext ctx) {
+  auto csr = CSR3<IDX>(ctx);
+  ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1));
+  ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0));
+  IdArray r = aten::VecToIdArray(
+      std::vector<IDX>({
+          0,
+          0,
+          0,
+          0,
+          0,
+      }),
+      sizeof(IDX) * 8, ctx);
+  IdArray c = aten::VecToIdArray(
+      std::vector<IDX>({
+          0,
+          1,
+          2,
+          3,
+          4,
+      }),
+      sizeof(IDX) * 8, ctx);
+  IdArray x = aten::CSRIsNonZero(csr, r, c);
+  IdArray tx = aten::VecToIdArray(
+      std::vector<IDX>({0, 1, 1, 1, 0}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx)) << " x = " << x << ", tx = " << tx;
+}
+
+TEST(SpmatTest, TestCSRIsNonZero) {
+  _TestCSRIsNonZero1<int32_t>(CPU);
+  _TestCSRIsNonZero1<int64_t>(CPU);
+  _TestCSRIsNonZero2<int32_t>(CPU);
+  _TestCSRIsNonZero2<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRIsNonZero1<int32_t>(GPU);
+  _TestCSRIsNonZero1<int64_t>(GPU);
+  _TestCSRIsNonZero2<int32_t>(GPU);
+  _TestCSRIsNonZero2<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRGetRowNNZ(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  ASSERT_EQ(aten::CSRGetRowNNZ(csr, 0), 3);
+  ASSERT_EQ(aten::CSRGetRowNNZ(csr, 3), 0);
+  IdArray r =
+      aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX) * 8, ctx);
+  IdArray x = aten::CSRGetRowNNZ(csr, r);
+  IdArray tx =
+      aten::VecToIdArray(std::vector<IDX>({3, 0}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+TEST(SpmatTest, TestCSRGetRowNNZ) {
+  _TestCSRGetRowNNZ<int32_t>(CPU);
+  _TestCSRGetRowNNZ<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowNNZ<int32_t>(GPU);
+  _TestCSRGetRowNNZ<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRGetRowColumnIndices(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  auto x = aten::CSRGetRowColumnIndices(csr, 0);
+  auto tx =
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::CSRGetRowColumnIndices(csr, 1);
+  tx = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::CSRGetRowColumnIndices(csr, 3);
+  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+TEST(SpmatTest, TestCSRGetRowColumnIndices) {
+  _TestCSRGetRowColumnIndices<int32_t>(CPU);
+  _TestCSRGetRowColumnIndices<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowColumnIndices<int32_t>(GPU);
+  _TestCSRGetRowColumnIndices<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRGetRowData(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  auto x = aten::CSRGetRowData(csr, 0);
+  auto tx =
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::CSRGetRowData(csr, 1);
+  tx = aten::VecToIdArray(std::vector<IDX>({3}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::CSRGetRowData(csr, 3);
+  tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+TEST(SpmatTest, TestCSRGetRowData) {
+  _TestCSRGetRowData<int32_t>(CPU);
+  _TestCSRGetRowData<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetRowData<int32_t>(GPU);
+  _TestCSRGetRowData<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRGetData(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  // test get all data
+  auto x = aten::CSRGetAllData(csr, 0, 0);
+  auto tx = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+  x = aten::CSRGetAllData(csr, 0, 2);
+  tx = aten::VecToIdArray(std::vector<IDX>({2, 5}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRGetData(csr, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data on sorted
+  csr = aten::CSRSort(csr);
+  r = aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRGetData(csr, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+
+  // test get data w/ broadcasting
+  r = aten::VecToIdArray(std::vector<IDX>({0}), sizeof(IDX) * 8, ctx);
+  c = aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRGetData(csr, r, c);
+  tx = aten::VecToIdArray(std::vector<IDX>({-1, 0, 2}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x, tx));
+}
+
+TEST(SpmatTest, CSRGetData) {
+  _TestCSRGetData<int32_t>(CPU);
+  _TestCSRGetData<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetData<int32_t>(GPU);
+  _TestCSRGetData<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRGetDataAndIndices(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  auto c =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+  auto x = aten::CSRGetDataAndIndices(csr, r, c);
+  auto tr =
+      aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+  auto tc =
+      aten::VecToIdArray(std::vector<IDX>({1, 2, 2}), sizeof(IDX) * 8, ctx);
+  auto td =
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x[0], tr));
+  ASSERT_TRUE(ArrayEQ<IDX>(x[1], tc));
+  ASSERT_TRUE(ArrayEQ<IDX>(x[2], td));
+}
+
+TEST(SpmatTest, CSRGetDataAndIndices) {
+  _TestCSRGetDataAndIndices<int32_t>(CPU);
+  _TestCSRGetDataAndIndices<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRGetDataAndIndices<int32_t>(GPU);
+  _TestCSRGetDataAndIndices<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRTranspose(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  auto csr_t = aten::CSRTranspose(csr);
+  // [[0, 1, 0, 0],
+  //  [1, 0, 0, 0],
+  //  [2, 0, 1, 0],
+  //  [0, 0, 1, 0],
+  //  [0, 0, 0, 0]]
+  // data: [3, 0, 2, 5, 1, 4]
+  ASSERT_EQ(csr_t.num_rows, 5);
+  ASSERT_EQ(csr_t.num_cols, 4);
+  auto tp = aten::VecToIdArray(
+      std::vector<IDX>({0, 1, 2, 5, 6, 6}), sizeof(IDX) * 8, ctx);
+  auto ti = aten::VecToIdArray(
+      std::vector<IDX>({1, 0, 0, 0, 2, 2}), sizeof(IDX) * 8, ctx);
+  auto td = aten::VecToIdArray(
+      std::vector<IDX>({3, 0, 2, 5, 1, 4}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(csr_t.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr_t.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(csr_t.data, td));
+}
+
+TEST(SpmatTest, CSRTranspose) {
+  _TestCSRTranspose<int32_t>(CPU);
+  _TestCSRTranspose<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRTranspose<int32_t>(GPU);
+  _TestCSRTranspose<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRToCOO(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  {
+    auto coo = CSRToCOO(csr, false);
+    ASSERT_EQ(coo.num_rows, 4);
+    ASSERT_EQ(coo.num_cols, 5);
+    ASSERT_TRUE(coo.row_sorted);
+    auto tr = aten::VecToIdArray(
+        std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.col, csr.indices));
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.data, csr.data));
+
+    // convert from sorted csr
+    auto s_csr = CSRSort(csr);
+    coo = CSRToCOO(s_csr, false);
+    ASSERT_EQ(coo.num_rows, 4);
+    ASSERT_EQ(coo.num_cols, 5);
+    ASSERT_TRUE(coo.row_sorted);
+    ASSERT_TRUE(coo.col_sorted);
+    tr = aten::VecToIdArray(
+        std::vector<IDX>({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tr));
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.col, s_csr.indices));
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.data, s_csr.data));
+  }
+  {
+    auto coo = CSRToCOO(csr, true);
+    ASSERT_EQ(coo.num_rows, 4);
+    ASSERT_EQ(coo.num_cols, 5);
+    auto tcoo = COO2<IDX>(ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.row, tcoo.row));
+    ASSERT_TRUE(ArrayEQ<IDX>(coo.col, tcoo.col));
+  }
+}
+
+TEST(SpmatTest, CSRToCOO) {
+  _TestCSRToCOO<int32_t>(CPU);
+  _TestCSRToCOO<int64_t>(CPU);
+#if DGL_USE_CUDA
+  _TestCSRToCOO<int32_t>(GPU);
+  _TestCSRToCOO<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRSliceRows(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  auto x = aten::CSRSliceRows(csr, 1, 4);
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [3, 1, 4]
+  ASSERT_EQ(x.num_rows, 3);
+  ASSERT_EQ(x.num_cols, 5);
+  auto tp =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 3, 3}), sizeof(IDX) * 8, ctx);
+  auto ti =
+      aten::VecToIdArray(std::vector<IDX>({0, 2, 3}), sizeof(IDX) * 8, ctx);
+  auto td =
+      aten::VecToIdArray(std::vector<IDX>({3, 1, 4}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+
+  auto r =
+      aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRSliceRows(csr, r);
+  // [[0, 1, 2, 0, 0],
+  //  [1, 0, 0, 0, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: [0, 2, 5, 3]
+  tp = aten::VecToIdArray(std::vector<IDX>({0, 3, 4, 4}), sizeof(IDX) * 8, ctx);
+  ti = aten::VecToIdArray(std::vector<IDX>({1, 2, 2, 0}), sizeof(IDX) * 8, ctx);
+  td = aten::VecToIdArray(std::vector<IDX>({0, 2, 5, 3}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+
+  // Testing non-increasing row id based slicing
+  r = aten::VecToIdArray(std::vector<IDX>({3, 2, 1}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRSliceRows(csr, r);
+  // [[0, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [1, 0, 0, 0, 0]]
+  // data: [1, 4, 3]
+  tp = aten::VecToIdArray(std::vector<IDX>({0, 0, 2, 3}), sizeof(IDX) * 8, ctx);
+  ti = aten::VecToIdArray(std::vector<IDX>({2, 3, 0}), sizeof(IDX) * 8, ctx);
+  td = aten::VecToIdArray(std::vector<IDX>({1, 4, 3}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+
+  // Testing zero-degree row slicing with different rows
+  r = aten::VecToIdArray(
+      std::vector<IDX>({1, 3, 0, 3, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRSliceRows(csr, r);
+  // [[1, 0, 0, 0, 0],
+  //  [0, 0, 0, 0, 0],
+  //  [0, 1, 2, 0, 0],
+  //  [0, 0, 0, 0, 0],
+  //  [0, 0, 1, 1, 0]]
+  // data: [3, 0, 2, 5, 1, 4]
+  tp = aten::VecToIdArray(
+      std::vector<IDX>({0, 1, 1, 4, 4, 6}), sizeof(IDX) * 8, ctx);
+  ti = aten::VecToIdArray(
+      std::vector<IDX>({0, 1, 2, 2, 2, 3}), sizeof(IDX) * 8, ctx);
+  td = aten::VecToIdArray(
+      std::vector<IDX>({3, 0, 2, 5, 1, 4}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+
+  // Testing empty output (i.e. sliced rows will be zero-degree)
+  r = aten::VecToIdArray(std::vector<IDX>({3, 3, 3}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRSliceRows(csr, r);
+  // [[0, 0, 0, 0, 0],
+  //  [0, 0, 0, 0, 0],
+  //  [0, 0, 0, 0, 0]]
+  // data: []
+  tp = aten::VecToIdArray(std::vector<IDX>({0, 0, 0, 0}), sizeof(IDX) * 8, ctx);
+  ti = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  td = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+
+  // Testing constant output: we pick last row with at least one nnz
+  r = aten::VecToIdArray(std::vector<IDX>({2, 2, 2}), sizeof(IDX) * 8, ctx);
+  x = aten::CSRSliceRows(csr, r);
+  // [[0, 0, 1, 1, 0],
+  //  [0, 0, 1, 1, 0],
+  //  [0, 0, 1, 1, 0]]
+  // data: [1, 4, 1, 4, 1, 4]
+  tp = aten::VecToIdArray(std::vector<IDX>({0, 2, 4, 6}), sizeof(IDX) * 8, ctx);
+  ti = aten::VecToIdArray(
+      std::vector<IDX>({2, 3, 2, 3, 2, 3}), sizeof(IDX) * 8, ctx);
+  td = aten::VecToIdArray(
+      std::vector<IDX>({1, 4, 1, 4, 1, 4}), sizeof(IDX) * 8, ctx);
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+  ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+}
+
+TEST(SpmatTest, TestCSRSliceRows) {
+  _TestCSRSliceRows<int32_t>(CPU);
+  _TestCSRSliceRows<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRSliceRows<int32_t>(GPU);
+  _TestCSRSliceRows<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRSliceMatrix1(DGLContext ctx) {
+  auto csr = CSR2<IDX>(ctx);
+  {
+    // square
+    auto r =
+        aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX) * 8, ctx);
+    auto c =
+        aten::VecToIdArray(std::vector<IDX>({1, 2, 3}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[1, 2, 0],
+    //  [0, 0, 0],
+    //  [0, 0, 0]]
+    // data: [0, 2, 5]
+    ASSERT_EQ(x.num_rows, 3);
+    ASSERT_EQ(x.num_cols, 3);
+    auto tp = aten::VecToIdArray(
+        std::vector<IDX>({0, 3, 3, 3}), sizeof(IDX) * 8, ctx);
+    auto ti =
+        aten::VecToIdArray(std::vector<IDX>({0, 1, 1}), sizeof(IDX) * 8, ctx);
+    auto td =
+        aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+  {
+    // non-square
+    auto r =
+        aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+    auto c = aten::VecToIdArray(std::vector<IDX>({0, 1}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[0, 1],
+    //  [1, 0],
+    //  [0, 0]]
+    // data: [0, 3]
+    ASSERT_EQ(x.num_rows, 3);
+    ASSERT_EQ(x.num_cols, 2);
+    auto tp = aten::VecToIdArray(
+        std::vector<IDX>({0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
+    auto ti =
+        aten::VecToIdArray(std::vector<IDX>({1, 0}), sizeof(IDX) * 8, ctx);
+    auto td =
+        aten::VecToIdArray(std::vector<IDX>({0, 3}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+  {
+    // empty slice
+    auto r = aten::VecToIdArray(std::vector<IDX>({2, 3}), sizeof(IDX) * 8, ctx);
+    auto c = aten::VecToIdArray(std::vector<IDX>({0, 1}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[0, 0],
+    //  [0, 0]]
+    // data: []
+    ASSERT_EQ(x.num_rows, 2);
+    ASSERT_EQ(x.num_cols, 2);
+    auto tp =
+        aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+    auto ti = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    auto td = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+}
+
+template <typename IDX>
+void _TestCSRSliceMatrix2(DGLContext ctx) {
+  auto csr = CSR3<IDX>(ctx);
+  {
+    // square
+    auto r =
+        aten::VecToIdArray(std::vector<IDX>({0, 1, 3}), sizeof(IDX) * 8, ctx);
+    auto c =
+        aten::VecToIdArray(std::vector<IDX>({1, 2, 3}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[1, 1, 1],
+    //  [0, 0, 0],
+    //  [0, 0, 0]]
+    // data: [5, 2, 0]
+    ASSERT_EQ(x.num_rows, 3);
+    ASSERT_EQ(x.num_cols, 3);
+    auto tp = aten::VecToIdArray(
+        std::vector<IDX>({0, 3, 3, 3}), sizeof(IDX) * 8, ctx);
+    // indexes are in reverse order in CSR3
+    auto ti =
+        aten::VecToIdArray(std::vector<IDX>({2, 1, 0}), sizeof(IDX) * 8, ctx);
+    auto td =
+        aten::VecToIdArray(std::vector<IDX>({0, 2, 5}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+  {
+    // non-square
+    auto r =
+        aten::VecToIdArray(std::vector<IDX>({0, 1, 2}), sizeof(IDX) * 8, ctx);
+    auto c = aten::VecToIdArray(std::vector<IDX>({0, 1}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[0, 1],
+    //  [1, 0],
+    //  [0, 0]]
+    // data: [0, 3]
+    ASSERT_EQ(x.num_rows, 3);
+    ASSERT_EQ(x.num_cols, 2);
+    auto tp = aten::VecToIdArray(
+        std::vector<IDX>({0, 1, 2, 2}), sizeof(IDX) * 8, ctx);
+    auto ti =
+        aten::VecToIdArray(std::vector<IDX>({1, 0}), sizeof(IDX) * 8, ctx);
+    auto td =
+        aten::VecToIdArray(std::vector<IDX>({5, 3}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+  {
+    // empty slice
+    auto r = aten::VecToIdArray(std::vector<IDX>({2, 3}), sizeof(IDX) * 8, ctx);
+    auto c = aten::VecToIdArray(std::vector<IDX>({0, 1}), sizeof(IDX) * 8, ctx);
+    auto x = aten::CSRSliceMatrix(csr, r, c);
+    // [[0, 0],
+    //  [0, 0]]
+    // data: []
+    ASSERT_EQ(x.num_rows, 2);
+    ASSERT_EQ(x.num_cols, 2);
+    auto tp =
+        aten::VecToIdArray(std::vector<IDX>({0, 0, 0}), sizeof(IDX) * 8, ctx);
+    auto ti = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    auto td = aten::VecToIdArray(std::vector<IDX>({}), sizeof(IDX) * 8, ctx);
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indptr, tp));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.indices, ti));
+    ASSERT_TRUE(ArrayEQ<IDX>(x.data, td));
+  }
+}
+
+TEST(SpmatTest, CSRSliceMatrix) {
+  _TestCSRSliceMatrix1<int32_t>(CPU);
+  _TestCSRSliceMatrix1<int64_t>(CPU);
+  _TestCSRSliceMatrix2<int32_t>(CPU);
+  _TestCSRSliceMatrix2<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRSliceMatrix1<int32_t>(GPU);
+  _TestCSRSliceMatrix1<int64_t>(GPU);
+  _TestCSRSliceMatrix2<int32_t>(GPU);
+  _TestCSRSliceMatrix2<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRHasDuplicate(DGLContext ctx) {
+  auto csr = CSR1<IDX>(ctx);
+  ASSERT_FALSE(aten::CSRHasDuplicate(csr));
+  csr = CSR2<IDX>(ctx);
+  ASSERT_TRUE(aten::CSRHasDuplicate(csr));
+}
+
+TEST(SpmatTest, CSRHasDuplicate) {
+  _TestCSRHasDuplicate<int32_t>(CPU);
+  _TestCSRHasDuplicate<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRHasDuplicate<int32_t>(GPU);
+  _TestCSRHasDuplicate<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRSort(DGLContext ctx) {
+  auto csr = CSR1<IDX>(ctx);
+  ASSERT_FALSE(aten::CSRIsSorted(csr));
+  auto csr1 = aten::CSRSort(csr);
+  ASSERT_FALSE(aten::CSRIsSorted(csr));
+  ASSERT_TRUE(aten::CSRIsSorted(csr1));
+  ASSERT_TRUE(csr1.sorted);
+  aten::CSRSort_(&csr);
+  ASSERT_TRUE(aten::CSRIsSorted(csr));
+  ASSERT_TRUE(csr.sorted);
+  csr = CSR2<IDX>(ctx);
+  ASSERT_TRUE(aten::CSRIsSorted(csr));
+}
+
+TEST(SpmatTest, CSRSort) {
+  _TestCSRSort<int32_t>(CPU);
+  _TestCSRSort<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestCSRSort<int32_t>(GPU);
+  _TestCSRSort<int64_t>(GPU);
+#endif
+}
+
+template <typename IDX>
+void _TestCSRReorder() {
+  auto csr = CSR2<IDX>();
+  auto new_row =
+      aten::VecToIdArray(std::vector<IDX>({2, 0, 3, 1}), sizeof(IDX) * 8, CTX);
+  auto new_col = aten::VecToIdArray(
+      std::vector<IDX>({2, 0, 4, 3, 1}), sizeof(IDX) * 8, CTX);
+  auto new_csr = CSRReorder(csr, new_row, new_col);
+  ASSERT_EQ(new_csr.num_rows, csr.num_rows);
+  ASSERT_EQ(new_csr.num_cols, csr.num_cols);
+}
+
+TEST(SpmatTest, TestCSRReorder) {
+  _TestCSRReorder<int32_t>();
+  _TestCSRReorder<int64_t>();
+}
diff --git a/tests/cpp/test_unit_graph.cc b/tests/cpp/test_unit_graph.cc
index 20cc1ebe24cf..8371321df953 100644
--- a/tests/cpp/test_unit_graph.cc
+++ b/tests/cpp/test_unit_graph.cc
@@ -369,7 +369,7 @@ void _TestUnitGraph_CopyTo(
 TEST(UniGraphTest, TestUnitGraph_CopyTo) {
   _TestUnitGraph_CopyTo<int32_t>(CPU, CPU);
   _TestUnitGraph_CopyTo<int64_t>(CPU, CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_CopyTo<int32_t>(CPU, GPU);
   _TestUnitGraph_CopyTo<int32_t>(GPU, GPU);
   _TestUnitGraph_CopyTo<int32_t>(GPU, CPU);
@@ -382,7 +382,7 @@ TEST(UniGraphTest, TestUnitGraph_CopyTo) {
 TEST(UniGraphTest, TestUnitGraph_InOutDegrees) {
   _TestUnitGraph_InOutDegrees<int32_t>(CPU);
   _TestUnitGraph_InOutDegrees<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_InOutDegrees<int32_t>(GPU);
   _TestUnitGraph_InOutDegrees<int64_t>(GPU);
 #endif
@@ -391,7 +391,7 @@ TEST(UniGraphTest, TestUnitGraph_InOutDegrees) {
 TEST(UniGraphTest, TestUnitGraph_Create) {
   _TestUnitGraph<int32_t>(CPU);
   _TestUnitGraph<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph<int32_t>(GPU);
   _TestUnitGraph<int64_t>(GPU);
 #endif
@@ -400,7 +400,7 @@ TEST(UniGraphTest, TestUnitGraph_Create) {
 TEST(UniGraphTest, TestUnitGraph_GetInCSR) {
   _TestUnitGraph_GetInCSR<int32_t>(CPU);
   _TestUnitGraph_GetInCSR<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_GetInCSR<int32_t>(GPU);
   _TestUnitGraph_GetInCSR<int64_t>(GPU);
 #endif
@@ -409,7 +409,7 @@ TEST(UniGraphTest, TestUnitGraph_GetInCSR) {
 TEST(UniGraphTest, TestUnitGraph_GetOutCSR) {
   _TestUnitGraph_GetOutCSR<int32_t>(CPU);
   _TestUnitGraph_GetOutCSR<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_GetOutCSR<int32_t>(GPU);
   _TestUnitGraph_GetOutCSR<int64_t>(GPU);
 #endif
@@ -418,7 +418,7 @@ TEST(UniGraphTest, TestUnitGraph_GetOutCSR) {
 TEST(UniGraphTest, TestUnitGraph_GetCOO) {
   _TestUnitGraph_GetCOO<int32_t>(CPU);
   _TestUnitGraph_GetCOO<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_GetCOO<int32_t>(GPU);
   _TestUnitGraph_GetCOO<int64_t>(GPU);
 #endif
@@ -427,7 +427,7 @@ TEST(UniGraphTest, TestUnitGraph_GetCOO) {
 TEST(UniGraphTest, TestUnitGraph_Reserve) {
   _TestUnitGraph_Reserve<int32_t>(CPU);
   _TestUnitGraph_Reserve<int64_t>(CPU);
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
   _TestUnitGraph_Reserve<int32_t>(GPU);
   _TestUnitGraph_Reserve<int64_t>(GPU);
 #endif
diff --git a/tests/cpp/test_unit_graph.cc.prehip b/tests/cpp/test_unit_graph.cc.prehip
new file mode 100644
index 000000000000..20cc1ebe24cf
--- /dev/null
+++ b/tests/cpp/test_unit_graph.cc.prehip
@@ -0,0 +1,434 @@
+/**
+ *  Copyright (c) 2019 by Contributors
+ * @file test_unit_graph.cc
+ * @brief Test UnitGraph
+ */
+#include <dgl/array.h>
+#include <dgl/immutable_graph.h>
+#include <dgl/runtime/device_api.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "../../src/graph/unit_graph.h"
+#include "./../src/graph/heterograph.h"
+#include "./common.h"
+
+using namespace dgl;
+using namespace dgl::runtime;
+
+template <typename IdType>
+aten::CSRMatrix CSR1(DGLContext ctx) {
+  /**
+   * G = [[0, 0, 1],
+   *      [1, 0, 1],
+   *      [0, 1, 0],
+   *      [1, 0, 1]]
+   */
+  IdArray g_indptr = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 3, 4, 6}), sizeof(IdType) * 8, CTX);
+  IdArray g_indices = aten::VecToIdArray(
+      std::vector<IdType>({2, 0, 2, 1, 0, 2}), sizeof(IdType) * 8, CTX);
+
+  const aten::CSRMatrix &csr_a =
+      aten::CSRMatrix(4, 3, g_indptr, g_indices, aten::NullArray(), false);
+  return csr_a;
+}
+
+template aten::CSRMatrix CSR1<int32_t>(DGLContext ctx);
+template aten::CSRMatrix CSR1<int64_t>(DGLContext ctx);
+
+template <typename IdType>
+aten::COOMatrix COO1(DGLContext ctx) {
+  /**
+   * G = [[1, 1, 0],
+   *      [0, 1, 0]]
+   */
+  IdArray g_row = aten::VecToIdArray(
+      std::vector<IdType>({0, 0, 1}), sizeof(IdType) * 8, CTX);
+  IdArray g_col = aten::VecToIdArray(
+      std::vector<IdType>({0, 1, 1}), sizeof(IdType) * 8, CTX);
+  const aten::COOMatrix &coo =
+      aten::COOMatrix(2, 3, g_row, g_col, aten::NullArray(), true, true);
+
+  return coo;
+}
+
+template aten::COOMatrix COO1<int32_t>(DGLContext ctx);
+template aten::COOMatrix COO1<int64_t>(DGLContext ctx);
+
+template <typename IdType>
+void _TestUnitGraph_InOutDegrees(DGLContext ctx) {
+  /**
+  InDegree(s) is available only if COO or CSC formats permitted.
+  OutDegree(s) is available only if COO or CSR formats permitted.
+  */
+
+  // COO
+  {
+    const aten::COOMatrix &coo = COO1<IdType>(ctx);
+    auto &&g = CreateFromCOO(2, coo, COO_CODE);
+    ASSERT_EQ(g->InDegree(0, 0), 1);
+    auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context());
+    ASSERT_TRUE(ArrayEQ<IdType>(
+        g->InDegrees(0, nids),
+        aten::VecToIdArray<IdType>({1, 2}, g->NumBits(), g->Context())));
+    ASSERT_EQ(g->OutDegree(0, 0), 2);
+    ASSERT_TRUE(ArrayEQ<IdType>(
+        g->OutDegrees(0, nids),
+        aten::VecToIdArray<IdType>({2, 1}, g->NumBits(), g->Context())));
+  }
+  // CSC
+  {
+    const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+    auto &&g = CreateFromCSC(2, csr, CSC_CODE);
+    ASSERT_EQ(g->InDegree(0, 0), 1);
+    auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context());
+    ASSERT_TRUE(ArrayEQ<IdType>(
+        g->InDegrees(0, nids),
+        aten::VecToIdArray<IdType>({1, 2, 1}, g->NumBits(), g->Context())));
+    EXPECT_ANY_THROW(g->OutDegree(0, 0));
+    EXPECT_ANY_THROW(g->OutDegrees(0, nids));
+  }
+  // CSR
+  {
+    const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+    auto &&g = CreateFromCSR(2, csr, CSR_CODE);
+    ASSERT_EQ(g->OutDegree(0, 0), 1);
+    auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context());
+    ASSERT_TRUE(ArrayEQ<IdType>(
+        g->OutDegrees(0, nids),
+        aten::VecToIdArray<IdType>({1, 2, 1, 2}, g->NumBits(), g->Context())));
+    EXPECT_ANY_THROW(g->InDegree(0, 0));
+    EXPECT_ANY_THROW(g->InDegrees(0, nids));
+  }
+}
+
+template <typename IdType>
+void _TestUnitGraph(DGLContext ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(ctx);
+
+  auto g = CreateFromCSC(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+
+  g = CreateFromCSR(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+
+  g = CreateFromCOO(2, coo);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+
+  auto src = aten::VecToIdArray<int64_t>({1, 2, 5, 3});
+  auto dst = aten::VecToIdArray<int64_t>({1, 6, 2, 6});
+  auto mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, COO_CODE);
+  ASSERT_EQ(mg->GetCreatedFormats(), 1);
+  auto hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, COO_CODE);
+  auto img = std::dynamic_pointer_cast<ImmutableGraph>(hmg->AsImmutableGraph());
+  ASSERT_TRUE(img != nullptr);
+  mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, CSR_CODE | COO_CODE);
+  ASSERT_EQ(mg->GetCreatedFormats(), 1);
+  hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, CSR_CODE | COO_CODE);
+  img = std::dynamic_pointer_cast<ImmutableGraph>(hmg->AsImmutableGraph());
+  ASSERT_TRUE(img != nullptr);
+  mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, CSC_CODE | COO_CODE);
+  ASSERT_EQ(mg->GetCreatedFormats(), 1);
+  hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, CSC_CODE | COO_CODE);
+  img = std::dynamic_pointer_cast<ImmutableGraph>(hmg->AsImmutableGraph());
+  ASSERT_TRUE(img != nullptr);
+
+  g = CreateFromCSC(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+
+  g = CreateFromCSR(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+
+  g = CreateFromCOO(2, coo);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+}
+
+template <typename IdType>
+void _TestUnitGraph_GetInCSR(DGLContext ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(ctx);
+
+  auto g = CreateFromCSC(2, csr);
+  auto in_csr_matrix = g->GetCSCMatrix(0);
+  ASSERT_EQ(in_csr_matrix.num_rows, csr.num_rows);
+  ASSERT_EQ(in_csr_matrix.num_cols, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+
+  // test out csr
+  g = CreateFromCSR(2, csr);
+  auto g_ptr = g->GetGraphInFormat(CSC_CODE);
+  in_csr_matrix = g_ptr->GetCSCMatrix(0);
+  ASSERT_EQ(in_csr_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(in_csr_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+  in_csr_matrix = g->GetCSCMatrix(0);
+  ASSERT_EQ(in_csr_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(in_csr_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 6);
+
+  // test out coo
+  g = CreateFromCOO(2, coo);
+  g_ptr = g->GetGraphInFormat(CSC_CODE);
+  in_csr_matrix = g_ptr->GetCSCMatrix(0);
+  ASSERT_EQ(in_csr_matrix.num_cols, coo.num_rows);
+  ASSERT_EQ(in_csr_matrix.num_rows, coo.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+
+  in_csr_matrix = g->GetCSCMatrix(0);
+  ASSERT_EQ(in_csr_matrix.num_cols, coo.num_rows);
+  ASSERT_EQ(in_csr_matrix.num_rows, coo.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 5);
+}
+
+template <typename IdType>
+void _TestUnitGraph_GetOutCSR(DGLContext ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(ctx);
+
+  auto g = CreateFromCSC(2, csr);
+  auto g_ptr = g->GetGraphInFormat(CSR_CODE);
+  auto out_csr_matrix = g_ptr->GetCSRMatrix(0);
+  ASSERT_EQ(out_csr_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(out_csr_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+  out_csr_matrix = g->GetCSRMatrix(0);
+  ASSERT_EQ(out_csr_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(out_csr_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 6);
+
+  // test out csr
+  g = CreateFromCSR(2, csr);
+  out_csr_matrix = g->GetCSRMatrix(0);
+  ASSERT_EQ(out_csr_matrix.num_rows, csr.num_rows);
+  ASSERT_EQ(out_csr_matrix.num_cols, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+
+  // test out coo
+  g = CreateFromCOO(2, coo);
+  g_ptr = g->GetGraphInFormat(CSR_CODE);
+  out_csr_matrix = g_ptr->GetCSRMatrix(0);
+  ASSERT_EQ(out_csr_matrix.num_rows, coo.num_rows);
+  ASSERT_EQ(out_csr_matrix.num_cols, coo.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+
+  out_csr_matrix = g->GetCSRMatrix(0);
+  ASSERT_EQ(out_csr_matrix.num_rows, coo.num_rows);
+  ASSERT_EQ(out_csr_matrix.num_cols, coo.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 3);
+}
+
+template <typename IdType>
+void _TestUnitGraph_GetCOO(DGLContext ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(ctx);
+
+  auto g = CreateFromCSC(2, csr);
+  auto g_ptr = g->GetGraphInFormat(COO_CODE);
+  auto out_coo_matrix = g_ptr->GetCOOMatrix(0);
+  ASSERT_EQ(out_coo_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(out_coo_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+  out_coo_matrix = g->GetCOOMatrix(0);
+  ASSERT_EQ(out_coo_matrix.num_cols, csr.num_rows);
+  ASSERT_EQ(out_coo_matrix.num_rows, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 5);
+
+  // test out csr
+  g = CreateFromCSR(2, csr);
+  g_ptr = g->GetGraphInFormat(COO_CODE);
+  out_coo_matrix = g_ptr->GetCOOMatrix(0);
+  ASSERT_EQ(out_coo_matrix.num_rows, csr.num_rows);
+  ASSERT_EQ(out_coo_matrix.num_cols, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+  out_coo_matrix = g->GetCOOMatrix(0);
+  ASSERT_EQ(out_coo_matrix.num_rows, csr.num_rows);
+  ASSERT_EQ(out_coo_matrix.num_cols, csr.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 3);
+
+  // test out coo
+  g = CreateFromCOO(2, coo);
+  out_coo_matrix = g->GetCOOMatrix(0);
+  ASSERT_EQ(out_coo_matrix.num_rows, coo.num_rows);
+  ASSERT_EQ(out_coo_matrix.num_cols, coo.num_cols);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+}
+
+template <typename IdType>
+void _TestUnitGraph_Reserve(DGLContext ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(ctx);
+
+  auto g = CreateFromCSC(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+  auto r_g =
+      std::dynamic_pointer_cast<UnitGraph>(g->GetRelationGraph(0))->Reverse();
+  ASSERT_EQ(r_g->GetCreatedFormats(), 2);
+  aten::CSRMatrix g_in_csr = g->GetCSCMatrix(0);
+  aten::CSRMatrix r_g_out_csr = r_g->GetCSRMatrix(0);
+  ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data);
+  ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data);
+  aten::CSRMatrix g_out_csr = g->GetCSRMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 6);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 6);
+  aten::CSRMatrix r_g_in_csr = r_g->GetCSCMatrix(0);
+  ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data);
+  ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data);
+  aten::COOMatrix g_coo = g->GetCOOMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 7);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 6);
+  aten::COOMatrix r_g_coo = r_g->GetCOOMatrix(0);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 7);
+  ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols);
+  ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows);
+  ASSERT_TRUE(ArrayEQ<IdType>(g_coo.row, r_g_coo.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(g_coo.col, r_g_coo.row));
+
+  // test out csr
+  g = CreateFromCSR(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+  r_g = std::dynamic_pointer_cast<UnitGraph>(g->GetRelationGraph(0))->Reverse();
+  ASSERT_EQ(r_g->GetCreatedFormats(), 4);
+  g_out_csr = g->GetCSRMatrix(0);
+  r_g_in_csr = r_g->GetCSCMatrix(0);
+  ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data);
+  ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data);
+  g_in_csr = g->GetCSCMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 6);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 6);
+  r_g_out_csr = r_g->GetCSRMatrix(0);
+  ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data);
+  ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data);
+  g_coo = g->GetCOOMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 7);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 6);
+  r_g_coo = r_g->GetCOOMatrix(0);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 7);
+  ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols);
+  ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows);
+  ASSERT_TRUE(ArrayEQ<IdType>(g_coo.row, r_g_coo.col));
+  ASSERT_TRUE(ArrayEQ<IdType>(g_coo.col, r_g_coo.row));
+
+  // test out coo
+  g = CreateFromCOO(2, coo);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+  r_g = std::dynamic_pointer_cast<UnitGraph>(g->GetRelationGraph(0))->Reverse();
+  ASSERT_EQ(r_g->GetCreatedFormats(), 1);
+  g_coo = g->GetCOOMatrix(0);
+  r_g_coo = r_g->GetCOOMatrix(0);
+  ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols);
+  ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows);
+  ASSERT_TRUE(g_coo.row->data == r_g_coo.col->data);
+  ASSERT_TRUE(g_coo.col->data == r_g_coo.row->data);
+  g_in_csr = g->GetCSCMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 5);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 3);
+  r_g_out_csr = r_g->GetCSRMatrix(0);
+  ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data);
+  ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data);
+  g_out_csr = g->GetCSRMatrix(0);
+  ASSERT_EQ(g->GetCreatedFormats(), 7);
+  ASSERT_EQ(r_g->GetCreatedFormats(), 7);
+  r_g_in_csr = r_g->GetCSCMatrix(0);
+  ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data);
+  ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data);
+}
+
+template <typename IdType>
+void _TestUnitGraph_CopyTo(
+    const DGLContext &src_ctx, const DGLContext &dst_ctx) {
+  const aten::CSRMatrix &csr = CSR1<IdType>(src_ctx);
+  const aten::COOMatrix &coo = COO1<IdType>(src_ctx);
+
+  auto device = dgl::runtime::DeviceAPI::Get(dst_ctx);
+  // We don't allow SetStream in DGL for now.
+  auto stream = nullptr;
+
+  auto g = dgl::UnitGraph::CreateFromCSC(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 4);
+  auto cg = dgl::UnitGraph::CopyTo(g, dst_ctx);
+  device->StreamSync(dst_ctx, stream);
+  ASSERT_EQ(cg->GetCreatedFormats(), 4);
+
+  g = dgl::UnitGraph::CreateFromCSR(2, csr);
+  ASSERT_EQ(g->GetCreatedFormats(), 2);
+  cg = dgl::UnitGraph::CopyTo(g, dst_ctx);
+  device->StreamSync(dst_ctx, stream);
+  ASSERT_EQ(cg->GetCreatedFormats(), 2);
+
+  g = dgl::UnitGraph::CreateFromCOO(2, coo);
+  ASSERT_EQ(g->GetCreatedFormats(), 1);
+  cg = dgl::UnitGraph::CopyTo(g, dst_ctx);
+  device->StreamSync(dst_ctx, stream);
+  ASSERT_EQ(cg->GetCreatedFormats(), 1);
+}
+
+TEST(UniGraphTest, TestUnitGraph_CopyTo) {
+  _TestUnitGraph_CopyTo<int32_t>(CPU, CPU);
+  _TestUnitGraph_CopyTo<int64_t>(CPU, CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_CopyTo<int32_t>(CPU, GPU);
+  _TestUnitGraph_CopyTo<int32_t>(GPU, GPU);
+  _TestUnitGraph_CopyTo<int32_t>(GPU, CPU);
+  _TestUnitGraph_CopyTo<int64_t>(CPU, GPU);
+  _TestUnitGraph_CopyTo<int64_t>(GPU, GPU);
+  _TestUnitGraph_CopyTo<int64_t>(GPU, CPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_InOutDegrees) {
+  _TestUnitGraph_InOutDegrees<int32_t>(CPU);
+  _TestUnitGraph_InOutDegrees<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_InOutDegrees<int32_t>(GPU);
+  _TestUnitGraph_InOutDegrees<int64_t>(GPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_Create) {
+  _TestUnitGraph<int32_t>(CPU);
+  _TestUnitGraph<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph<int32_t>(GPU);
+  _TestUnitGraph<int64_t>(GPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_GetInCSR) {
+  _TestUnitGraph_GetInCSR<int32_t>(CPU);
+  _TestUnitGraph_GetInCSR<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_GetInCSR<int32_t>(GPU);
+  _TestUnitGraph_GetInCSR<int64_t>(GPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_GetOutCSR) {
+  _TestUnitGraph_GetOutCSR<int32_t>(CPU);
+  _TestUnitGraph_GetOutCSR<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_GetOutCSR<int32_t>(GPU);
+  _TestUnitGraph_GetOutCSR<int64_t>(GPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_GetCOO) {
+  _TestUnitGraph_GetCOO<int32_t>(CPU);
+  _TestUnitGraph_GetCOO<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_GetCOO<int32_t>(GPU);
+  _TestUnitGraph_GetCOO<int64_t>(GPU);
+#endif
+}
+
+TEST(UniGraphTest, TestUnitGraph_Reserve) {
+  _TestUnitGraph_Reserve<int32_t>(CPU);
+  _TestUnitGraph_Reserve<int64_t>(CPU);
+#ifdef DGL_USE_CUDA
+  _TestUnitGraph_Reserve<int32_t>(GPU);
+  _TestUnitGraph_Reserve<int64_t>(GPU);
+#endif
+}
diff --git a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
index 563aa5b5b75f..35d12531afd5 100644
--- a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp
@@ -31,25 +31,25 @@ class gpu_cache_api {
   // Query API, i.e. A single read from the cache
   virtual void Query(const key_type* d_keys, const size_t len, float* d_values,
                      uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len,
-                     cudaStream_t stream,
+                     hipStream_t stream,
                      const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
 
   // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
   virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values,
-                       cudaStream_t stream,
+                       hipStream_t stream,
                        const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
 
   // Update API, i.e. update the embeddings which exist in the cache
   virtual void Update(const key_type* d_keys, const size_t len, const float* d_values,
-                      cudaStream_t stream,
+                      hipStream_t stream,
                       const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
 
   // Dump API, i.e. dump some slabsets' keys from the cache
   virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
-                    const size_t end_set_index, cudaStream_t stream) = 0;
+                    const size_t end_set_index, hipStream_t stream) = 0;
 
   // Record all the lookup stream of a specific cache for Update/Replace sync
-  virtual void Record(cudaStream_t stream) = 0;
+  virtual void Record(hipStream_t stream) = 0;
 };
 
 }  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip
new file mode 100644
index 000000000000..563aa5b5b75f
--- /dev/null
+++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nv_util.h>
+
+#define TASK_PER_WARP_TILE_MACRO 1
+
+namespace gpu_cache {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GPU Cache API
+template <typename key_type>
+class gpu_cache_api {
+ public:
+  virtual ~gpu_cache_api() noexcept(false) {}
+  // Query API, i.e. A single read from the cache
+  virtual void Query(const key_type* d_keys, const size_t len, float* d_values,
+                     uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len,
+                     cudaStream_t stream,
+                     const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
+
+  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
+  virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values,
+                       cudaStream_t stream,
+                       const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
+
+  // Update API, i.e. update the embeddings which exist in the cache
+  virtual void Update(const key_type* d_keys, const size_t len, const float* d_values,
+                      cudaStream_t stream,
+                      const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0;
+
+  // Dump API, i.e. dump some slabsets' keys from the cache
+  virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
+                    const size_t end_set_index, cudaStream_t stream) = 0;
+
+  // Record all the lookup stream of a specific cache for Update/Replace sync
+  virtual void Record(cudaStream_t stream) = 0;
+};
+
+}  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
index 7cc61b58d78f..68045b10c588 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp
@@ -61,22 +61,22 @@ class gpu_cache : public gpu_cache_api<key_type> {
 
   // Query API, i.e. A single read from the cache
   void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index,
-             key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream,
+             key_type* d_missing_keys, size_t* d_missing_len, hipStream_t stream,
              const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
 
   // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
-  void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+  void Replace(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
                const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
 
   // Update API, i.e. update the embeddings which exist in the cache
-  void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+  void Update(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream,
               const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
 
   // Dump API, i.e. dump some slabsets' keys from the cache
   void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
-            const size_t end_set_index, cudaStream_t stream) override;
+            const size_t end_set_index, hipStream_t stream) override;
 
-  void Record(cudaStream_t stream) override {}
+  void Record(hipStream_t stream) override {}
 
  public:
   using slabset = slab_set<set_associativity, key_type, warp_size>;
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip
new file mode 100644
index 000000000000..7cc61b58d78f
--- /dev/null
+++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <nv_util.h>
+
+#include <cstdio>
+#include <hash_functions.cuh>
+#include <limits>
+
+#include "gpu_cache_api.hpp"
+#ifdef LIBCUDACXX_VERSION
+#include <cuda/atomic>
+#include <cuda/semaphore>
+#endif
+
+#define SET_ASSOCIATIVITY 2
+#define SLAB_SIZE 32
+#define TASK_PER_WARP_TILE_MACRO 1
+
+namespace gpu_cache {
+
+// slab for static slab list
+template <typename key_type, int warp_size>
+struct static_slab {
+  key_type slab_[warp_size];
+};
+
+// Static slablist(slabset) for GPU Cache
+template <int set_associativity, typename key_type, int warp_size>
+struct slab_set {
+  static_slab<key_type, warp_size> set_[set_associativity];
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// GPU Cache
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher = MurmurHash3_32<key_type>,
+          typename slab_hasher = Mod_Hash<key_type, size_t>>
+class gpu_cache : public gpu_cache_api<key_type> {
+ public:
+  // Ctor
+  gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size);
+
+  // Dtor
+  ~gpu_cache();
+
+  // Query API, i.e. A single read from the cache
+  void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index,
+             key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream,
+             const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
+
+  // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent
+  void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+               const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
+
+  // Update API, i.e. update the embeddings which exist in the cache
+  void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream,
+              const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override;
+
+  // Dump API, i.e. dump some slabsets' keys from the cache
+  void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index,
+            const size_t end_set_index, cudaStream_t stream) override;
+
+  void Record(cudaStream_t stream) override {}
+
+ public:
+  using slabset = slab_set<set_associativity, key_type, warp_size>;
+#ifdef LIBCUDACXX_VERSION
+  using atomic_ref_counter_type = cuda::atomic<ref_counter_type, cuda::thread_scope_device>;
+  using mutex = cuda::binary_semaphore<cuda::thread_scope_device>;
+#endif
+
+ private:
+  static const size_t BLOCK_SIZE_ = 64;
+
+  // Cache data
+  slabset* keys_;
+  float* vals_;
+  ref_counter_type* slot_counter_;
+
+  // Global counter
+#ifdef LIBCUDACXX_VERSION
+  atomic_ref_counter_type* global_counter_;
+#else
+  ref_counter_type* global_counter_;
+#endif
+  // CUDA device
+  int dev_;
+
+  // Cache capacity
+  size_t capacity_in_set_;
+  size_t num_slot_;
+
+  // Embedding vector size
+  size_t embedding_vec_size_;
+
+#ifdef LIBCUDACXX_VERSION
+  // Array of mutex to protect (sub-)warp-level data structure, each mutex protect 1 slab set
+  mutex* set_mutex_;
+#else
+  // Array of flag to protect (sub-)warp-level data structure, each flag act as a mutex and protect
+  // 1 slab set 1 for unlock, 0 for lock
+  int* set_mutex_;
+#endif
+};
+
+}  // namespace gpu_cache
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_util.h b/third_party/HugeCTR/gpu_cache/include/nv_util.h
index f67ad6be2daf..a0a73cc1472f 100644
--- a/third_party/HugeCTR/gpu_cache/include/nv_util.h
+++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <cuda_fp16.h>
-#include <cuda_fp8.h>
-#include <cuda_runtime_api.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_fp8.h>
+#include <hip/hip_runtime_api.h>
 
 #include <stdexcept>
 #include <string>
@@ -31,27 +31,27 @@ template <typename T>
 struct is_fp8 : std::false_type {};
 
 template <>
-struct is_fp8<__nv_fp8_e4m3> : std::true_type {};
+struct is_fp8<__hip_fp8_e4m3> : std::true_type {};
 
 template <>
-struct is_fp8<__nv_fp8_e5m2> : std::true_type {};
+struct is_fp8<__hip_fp8_e5m2> : std::true_type {};
 
 class CudaException : public std::runtime_error {
  public:
   CudaException(const std::string& what) : runtime_error(what) {}
 };
 
-inline void cuda_check_(cudaError_t val, const char* file, int line) {
-  if (val != cudaSuccess) {
+inline void cuda_check_(hipError_t val, const char* file, int line) {
+  if (val != hipSuccess) {
     throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
-                        std::to_string(val) + ": " + cudaGetErrorString(val));
+                        std::to_string(val) + ": " + hipGetErrorString(val));
   }
 }
 
 class CudaDeviceRestorer {
  public:
-  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
-  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
+  CudaDeviceRestorer() { CUDA_CHECK(hipGetDevice(&dev_)); }
+  ~CudaDeviceRestorer() { CUDA_CHECK(hipSetDevice(dev_)); }
   void check_device(int device) const {
     if (device != dev_) {
       throw std::runtime_error(
@@ -65,14 +65,14 @@ class CudaDeviceRestorer {
 };
 
 inline int get_dev(const void* ptr) {
-  cudaPointerAttributes attr;
-  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+  hipPointerAttribute_t attr;
+  CUDA_CHECK(hipPointerGetAttributes(&attr, ptr));
   int dev = -1;
 
 #if CUDART_VERSION >= 10000
-  if (attr.type == cudaMemoryTypeDevice)
+  if (attr.type == hipMemoryTypeDevice)
 #else
-  if (attr.memoryType == cudaMemoryTypeDevice)
+  if (attr.memoryType == hipMemoryTypeDevice)
 #endif
   {
     dev = attr.device;
@@ -83,7 +83,7 @@ inline int get_dev(const void* ptr) {
 inline void switch_to_dev(const void* ptr) {
   int dev = get_dev(ptr);
   if (dev >= 0) {
-    CUDA_CHECK(cudaSetDevice(dev));
+    CUDA_CHECK(hipSetDevice(dev));
   }
 }
 
diff --git a/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip b/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip
new file mode 100644
index 000000000000..f67ad6be2daf
--- /dev/null
+++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#include <cuda_runtime_api.h>
+
+#include <stdexcept>
+#include <string>
+
+#define CUDA_CHECK(val) \
+  { nv::cuda_check_((val), __FILE__, __LINE__); }
+
+namespace nv {
+
+template <typename T>
+struct is_fp8 : std::false_type {};
+
+template <>
+struct is_fp8<__nv_fp8_e4m3> : std::true_type {};
+
+template <>
+struct is_fp8<__nv_fp8_e5m2> : std::true_type {};
+
+class CudaException : public std::runtime_error {
+ public:
+  CudaException(const std::string& what) : runtime_error(what) {}
+};
+
+inline void cuda_check_(cudaError_t val, const char* file, int line) {
+  if (val != cudaSuccess) {
+    throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " +
+                        std::to_string(val) + ": " + cudaGetErrorString(val));
+  }
+}
+
+class CudaDeviceRestorer {
+ public:
+  CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); }
+  ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); }
+  void check_device(int device) const {
+    if (device != dev_) {
+      throw std::runtime_error(
+          std::string(__FILE__) + ":" + std::to_string(__LINE__) +
+          ": Runtime Error: The device id in the context is not consistent with configuration");
+    }
+  }
+
+ private:
+  int dev_;
+};
+
+inline int get_dev(const void* ptr) {
+  cudaPointerAttributes attr;
+  CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+  int dev = -1;
+
+#if CUDART_VERSION >= 10000
+  if (attr.type == cudaMemoryTypeDevice)
+#else
+  if (attr.memoryType == cudaMemoryTypeDevice)
+#endif
+  {
+    dev = attr.device;
+  }
+  return dev;
+}
+
+inline void switch_to_dev(const void* ptr) {
+  int dev = get_dev(ptr);
+  if (dev >= 0) {
+    CUDA_CHECK(cudaSetDevice(dev));
+  }
+}
+
+}  // namespace nv
diff --git a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
index 8dc21395e329..4e982029fec1 100644
--- a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
+++ b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*
  * Copyright (c) 2023, NVIDIA CORPORATION.
  *
@@ -14,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <cooperative_groups.h>
+#include <hip/hip_cooperative_groups.h>
 
 #include <nv_gpu_cache.hpp>
 
@@ -1253,27 +1254,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
   }
 
   // Get the current CUDA dev
-  CUDA_CHECK(cudaGetDevice(&dev_));
+  CUDA_CHECK(hipGetDevice(&dev_));
 
   // Calculate # of slot
   num_slot_ = capacity_in_set_ * set_associativity * warp_size;
 
   // Allocate GPU memory for cache
-  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
-  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
+  CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
 
   // Allocate GPU memory for set mutex
-  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
 
   // Initialize the cache, set all entry to unused <K,V>
   init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
       keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
 
   // Wait for initialization to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipStreamSynchronize(0));
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1301,27 +1302,27 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
   }
 
   // Get the current CUDA dev
-  CUDA_CHECK(cudaGetDevice(&dev_));
+  CUDA_CHECK(hipGetDevice(&dev_));
 
   // Calculate # of slot
   num_slot_ = capacity_in_set_ * set_associativity * warp_size;
 
   // Allocate GPU memory for cache
-  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
-  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
-  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(ref_counter_type)));
+  CUDA_CHECK(hipMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(hipMalloc((void**)&global_counter_, sizeof(ref_counter_type)));
 
   // Allocate GPU memory for set mutex
-  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
+  CUDA_CHECK(hipMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
 
   // Initialize the cache, set all entry to unused <K,V>
   init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
       keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
 
   // Wait for initialization to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipStreamSynchronize(0));
+  CUDA_CHECK(hipGetLastError());
 }
 #endif
 
@@ -1340,15 +1341,15 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
   destruct_kernel<<<((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
       global_counter_, set_mutex_, capacity_in_set_);
   // Wait for destruction to finish
-  CUDA_CHECK(cudaStreamSynchronize(0));
+  CUDA_CHECK(hipStreamSynchronize(0));
 
   // Free GPU memory for cache
-  CUDA_CHECK(cudaFree(keys_));
-  CUDA_CHECK(cudaFree(vals_));
-  CUDA_CHECK(cudaFree(slot_counter_));
-  CUDA_CHECK(cudaFree(global_counter_));
+  CUDA_CHECK(hipFree(keys_));
+  CUDA_CHECK(hipFree(vals_));
+  CUDA_CHECK(hipFree(slot_counter_));
+  CUDA_CHECK(hipFree(global_counter_));
   // Free GPU memory for set mutex
-  CUDA_CHECK(cudaFree(set_mutex_));
+  CUDA_CHECK(hipFree(set_mutex_));
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1362,12 +1363,12 @@ gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, s
   dev_restorer.check_device(dev_);
 
   // Free GPU memory for cache
-  CUDA_CHECK(cudaFree(keys_));
-  CUDA_CHECK(cudaFree(vals_));
-  CUDA_CHECK(cudaFree(slot_counter_));
-  CUDA_CHECK(cudaFree(global_counter_));
+  CUDA_CHECK(hipFree(keys_));
+  CUDA_CHECK(hipFree(vals_));
+  CUDA_CHECK(hipFree(slot_counter_));
+  CUDA_CHECK(hipFree(global_counter_));
   // Free GPU memory for set mutex
-  CUDA_CHECK(cudaFree(set_mutex_));
+  CUDA_CHECK(hipFree(set_mutex_));
 }
 #endif
 
@@ -1377,7 +1378,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
                                    uint64_t* d_missing_index, key_type* d_missing_keys,
-                                   size_t* d_missing_len, cudaStream_t stream,
+                                   size_t* d_missing_len, hipStream_t stream,
                                    const size_t task_per_warp_tile) {
   // Device Restorer
   nv::CudaDeviceRestorer dev_restorer;
@@ -1387,7 +1388,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
   // Check if it is a valid query
   if (len == 0) {
     // Set the d_missing_len to 0 before return
-    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
     return;
   }
 
@@ -1407,7 +1408,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
       task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1415,7 +1416,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
                                    uint64_t* d_missing_index, key_type* d_missing_keys,
-                                   size_t* d_missing_len, cudaStream_t stream,
+                                   size_t* d_missing_len, hipStream_t stream,
                                    const size_t task_per_warp_tile) {
   // Device Restorer
   nv::CudaDeviceRestorer dev_restorer;
@@ -1425,7 +1426,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
   // Check if it is a valid query
   if (len == 0) {
     // Set the d_missing_len to 0 before return
-    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    CUDA_CHECK(hipMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
     return;
   }
 
@@ -1445,7 +1446,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
       task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif
 
@@ -1454,7 +1455,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
           int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Replace(const key_type* d_keys, const size_t len,
-                                     const float* d_values, cudaStream_t stream,
+                                     const float* d_values, hipStream_t stream,
                                      const size_t task_per_warp_tile) {
   // Check if it is a valid replacement
   if (len == 0) {
@@ -1477,14 +1478,14 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
                                               capacity_in_set_, task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
           int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Replace(const key_type* d_keys, const size_t len,
-                                     const float* d_values, cudaStream_t stream,
+                                     const float* d_values, hipStream_t stream,
                                      const size_t task_per_warp_tile) {
   // Check if it is a valid replacement
   if (len == 0) {
@@ -1506,7 +1507,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
       global_counter_, capacity_in_set_, task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif
 
@@ -1515,7 +1516,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
           int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
-                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+                                    hipStream_t stream, const size_t task_per_warp_tile) {
   // Check if it is a valid update request
   if (len == 0) {
     return;
@@ -1535,14 +1536,14 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
       task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
           int warp_size, typename set_hasher, typename slab_hasher>
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
-                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+                                    hipStream_t stream, const size_t task_per_warp_tile) {
   // Check if it is a valid update request
   if (len == 0) {
     return;
@@ -1562,7 +1563,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
                                               task_per_warp_tile);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif
 
@@ -1572,7 +1573,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
                                   const size_t start_set_index, const size_t end_set_index,
-                                  cudaStream_t stream) {
+                                  hipStream_t stream) {
   // Check if it is a valid dump request
   if (start_set_index >= capacity_in_set_) {
     printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
@@ -1589,7 +1590,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
   dev_restorer.check_device(dev_);
 
   // Set the global counter to 0 first
-  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+  CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
 
   // Dump keys from the cache
   const size_t grid_size =
@@ -1599,7 +1600,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
                                               start_set_index, end_set_index);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #else
 template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
@@ -1607,7 +1608,7 @@ template <typename key_type, typename ref_counter_type, key_type empty_key, int
 void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
                slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
                                   const size_t start_set_index, const size_t end_set_index,
-                                  cudaStream_t stream) {
+                                  hipStream_t stream) {
   // Check if it is a valid dump request
   if (start_set_index >= capacity_in_set_) {
     printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
@@ -1624,7 +1625,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
   dev_restorer.check_device(dev_);
 
   // Set the global counter to 0 first
-  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+  CUDA_CHECK(hipMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
 
   // Dump keys from the cache
   const size_t grid_size =
@@ -1634,7 +1635,7 @@ void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_si
                                               start_set_index, end_set_index);
 
   // Check for GPU error before return
-  CUDA_CHECK(cudaGetLastError());
+  CUDA_CHECK(hipGetLastError());
 }
 #endif
 
diff --git a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu.prehip b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu.prehip
new file mode 100644
index 000000000000..8dc21395e329
--- /dev/null
+++ b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu.prehip
@@ -0,0 +1,1645 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cooperative_groups.h>
+
+#include <nv_gpu_cache.hpp>
+
+namespace cg = cooperative_groups;
+
+// Overload CUDA atomic for other 64bit unsigned/signed integer type
+__forceinline__ __device__ long atomicAdd(long* address, long val) {
+  return (long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
+}
+
+__forceinline__ __device__ long long atomicAdd(long long* address, long long val) {
+  return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
+}
+
+__forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) {
+  return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val);
+}
+
+namespace gpu_cache {
+
+#ifdef LIBCUDACXX_VERSION
+template <int warp_size>
+__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
+                                               const size_t emb_vec_size_in_float, float* d_dst,
+                                               const float* d_src) {
+#pragma unroll
+  for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
+    d_dst[i] = d_src[i];
+  }
+}
+#else
+template <int warp_size>
+__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
+                                               const size_t emb_vec_size_in_float,
+                                               volatile float* d_dst, volatile float* d_src) {
+
+#pragma unroll
+  for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
+    d_dst[i] = d_src[i];
+  }
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+// Will be called by multiple thread_block_tile((sub-)warp) on the same mutex
+// Expect only one thread_block_tile return to execute critical section at any time
+template <typename mutex, int warp_size>
+__forceinline__ __device__ void warp_lock_mutex(const cg::thread_block_tile<warp_size>& warp_tile,
+                                                mutex& set_mutex) {
+  // The first thread of this (sub-)warp to acquire the lock
+  if (warp_tile.thread_rank() == 0) {
+    set_mutex.acquire();
+  }
+  warp_tile.sync();  // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence
+}
+
+// The (sub-)warp holding the mutex will unlock the mutex after finishing the critical section on a
+// set Expect any following (sub-)warp that acquire the mutex can see its modification done in the
+// critical section
+template <typename mutex, int warp_size>
+__forceinline__ __device__ void warp_unlock_mutex(const cg::thread_block_tile<warp_size>& warp_tile,
+                                                  mutex& set_mutex) {
+  warp_tile.sync();  // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence
+  // The first thread of this (sub-)warp to release the lock
+  if (warp_tile.thread_rank() == 0) {
+    set_mutex.release();
+  }
+}
+#else
+// Will be called by multiple thread_block_tile((sub-)warp) on the same mutex
+// Expect only one thread_block_tile return to execute critical section at any time
+template <int warp_size>
+__forceinline__ __device__ void warp_lock_mutex(const cg::thread_block_tile<warp_size>& warp_tile,
+                                                volatile int& set_mutex) {
+  // The first thread of this (sub-)warp to acquire the lock
+  if (warp_tile.thread_rank() == 0) {
+    while (0 == atomicCAS((int*)&set_mutex, 1, 0))
+      ;
+  }
+  __threadfence();
+  warp_tile.sync();  // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence
+}
+
+// The (sub-)warp holding the mutex will unlock the mutex after finishing the critical section on a
+// set Expect any following (sub-)warp that acquire the mutex can see its modification done in the
+// critical section
+template <int warp_size>
+__forceinline__ __device__ void warp_unlock_mutex(const cg::thread_block_tile<warp_size>& warp_tile,
+                                                  volatile int& set_mutex) {
+  __threadfence();
+  warp_tile.sync();  // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence
+  // The first thread of this (sub-)warp to release the lock
+  if (warp_tile.thread_rank() == 0) {
+    atomicExch((int*)&set_mutex, 1);
+  }
+}
+#endif
+
+// The (sub-)warp doing all reduction to find the slot with min slot_counter
+// The slot with min slot_counter is the LR slot.
+template <typename ref_counter_type, int warp_size>
+__forceinline__ __device__ void warp_min_reduction(
+    const cg::thread_block_tile<warp_size>& warp_tile, ref_counter_type& min_slot_counter_val,
+    size_t& slab_distance, size_t& slot_distance) {
+  const size_t lane_idx = warp_tile.thread_rank();
+  slot_distance = lane_idx;
+
+  for (size_t i = (warp_tile.size() >> 1); i > 0; i = i >> 1) {
+    ref_counter_type input_slot_counter_val = warp_tile.shfl_xor(min_slot_counter_val, (int)i);
+    size_t input_slab_distance = warp_tile.shfl_xor(slab_distance, (int)i);
+    size_t input_slot_distance = warp_tile.shfl_xor(slot_distance, (int)i);
+
+    if (input_slot_counter_val == min_slot_counter_val) {
+      if (input_slab_distance == slab_distance) {
+        if (input_slot_distance < slot_distance) {
+          slot_distance = input_slot_distance;
+        }
+      } else if (input_slab_distance < slab_distance) {
+        slab_distance = input_slab_distance;
+        slot_distance = input_slot_distance;
+      }
+    } else if (input_slot_counter_val < min_slot_counter_val) {
+      min_slot_counter_val = input_slot_counter_val;
+      slab_distance = input_slab_distance;
+      slot_distance = input_slot_distance;
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef LIBCUDACXX_VERSION
+// Kernel to initialize the GPU cache
+// Init every entry of the cache with <unused_key, value> pair
+template <typename slabset, typename ref_counter_type, typename atomic_ref_counter_type,
+          typename key_type, typename mutex>
+__global__ void init_cache(slabset* keys, ref_counter_type* slot_counter,
+                           atomic_ref_counter_type* global_counter, const size_t num_slot,
+                           const key_type empty_key, mutex* set_mutex,
+                           const size_t capacity_in_set) {
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_slot) {
+    // Set the key of this slot to unused key
+    // Flatten the cache
+    key_type* key_slot = (key_type*)keys;
+    key_slot[idx] = empty_key;
+
+    // Clear the counter for this slot
+    slot_counter[idx] = 0;
+  }
+  // First CUDA thread clear the global counter
+  if (idx == 0) {
+    new (global_counter) atomic_ref_counter_type(0);
+  }
+
+  // First capacity_in_set CUDA thread initialize mutex
+  if (idx < capacity_in_set) {
+    new (set_mutex + idx) mutex(1);
+  }
+}
+
+template <typename atomic_ref_counter_type, typename mutex>
+__global__ void destruct_kernel(atomic_ref_counter_type* global_counter, mutex* set_mutex,
+                                const size_t capacity_in_set) {
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // First CUDA thread destruct the global_counter
+  if (idx == 0) {
+    global_counter->~atomic_ref_counter_type();
+  }
+  // First capacity_in_set CUDA thread destruct the set mutex
+  if (idx < capacity_in_set) {
+    (set_mutex + idx)->~mutex();
+  }
+}
+#else
+// Kernel to initialize the GPU cache
+// Init every entry of the cache with <unused_key, value> pair
+template <typename slabset, typename ref_counter_type, typename key_type>
+__global__ void init_cache(slabset* keys, ref_counter_type* slot_counter,
+                           ref_counter_type* global_counter, const size_t num_slot,
+                           const key_type empty_key, int* set_mutex, const size_t capacity_in_set) {
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_slot) {
+    // Set the key of this slot to unused key
+    // Flatten the cache
+    key_type* key_slot = (key_type*)keys;
+    key_slot[idx] = empty_key;
+
+    // Clear the counter for this slot
+    slot_counter[idx] = 0;
+  }
+  // First CUDA thread clear the global counter
+  if (idx == 0) {
+    global_counter[idx] = 0;
+  }
+
+  // First capacity_in_set CUDA thread initialize mutex
+  if (idx < capacity_in_set) {
+    set_mutex[idx] = 1;
+  }
+}
+#endif
+
+// Kernel to update global counter
+// Resolve distance overflow issue as well
+#ifdef LIBCUDACXX_VERSION
+template <typename atomic_ref_counter_type>
+__global__ void update_kernel_overflow_ignore(atomic_ref_counter_type* global_counter,
+                                              size_t* d_missing_len) {
+  // Update global counter
+  global_counter->fetch_add(1, cuda::std::memory_order_relaxed);
+  *d_missing_len = 0;
+}
+#else
+template <typename ref_counter_type>
+__global__ void update_kernel_overflow_ignore(ref_counter_type* global_counter,
+                                              size_t* d_missing_len) {
+  // Update global counter
+  atomicAdd(global_counter, 1);
+  *d_missing_len = 0;
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+// Kernel to read from cache
+// Also update locality information for touched slot
+template <typename key_type, typename ref_counter_type, typename atomic_ref_counter_type,
+          typename slabset, typename set_hasher, typename slab_hasher, typename mutex,
+          key_type empty_key, int set_associativity, int warp_size>
+__global__ void get_kernel(const key_type* d_keys, const size_t len, float* d_values,
+                           const size_t embedding_vec_size, uint64_t* d_missing_index,
+                           key_type* d_missing_keys, size_t* d_missing_len,
+                           const atomic_ref_counter_type* global_counter,
+                           ref_counter_type* slot_counter, const size_t capacity_in_set,
+                           const slabset* keys, const float* vals, mutex* set_mutex,
+                           const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // The variable that contains the missing key
+  key_type missing_key;
+  // The variable that contains the index for the missing key
+  uint64_t missing_index;
+  // The counter for counting the missing key in this warp
+  uint8_t warp_missing_counter = 0;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task and the global index to all lane in the warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched, mark missing task, task is
+      // completed
+      if (counter >= set_associativity) {
+        if (lane_idx == warp_missing_counter) {
+          missing_key = next_key;
+          missing_index = next_idx;
+        }
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_missing_counter++;
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found, mark hit task, copy the founded data, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed);
+          active = false;
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  d_values + next_idx * embedding_vec_size,
+                                  vals + found_offset * embedding_vec_size);
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key, if found empty key, mark missing task, task is
+      // completed
+      if (warp_tile.ballot(read_key == empty_key) != 0) {
+        if (lane_idx == warp_missing_counter) {
+          missing_key = next_key;
+          missing_index = next_idx;
+        }
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_missing_counter++;
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Not found in this slab, the task is not completed, goto searching next slab
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+  }
+
+  // After warp_tile complete the working queue, save the result for output
+  // First thread of the warp_tile accumulate the missing length to global variable
+  size_t warp_position;
+  if (lane_idx == 0) {
+    warp_position = atomicAdd(d_missing_len, (size_t)warp_missing_counter);
+  }
+  warp_position = warp_tile.shfl(warp_position, 0);
+
+  if (lane_idx < warp_missing_counter) {
+    d_missing_keys[warp_position + lane_idx] = missing_key;
+    d_missing_index[warp_position + lane_idx] = missing_index;
+  }
+}
+#else
+// Kernel to read from cache
+// Also update locality information for touched slot
+template <typename key_type, typename ref_counter_type, typename slabset, typename set_hasher,
+          typename slab_hasher, key_type empty_key, int set_associativity, int warp_size>
+__global__ void get_kernel(const key_type* d_keys, const size_t len, float* d_values,
+                           const size_t embedding_vec_size, uint64_t* d_missing_index,
+                           key_type* d_missing_keys, size_t* d_missing_len,
+                           ref_counter_type* global_counter,
+                           volatile ref_counter_type* slot_counter, const size_t capacity_in_set,
+                           volatile slabset* keys, volatile float* vals, volatile int* set_mutex,
+                           const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // The variable that contains the missing key
+  key_type missing_key;
+  // The variable that contains the index for the missing key
+  uint64_t missing_index;
+  // The counter for counting the missing key in this warp
+  uint8_t warp_missing_counter = 0;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task and the global index to all lane in the warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched, mark missing task, task is
+      // completed
+      if (counter >= set_associativity) {
+        if (lane_idx == warp_missing_counter) {
+          missing_key = next_key;
+          missing_index = next_idx;
+        }
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_missing_counter++;
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found, mark hit task, copy the founded data, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          slot_counter[found_offset] = atomicAdd(global_counter, 0);
+          active = false;
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  (volatile float*)(d_values + next_idx * embedding_vec_size),
+                                  (volatile float*)(vals + found_offset * embedding_vec_size));
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key, if found empty key, mark missing task, task is
+      // completed
+      if (warp_tile.ballot(read_key == empty_key) != 0) {
+        if (lane_idx == warp_missing_counter) {
+          missing_key = next_key;
+          missing_index = next_idx;
+        }
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_missing_counter++;
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Not found in this slab, the task is not completed, goto searching next slab
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+  }
+
+  // After warp_tile complete the working queue, save the result for output
+  // First thread of the warp_tile accumulate the missing length to global variable
+  size_t warp_position;
+  if (lane_idx == 0) {
+    warp_position = atomicAdd(d_missing_len, (size_t)warp_missing_counter);
+  }
+  warp_position = warp_tile.shfl(warp_position, 0);
+
+  if (lane_idx < warp_missing_counter) {
+    d_missing_keys[warp_position + lane_idx] = missing_key;
+    d_missing_index[warp_position + lane_idx] = missing_index;
+  }
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+// Kernel to insert or replace the <k,v> pairs into the cache
+template <typename key_type, typename slabset, typename ref_counter_type, typename mutex,
+          typename atomic_ref_counter_type, typename set_hasher, typename slab_hasher,
+          key_type empty_key, int set_associativity, int warp_size,
+          ref_counter_type max_ref_counter_type = std::numeric_limits<ref_counter_type>::max(),
+          size_t max_slab_distance = std::numeric_limits<size_t>::max()>
+__global__ void insert_replace_kernel(const key_type* d_keys, const float* d_values,
+                                      const size_t embedding_vec_size, const size_t len,
+                                      slabset* keys, float* vals, ref_counter_type* slot_counter,
+                                      mutex* set_mutex,
+                                      const atomic_ref_counter_type* global_counter,
+                                      const size_t capacity_in_set,
+                                      const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task, the global index and the src slabset and slab to all lane in a warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+    size_t first_slab = next_slab;
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Variable to keep the min slot counter during the probing
+    ref_counter_type min_slot_counter_val = max_ref_counter_type;
+    // Variable to keep the slab distance for slot with min counter
+    size_t slab_distance = max_slab_distance;
+    // Variable to keep the slot distance for slot with min counter within the slab
+    size_t slot_distance;
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched
+      // and no empty slots or target slots are found. Replace with LRU
+      if (counter >= set_associativity) {
+        // (sub)Warp all-reduction, the reduction result store in all threads
+        warp_min_reduction<ref_counter_type, warp_size>(warp_tile, min_slot_counter_val,
+                                                        slab_distance, slot_distance);
+
+        // Calculate the position of LR slot
+        size_t target_slab = (first_slab + slab_distance) % set_associativity;
+        size_t slot_index =
+            (next_set * set_associativity + target_slab) * warp_size + slot_distance;
+
+        // Replace the LR slot
+        if (lane_idx == (size_t)next_lane) {
+          keys[next_set].set_[target_slab].slab_[slot_distance] = key;
+          slot_counter[slot_index] = global_counter->load(cuda::std::memory_order_relaxed);
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  vals + slot_index * embedding_vec_size,
+                                  d_values + next_idx * embedding_vec_size);
+
+        // Replace complete, mark this task completed
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found target key, the insertion/replace is no longer needed.
+      // Refresh the slot, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed);
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key.
+      // If found empty key, do insertion,the task is complete
+      found_lane = __ffs(warp_tile.ballot(read_key == empty_key)) - 1;
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+
+        if (lane_idx == (size_t)next_lane) {
+          keys[next_set].set_[next_slab].slab_[found_lane] = key;
+          slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed);
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  vals + found_offset * embedding_vec_size,
+                                  d_values + next_idx * embedding_vec_size);
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // If no target or unused slot found in this slab,
+      // Refresh LR info, continue probing
+      ref_counter_type read_slot_counter =
+          slot_counter[(next_set * set_associativity + next_slab) * warp_size + lane_idx];
+      if (read_slot_counter < min_slot_counter_val) {
+        min_slot_counter_val = read_slot_counter;
+        slab_distance = counter;
+      }
+
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+  }
+}
+#else
+// Kernel to insert or replace the <k,v> pairs into the cache
+template <typename key_type, typename slabset, typename ref_counter_type, typename set_hasher,
+          typename slab_hasher, key_type empty_key, int set_associativity, int warp_size,
+          ref_counter_type max_ref_counter_type = std::numeric_limits<ref_counter_type>::max(),
+          size_t max_slab_distance = std::numeric_limits<size_t>::max()>
+__global__ void insert_replace_kernel(const key_type* d_keys, const float* d_values,
+                                      const size_t embedding_vec_size, const size_t len,
+                                      volatile slabset* keys, volatile float* vals,
+                                      volatile ref_counter_type* slot_counter,
+                                      volatile int* set_mutex, ref_counter_type* global_counter,
+                                      const size_t capacity_in_set,
+                                      const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task, the global index and the src slabset and slab to all lane in a warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+    size_t first_slab = next_slab;
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Variable to keep the min slot counter during the probing
+    ref_counter_type min_slot_counter_val = max_ref_counter_type;
+    // Variable to keep the slab distance for slot with min counter
+    size_t slab_distance = max_slab_distance;
+    // Variable to keep the slot distance for slot with min counter within the slab
+    size_t slot_distance;
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched
+      // and no empty slots or target slots are found. Replace with LRU
+      if (counter >= set_associativity) {
+        // (sub)Warp all-reduction, the reduction result store in all threads
+        warp_min_reduction<ref_counter_type, warp_size>(warp_tile, min_slot_counter_val,
+                                                        slab_distance, slot_distance);
+
+        // Calculate the position of LR slot
+        size_t target_slab = (first_slab + slab_distance) % set_associativity;
+        size_t slot_index =
+            (next_set * set_associativity + target_slab) * warp_size + slot_distance;
+
+        // Replace the LR slot
+        if (lane_idx == (size_t)next_lane) {
+          ((volatile key_type*)(keys[next_set].set_[target_slab].slab_))[slot_distance] = key;
+          slot_counter[slot_index] = atomicAdd(global_counter, 0);
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  (volatile float*)(vals + slot_index * embedding_vec_size),
+                                  (volatile float*)(d_values + next_idx * embedding_vec_size));
+
+        // Replace complete, mark this task completed
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found target key, the insertion/replace is no longer needed.
+      // Refresh the slot, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          slot_counter[found_offset] = atomicAdd(global_counter, 0);
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key.
+      // If found empty key, do insertion,the task is complete
+      found_lane = __ffs(warp_tile.ballot(read_key == empty_key)) - 1;
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+
+        if (lane_idx == (size_t)next_lane) {
+          ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[found_lane] = key;
+          slot_counter[found_offset] = atomicAdd(global_counter, 0);
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  (volatile float*)(vals + found_offset * embedding_vec_size),
+                                  (volatile float*)(d_values + next_idx * embedding_vec_size));
+
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // If no target or unused slot found in this slab,
+      // Refresh LR info, continue probing
+      ref_counter_type read_slot_counter =
+          slot_counter[(next_set * set_associativity + next_slab) * warp_size + lane_idx];
+      if (read_slot_counter < min_slot_counter_val) {
+        min_slot_counter_val = read_slot_counter;
+        slab_distance = counter;
+      }
+
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+  }
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+// Kernel to update the existing keys in the cache
+// Will not change the locality information
+template <typename key_type, typename slabset, typename set_hasher, typename slab_hasher,
+          typename mutex, key_type empty_key, int set_associativity, int warp_size>
+__global__ void update_kernel(const key_type* d_keys, const size_t len, const float* d_values,
+                              const size_t embedding_vec_size, const size_t capacity_in_set,
+                              const slabset* keys, float* vals, mutex* set_mutex,
+                              const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task and the global index to all lane in the warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched, mark missing task, do nothing, task
+      // complete
+      if (counter >= set_associativity) {
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found, mark hit task, update the value, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  vals + found_offset * embedding_vec_size,
+                                  d_values + next_idx * embedding_vec_size);
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key, if found empty key, mark missing task, do nothing,
+      // task is completed
+      if (warp_tile.ballot(read_key == empty_key) != 0) {
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Not found in this slab, the task is not completed, goto searching next slab
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<mutex, warp_size>(warp_tile, set_mutex[next_set]);
+  }
+}
+#else
+// Kernel to update the existing keys in the cache
+// Will not change the locality information
+template <typename key_type, typename slabset, typename set_hasher, typename slab_hasher,
+          key_type empty_key, int set_associativity, int warp_size>
+__global__ void update_kernel(const key_type* d_keys, const size_t len, const float* d_values,
+                              const size_t embedding_vec_size, const size_t capacity_in_set,
+                              volatile slabset* keys, volatile float* vals, volatile int* set_mutex,
+                              const size_t task_per_warp_tile) {
+  // Lane(thread) ID within a warp_tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile global ID
+  const size_t warp_tile_global_idx =
+      (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank();
+  // The index of key for this thread
+  const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx;
+  // The assigned key for this lane(thread)
+  key_type key;
+  // The dst slabset and the dst slab inside this set
+  size_t src_set;
+  size_t src_slab;
+  // Active flag: whether current lane(thread) has unfinished task
+  bool active = false;
+  if (lane_idx < task_per_warp_tile) {
+    if (key_idx < len) {
+      active = true;
+      key = d_keys[key_idx];
+      src_set = set_hasher::hash(key) % capacity_in_set;
+      src_slab = slab_hasher::hash(key) % set_associativity;
+    }
+  }
+
+  // Lane participate in warp_tile ballot to produce warp-level work queue
+  unsigned active_mask = warp_tile.ballot(active);
+
+  // The warp-level outer loop: finish all the tasks within the work queue
+  while (active_mask != 0) {
+    // Next task in the work quere, start from lower index lane(thread)
+    int next_lane = __ffs(active_mask) - 1;
+    // Broadcast the task and the global index to all lane in the warp_tile
+    key_type next_key = warp_tile.shfl(key, next_lane);
+    size_t next_idx = warp_tile.shfl(key_idx, next_lane);
+    size_t next_set = warp_tile.shfl(src_set, next_lane);
+    size_t next_slab = warp_tile.shfl(src_slab, next_lane);
+
+    // Counter to record how many slab have been searched
+    size_t counter = 0;
+
+    // Working queue before task started
+    const unsigned old_active_mask = active_mask;
+
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+
+    // The warp-level inner loop: finish a single task in the work queue
+    while (active_mask == old_active_mask) {
+      // When all the slabs inside a slabset have been searched, mark missing task, do nothing, task
+      // complete
+      if (counter >= set_associativity) {
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // The warp_tile read out the slab
+      key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx];
+
+      // Compare the slab data with the target key
+      int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1;
+
+      // If found, mark hit task, update the value, the task is completed
+      if (found_lane >= 0) {
+        size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane;
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        warp_tile_copy<warp_size>(lane_idx, embedding_vec_size,
+                                  (volatile float*)(vals + found_offset * embedding_vec_size),
+                                  (volatile float*)(d_values + next_idx * embedding_vec_size));
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Compare the slab data with empty key, if found empty key, mark missing task, do nothing,
+      // task is completed
+      if (warp_tile.ballot(read_key == empty_key) != 0) {
+        if (lane_idx == (size_t)next_lane) {
+          active = false;
+        }
+
+        active_mask = warp_tile.ballot(active);
+        break;
+      }
+
+      // Not found in this slab, the task is not completed, goto searching next slab
+      counter++;
+      next_slab = (next_slab + 1) % set_associativity;
+    }
+
+    // Unlock the slabset after operating the slabset
+    warp_unlock_mutex<warp_size>(warp_tile, set_mutex[next_set]);
+  }
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename slabset, typename mutex, key_type empty_key,
+          int set_associativity, int warp_size>
+__global__ void dump_kernel(key_type* d_keys, size_t* d_dump_counter, const slabset* keys,
+                            mutex* set_mutex, const size_t start_set_index,
+                            const size_t end_set_index) {
+  // Block-level counter used by all warp tiles within a block
+  __shared__ uint32_t block_acc;
+  // Initialize block-level counter
+  if (threadIdx.x == 0) {
+    block_acc = 0;
+  }
+  __syncthreads();
+  // Lane(thread) ID within a warp tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile target slabset id
+  const size_t set_idx =
+      ((blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank()) + start_set_index;
+  // Keys dump from cache
+  key_type read_key[set_associativity];
+  // Lane(thread) offset for storing each key
+  uint32_t thread_key_offset[set_associativity];
+  // Warp offset for storing each key
+  uint32_t warp_key_offset;
+  // Block offset for storing each key
+  __shared__ size_t block_key_offset;
+
+  // Warp tile dump target slabset
+  if (set_idx < end_set_index) {
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<mutex, warp_size>(warp_tile, set_mutex[set_idx]);
+
+    // The warp tile read out the slabset
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      // The warp tile read out a slab
+      read_key[slab_id] = keys[set_idx].set_[slab_id].slab_[lane_idx];
+    }
+
+    // Finish dumping the slabset, unlock the slabset
+    warp_unlock_mutex<mutex, warp_size>(warp_tile, set_mutex[set_idx]);
+
+    // Each lane(thread) within the warp tile calculate the offset to store its keys
+    uint32_t warp_tile_total_keys = 0;
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      unsigned valid_mask = warp_tile.ballot(read_key[slab_id] != empty_key);
+      thread_key_offset[slab_id] =
+          __popc(valid_mask & ((1U << lane_idx) - 1U)) + warp_tile_total_keys;
+      warp_tile_total_keys = warp_tile_total_keys + __popc(valid_mask);
+    }
+
+    // Each warp tile request a unique place from the block-level counter
+    if (lane_idx == 0) {
+      warp_key_offset = atomicAdd(&block_acc, warp_tile_total_keys);
+    }
+    warp_key_offset = warp_tile.shfl(warp_key_offset, 0);
+  }
+
+  // Each block request a unique place in global memory output buffer
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    block_key_offset = atomicAdd(d_dump_counter, (size_t)block_acc);
+  }
+  __syncthreads();
+
+  // Warp tile store the (non-empty)keys back to output buffer
+  if (set_idx < end_set_index) {
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      if (read_key[slab_id] != empty_key) {
+        d_keys[block_key_offset + warp_key_offset + thread_key_offset[slab_id]] = read_key[slab_id];
+      }
+    }
+  }
+}
+#else
+template <typename key_type, typename slabset, key_type empty_key, int set_associativity,
+          int warp_size>
+__global__ void dump_kernel(key_type* d_keys, size_t* d_dump_counter, volatile slabset* keys,
+                            volatile int* set_mutex, const size_t start_set_index,
+                            const size_t end_set_index) {
+  // Block-level counter used by all warp tiles within a block
+  __shared__ uint32_t block_acc;
+  // Initialize block-level counter
+  if (threadIdx.x == 0) {
+    block_acc = 0;
+  }
+  __syncthreads();
+  // Lane(thread) ID within a warp tile
+  cg::thread_block_tile<warp_size> warp_tile =
+      cg::tiled_partition<warp_size>(cg::this_thread_block());
+  const size_t lane_idx = warp_tile.thread_rank();
+  // Warp tile target slabset id
+  const size_t set_idx =
+      ((blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank()) + start_set_index;
+  // Keys dump from cache
+  key_type read_key[set_associativity];
+  // Lane(thread) offset for storing each key
+  uint32_t thread_key_offset[set_associativity];
+  // Warp offset for storing each key
+  uint32_t warp_key_offset;
+  // Block offset for storing each key
+  __shared__ size_t block_key_offset;
+
+  // Warp tile dump target slabset
+  if (set_idx < end_set_index) {
+    // Lock the slabset before operating the slabset
+    warp_lock_mutex<warp_size>(warp_tile, set_mutex[set_idx]);
+
+    // The warp tile read out the slabset
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      // The warp tile read out a slab
+      read_key[slab_id] = ((volatile key_type*)(keys[set_idx].set_[slab_id].slab_))[lane_idx];
+    }
+
+    // Finish dumping the slabset, unlock the slabset
+    warp_unlock_mutex<warp_size>(warp_tile, set_mutex[set_idx]);
+
+    // Each lane(thread) within the warp tile calculate the offset to store its keys
+    uint32_t warp_tile_total_keys = 0;
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      unsigned valid_mask = warp_tile.ballot(read_key[slab_id] != empty_key);
+      thread_key_offset[slab_id] =
+          __popc(valid_mask & ((1U << lane_idx) - 1U)) + warp_tile_total_keys;
+      warp_tile_total_keys = warp_tile_total_keys + __popc(valid_mask);
+    }
+
+    // Each warp tile request a unique place from the block-level counter
+    if (lane_idx == 0) {
+      warp_key_offset = atomicAdd(&block_acc, warp_tile_total_keys);
+    }
+    warp_key_offset = warp_tile.shfl(warp_key_offset, 0);
+  }
+
+  // Each block request a unique place in global memory output buffer
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    block_key_offset = atomicAdd(d_dump_counter, (size_t)block_acc);
+  }
+  __syncthreads();
+
+  // Warp tile store the (non-empty)keys back to output buffer
+  if (set_idx < end_set_index) {
+    for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) {
+      if (read_key[slab_id] != empty_key) {
+        d_keys[block_key_offset + warp_key_offset + thread_key_offset[slab_id]] = read_key[slab_id];
+      }
+    }
+  }
+}
+#endif
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+          slab_hasher>::gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size)
+    : capacity_in_set_(capacity_in_set), embedding_vec_size_(embedding_vec_size) {
+  // Check parameter
+  if (capacity_in_set_ == 0) {
+    printf("Error: Invalid value for capacity_in_set.\n");
+    return;
+  }
+  if (embedding_vec_size_ == 0) {
+    printf("Error: Invalid value for embedding_vec_size.\n");
+    return;
+  }
+  if (set_associativity <= 0) {
+    printf("Error: Invalid value for set_associativity.\n");
+    return;
+  }
+  if (warp_size != 1 && warp_size != 2 && warp_size != 4 && warp_size != 8 && warp_size != 16 &&
+      warp_size != 32) {
+    printf("Error: Invalid value for warp_size.\n");
+    return;
+  }
+
+  // Get the current CUDA dev
+  CUDA_CHECK(cudaGetDevice(&dev_));
+
+  // Calculate # of slot
+  num_slot_ = capacity_in_set_ * set_associativity * warp_size;
+
+  // Allocate GPU memory for cache
+  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type)));
+
+  // Allocate GPU memory for set mutex
+  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_));
+
+  // Initialize the cache, set all entry to unused <K,V>
+  init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+      keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
+
+  // Wait for initialization to finish
+  CUDA_CHECK(cudaStreamSynchronize(0));
+  CUDA_CHECK(cudaGetLastError());
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+          slab_hasher>::gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size)
+    : capacity_in_set_(capacity_in_set), embedding_vec_size_(embedding_vec_size) {
+  // Check parameter
+  if (capacity_in_set_ == 0) {
+    printf("Error: Invalid value for capacity_in_set.\n");
+    return;
+  }
+  if (embedding_vec_size_ == 0) {
+    printf("Error: Invalid value for embedding_vec_size.\n");
+    return;
+  }
+  if (set_associativity <= 0) {
+    printf("Error: Invalid value for set_associativity.\n");
+    return;
+  }
+  if (warp_size != 1 && warp_size != 2 && warp_size != 4 && warp_size != 8 && warp_size != 16 &&
+      warp_size != 32) {
+    printf("Error: Invalid value for warp_size.\n");
+    return;
+  }
+
+  // Get the current CUDA dev
+  CUDA_CHECK(cudaGetDevice(&dev_));
+
+  // Calculate # of slot
+  num_slot_ = capacity_in_set_ * set_associativity * warp_size;
+
+  // Allocate GPU memory for cache
+  CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_));
+  CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_));
+  CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_));
+  CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(ref_counter_type)));
+
+  // Allocate GPU memory for set mutex
+  CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_));
+
+  // Initialize the cache, set all entry to unused <K,V>
+  init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+      keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_);
+
+  // Wait for initialization to finish
+  CUDA_CHECK(cudaStreamSynchronize(0));
+  CUDA_CHECK(cudaGetLastError());
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+          slab_hasher>::~gpu_cache() {
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Destruct CUDA std object
+  destruct_kernel<<<((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>(
+      global_counter_, set_mutex_, capacity_in_set_);
+  // Wait for destruction to finish
+  CUDA_CHECK(cudaStreamSynchronize(0));
+
+  // Free GPU memory for cache
+  CUDA_CHECK(cudaFree(keys_));
+  CUDA_CHECK(cudaFree(vals_));
+  CUDA_CHECK(cudaFree(slot_counter_));
+  CUDA_CHECK(cudaFree(global_counter_));
+  // Free GPU memory for set mutex
+  CUDA_CHECK(cudaFree(set_mutex_));
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+          slab_hasher>::~gpu_cache() noexcept(false) {
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Free GPU memory for cache
+  CUDA_CHECK(cudaFree(keys_));
+  CUDA_CHECK(cudaFree(vals_));
+  CUDA_CHECK(cudaFree(slot_counter_));
+  CUDA_CHECK(cudaFree(global_counter_));
+  // Free GPU memory for set mutex
+  CUDA_CHECK(cudaFree(set_mutex_));
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
+                                   uint64_t* d_missing_index, key_type* d_missing_keys,
+                                   size_t* d_missing_len, cudaStream_t stream,
+                                   const size_t task_per_warp_tile) {
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Check if it is a valid query
+  if (len == 0) {
+    // Set the d_missing_len to 0 before return
+    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    return;
+  }
+
+  // Update the global counter as user perform a new(most recent) read operation to the cache
+  // Resolve distance overflow issue as well.
+  update_kernel_overflow_ignore<atomic_ref_counter_type>
+      <<<1, 1, 0, stream>>>(global_counter_, d_missing_len);
+
+  // Read from the cache
+  // Touch and refresh the hitting slot
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  get_kernel<key_type, ref_counter_type, atomic_ref_counter_type, slabset, set_hasher, slab_hasher,
+             mutex, empty_key, set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
+      global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
+      task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Query(const key_type* d_keys, const size_t len, float* d_values,
+                                   uint64_t* d_missing_index, key_type* d_missing_keys,
+                                   size_t* d_missing_len, cudaStream_t stream,
+                                   const size_t task_per_warp_tile) {
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Check if it is a valid query
+  if (len == 0) {
+    // Set the d_missing_len to 0 before return
+    CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream));
+    return;
+  }
+
+  // Update the global counter as user perform a new(most recent) read operation to the cache
+  // Resolve distance overflow issue as well.
+  update_kernel_overflow_ignore<ref_counter_type>
+      <<<1, 1, 0, stream>>>(global_counter_, d_missing_len);
+
+  // Read from the cache
+  // Touch and refresh the hitting slot
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  get_kernel<key_type, ref_counter_type, slabset, set_hasher, slab_hasher, empty_key,
+             set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len,
+      global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_,
+      task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Replace(const key_type* d_keys, const size_t len,
+                                     const float* d_values, cudaStream_t stream,
+                                     const size_t task_per_warp_tile) {
+  // Check if it is a valid replacement
+  if (len == 0) {
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Try to insert the <k,v> paris into the cache as long as there are unused slot
+  // Then replace the <k,v> pairs into the cache
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  insert_replace_kernel<key_type, slabset, ref_counter_type, mutex, atomic_ref_counter_type,
+                        set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
+      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_values, embedding_vec_size_, len, keys_,
+                                              vals_, slot_counter_, set_mutex_, global_counter_,
+                                              capacity_in_set_, task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Replace(const key_type* d_keys, const size_t len,
+                                     const float* d_values, cudaStream_t stream,
+                                     const size_t task_per_warp_tile) {
+  // Check if it is a valid replacement
+  if (len == 0) {
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Try to insert the <k,v> paris into the cache as long as there are unused slot
+  // Then replace the <k,v> pairs into the cache
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  insert_replace_kernel<key_type, slabset, ref_counter_type, set_hasher, slab_hasher, empty_key,
+                        set_associativity, warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_,
+      global_counter_, capacity_in_set_, task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
+                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+  // Check if it is a valid update request
+  if (len == 0) {
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Update the value of input keys that are existed in the cache
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  update_kernel<key_type, slabset, set_hasher, slab_hasher, mutex, empty_key, set_associativity,
+                warp_size><<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_,
+      task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Update(const key_type* d_keys, const size_t len, const float* d_values,
+                                    cudaStream_t stream, const size_t task_per_warp_tile) {
+  // Check if it is a valid update request
+  if (len == 0) {
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Update the value of input keys that are existed in the cache
+  const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile;
+  const size_t grid_size = ((len - 1) / keys_per_block) + 1;
+  update_kernel<key_type, slabset, set_hasher, slab_hasher, empty_key, set_associativity, warp_size>
+      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, len, d_values, embedding_vec_size_,
+                                              capacity_in_set_, keys_, vals_, set_mutex_,
+                                              task_per_warp_tile);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#endif
+
+#ifdef LIBCUDACXX_VERSION
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
+                                  const size_t start_set_index, const size_t end_set_index,
+                                  cudaStream_t stream) {
+  // Check if it is a valid dump request
+  if (start_set_index >= capacity_in_set_) {
+    printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
+    return;
+  }
+  if (end_set_index <= start_set_index || end_set_index > capacity_in_set_) {
+    printf("Error: Invalid value for end_set_index. Nothing dumped.\n");
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Set the global counter to 0 first
+  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+
+  // Dump keys from the cache
+  const size_t grid_size =
+      (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
+  dump_kernel<key_type, slabset, mutex, empty_key, set_associativity, warp_size>
+      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_,
+                                              start_set_index, end_set_index);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#else
+template <typename key_type, typename ref_counter_type, key_type empty_key, int set_associativity,
+          int warp_size, typename set_hasher, typename slab_hasher>
+void gpu_cache<key_type, ref_counter_type, empty_key, set_associativity, warp_size, set_hasher,
+               slab_hasher>::Dump(key_type* d_keys, size_t* d_dump_counter,
+                                  const size_t start_set_index, const size_t end_set_index,
+                                  cudaStream_t stream) {
+  // Check if it is a valid dump request
+  if (start_set_index >= capacity_in_set_) {
+    printf("Error: Invalid value for start_set_index. Nothing dumped.\n");
+    return;
+  }
+  if (end_set_index <= start_set_index || end_set_index > capacity_in_set_) {
+    printf("Error: Invalid value for end_set_index. Nothing dumped.\n");
+    return;
+  }
+
+  // Device Restorer
+  nv::CudaDeviceRestorer dev_restorer;
+  // Check device
+  dev_restorer.check_device(dev_);
+
+  // Set the global counter to 0 first
+  CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream));
+
+  // Dump keys from the cache
+  const size_t grid_size =
+      (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1;
+  dump_kernel<key_type, slabset, empty_key, set_associativity, warp_size>
+      <<<grid_size, BLOCK_SIZE_, 0, stream>>>(d_keys, d_dump_counter, keys_, set_mutex_,
+                                              start_set_index, end_set_index);
+
+  // Check for GPU error before return
+  CUDA_CHECK(cudaGetLastError());
+}
+#endif
+
+template class gpu_cache<unsigned int, uint64_t, std::numeric_limits<unsigned int>::max(),
+                         SET_ASSOCIATIVITY, SLAB_SIZE>;
+template class gpu_cache<long long, uint64_t, std::numeric_limits<long long>::max(),
+                         SET_ASSOCIATIVITY, SLAB_SIZE>;
+}  // namespace gpu_cache