From 95243efe17e66e769f8e829dc25f3f71909257ba Mon Sep 17 00:00:00 2001 From: Geoffrey Martin-Noble Date: Tue, 28 Jan 2025 16:58:34 -0800 Subject: [PATCH] Hipify source in place (#2) This is just the output of the hipify-inplace.sh and hipify-tensoradapter.py scripts with no further modifications, so probably you'll want to just spot-check. I think it will be easier to review changes to the actual HIP source directly rather than trying to thing about what the hipify script will do and since there are a decent number of changes to get the build and tests working, that seems worth it. In the end we should have a bunch of hip source files and .prehip cuda files that they can be generated from. Then we can handle organization however we want: restore the originals and have hipification be part of a build process, have the hip versions on a separate branch, etc. I put the .prehip files in a separate commit to keep things a bit cleaner. Note that this PR is based off of #1. I didn't make that the base branch though because then it would all be in my fork. --- include/dgl/aten/macro.h | 20 +- include/dgl/aten/macro.h.prehip | 434 +++++ include/dgl/runtime/device_api.h | 6 +- include/dgl/runtime/device_api.h.prehip | 268 +++ include/dgl/runtime/ndarray.h | 24 +- include/dgl/runtime/ndarray.h.prehip | 890 +++++++++ include/dgl/runtime/tensordispatch.h | 38 +- include/dgl/runtime/tensordispatch.h.prehip | 281 +++ src/array/arith.h | 4 +- src/array/arith.h.prehip | 109 ++ src/array/cuda/array_cumsum.cu | 8 +- src/array/cuda/array_cumsum.cu.prehip | 57 + src/array/cuda/array_index_select.cu | 8 +- src/array/cuda/array_index_select.cu.prehip | 98 + src/array/cuda/array_index_select.cuh | 1 + src/array/cuda/array_index_select.cuh.prehip | 87 + src/array/cuda/array_nonzero.cu | 10 +- src/array/cuda/array_nonzero.cu.prehip | 71 + src/array/cuda/array_op_impl.cu | 23 +- src/array/cuda/array_op_impl.cu.prehip | 441 +++++ src/array/cuda/array_scatter.cu | 7 +- src/array/cuda/array_scatter.cu.prehip | 61 + src/array/cuda/array_sort.cu | 8 +- src/array/cuda/array_sort.cu.prehip | 61 + src/array/cuda/atomic.cuh | 32 +- src/array/cuda/atomic.cuh.prehip | 336 ++++ src/array/cuda/bf16.cuh | 112 +- src/array/cuda/bf16.cuh.prehip | 149 ++ src/array/cuda/coo2csr.cu | 13 +- src/array/cuda/coo2csr.cu.prehip | 137 ++ src/array/cuda/coo_sort.cu | 5 +- src/array/cuda/coo_sort.cu.prehip | 168 ++ src/array/cuda/csr2coo.cu | 28 +- src/array/cuda/csr2coo.cu.prehip | 183 ++ src/array/cuda/csr_get_data.cu | 10 +- src/array/cuda/csr_get_data.cu.prehip | 100 + src/array/cuda/csr_mm.cu | 126 +- src/array/cuda/csr_mm.cu.prehip | 332 ++++ src/array/cuda/csr_sort.cu | 31 +- src/array/cuda/csr_sort.cu.prehip | 151 ++ src/array/cuda/csr_sum.cu | 28 +- src/array/cuda/csr_sum.cu.prehip | 177 ++ src/array/cuda/csr_transpose.cc | 22 +- src/array/cuda/csr_transpose.cc.prehip | 95 + src/array/cuda/cuda_filter.cu | 11 +- src/array/cuda/cuda_filter.cu.prehip | 140 ++ src/array/cuda/cusparse_dispatcher.cuh | 126 +- src/array/cuda/cusparse_dispatcher.cuh.prehip | 238 +++ src/array/cuda/disjoint_union.cu | 5 +- src/array/cuda/disjoint_union.cu.prehip | 185 ++ src/array/cuda/fp16.cuh | 8 +- src/array/cuda/fp16.cuh.prehip | 134 ++ src/array/cuda/functor.cuh | 54 +- src/array/cuda/functor.cuh.prehip | 456 +++++ src/array/cuda/gather_mm.cu | 83 +- src/array/cuda/gather_mm.cu.prehip | 464 +++++ src/array/cuda/ge_spmm.cuh | 3 +- src/array/cuda/ge_spmm.cuh.prehip | 144 ++ src/array/cuda/labor_sampling.cu | 31 +- src/array/cuda/labor_sampling.cu.prehip | 833 +++++++++ src/array/cuda/macro.cuh | 8 +- src/array/cuda/macro.cuh.prehip | 53 + src/array/cuda/negative_sampling.cu | 29 +- src/array/cuda/negative_sampling.cu.prehip | 220 +++ src/array/cuda/rowwise_sampling.cu | 37 +- src/array/cuda/rowwise_sampling.cu.prehip | 366 ++++ src/array/cuda/rowwise_sampling_prob.cu | 53 +- .../cuda/rowwise_sampling_prob.cu.prehip | 696 +++++++ src/array/cuda/sddmm.cu | 8 +- src/array/cuda/sddmm.cu.prehip | 99 + src/array/cuda/sddmm.cuh | 5 +- src/array/cuda/sddmm.cuh.prehip | 368 ++++ src/array/cuda/sddmm_hetero_coo.cu | 4 +- src/array/cuda/sddmm_hetero_coo.cu.prehip | 91 + src/array/cuda/sddmm_hetero_csr.cu | 4 +- src/array/cuda/sddmm_hetero_csr.cu.prehip | 90 + src/array/cuda/segment_reduce.cu | 16 +- src/array/cuda/segment_reduce.cu.prehip | 157 ++ src/array/cuda/segment_reduce.cuh | 9 +- src/array/cuda/segment_reduce.cuh.prehip | 262 +++ src/array/cuda/spmat_op_impl_coo.cu | 5 +- src/array/cuda/spmat_op_impl_coo.cu.prehip | 139 ++ src/array/cuda/spmat_op_impl_csr.cu | 23 +- src/array/cuda/spmat_op_impl_csr.cu.prehip | 654 +++++++ src/array/cuda/spmm.cu | 8 +- src/array/cuda/spmm.cu.prehip | 179 ++ src/array/cuda/spmm.cuh | 201 +- src/array/cuda/spmm.cuh.prehip | 802 ++++++++ src/array/cuda/spmm_hetero.cu | 8 +- src/array/cuda/spmm_hetero.cu.prehip | 262 +++ src/array/cuda/utils.cu | 8 +- src/array/cuda/utils.cu.prehip | 33 + src/array/cuda/utils.h | 19 +- src/array/cuda/utils.h.prehip | 301 +++ src/array/cuda/uvm/array_index_select_uvm.cu | 4 +- .../cuda/uvm/array_index_select_uvm.cu.prehip | 131 ++ src/array/cuda/uvm/array_index_select_uvm.cuh | 1 + .../uvm/array_index_select_uvm.cuh.prehip | 52 + src/array/filter.cc | 2 +- src/array/filter.cc.prehip | 54 + src/array/selector.h | 4 +- src/array/selector.h.prehip | 59 + src/array/uvm_array.cc | 4 +- src/array/uvm_array.cc.prehip | 74 + src/geometry/cuda/edge_coarsening_impl.cu | 19 +- .../cuda/edge_coarsening_impl.cu.prehip | 239 +++ src/geometry/cuda/geometry_op_impl.cu | 5 +- src/geometry/cuda/geometry_op_impl.cu.prehip | 135 ++ src/graph/heterograph_capi.cc | 2 +- src/graph/heterograph_capi.cc.prehip | 841 +++++++++ .../sampling/randomwalks/frequency_hashmap.cu | 37 +- .../randomwalks/frequency_hashmap.cu.prehip | 471 +++++ .../randomwalks/frequency_hashmap.cuh | 4 +- .../randomwalks/frequency_hashmap.cuh.prehip | 79 + .../randomwalks/get_node_types_gpu.cu | 2 +- .../randomwalks/get_node_types_gpu.cu.prehip | 72 + .../sampling/randomwalks/randomwalk_gpu.cu | 43 +- .../randomwalks/randomwalk_gpu.cu.prehip | 496 +++++ .../transform/cuda/cuda_compact_graph.cu | 8 +- .../cuda/cuda_compact_graph.cu.prehip | 247 +++ src/graph/transform/cuda/cuda_map_edges.cuh | 7 +- .../transform/cuda/cuda_map_edges.cuh.prehip | 240 +++ src/graph/transform/cuda/cuda_to_block.cu | 24 +- .../transform/cuda/cuda_to_block.cu.prehip | 258 +++ src/graph/transform/cuda/knn.cu | 45 +- src/graph/transform/cuda/knn.cu.prehip | 997 ++++++++++ src/graph/transform/to_block.cc | 4 +- src/graph/transform/to_block.cc.prehip | 383 ++++ src/partition/cuda/partition_op.cu | 35 +- src/partition/cuda/partition_op.cu.prehip | 613 ++++++ src/partition/ndarray_partition.cc | 12 +- src/partition/ndarray_partition.cc.prehip | 266 +++ src/random/continuous_seed.h | 14 +- src/random/continuous_seed.h.prehip | 100 + src/runtime/c_runtime_api.cc | 6 +- src/runtime/c_runtime_api.cc.prehip | 418 +++++ src/runtime/cuda/cuda_common.h | 126 +- src/runtime/cuda/cuda_common.h.prehip | 259 +++ src/runtime/cuda/cuda_device_api.cc | 158 +- src/runtime/cuda/cuda_device_api.cc.prehip | 377 ++++ src/runtime/cuda/cuda_hashtable.cu | 19 +- src/runtime/cuda/cuda_hashtable.cu.prehip | 443 +++++ src/runtime/cuda/cuda_hashtable.cuh | 9 +- src/runtime/cuda/cuda_hashtable.cuh.prehip | 284 +++ src/runtime/cuda/gpu_cache.cu | 8 +- src/runtime/cuda/gpu_cache.cu.prehip | 189 ++ src/runtime/ndarray.cc | 8 +- src/runtime/ndarray.cc.prehip | 505 +++++ tensoradapter/include/tensoradapter.h | 19 +- tensoradapter/include/tensoradapter.h.prehip | 113 ++ tensoradapter/pytorch/torch.cpp | 47 +- tensoradapter/pytorch/torch.cpp.prehip | 106 ++ tests/cpp/common.h | 2 +- tests/cpp/common.h.prehip | 56 + tests/cpp/test_aten.cc | 28 +- tests/cpp/test_aten.cc.prehip | 1437 ++++++++++++++ tests/cpp/test_csrmm.cc | 6 +- tests/cpp/test_csrmm.cc.prehip | 215 +++ tests/cpp/test_partition.cc | 4 +- tests/cpp/test_partition.cc.prehip | 196 ++ tests/cpp/test_spmat_coo.cc | 6 +- tests/cpp/test_spmat_coo.cc.prehip | 576 ++++++ tests/cpp/test_spmat_csr.cc | 24 +- tests/cpp/test_spmat_csr.cc.prehip | 760 ++++++++ tests/cpp/test_unit_graph.cc | 14 +- tests/cpp/test_unit_graph.cc.prehip | 434 +++++ .../gpu_cache/include/gpu_cache_api.hpp | 10 +- .../include/gpu_cache_api.hpp.prehip | 55 + .../gpu_cache/include/nv_gpu_cache.hpp | 10 +- .../gpu_cache/include/nv_gpu_cache.hpp.prehip | 122 ++ .../HugeCTR/gpu_cache/include/nv_util.h | 30 +- .../gpu_cache/include/nv_util.h.prehip | 90 + .../HugeCTR/gpu_cache/src/nv_gpu_cache.cu | 97 +- .../gpu_cache/src/nv_gpu_cache.cu.prehip | 1645 +++++++++++++++++ 174 files changed, 27274 insertions(+), 1102 deletions(-) create mode 100644 include/dgl/aten/macro.h.prehip create mode 100644 include/dgl/runtime/device_api.h.prehip create mode 100644 include/dgl/runtime/ndarray.h.prehip create mode 100644 include/dgl/runtime/tensordispatch.h.prehip create mode 100644 src/array/arith.h.prehip create mode 100644 src/array/cuda/array_cumsum.cu.prehip create mode 100644 src/array/cuda/array_index_select.cu.prehip create mode 100644 src/array/cuda/array_index_select.cuh.prehip create mode 100644 src/array/cuda/array_nonzero.cu.prehip create mode 100644 src/array/cuda/array_op_impl.cu.prehip create mode 100644 src/array/cuda/array_scatter.cu.prehip create mode 100644 src/array/cuda/array_sort.cu.prehip create mode 100644 src/array/cuda/atomic.cuh.prehip create mode 100644 src/array/cuda/bf16.cuh.prehip create mode 100644 src/array/cuda/coo2csr.cu.prehip create mode 100644 src/array/cuda/coo_sort.cu.prehip create mode 100644 src/array/cuda/csr2coo.cu.prehip create mode 100644 src/array/cuda/csr_get_data.cu.prehip create mode 100644 src/array/cuda/csr_mm.cu.prehip create mode 100644 src/array/cuda/csr_sort.cu.prehip create mode 100644 src/array/cuda/csr_sum.cu.prehip create mode 100644 src/array/cuda/csr_transpose.cc.prehip create mode 100644 src/array/cuda/cuda_filter.cu.prehip create mode 100644 src/array/cuda/cusparse_dispatcher.cuh.prehip create mode 100644 src/array/cuda/disjoint_union.cu.prehip create mode 100644 src/array/cuda/fp16.cuh.prehip create mode 100644 src/array/cuda/functor.cuh.prehip create mode 100644 src/array/cuda/gather_mm.cu.prehip create mode 100644 src/array/cuda/ge_spmm.cuh.prehip create mode 100644 src/array/cuda/labor_sampling.cu.prehip create mode 100644 src/array/cuda/macro.cuh.prehip create mode 100644 src/array/cuda/negative_sampling.cu.prehip create mode 100644 src/array/cuda/rowwise_sampling.cu.prehip create mode 100644 src/array/cuda/rowwise_sampling_prob.cu.prehip create mode 100644 src/array/cuda/sddmm.cu.prehip create mode 100644 src/array/cuda/sddmm.cuh.prehip create mode 100644 src/array/cuda/sddmm_hetero_coo.cu.prehip create mode 100644 src/array/cuda/sddmm_hetero_csr.cu.prehip create mode 100644 src/array/cuda/segment_reduce.cu.prehip create mode 100644 src/array/cuda/segment_reduce.cuh.prehip create mode 100644 src/array/cuda/spmat_op_impl_coo.cu.prehip create mode 100644 src/array/cuda/spmat_op_impl_csr.cu.prehip create mode 100644 src/array/cuda/spmm.cu.prehip create mode 100644 src/array/cuda/spmm.cuh.prehip create mode 100644 src/array/cuda/spmm_hetero.cu.prehip create mode 100644 src/array/cuda/utils.cu.prehip create mode 100644 src/array/cuda/utils.h.prehip create mode 100644 src/array/cuda/uvm/array_index_select_uvm.cu.prehip create mode 100644 src/array/cuda/uvm/array_index_select_uvm.cuh.prehip create mode 100644 src/array/filter.cc.prehip create mode 100644 src/array/selector.h.prehip create mode 100644 src/array/uvm_array.cc.prehip create mode 100644 src/geometry/cuda/edge_coarsening_impl.cu.prehip create mode 100644 src/geometry/cuda/geometry_op_impl.cu.prehip create mode 100644 src/graph/heterograph_capi.cc.prehip create mode 100644 src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip create mode 100644 src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip create mode 100644 src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip create mode 100644 src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip create mode 100644 src/graph/transform/cuda/cuda_compact_graph.cu.prehip create mode 100644 src/graph/transform/cuda/cuda_map_edges.cuh.prehip create mode 100644 src/graph/transform/cuda/cuda_to_block.cu.prehip create mode 100644 src/graph/transform/cuda/knn.cu.prehip create mode 100644 src/graph/transform/to_block.cc.prehip create mode 100644 src/partition/cuda/partition_op.cu.prehip create mode 100644 src/partition/ndarray_partition.cc.prehip create mode 100644 src/random/continuous_seed.h.prehip create mode 100644 src/runtime/c_runtime_api.cc.prehip create mode 100644 src/runtime/cuda/cuda_common.h.prehip create mode 100644 src/runtime/cuda/cuda_device_api.cc.prehip create mode 100644 src/runtime/cuda/cuda_hashtable.cu.prehip create mode 100644 src/runtime/cuda/cuda_hashtable.cuh.prehip create mode 100644 src/runtime/cuda/gpu_cache.cu.prehip create mode 100644 src/runtime/ndarray.cc.prehip create mode 100644 tensoradapter/include/tensoradapter.h.prehip create mode 100644 tensoradapter/pytorch/torch.cpp.prehip create mode 100644 tests/cpp/common.h.prehip create mode 100644 tests/cpp/test_aten.cc.prehip create mode 100644 tests/cpp/test_csrmm.cc.prehip create mode 100644 tests/cpp/test_partition.cc.prehip create mode 100644 tests/cpp/test_spmat_coo.cc.prehip create mode 100644 tests/cpp/test_spmat_csr.cc.prehip create mode 100644 tests/cpp/test_unit_graph.cc.prehip create mode 100644 third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip create mode 100644 third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip create mode 100644 third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip create mode 100644 third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu.prehip diff --git a/include/dgl/aten/macro.h b/include/dgl/aten/macro.h index b760f3e88b41..ef97762d94c2 100644 --- a/include/dgl/aten/macro.h +++ b/include/dgl/aten/macro.h @@ -41,7 +41,7 @@ * We treat pinned memory as normal host memory if we don't want * to enable CUDA UVA access for this operator */ -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM #define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...) \ do { \ if ((val) == kDGLCPU) { \ @@ -55,9 +55,9 @@ << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \ } \ } while (0) -#else // DGL_USE_CUDA +#else // DGL_USE_ROCM #define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM /** * Dispatch according to integral type (either int32 or int64): @@ -132,7 +132,7 @@ * Dispatch according to float type, including 16bits * (float16/bfloat16/float32/float64). */ -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM #if BF16_ENABLED #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \ do { \ @@ -150,7 +150,7 @@ { __VA_ARGS__ } \ } else if ( \ XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \ - typedef __nv_bfloat16 FloatType; \ + typedef __hip_bfloat16 FloatType; \ { __VA_ARGS__ } \ } else if ( \ XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) { \ @@ -195,7 +195,7 @@ } \ } while (0) #endif // BF16_ENABLED -#else // DGL_USE_CUDA +#else // DGL_USE_ROCM #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \ do { \ CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat)) \ @@ -215,7 +215,7 @@ << " can only be bfloat16/float32/float64 on CPU"; \ } \ } while (0) -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM /** * Dispatch according to data type (int32, int64, float32 or float64): @@ -361,7 +361,7 @@ } while (0) // Macro to dispatch according to device context (allowing cuda) -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM #define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...) \ ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, { \ ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \ @@ -372,10 +372,10 @@ ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, { \ ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \ }); -#else // DGL_USE_CUDA +#else // DGL_USE_ROCM #define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH #define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM ///////////////////////// Array checks ////////////////////////// diff --git a/include/dgl/aten/macro.h.prehip b/include/dgl/aten/macro.h.prehip new file mode 100644 index 000000000000..b760f3e88b41 --- /dev/null +++ b/include/dgl/aten/macro.h.prehip @@ -0,0 +1,434 @@ +/** + * Copyright (c) 2020 by Contributors + * @file dgl/aten/macro.h + * @brief Common macros for aten package. + */ + +#ifndef DGL_ATEN_MACRO_H_ +#define DGL_ATEN_MACRO_H_ + +///////////////////////// Dispatchers ////////////////////////// + +/** + * Dispatch according to device: + * + * ATEN_XPU_SWITCH(array->ctx.device_type, XPU, { + * // Now XPU is a placeholder for array->ctx.device_type + * DeviceSpecificImplementation(...); + * }); + */ +#define ATEN_XPU_SWITCH(val, XPU, op, ...) \ + do { \ + if ((val) == kDGLCPU) { \ + constexpr auto XPU = kDGLCPU; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "Operator " << (op) << " does not support " \ + << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \ + } \ + } while (0) + +/** + * Dispatch according to device: + * + * XXX(minjie): temporary macro that allows CUDA operator + * + * ATEN_XPU_SWITCH(array->ctx.device_type, XPU, { + * // Now XPU is a placeholder for array->ctx.device_type + * DeviceSpecificImplementation(...); + * }); + * + * We treat pinned memory as normal host memory if we don't want + * to enable CUDA UVA access for this operator + */ +#ifdef DGL_USE_CUDA +#define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...) \ + do { \ + if ((val) == kDGLCPU) { \ + constexpr auto XPU = kDGLCPU; \ + { __VA_ARGS__ } \ + } else if ((val) == kDGLCUDA) { \ + constexpr auto XPU = kDGLCUDA; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "Operator " << (op) << " does not support " \ + << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \ + } \ + } while (0) +#else // DGL_USE_CUDA +#define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH +#endif // DGL_USE_CUDA + +/** + * Dispatch according to integral type (either int32 or int64): + * + * ATEN_ID_TYPE_SWITCH(array->dtype, IdType, { + * // Now IdType is the type corresponding to data type in array. + * // For instance, one can do this for a CPU array: + * DType *data = static_cast(array->data); + * }); + */ +#define ATEN_ID_TYPE_SWITCH(val, IdType, ...) \ + do { \ + CHECK_EQ((val).code, kDGLInt) << "ID must be integer type"; \ + if ((val).bits == 32) { \ + typedef int32_t IdType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef int64_t IdType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "ID can only be int32 or int64"; \ + } \ + } while (0) + +/** + * Dispatch according to bits (either int32 or int64): + * + * ATEN_ID_BITS_SWITCH(bits, IdType, { + * // Now IdType is the type corresponding to data type in array. + * // For instance, one can do this for a CPU array: + * DType *data = static_cast(array->data); + * }); + */ +#define ATEN_ID_BITS_SWITCH(bits, IdType, ...) \ + do { \ + CHECK((bits) == 32 || (bits) == 64) << "bits must be 32 or 64"; \ + if ((bits) == 32) { \ + typedef int32_t IdType; \ + { __VA_ARGS__ } \ + } else if ((bits) == 64) { \ + typedef int64_t IdType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "ID can only be int32 or int64"; \ + } \ + } while (0) + +/** + * Dispatch according to float type (either float32 or float64): + * + * ATEN_FLOAT_TYPE_SWITCH(array->dtype, FloatType, { + * // Now FloatType is the type corresponding to data type in array. + * // For instance, one can do this for a CPU array: + * FloatType *data = static_cast(array->data); + * }); + */ +#define ATEN_FLOAT_TYPE_SWITCH(val, FloatType, val_name, ...) \ + do { \ + CHECK_EQ((val).code, kDGLFloat) << (val_name) << " must be float type"; \ + if ((val).bits == 32) { \ + typedef float FloatType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef double FloatType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) << " can only be float32 or float64"; \ + } \ + } while (0) + +/** + * Dispatch according to float type, including 16bits + * (float16/bfloat16/float32/float64). + */ +#ifdef DGL_USE_CUDA +#if BF16_ENABLED +#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \ + do { \ + CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat)) \ + << (val_name) << " must be float type"; \ + if ((val).bits == 32) { \ + typedef float FloatType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef double FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) { \ + typedef __half FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \ + typedef __nv_bfloat16 FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) { \ + LOG(FATAL) << (val_name) << " can't be float16 on CPU"; \ + } else if ( \ + XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) { \ + typedef BFloat16 FloatType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be float16/bfloat16/float32/float64 on GPU"; \ + } \ + } while (0) +#else // BF16_ENABLED +#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \ + do { \ + CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat)) \ + << (val_name) << " must be float type"; \ + if ((val).bits == 32) { \ + typedef float FloatType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef double FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLFloat) { \ + typedef __half FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \ + LOG(FATAL) << "bfloat16 requires CUDA >= 11.0"; \ + } else if ( \ + XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) { \ + LOG(FATAL) << (val_name) << " can't be float16 on CPU"; \ + } else if ( \ + XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) { \ + typedef BFloat16 FloatType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be float16/float32/float64 on GPU"; \ + } \ + } while (0) +#endif // BF16_ENABLED +#else // DGL_USE_CUDA +#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \ + do { \ + CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat)) \ + << (val_name) << " must be float type"; \ + if ((val).bits == 32) { \ + typedef float FloatType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef double FloatType; \ + { __VA_ARGS__ } \ + } else if ( \ + XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLBfloat) { \ + typedef BFloat16 FloatType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be bfloat16/float32/float64 on CPU"; \ + } \ + } while (0) +#endif // DGL_USE_CUDA + +/** + * Dispatch according to data type (int32, int64, float32 or float64): + * + * ATEN_DTYPE_SWITCH(array->dtype, DType, { + * // Now DType is the type corresponding to data type in array. + * // For instance, one can do this for a CPU array: + * DType *data = static_cast(array->data); + * }); + */ +#define ATEN_DTYPE_SWITCH(val, DType, val_name, ...) \ + do { \ + if ((val).code == kDGLInt && (val).bits == 32) { \ + typedef int32_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLInt && (val).bits == 64) { \ + typedef int64_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLFloat && (val).bits == 32) { \ + typedef float DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLFloat && (val).bits == 64) { \ + typedef double DType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be int32, int64, float32 or float64"; \ + } \ + } while (0) + +/** + * Dispatch according to data type (int8, uint8, float32 or float64): + * + * ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(array->dtype, DType, { + * // Now DType is the type corresponding to data type in array. + * // For instance, one can do this for a CPU array: + * DType *data = static_cast(array->data); + * }); + */ +#define ATEN_FLOAT_INT8_UINT8_TYPE_SWITCH(val, DType, val_name, ...) \ + do { \ + if ((val).code == kDGLInt && (val).bits == 8) { \ + typedef int8_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLUInt && (val).bits == 8) { \ + typedef uint8_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLFloat && (val).bits == 32) { \ + typedef float DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLFloat && (val).bits == 64) { \ + typedef double DType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be int8, uint8, float32 or float64"; \ + } \ + } while (0) + +/** + * Dispatch data type only based on bit-width (8-bit, 16-bit, 32-bit, 64-bit): + * + * ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, { + * // Now DType is the type which has the same bit-width with the + * // data type in array. + * // Do not use for computation, but only for read and write. + * // For instance, one can do this for a CPU array: + * DType *data = static_cast(array->data); + * }); + */ +#define ATEN_DTYPE_BITS_ONLY_SWITCH(val, DType, val_name, ...) \ + do { \ + if ((val).bits == 8) { \ + typedef int8_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 16) { \ + typedef int16_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 32) { \ + typedef int32_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).bits == 64) { \ + typedef int64_t DType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << (val_name) \ + << " can only be 8-bit, 16-bit, 32-bit, or 64-bit"; \ + } \ + } while (0) + +/** + * Dispatch according to integral type of CSR graphs. + * Identical to ATEN_ID_TYPE_SWITCH except for a different error message. + */ +#define ATEN_CSR_DTYPE_SWITCH(val, DType, ...) \ + do { \ + if ((val).code == kDGLInt && (val).bits == 32) { \ + typedef int32_t DType; \ + { __VA_ARGS__ } \ + } else if ((val).code == kDGLInt && (val).bits == 64) { \ + typedef int64_t DType; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "CSR matrix data can only be int32 or int64"; \ + } \ + } while (0) + +// Macro to dispatch according to device context and index type. +#define ATEN_CSR_SWITCH(csr, XPU, IdType, op, ...) \ + ATEN_XPU_SWITCH((csr).indptr->ctx.device_type, XPU, op, { \ + ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \ + }); + +// Macro to dispatch according to device context and index type. +#define ATEN_COO_SWITCH(coo, XPU, IdType, op, ...) \ + ATEN_XPU_SWITCH((coo).row->ctx.device_type, XPU, op, { \ + ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \ + }); + +#define CHECK_VALID_CONTEXT(VAR1, VAR2) \ + CHECK( \ + ((VAR1)->ctx == (VAR2)->ctx) || (VAR1).IsPinned() || \ + ((VAR1).NumElements() == 0)) /* Let empty arrays pass */ \ + << "Expected " << (#VAR2) << "(" << (VAR2)->ctx << ")" \ + << " to have the same device " \ + << "context as " << (#VAR1) << "(" << (VAR1)->ctx << "). " \ + << "Or " << (#VAR1) << "(" << (VAR1)->ctx << ")" \ + << " is pinned"; + +/** + * Macro to dispatch according to the context of array and dtype of csr + * to enable CUDA UVA ops. + * Context check is covered here to avoid confusion with CHECK_SAME_CONTEXT. + * If csr has the same context with array, same behivor as ATEN_CSR_SWITCH_CUDA. + * If csr is pinned, array's context will conduct the actual operation. + */ +#define ATEN_CSR_SWITCH_CUDA_UVA(csr, array, XPU, IdType, op, ...) \ + do { \ + CHECK_VALID_CONTEXT(csr.indices, array); \ + ATEN_XPU_SWITCH_CUDA(array->ctx.device_type, XPU, op, { \ + ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \ + }); \ + } while (0) + +// Macro to dispatch according to device context (allowing cuda) +#ifdef DGL_USE_CUDA +#define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...) \ + ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, { \ + ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \ + }); + +// Macro to dispatch according to device context and index type. +#define ATEN_COO_SWITCH_CUDA(coo, XPU, IdType, op, ...) \ + ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, { \ + ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \ + }); +#else // DGL_USE_CUDA +#define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH +#define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH +#endif // DGL_USE_CUDA + +///////////////////////// Array checks ////////////////////////// + +#define IS_INT32(a) ((a)->dtype.code == kDGLInt && (a)->dtype.bits == 32) +#define IS_INT64(a) ((a)->dtype.code == kDGLInt && (a)->dtype.bits == 64) +#define IS_FLOAT32(a) ((a)->dtype.code == kDGLFloat && (a)->dtype.bits == 32) +#define IS_FLOAT64(a) ((a)->dtype.code == kDGLFloat && (a)->dtype.bits == 64) + +#define CHECK_IF(cond, prop, value_name, dtype_name) \ + CHECK(cond) << "Expecting " << (prop) << " of " << (value_name) << " to be " \ + << (dtype_name) + +#define CHECK_INT32(value, value_name) \ + CHECK_IF(IS_INT32(value), "dtype", value_name, "int32") +#define CHECK_INT64(value, value_name) \ + CHECK_IF(IS_INT64(value), "dtype", value_name, "int64") +#define CHECK_INT(value, value_name) \ + CHECK_IF( \ + IS_INT32(value) || IS_INT64(value), "dtype", value_name, \ + "int32 or int64") +#define CHECK_FLOAT32(value, value_name) \ + CHECK_IF(IS_FLOAT32(value), "dtype", value_name, "float32") +#define CHECK_FLOAT64(value, value_name) \ + CHECK_IF(IS_FLOAT64(value), "dtype", value_name, "float64") +#define CHECK_FLOAT(value, value_name) \ + CHECK_IF( \ + IS_FLOAT32(value) || IS_FLOAT64(value), "dtype", value_name, \ + "float32 or float64") + +#define CHECK_NDIM(value, _ndim, value_name) \ + CHECK_IF((value)->ndim == (_ndim), "ndim", value_name, _ndim) + +#define CHECK_SAME_DTYPE(VAR1, VAR2) \ + CHECK((VAR1)->dtype == (VAR2)->dtype) \ + << "Expected " << (#VAR2) << " to be the same type as " << (#VAR1) \ + << "(" << (VAR1)->dtype << ")" \ + << ". But got " << (VAR2)->dtype << "."; + +#define CHECK_SAME_CONTEXT(VAR1, VAR2) \ + CHECK((VAR1)->ctx == (VAR2)->ctx) \ + << "Expected " << (#VAR2) << " to have the same device context as " \ + << (#VAR1) << "(" << (VAR1)->ctx << ")" \ + << ". But got " << (VAR2)->ctx << "."; + +#define CHECK_NO_OVERFLOW(dtype, val) \ + do { \ + if (sizeof(val) == 8 && (dtype).bits == 32) \ + CHECK_LE((val), 0x7FFFFFFFL) \ + << "int32 overflow for argument " << (#val) << "."; \ + } while (0); + +#define CHECK_IS_ID_ARRAY(VAR) \ + CHECK((VAR)->ndim == 1 && (IS_INT32(VAR) || IS_INT64(VAR))) \ + << "Expected argument " << (#VAR) << " to be an 1D integer array."; + +#endif // DGL_ATEN_MACRO_H_ diff --git a/include/dgl/runtime/device_api.h b/include/dgl/runtime/device_api.h index d085d26c0fe1..f758c35ee744 100644 --- a/include/dgl/runtime/device_api.h +++ b/include/dgl/runtime/device_api.h @@ -174,7 +174,7 @@ class DeviceAPI { DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst); /** - * @brief Pin host memory using cudaHostRegister(). + * @brief Pin host memory using hipHostRegister(). * * @param ptr The host memory pointer to be pinned. * @param nbytes The size to be pinned. @@ -183,7 +183,7 @@ class DeviceAPI { DGL_DLL virtual bool PinData(void* ptr, size_t nbytes); /** - * @brief Unpin host memory using cudaHostUnregister(). + * @brief Unpin host memory using hipHostUnregister(). * * @param ptr The host memory pointer to be unpinned. */ @@ -203,7 +203,7 @@ class DeviceAPI { /** * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator. - * @note It avoids unnecessary cudaFreeHost calls and puts the memory + * @note It avoids unnecessary hipHostFree calls and puts the memory * block into CachingHostAllocator's free list. * @param deleter Pointer to the deleter function from PyTorch's * CachingHostAllocator. diff --git a/include/dgl/runtime/device_api.h.prehip b/include/dgl/runtime/device_api.h.prehip new file mode 100644 index 000000000000..d085d26c0fe1 --- /dev/null +++ b/include/dgl/runtime/device_api.h.prehip @@ -0,0 +1,268 @@ +/** + * Copyright (c) 2016 by Contributors + * @file dgl/runtime/device_api.h + * @brief Abstract device memory management API + */ +#ifndef DGL_RUNTIME_DEVICE_API_H_ +#define DGL_RUNTIME_DEVICE_API_H_ + +#include + +#include "c_runtime_api.h" +#include "packed_func.h" + +namespace dgl { +namespace runtime { +/** + * @brief the query type into GetAttr + */ +enum DeviceAttrKind : int { + kExist = 0, + kMaxThreadsPerBlock = 1, + kWarpSize = 2, + kMaxSharedMemoryPerBlock = 3, + kComputeVersion = 4, + kDeviceName = 5, + kMaxClockRate = 6, + kMultiProcessorCount = 7, + kMaxThreadDimensions = 8 +}; + +/** @brief Number of bytes each allocation must align to */ +constexpr int kAllocAlignment = 64; + +/** @brief Number of bytes each allocation must align to in temporary allocation + */ +constexpr int kTempAllocaAlignment = 64; + +/** @brief Maximum size that can be allocated on stack */ +constexpr int kMaxStackAlloca = 1024; + +/** + * @brief DGL Runtime Device API, abstracts the device + * specific interface for memory management. + */ +class DeviceAPI { + public: + /** @brief virtual destructor */ + virtual ~DeviceAPI() {} + /** + * @brief Check whether the device is available. + */ + virtual bool IsAvailable() { return true; } + + /** + * @brief Set the environment device id to ctx + * @param ctx The context to be set. + */ + virtual void SetDevice(DGLContext ctx) = 0; + + /** + * @brief Get attribute of specified device. + * @param ctx The device context + * @param kind The result kind + * @param rv The return value. + * @sa DeviceAttrKind + */ + virtual void GetAttr( + DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) = 0; + + /** + * @brief Allocate a data space on device. + * @param ctx The device context to perform operation. + * @param nbytes The number of bytes in memory. + * @param alignment The alignment of the memory. + * @param type_hint The type of elements. Only needed by certain backends such + * as OpenGL, as nbytes & alignment are sufficient for most backends. + * @return The allocated device pointer. + */ + virtual void* AllocDataSpace( + DGLContext ctx, size_t nbytes, size_t alignment, + DGLDataType type_hint) = 0; + + /** + * @brief Free a data space on device. + * @param ctx The device context to perform operation. + * @param ptr The data space. + */ + virtual void FreeDataSpace(DGLContext ctx, void* ptr) = 0; + + /** + * @brief copy data from one place to another + * @param from The source array. + * @param from_offset The byte offeset in the from. + * @param to The target array. + * @param to_offset The byte offset in the to. + * @param num_bytes The size of the memory in bytes. + * @param ctx_from The source context. + * @param ctx_to The target context. + * @param type_hint The type of elements, only needed by certain backends, + * can be useful for cross device endian converison. + */ + virtual void CopyDataFromTo( + const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to, + DGLDataType type_hint) = 0; + + /** + * @brief copy data between device and CPU while recording the event. + * @param from The source array. + * @param from_offset The byte offeset in the from. + * @param to The target array. + * @param to_offset The byte offset in the to. + * @param num_bytes The size of the memory in bytes. + * @param ctx_from The source context. + * @param ctx_to The target context. + * @param type_hint The type of elements, only needed by certain backends, + * can be useful for cross device endian converison. + * @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator. + * @note This function only works when PyTorch CachingHostAllocator is + * available. + */ + virtual void RecordedCopyDataFromTo( + void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to, + DGLDataType type_hint, void* pytorch_ctx) = 0; + + /** + * @brief Create a new stream of execution. + * + * @param ctx The context of allocation. + */ + DGL_DLL virtual DGLStreamHandle CreateStream(DGLContext ctx); + + /** + * @brief Free a stream of execution + * + * @param ctx The context of the stream + * @param stream The pointer to be freed. + */ + DGL_DLL virtual void FreeStream(DGLContext ctx, DGLStreamHandle stream); + + /** + * @brief Synchronize the stream + * @param ctx The context to perform operation. + * @param stream The stream to be sync. + */ + virtual void StreamSync(DGLContext ctx, DGLStreamHandle stream) = 0; + + /** + * @brief Set the stream + * @param ctx The context to set stream. + * @param stream The stream to be set. + */ + virtual void SetStream(DGLContext ctx, DGLStreamHandle stream) {} + + /** + * @brief Get the stream + */ + virtual DGLStreamHandle GetStream() const { return nullptr; } + + /** + * @brief Synchronize 2 streams of execution. + * + * An event is created in event_src stream that the second then + * stream waits on. Neither event_src or event_dst need to be of + * the same device ID as the context, but they must be of the same + * device type. + * + * @param ctx The context of the streams. + * @param event_src The source stream to synchronize. + * @param event_dst The destination stream to synchronize. + */ + DGL_DLL virtual void SyncStreamFromTo( + DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst); + + /** + * @brief Pin host memory using cudaHostRegister(). + * + * @param ptr The host memory pointer to be pinned. + * @param nbytes The size to be pinned. + * @return false when pinning an empty tensor. true otherwise. + */ + DGL_DLL virtual bool PinData(void* ptr, size_t nbytes); + + /** + * @brief Unpin host memory using cudaHostUnregister(). + * + * @param ptr The host memory pointer to be unpinned. + */ + DGL_DLL virtual void UnpinData(void* ptr); + + /** + * @brief Allocate the pinned memory using PyTorch CachingHostAllocator. + * + * @param nbytes The size to be pinned. + * @param ctx Pointer to the context pointer from PyTorch's + * CachingHostAllocator. + * @param deleter Pointer to the deleter function from PyTorch's + * CachingHostAllocator. + */ + DGL_DLL virtual void* AllocPinnedDataSpace( + size_t nbytes, void** ctx, void** deleter); + + /** + * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator. + * @note It avoids unnecessary cudaFreeHost calls and puts the memory + * block into CachingHostAllocator's free list. + * @param deleter Pointer to the deleter function from PyTorch's + * CachingHostAllocator. + */ + DGL_DLL virtual void FreePinnedDataSpace(void** deleter); + + /** + * @brief Check whether the memory is in pinned memory. + */ + DGL_DLL virtual bool IsPinned(const void* ptr) { return false; } + + /** + * @brief Allocate temporal workspace for backend execution. + * + * \note We have the following assumption about backend temporal + * workspace allocation, and backend will optimize for such assumption: + * + * - Only a few allocation will happen, and space will be released after use. + * - The release order is usually in reverse order of allocate (stack style). + * - Repeative pattern of same allocations over different runs. + * - Workspace should not overlap between different threads(i.e. be + * threadlocal) + * + * @param ctx The context of allocation. + * @param nbytes The size to be allocated. + * @param type_hint The type of elements. Only needed by certain backends such + * as OpenGL, as nbytes is sufficient for most backends. + */ + DGL_DLL virtual void* AllocWorkspace( + DGLContext ctx, size_t nbytes, DGLDataType type_hint = {}); + + /** + * @brief Free temporal workspace in backend execution. + * + * @param ctx The context of allocation. + * @param ptr The pointer to be freed. + */ + DGL_DLL virtual void FreeWorkspace(DGLContext ctx, void* ptr); + + /** + * @brief Get device API based on context. + * @param ctx The context + * @param allow_missing Whether allow missing + * @return The corresponding device API. + */ + DGL_DLL static DeviceAPI* Get(DGLContext ctx, bool allow_missing = false); + + /** + * @brief Get device API based on device type. + * @param dev_type The device type + * @param allow_missing Whether allow missing + * @return The corresponding device API. + */ + DGL_DLL static DeviceAPI* Get( + DGLDeviceType dev_type, bool allow_missing = false); +}; + +/** @brief The device type bigger than this is RPC device */ +constexpr int kRPCSessMask = 128; +} // namespace runtime +} // namespace dgl +#endif // DGL_RUNTIME_DEVICE_API_H_ diff --git a/include/dgl/runtime/ndarray.h b/include/dgl/runtime/ndarray.h index 40bbbed3631f..cde14d1cbc84 100644 --- a/include/dgl/runtime/ndarray.h +++ b/include/dgl/runtime/ndarray.h @@ -17,16 +17,16 @@ #include "serializer.h" #include "shared_mem.h" -#ifdef DGL_USE_CUDA -#include +#ifdef DGL_USE_ROCM +#include #define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000) -#include +#include #if BF16_ENABLED -#include +#include #endif // BF16_ENABLED -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM // forward declaration inline std::ostream& operator<<(std::ostream& os, DGLDataType t); @@ -57,12 +57,12 @@ GEN_DGLDATATYPETRAITS_FOR(int64_t, kDGLInt, 64); // arrays, so I'm just converting uints to signed DTypes. GEN_DGLDATATYPETRAITS_FOR(uint32_t, kDGLInt, 32); GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16); #if BF16_ENABLED -GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16); +GEN_DGLDATATYPETRAITS_FOR(__hip_bfloat16, kDGLBfloat, 16); #endif // BF16_ENABLED -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32); GEN_DGLDATATYPETRAITS_FOR(double, kDGLFloat, 64); #undef GEN_DGLDATATYPETRAITS_FOR @@ -185,7 +185,7 @@ class NDArray { * CachingHostAllocator for allocating pinned memory and copying data * from the current NDAarray. As a result, PyTorch is responsible for * managing the lifecycle of the returned NDArray, including deciding - * when to flush the data for reuse or call cudaFreeHost. The current + * when to flush the data for reuse or call hipHostFree. The current * context must be kDGLCPU, otherwise, an error will be thrown. */ inline NDArray PinMemory(); @@ -194,7 +194,7 @@ class NDArray { * @brief In-place method to pin the current array by calling PinContainer * on the underlying NDArray:Container. * @note This is an in-place method that flags the memory as page-locked by - * utilizing cudaHostRegister at the underlying level to pin the current + * utilizing hipHostRegister at the underlying level to pin the current * instance of NDArray. The current context must be kDGLCPU, otherwise, * an error will be thrown. */ @@ -523,7 +523,7 @@ inline void NDArray::CopyFrom(const NDArray& other) { // Pinned by PyTorch if (cpu_data->pinned_by_pytorch_) { // To ensure correct behavior, the event must be recorded after - // cudaMemcpyAsync as long as the memory is pinned by PyTorch. + // hipMemcpyAsync as long as the memory is pinned by PyTorch. void* pytorch_ctx = cpu_data->pytorch_ctx_; RecordedCopyFromTo( &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx); @@ -549,7 +549,7 @@ inline void NDArray::CopyTo(const NDArray& other) const { // pinned by PyTorch if (cpu_data->pinned_by_pytorch_) { // To ensure correct behavior, the event must be recorded after - // cudaMemcpyAsync as long as the memory is pinned by PyTorch. + // hipMemcpyAsync as long as the memory is pinned by PyTorch. void* pytorch_ctx = cpu_data->pytorch_ctx_; RecordedCopyFromTo( &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx); diff --git a/include/dgl/runtime/ndarray.h.prehip b/include/dgl/runtime/ndarray.h.prehip new file mode 100644 index 000000000000..40bbbed3631f --- /dev/null +++ b/include/dgl/runtime/ndarray.h.prehip @@ -0,0 +1,890 @@ +/** + * Copyright (c) 2017-2022 by Contributors + * @file dgl/runtime/ndarray.h + * @brief Abstract device memory management API + */ +#ifndef DGL_RUNTIME_NDARRAY_H_ +#define DGL_RUNTIME_NDARRAY_H_ + +#include +#include +#include +#include +#include + +#include "bfloat16.h" +#include "c_runtime_api.h" +#include "serializer.h" +#include "shared_mem.h" + +#ifdef DGL_USE_CUDA +#include + +#define BF16_ENABLED (defined(CUDART_VERSION) && CUDART_VERSION >= 11000) + +#include +#if BF16_ENABLED +#include +#endif // BF16_ENABLED +#endif // DGL_USE_CUDA + +// forward declaration +inline std::ostream& operator<<(std::ostream& os, DGLDataType t); + +namespace dgl { + +/** + * @brief Type traits that converts a C type to a DGLDataType. + * + * Usage: + * DGLDataTypeTraits::dtype == dtype + */ +template +struct DGLDataTypeTraits { + static constexpr DGLDataType dtype{0, 0, 0}; // dummy +}; +#define GEN_DGLDATATYPETRAITS_FOR(T, code, bits) \ + template <> \ + struct DGLDataTypeTraits { \ + static constexpr DGLDataType dtype{code, bits, 1}; \ + } +GEN_DGLDATATYPETRAITS_FOR(int8_t, kDGLInt, 8); +GEN_DGLDATATYPETRAITS_FOR(uint8_t, kDGLUInt, 8); +GEN_DGLDATATYPETRAITS_FOR(int16_t, kDGLInt, 16); +GEN_DGLDATATYPETRAITS_FOR(int32_t, kDGLInt, 32); +GEN_DGLDATATYPETRAITS_FOR(int64_t, kDGLInt, 64); +// XXX(BarclayII) most DL frameworks do not support unsigned int and long +// arrays, so I'm just converting uints to signed DTypes. +GEN_DGLDATATYPETRAITS_FOR(uint32_t, kDGLInt, 32); +GEN_DGLDATATYPETRAITS_FOR(uint64_t, kDGLInt, 64); +#ifdef DGL_USE_CUDA +GEN_DGLDATATYPETRAITS_FOR(__half, kDGLFloat, 16); +#if BF16_ENABLED +GEN_DGLDATATYPETRAITS_FOR(__nv_bfloat16, kDGLBfloat, 16); +#endif // BF16_ENABLED +#endif // DGL_USE_CUDA +GEN_DGLDATATYPETRAITS_FOR(float, kDGLFloat, 32); +GEN_DGLDATATYPETRAITS_FOR(double, kDGLFloat, 64); +#undef GEN_DGLDATATYPETRAITS_FOR + +namespace runtime { + +/** + * @brief DLPack converter. + */ +struct DLPackConvert; + +/** + * @brief Managed NDArray. + * The array is backed by reference counted blocks. + */ +class NDArray { + public: + // internal container type + struct Container; + /** @brief default constructor */ + NDArray() {} + /** + * @brief cosntruct a NDArray that refers to data + * @param data The data this NDArray refers to + */ + explicit inline NDArray(Container* data); + /** + * @brief copy constructor + * @param other The value to be copied + */ + inline NDArray(const NDArray& other); // NOLINT(*) + /** + * @brief move constructor + * @param other The value to be moved + */ + NDArray(NDArray&& other) // NOLINT(*) + : data_(other.data_) { + other.data_ = nullptr; + } + /** @brief destructor */ + ~NDArray() { this->reset(); } + /** + * @brief Swap this array with another NDArray + * @param other The other NDArray + */ + void swap(NDArray& other) { // NOLINT(*) + std::swap(data_, other.data_); + } + /** + * @brief copy assignmemt + * @param other The value to be assigned. + * @return reference to self. + */ + NDArray& operator=(const NDArray& other) { // NOLINT(*) + // copy-and-swap idiom + NDArray(other).swap(*this); // NOLINT(*) + return *this; + } + /** + * @brief move assignmemt + * @param other The value to be assigned. + * @return reference to self. + */ + NDArray& operator=(NDArray&& other) { // NOLINT(*) + // copy-and-swap idiom + NDArray(std::move(other)).swap(*this); // NOLINT(*) + return *this; + } + /** @return If NDArray is defined */ + bool defined() const { return data_ != nullptr; } + /** @return If both NDArray reference the same container */ + bool same_as(const NDArray& other) const { return data_ == other.data_; } + /** @brief reset the content of NDArray to be nullptr */ + inline void reset(); + /** + * @return the reference counter + * @note this number is approximate in multi-threaded setting. + */ + inline int use_count() const; + /** @return Pointer to content of DGLArray */ + inline const DGLArray* operator->() const; + /** @return True if the ndarray is contiguous. */ + bool IsContiguous() const; + /** @return the data pointer with type. */ + template + inline T* Ptr() const { + if (!defined()) + return nullptr; + else + return static_cast(operator->()->data); + } + + /** + * @brief Copy data content from/into another array. + * @param other The source array to be copied from. + * @note The copy runs on the dgl internal stream if it involves a GPU + * context. + */ + inline void CopyFrom(DGLArray* other); + inline void CopyFrom(const NDArray& other); + inline void CopyTo(DGLArray* other) const; + inline void CopyTo(const NDArray& other) const; + + /** + * @brief Copy the data to another context. + * @param ctx The target context. + * @return The array under another context. + */ + inline NDArray CopyTo(const DGLContext& ctx) const; + + /** + * @brief Return a new array with a copy of the content. + */ + inline NDArray Clone() const; + + /** + * @brief Return a copy of the current instance of NDArray in pinned + * (page-locked) memory. + * @note This is an out-of-place method, which utilizes PyTorch's + * CachingHostAllocator for allocating pinned memory and copying data + * from the current NDAarray. As a result, PyTorch is responsible for + * managing the lifecycle of the returned NDArray, including deciding + * when to flush the data for reuse or call cudaFreeHost. The current + * context must be kDGLCPU, otherwise, an error will be thrown. + */ + inline NDArray PinMemory(); + + /** + * @brief In-place method to pin the current array by calling PinContainer + * on the underlying NDArray:Container. + * @note This is an in-place method that flags the memory as page-locked by + * utilizing cudaHostRegister at the underlying level to pin the current + * instance of NDArray. The current context must be kDGLCPU, otherwise, + * an error will be thrown. + */ + inline void PinMemory_(); + + /** + * @brief In-place method to unpin the current array by calling UnpinContainer + * on the underlying NDArray:Container. + * @note This is an in-place method. Behavior depends on the current context, + * IsPinned: will be unpinned; + * others: directly return. + */ + inline void UnpinMemory_(); + + /** + * @brief Check if the array is pinned. + */ + inline bool IsPinned() const; + + /** + * @brief Record streams that are using the underlying tensor. + * @param stream The stream that is using the underlying tensor. + */ + inline void RecordStream(DGLStreamHandle stream) const; + + /** + * @brief Load NDArray from stream + * @param stream The input data stream + * @return Whether load is successful + */ + bool Load(dmlc::Stream* stream); + + /** + * @brief Save NDArray to stream + * @param stream The output data stream + */ + void Save(dmlc::Stream* stream) const; + + /** + * @brief Create a NDArray that shares the data memory with the current one. + * @param shape The shape of the new array. + * @param dtype The data type of the new array. + * @param offset The offset (in bytes) of the starting pointer. + * @note The memory size of new array must be smaller than the current one. + */ + DGL_DLL NDArray + CreateView(std::vector shape, DGLDataType dtype, int64_t offset = 0); + + /** + * @brief Create an empty NDArray. + * @param shape The shape of the new array. + * @param dtype The data type of the new array. + * @param ctx The context of the array. + * @return The created Array + */ + DGL_DLL static NDArray Empty( + std::vector shape, DGLDataType dtype, DGLContext ctx); + + /** + * @brief Create an empty NDArray in pinned memory. + * @param shape The shape of the new array. + * @param dtype The data type of the new array. + * @param ctx The context of the array. + * @return The created array. + */ + DGL_DLL static NDArray PinnedEmpty( + std::vector shape, DGLDataType dtype, DGLContext ctx); + + /** + * @brief Create an empty NDArray with shared memory. + * @param name The name of shared memory. + * @param shape The shape of the new array. + * @param dtype The data type of the new array. + * @param ctx The context of the array. + * @param is_create whether to create shared memory. + * @return The created Array + */ + DGL_DLL static NDArray EmptyShared( + const std::string& name, std::vector shape, DGLDataType dtype, + DGLContext ctx, bool is_create); + + /** + * @brief Get the size of the array in the number of bytes. + */ + size_t GetSize() const; + + /** + * @brief Get the number of elements in this array. + */ + int64_t NumElements() const; + + /** + * @brief Create a NDArray by copying from std::vector. + * @tparam T Type of vector data. Determines the dtype of returned array. + */ + template + DGL_DLL static NDArray FromVector( + const std::vector& vec, DGLContext ctx = DGLContext{kDGLCPU, 0}); + + /** + * @brief Create a NDArray from a raw pointer. + */ + DGL_DLL static NDArray CreateFromRaw( + const std::vector& shape, DGLDataType dtype, DGLContext ctx, + void* raw, bool auto_free); + + /** + * @brief Create a std::vector from a 1D NDArray. + * @tparam T Type of vector data. + * @note Type casting is NOT performed. The caller has to make sure that the + * vector type matches the dtype of NDArray. + */ + template + std::vector ToVector() const; + + std::shared_ptr GetSharedMem() const; + + /** + * @brief Function to copy data from one array to another. + * @param from The source array. + * @param to The target array. + * @param (optional) stream The stream used in copy. + */ + DGL_DLL static void CopyFromTo(DGLArray* from, DGLArray* to); + DGL_DLL static void CopyFromTo( + DGLArray* from, DGLArray* to, DGLStreamHandle stream); + + /** + * @brief Function to copy data between device and CPU while recording the + * event. + * @param from The source array. + * @param to The target array. + * @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator. + * @note This function fuses data-copy and event recording to ensure + * CachingHostAllocator works properly. + */ + DGL_DLL static void RecordedCopyFromTo( + DGLArray* from, DGLArray* to, void* pytorch_ctx); + + /** + * @brief Function to pin the DGLArray of a Container. + * @param ptr The container to be pinned. + * @note Data of the given array will be pinned inplace. + * Behavior depends on the current context, + * kDGLCPU: will be pinned; + * IsPinned: directly return; + * kDGLCUDA: invalid, will throw an error. + */ + DGL_DLL static void PinContainer(Container* ptr); + + /** + * @brief Function to unpin the DGLArray of a Container. + * @param ptr The container to be unpinned. + * @note Data of the given array will be unpinned inplace. + * Behavior depends on the current context, + * IsPinned: will be unpinned; + * others: directly return. + */ + DGL_DLL static void UnpinContainer(Container* ptr); + + /** + * @brief Function check if the DGLArray of a Container is pinned. + * @param ptr The container to be checked. + * @return true if pinned. + */ + DGL_DLL static bool IsContainerPinned(Container* ptr); + + /** + * @brief Record streams that are using this tensor. + * @param ptr Pointer of the tensor to be recorded. + * @param stream The stream that is using this tensor. + */ + DGL_DLL static void RecordStream(DGLArray* tensor, DGLStreamHandle stream); + + // internal namespace + struct Internal { + // Default deleter for the container + static void DefaultDeleter(NDArray::Container* ptr); + // Local create function which allocates tensor metadata + // but does not allocate space for the data. + static NDArray Create( + std::vector shape, DGLDataType dtype, DGLContext ctx); + // Implementation of API function + static DGLArray* MoveAsDGLArray(NDArray arr); + }; + + private: + /** @brief Internal Data content */ + Container* data_{nullptr}; + // enable internal functions + friend struct Internal; + friend struct DLPackConvert; + friend class DGLRetValue; + friend class DGLArgsSetter; +}; + +/** + * @brief Save a DGLArray to stream + * @param strm The outpu stream + * @param tensor The tensor to be saved. + */ +inline bool SaveDGLArray(dmlc::Stream* strm, const DGLArray* tensor); + +/** + * @brief Reference counted Container object used to back NDArray. + * + * This object is DGLArray compatible: + * the pointer to the NDArrayContainer can be directly + * interpreted as a DGLArray* + * + * @note: do not use this function directly, use NDArray. + */ +struct NDArray::Container { + public: + /** NOTE: the first part of this structure is the same as + * DLManagedTensor, note that, however, the deleter + * is only called when the reference counter goes to 0 + */ + /** + * @brief Tensor structure. + * @note it is important that the first field is DGLArray + * So that this data structure is DGLArray compatible. + * The head ptr of this struct can be viewed as DGLArray*. + */ + DGLArray dl_tensor; + /** + * @brief addtional context, reserved for recycling + * @note We can attach additional content here + * which the current container depend on + * (e.g. reference to original memory when creating views). + */ + void* manager_ctx{nullptr}; + /** + * @brief Customized deleter + * + * @note The customized deleter is helpful to enable + * different ways of memory allocator that are not + * currently defined by the system. + */ + void (*deleter)(Container* self) = nullptr; + /** @brief default constructor */ + Container() { + dl_tensor.data = nullptr; + dl_tensor.ndim = 0; + dl_tensor.shape = nullptr; + dl_tensor.strides = nullptr; + dl_tensor.byte_offset = 0; + } + /** @brief pointer to shared memory */ + std::shared_ptr mem; + /** @brief developer function, increases reference counter */ + void IncRef() { ref_counter_.fetch_add(1, std::memory_order_relaxed); } + /** @brief developer function, decrease reference counter */ + void DecRef() { + if (ref_counter_.fetch_sub(1, std::memory_order_release) == 1) { + std::atomic_thread_fence(std::memory_order_acquire); + if (this->deleter != nullptr) { + (*this->deleter)(this); + } + } + } + + private: + friend struct DLPackConvert; + friend class NDArray; + friend class RPCWrappedFunc; + /** + * @brief The shape container, + * can be used for shape data. + */ + std::vector shape_; + /** + * @brief The stride container, + * can be used for stride data. + */ + std::vector stride_; + /** @brief The internal array object */ + std::atomic ref_counter_{0}; + + /** @brief Whether underlying dl_tensor is pinned by DGL. */ + bool pinned_by_dgl_{false}; + + /** @brief Whether underlying dl_tensor is pinned by PyTorch + * (CachingHostAllocator). */ + bool pinned_by_pytorch_{false}; + + /** @brief The PyTorch storage ctx ptr if pinned_by_pytorch_ = True. */ + void* pytorch_ctx_{nullptr}; + + /** @brief Pointer to the corresp. PyTorch deleter if pinned_by_pytorch_ = + * True. + */ + void* pytorch_raw_deleter_{nullptr}; +}; + +// implementations of inline functions +// the usages of functions are documented in place. +inline NDArray::NDArray(Container* data) : data_(data) { + if (data_) data_->IncRef(); +} + +inline NDArray::NDArray(const NDArray& other) : data_(other.data_) { + if (data_) data_->IncRef(); +} + +inline void NDArray::reset() { + if (data_) { + data_->DecRef(); + data_ = nullptr; + } +} + +inline void NDArray::CopyFrom(DGLArray* other) { + CHECK(data_ != nullptr); + CopyFromTo(other, &(data_->dl_tensor)); +} + +inline void NDArray::CopyFrom(const NDArray& other) { + CHECK(other.data_ != nullptr); + // Copy between two devices + if (data_->dl_tensor.ctx.device_type != + other.data_->dl_tensor.ctx.device_type) { + CHECK(data_ != nullptr); + auto to_ctx_type = data_->dl_tensor.ctx.device_type; + auto cpu_data = (to_ctx_type == kDGLCPU ? data_ : other.data_); + // Pinned by PyTorch + if (cpu_data->pinned_by_pytorch_) { + // To ensure correct behavior, the event must be recorded after + // cudaMemcpyAsync as long as the memory is pinned by PyTorch. + void* pytorch_ctx = cpu_data->pytorch_ctx_; + RecordedCopyFromTo( + &(other.data_->dl_tensor), &(data_->dl_tensor), pytorch_ctx); + return; + } + } + CopyFrom(&(other.data_->dl_tensor)); +} + +inline void NDArray::CopyTo(DGLArray* other) const { + CHECK(data_ != nullptr); + CopyFromTo(&(data_->dl_tensor), other); +} + +inline void NDArray::CopyTo(const NDArray& other) const { + CHECK(other.data_ != nullptr); + // copy between two devices + if (data_->dl_tensor.ctx.device_type != + other.data_->dl_tensor.ctx.device_type) { + CHECK(data_ != nullptr); + auto from_ctx_type = data_->dl_tensor.ctx.device_type; + auto cpu_data = (from_ctx_type == kDGLCPU ? data_ : other.data_); + // pinned by PyTorch + if (cpu_data->pinned_by_pytorch_) { + // To ensure correct behavior, the event must be recorded after + // cudaMemcpyAsync as long as the memory is pinned by PyTorch. + void* pytorch_ctx = cpu_data->pytorch_ctx_; + RecordedCopyFromTo( + &(data_->dl_tensor), &(other.data_->dl_tensor), pytorch_ctx); + return; + } + } + CopyTo(&(other.data_->dl_tensor)); +} + +inline NDArray NDArray::CopyTo(const DGLContext& ctx) const { + CHECK(data_ != nullptr); + const DGLArray* array = operator->(); + NDArray ret = Empty( + std::vector(array->shape, array->shape + array->ndim), + array->dtype, ctx); + this->CopyTo(ret); + return ret; +} + +inline NDArray NDArray::Clone() const { + CHECK(data_ != nullptr); + const DGLArray* array = operator->(); + return this->CopyTo(array->ctx); +} + +inline NDArray NDArray::PinMemory() { + CHECK(data_ != nullptr); + const DGLArray* array = operator->(); + auto ctx = array->ctx; + NDArray ret = PinnedEmpty( + std::vector(array->shape, array->shape + array->ndim), + array->dtype, ctx); + this->CopyTo(ret); + return ret; +} + +inline void NDArray::PinMemory_() { + CHECK(data_ != nullptr); + PinContainer(data_); +} + +inline void NDArray::UnpinMemory_() { + CHECK(data_ != nullptr); + UnpinContainer(data_); +} + +inline bool NDArray::IsPinned() const { + CHECK(data_ != nullptr); + return IsContainerPinned(data_); +} + +inline void NDArray::RecordStream(DGLStreamHandle stream) const { + CHECK(data_ != nullptr); + RecordStream(&(data_->dl_tensor), stream); +} + +inline int NDArray::use_count() const { + if (data_ == nullptr) return 0; + return data_->ref_counter_.load(std::memory_order_relaxed); +} + +inline const DGLArray* NDArray::operator->() const { + return &(data_->dl_tensor); +} + +/** @brief Magic number for NDArray file */ +constexpr uint64_t kDGLNDArrayMagic = 0xDD5E40F096B4A13F; + +inline bool SaveDGLArray(dmlc::Stream* strm, DGLArray* tensor) { + uint64_t header = kDGLNDArrayMagic, reserved = 0; + strm->Write(header); + strm->Write(reserved); + // Always save data as CPU context + // + // Parameters that get serialized should be in CPU by default. + // So even the array's context is GPU, it will be stored as CPU array. + // This is used to prevent case when another user loads the parameters + // back on machine that do not have GPU or related context. + // + // We can always do array.CopyTo(target_ctx) to get a corresponding + // array in the target context. + DGLContext cpu_ctx; + cpu_ctx.device_type = kDGLCPU; + cpu_ctx.device_id = 0; + strm->Write(cpu_ctx); + strm->Write(tensor->ndim); + strm->Write(tensor->dtype); + int ndim = tensor->ndim; + strm->WriteArray(tensor->shape, ndim); + int type_bytes = tensor->dtype.bits / 8; + int64_t num_elems = 1; + for (int i = 0; i < ndim; ++i) { + num_elems *= tensor->shape[i]; + } + int64_t data_byte_size = type_bytes * num_elems; + strm->Write(data_byte_size); + + if (DMLC_IO_NO_ENDIAN_SWAP && tensor->ctx.device_type == kDGLCPU && + tensor->strides == nullptr && tensor->byte_offset == 0) { + // quick path + strm->Write(tensor->data, data_byte_size); + } else { + std::vector bytes(data_byte_size); + CHECK_EQ( + DGLArrayCopyToBytes(tensor, dmlc::BeginPtr(bytes), data_byte_size), 0) + << DGLGetLastError(); + if (!DMLC_IO_NO_ENDIAN_SWAP) { + dmlc::ByteSwap(dmlc::BeginPtr(bytes), type_bytes, num_elems); + } + strm->Write(dmlc::BeginPtr(bytes), data_byte_size); + } + return true; +} + +/** + * @brief Convert type code to its name + * @param type_code The type code . + * @return The name of type code. + */ +inline const char* TypeCode2Str(int type_code) { + switch (type_code) { + case kDGLInt: + return "int"; + case kDGLUInt: + return "uint"; + case kDGLFloat: + return "float"; + case kStr: + return "str"; + case kBytes: + return "bytes"; + case kHandle: + return "handle"; + case kNull: + return "NULL"; + case kObjectHandle: + return "ObjectHandle"; + case kArrayHandle: + return "ArrayHandle"; + case kDGLDataType: + return "DGLDataType"; + case kDGLContext: + return "DGLContext"; + case kFuncHandle: + return "FunctionHandle"; + case kModuleHandle: + return "ModuleHandle"; + case kNDArrayContainer: + return "NDArrayContainer"; + default: + LOG(FATAL) << "unknown type_code=" << static_cast(type_code); + return ""; + } +} + +/** + * @brief Convert device type code to its name + * @param device_type The device type code. + * @return The name of the device. + */ +inline const char* DeviceTypeCode2Str(DGLDeviceType device_type) { + switch (device_type) { + case kDGLCPU: + return "cpu"; + case kDGLCUDA: + return "cuda"; + default: + LOG(FATAL) << "Unsupported device type code=" + << static_cast(device_type); + return ""; + } +} + +/** + * @brief convert a string to DGL type. + * @param s The string to be converted. + * @return The corresponding dgl type. + */ +inline DGLDataType String2DGLDataType(std::string s) { + DGLDataType t; + t.bits = 32; + t.lanes = 1; + const char* scan; + if (s.substr(0, 3) == "int") { + t.code = kDGLInt; + scan = s.c_str() + 3; + } else if (s.substr(0, 4) == "uint") { + t.code = kDGLUInt; + scan = s.c_str() + 4; + } else if (s.substr(0, 5) == "float") { + t.code = kDGLFloat; + scan = s.c_str() + 5; + } else if (s.substr(0, 6) == "handle") { + t.code = kHandle; + t.bits = 64; // handle uses 64 bit by default. + scan = s.c_str() + 6; + } else { + scan = s.c_str(); + LOG(FATAL) << "unknown type " << s; + } + char* xdelim; // emulate sscanf("%ux%u", bits, lanes) + uint8_t bits = static_cast(strtoul(scan, &xdelim, 10)); + if (bits != 0) t.bits = bits; + if (*xdelim == 'x') { + t.lanes = static_cast(strtoul(xdelim + 1, nullptr, 10)); + } + return t; +} + +/** + * @brief convert a DGL type to string. + * @param t The type to be converted. + * @return The corresponding dgl type in string. + */ +inline std::string DGLDataType2String(DGLDataType t) { +#ifndef _LIBCPP_SGX_NO_IOSTREAMS + std::ostringstream os; + os << t; + return os.str(); +#else + std::string repr = ""; + repr += TypeCode2Str(t.code); + if (t.code == kHandle) return repr; + repr += std::to_string(static_cast(t.bits)); + if (t.lanes != 1) { + repr += "x" + std::to_string(static_cast(t.lanes)); + } + return repr; +#endif +} + +// macro to check type code. +#define DGL_CHECK_TYPE_CODE(CODE, T) \ + CHECK_EQ(CODE, T) << " expected " << TypeCode2Str(T) << " but get " \ + << TypeCode2Str(CODE) + +} // namespace runtime +} // namespace dgl + +namespace dmlc { +DMLC_DECLARE_TRAITS(has_saveload, dgl::runtime::NDArray, true); +} // namespace dmlc + +///////////////// Operator overloading for NDArray ///////////////// +dgl::runtime::NDArray operator+( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator-( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator*( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator/( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator%( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator+(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator-(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator*(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator/(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator%(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator+(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator-(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator*(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator/(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator%(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator-(const dgl::runtime::NDArray& array); + +dgl::runtime::NDArray operator>( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator<( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator>=( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator<=( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator==( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator!=( + const dgl::runtime::NDArray& a1, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator>(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator<(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator>=(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator<=(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator==(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator!=(const dgl::runtime::NDArray& a1, int64_t rhs); +dgl::runtime::NDArray operator>(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator<(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator>=(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator<=(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator==(int64_t lhs, const dgl::runtime::NDArray& a2); +dgl::runtime::NDArray operator!=(int64_t lhs, const dgl::runtime::NDArray& a2); + +std::ostream& operator<<(std::ostream& os, dgl::runtime::NDArray array); + +///////////////// Operator overloading for DGLDataType ///////////////// + +/** @brief Check whether two data types are the same.*/ +inline bool operator==(const DGLDataType& ty1, const DGLDataType& ty2) { + return ty1.code == ty2.code && ty1.bits == ty2.bits && ty1.lanes == ty2.lanes; +} + +/** @brief Check whether two data types are different.*/ +inline bool operator!=(const DGLDataType& ty1, const DGLDataType& ty2) { + return !(ty1 == ty2); +} + +#ifndef _LIBCPP_SGX_NO_IOSTREAMS +inline std::ostream& operator<<(std::ostream& os, DGLDataType t) { + os << dgl::runtime::TypeCode2Str(t.code); + if (t.code == kHandle) return os; + os << static_cast(t.bits); + if (t.lanes != 1) { + os << 'x' << static_cast(t.lanes); + } + return os; +} +#endif + +///////////////// Operator overloading for DGLContext ///////////////// + +/** @brief Check whether two device contexts are the same.*/ +inline bool operator==(const DGLContext& ctx1, const DGLContext& ctx2) { + return ctx1.device_type == ctx2.device_type && + ctx1.device_id == ctx2.device_id; +} + +/** @brief Check whether two device contexts are different.*/ +inline bool operator!=(const DGLContext& ctx1, const DGLContext& ctx2) { + return !(ctx1 == ctx2); +} + +#ifndef _LIBCPP_SGX_NO_IOSTREAMS +inline std::ostream& operator<<(std::ostream& os, const DGLContext& ctx) { + return os << dgl::runtime::DeviceTypeCode2Str(ctx.device_type) << ":" + << ctx.device_id; +} +#endif + +#endif // DGL_RUNTIME_NDARRAY_H_ diff --git a/include/dgl/runtime/tensordispatch.h b/include/dgl/runtime/tensordispatch.h index 872c1d68e582..6dd5748bb01e 100644 --- a/include/dgl/runtime/tensordispatch.h +++ b/include/dgl/runtime/tensordispatch.h @@ -33,9 +33,9 @@ #if defined(WIN32) || defined(_WIN32) #include #endif // WIN32 -#ifdef DGL_USE_CUDA -#include -#endif // DGL_USE_CUDA +#ifdef DGL_USE_ROCM +#include +#endif // DGL_USE_ROCM #include "ndarray.h" /** @@ -90,21 +90,21 @@ class TensorDispatcher { FUNCCAST(tensoradapter::CPURawDelete, entry)(ptr); } -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM /** * @brief Allocate a piece of GPU memory via * PyTorch's THCCachingAllocator. * Used in CUDADeviceAPI::AllocWorkspace(). * * @note THCCachingAllocator specify the device to allocate on - * via cudaGetDevice(). Make sure to call cudaSetDevice() + * via hipGetDevice(). Make sure to call hipSetDevice() * before invoking this function. * * @param nbytes The size to be allocated. * @param stream The stream to be allocated on. * @return Pointer to the allocated memory. */ - inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) { + inline void* CUDAAllocWorkspace(size_t nbytes, hipStream_t stream) { auto entry = entrypoints_[Op::kCUDARawAlloc]; return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream); } @@ -125,12 +125,12 @@ class TensorDispatcher { * Used in runtime::getCurrentCUDAStream(). * * @note PyTorch pre-allocates/sets the current CUDA stream - * on current device via cudaGetDevice(). Make sure to call cudaSetDevice() + * on current device via hipGetDevice(). Make sure to call hipSetDevice() * before invoking this function. * - * @return cudaStream_t stream handle + * @return hipStream_t stream handle */ - inline cudaStream_t CUDAGetCurrentStream() { + inline hipStream_t CUDAGetCurrentStream() { auto entry = entrypoints_[Op::kCUDACurrentStream]; return FUNCCAST(tensoradapter::CUDACurrentStream, entry)(); } @@ -183,7 +183,7 @@ class TensorDispatcher { * @param device_id Device of the tensor. */ inline void CUDARecordHostAlloc( - void* data, void* ctx, cudaStream_t stream, int device_id) { + void* data, void* ctx, hipStream_t stream, int device_id) { auto entry = entrypoints_[Op::kCUDARecordHostAlloc]; auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry); recorded_alloc(data, ctx, stream, device_id); @@ -198,7 +198,7 @@ class TensorDispatcher { auto entry = entrypoints_[Op::kCUDAHostAllocatorEmptyCache]; FUNCCAST(tensoradapter::CUDAHostAllocatorEmptyCache, entry)(); } -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM /** * @brief Record streams that are using this tensor. @@ -209,10 +209,10 @@ class TensorDispatcher { * @param device_id Device of the tensor. */ inline void RecordStream(void* ptr, DGLStreamHandle stream, int device_id) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto entry = entrypoints_[Op::kRecordStream]; FUNCCAST(tensoradapter::RecordStream, entry) - (ptr, static_cast(stream), device_id); + (ptr, static_cast(stream), device_id); #endif } @@ -229,12 +229,12 @@ class TensorDispatcher { */ static constexpr const char* names_[] = { "CPURawAlloc", "CPURawDelete", -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM "CUDARawAlloc", "CUDARawDelete", "CUDACurrentStream", "RecordStream", "CUDARawHostAlloc", "CUDARawHostDelete", "CUDARecordHostAlloc", "CUDAHostAllocatorEmptyCache", -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM }; /** @brief Index of each function to the symbol list */ @@ -242,7 +242,7 @@ class TensorDispatcher { public: static constexpr int kCPURawAlloc = 0; static constexpr int kCPURawDelete = 1; -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM static constexpr int kCUDARawAlloc = 2; static constexpr int kCUDARawDelete = 3; static constexpr int kCUDACurrentStream = 4; @@ -251,7 +251,7 @@ class TensorDispatcher { static constexpr int kCUDARawHostDelete = 7; static constexpr int kCUDARecordHostAlloc = 8; static constexpr int kCUDAHostAllocatorEmptyCache = 9; -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM }; /** @brief Number of functions */ @@ -260,9 +260,9 @@ class TensorDispatcher { /** @brief Entrypoints of each function */ void* entrypoints_[num_entries_] = { nullptr, nullptr, -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM }; bool available_ = false; diff --git a/include/dgl/runtime/tensordispatch.h.prehip b/include/dgl/runtime/tensordispatch.h.prehip new file mode 100644 index 000000000000..872c1d68e582 --- /dev/null +++ b/include/dgl/runtime/tensordispatch.h.prehip @@ -0,0 +1,281 @@ +/** + * Copyright (c) 2020-2022 by Contributors + * @file array/tensordispatch.h + * @brief This file defines the dispatcher of tensor operators to + * framework-specific implementations. + * + * The dispatcher consists of a TensorDispatcher singleton in DGL C library and + * one separately-built shared library per supported backend. + * + * Those shared libraries contain wrappers of the framework-specific operators. + * The wrappers are defined with extern "C", meaning that the C++ compiler will + * not do name mangling for those functions so that DGL can conveniently locate + * them using dlsym(3) (or GetProcAddress in Windows). + * + * The TensorDispatcher singleton maintains a mapping from an array operator to + * the address of the corresponding symbol in the shared library. During + * initialization, the TensorDispatcher checks which backend DGL is using. + * It then locates and opens the corresponding shared library using dlopen(3) + * (or LoadLibrary in Windows), and populates the said mapping above with + * dlsym(3) (or GetProcAddress in Windows). + * + * A tensor operator in TensorDispatcher first checks whether the corresponding + * symbol address is found in the mapping. If so, it calls the function located + * at the symbol address instead, allocate/free pieces of memory on CPU/GPU. If + * not, it falls back to DeviceAPI::AllocWorkspace/FreeWorkspace. + */ + +#ifndef DGL_RUNTIME_TENSORDISPATCH_H_ +#define DGL_RUNTIME_TENSORDISPATCH_H_ + +#include +#include +#if defined(WIN32) || defined(_WIN32) +#include +#endif // WIN32 +#ifdef DGL_USE_CUDA +#include +#endif // DGL_USE_CUDA +#include "ndarray.h" + +/** + * @brief Casts a pointer \c entry to a function pointer with signature of \c + * func. + */ +#define FUNCCAST(func, entry) (*reinterpret_cast(entry)) + +namespace dgl { +namespace runtime { + +/** + * @brief Dispatcher that delegates the function calls to framework-specific C++ + * APIs. + * + * This class is not thread-safe. + */ +class TensorDispatcher { + public: + /** @brief Get the singleton instance. */ + static TensorDispatcher* Global() { + static TensorDispatcher inst; + return &inst; + } + + /** @brief Whether an adapter library is available. */ + inline bool IsAvailable() { return available_; } + + /** @brief Load symbols from the given tensor adapter library path. */ + bool Load(const char* path_cstr); + + /** + * @brief Allocate a piece of CPU memory via PyTorch's CPUAllocator. + * Used in CPUDeviceAPI::AllocWorkspace(). + * + * @param nbytes The size to be allocated. + * @return Pointer to the allocated memory. + */ + inline void* CPUAllocWorkspace(size_t nbytes) { + auto entry = entrypoints_[Op::kCPURawAlloc]; + return FUNCCAST(tensoradapter::CPURawAlloc, entry)(nbytes); + } + + /** + * @brief Free the CPU memory. + * Used in CPUDeviceAPI::FreeWorkspace(). + * + * @param ptr Pointer to the memory to be freed. + */ + inline void CPUFreeWorkspace(void* ptr) { + auto entry = entrypoints_[Op::kCPURawDelete]; + FUNCCAST(tensoradapter::CPURawDelete, entry)(ptr); + } + +#ifdef DGL_USE_CUDA + /** + * @brief Allocate a piece of GPU memory via + * PyTorch's THCCachingAllocator. + * Used in CUDADeviceAPI::AllocWorkspace(). + * + * @note THCCachingAllocator specify the device to allocate on + * via cudaGetDevice(). Make sure to call cudaSetDevice() + * before invoking this function. + * + * @param nbytes The size to be allocated. + * @param stream The stream to be allocated on. + * @return Pointer to the allocated memory. + */ + inline void* CUDAAllocWorkspace(size_t nbytes, cudaStream_t stream) { + auto entry = entrypoints_[Op::kCUDARawAlloc]; + return FUNCCAST(tensoradapter::CUDARawAlloc, entry)(nbytes, stream); + } + + /** + * @brief Free the GPU memory. + * Used in CUDADeviceAPI::FreeWorkspace(). + * + * @param ptr Pointer to the memory to be freed. + */ + inline void CUDAFreeWorkspace(void* ptr) { + auto entry = entrypoints_[Op::kCUDARawDelete]; + FUNCCAST(tensoradapter::CUDARawDelete, entry)(ptr); + } + + /** + * @brief Find the current PyTorch CUDA stream + * Used in runtime::getCurrentCUDAStream(). + * + * @note PyTorch pre-allocates/sets the current CUDA stream + * on current device via cudaGetDevice(). Make sure to call cudaSetDevice() + * before invoking this function. + * + * @return cudaStream_t stream handle + */ + inline cudaStream_t CUDAGetCurrentStream() { + auto entry = entrypoints_[Op::kCUDACurrentStream]; + return FUNCCAST(tensoradapter::CUDACurrentStream, entry)(); + } + + /** + * @brief Allocate a piece of pinned CPU memory via PyTorch + * CachingHostAllocator. + * @note Used in CUDADeviceAPI::AllocPinnedDataSpace(). + * @param nbytes The size to be allocated. + * @param ctx Pointer to the PyTorch storage ctx ptr returned from the + * allocator. + * @param deleter Pointer to the delete function ptr returned from the + * allocator. + * @return Raw pointer to the allocated memory. + */ + inline void* CUDAAllocHostWorkspace( + size_t nbytes, void** ctx, void** deleter) { + auto entry = entrypoints_[Op::kCUDARawHostAlloc]; + + auto alloc_func = FUNCCAST(tensoradapter::CUDARawHostAlloc, entry); + return alloc_func(nbytes, ctx, deleter); + } + + /** + * @brief Insert the pinned memory block (allocated via PyTorch + * CachingHostAllocator) back to the free list for future usage.(ref: + * pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp). + * @note Used in CUDADeviceAPI::FreePinnedDataSpace(). + * @param deleter Pointer to the delete function ptr returned from the + * allocator. + */ + inline void CUDAFreeHostWorkspace(void** deleter) { + auto entry = entrypoints_[Op::kCUDARawHostDelete]; + FUNCCAST(tensoradapter::CUDARawHostDelete, entry)(deleter); + } + + /** + * @brief Invoke the record_event function call from PyTorch + * CachingHostAllocator. + * @note This function assoicates a CUDA stream (used by a copy kernel) to the + * pinned data. In the free path of this data, which is achieved by + * calling CUDAFreeHostWorkspace, the set of associated streams is then + * consumed to ensure proper functionlity. (ref: + * pytorch/pytorch/blob/master/aten/src/ATen/cuda/CachingHostAllocator.cpp). + * Used in CUDADeviceAPI::RecordedCopyDataFromTo(). + * + * @param data Pointer of the tensor to be recorded. + * @param ctx PyTorch storage ctx ptr returned from the allocator. + * @param stream The stream that currently consumes this tensor. + * @param device_id Device of the tensor. + */ + inline void CUDARecordHostAlloc( + void* data, void* ctx, cudaStream_t stream, int device_id) { + auto entry = entrypoints_[Op::kCUDARecordHostAlloc]; + auto recorded_alloc = FUNCCAST(tensoradapter::CUDARecordHostAlloc, entry); + recorded_alloc(data, ctx, stream, device_id); + } + + /** + * @brief Release cached pinned memory allocations via cudaHostFree. + * @note Used in CUDADeviceAPI::PinData() before pinning any host memory by + * DGL. + */ + inline void CUDAHostAllocatorEmptyCache() { + auto entry = entrypoints_[Op::kCUDAHostAllocatorEmptyCache]; + FUNCCAST(tensoradapter::CUDAHostAllocatorEmptyCache, entry)(); + } +#endif // DGL_USE_CUDA + + /** + * @brief Record streams that are using this tensor. + * Used in NDArray::RecordStream(). + * + * @param ptr Pointer of the tensor to be recorded. + * @param stream The stream that is using this tensor. + * @param device_id Device of the tensor. + */ + inline void RecordStream(void* ptr, DGLStreamHandle stream, int device_id) { +#ifdef DGL_USE_CUDA + auto entry = entrypoints_[Op::kRecordStream]; + FUNCCAST(tensoradapter::RecordStream, entry) + (ptr, static_cast(stream), device_id); +#endif + } + + private: + /** @brief ctor */ + TensorDispatcher() = default; + /** @brief dtor */ + ~TensorDispatcher(); + + /** + * @brief List of symbols in the adapter library. + * + * Must match the functions in tensoradapter/include/tensoradapter.h. + */ + static constexpr const char* names_[] = { + "CPURawAlloc", "CPURawDelete", +#ifdef DGL_USE_CUDA + "CUDARawAlloc", "CUDARawDelete", + "CUDACurrentStream", "RecordStream", + "CUDARawHostAlloc", "CUDARawHostDelete", + "CUDARecordHostAlloc", "CUDAHostAllocatorEmptyCache", +#endif // DGL_USE_CUDA + }; + + /** @brief Index of each function to the symbol list */ + class Op { + public: + static constexpr int kCPURawAlloc = 0; + static constexpr int kCPURawDelete = 1; +#ifdef DGL_USE_CUDA + static constexpr int kCUDARawAlloc = 2; + static constexpr int kCUDARawDelete = 3; + static constexpr int kCUDACurrentStream = 4; + static constexpr int kRecordStream = 5; + static constexpr int kCUDARawHostAlloc = 6; + static constexpr int kCUDARawHostDelete = 7; + static constexpr int kCUDARecordHostAlloc = 8; + static constexpr int kCUDAHostAllocatorEmptyCache = 9; +#endif // DGL_USE_CUDA + }; + + /** @brief Number of functions */ + static constexpr int num_entries_ = sizeof(names_) / sizeof(names_[0]); + + /** @brief Entrypoints of each function */ + void* entrypoints_[num_entries_] = { + nullptr, nullptr, +#ifdef DGL_USE_CUDA + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, +#endif // DGL_USE_CUDA + }; + + bool available_ = false; +#if defined(WIN32) || defined(_WIN32) + HINSTANCE handle_; +#else // !WIN32 + void* handle_; +#endif // WIN32 +}; + +}; // namespace runtime +}; // namespace dgl + +#undef FUNCCAST + +#endif // DGL_RUNTIME_TENSORDISPATCH_H_ diff --git a/src/array/arith.h b/src/array/arith.h index 9526c694eb74..808f5f458168 100644 --- a/src/array/arith.h +++ b/src/array/arith.h @@ -6,13 +6,13 @@ #ifndef DGL_ARRAY_ARITH_H_ #define DGL_ARRAY_ARITH_H_ -#ifdef __CUDACC__ +#ifdef __HIPCC__ #define DGLDEVICE __device__ #define DGLINLINE __forceinline__ #else #define DGLDEVICE #define DGLINLINE inline -#endif // __CUDACC__ +#endif // __HIPCC__ namespace dgl { namespace aten { diff --git a/src/array/arith.h.prehip b/src/array/arith.h.prehip new file mode 100644 index 000000000000..9526c694eb74 --- /dev/null +++ b/src/array/arith.h.prehip @@ -0,0 +1,109 @@ +/** + * Copyright (c) 2019 by Contributors + * @file array/arith.h + * @brief Arithmetic functors + */ +#ifndef DGL_ARRAY_ARITH_H_ +#define DGL_ARRAY_ARITH_H_ + +#ifdef __CUDACC__ +#define DGLDEVICE __device__ +#define DGLINLINE __forceinline__ +#else +#define DGLDEVICE +#define DGLINLINE inline +#endif // __CUDACC__ + +namespace dgl { +namespace aten { +namespace arith { + +struct Add { + template + static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) { + return t1 + t2; + } +}; + +struct Sub { + template + static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) { + return t1 - t2; + } +}; + +struct Mul { + template + static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) { + return t1 * t2; + } +}; + +struct Div { + template + static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) { + return t1 / t2; + } +}; + +struct Mod { + template + static DGLINLINE DGLDEVICE T Call(const T& t1, const T& t2) { + return t1 % t2; + } +}; + +struct GT { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 > t2; + } +}; + +struct LT { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 < t2; + } +}; + +struct GE { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 >= t2; + } +}; + +struct LE { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 <= t2; + } +}; + +struct EQ { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 == t2; + } +}; + +struct NE { + template + static DGLINLINE DGLDEVICE bool Call(const T& t1, const T& t2) { + return t1 != t2; + } +}; + +struct Neg { + template + static DGLINLINE DGLDEVICE T Call(const T& t1) { + return -t1; + } +}; + +} // namespace arith +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_ARITH_H_ diff --git a/src/array/cuda/array_cumsum.cu b/src/array/cuda/array_cumsum.cu index 5d5ef1603c46..c372c75abf4a 100644 --- a/src/array/cuda/array_cumsum.cu +++ b/src/array/cuda/array_cumsum.cu @@ -5,7 +5,7 @@ */ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -23,7 +23,7 @@ IdArray CumSum(IdArray array, bool prepend_zero) { : aten::Full(0, 1, array->dtype.bits, array->ctx); auto device = runtime::DeviceAPI::Get(array->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const IdType* in_d = array.Ptr(); IdArray ret; IdType* out_d = nullptr; @@ -36,12 +36,12 @@ IdArray CumSum(IdArray array, bool prepend_zero) { } // Allocate workspace size_t workspace_size = 0; - CUDA_CALL(cub::DeviceScan::InclusiveSum( + CUDA_CALL(hipcub::DeviceScan::InclusiveSum( nullptr, workspace_size, in_d, out_d, len, stream)); void* workspace = device->AllocWorkspace(array->ctx, workspace_size); // Compute cumsum - CUDA_CALL(cub::DeviceScan::InclusiveSum( + CUDA_CALL(hipcub::DeviceScan::InclusiveSum( workspace, workspace_size, in_d, out_d, len, stream)); device->FreeWorkspace(array->ctx, workspace); diff --git a/src/array/cuda/array_cumsum.cu.prehip b/src/array/cuda/array_cumsum.cu.prehip new file mode 100644 index 000000000000..5d5ef1603c46 --- /dev/null +++ b/src/array/cuda/array_cumsum.cu.prehip @@ -0,0 +1,57 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cpu/array_cumsum.cu + * @brief Array cumsum GPU implementation + */ +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +IdArray CumSum(IdArray array, bool prepend_zero) { + const int64_t len = array.NumElements(); + if (len == 0) + return !prepend_zero ? array + : aten::Full(0, 1, array->dtype.bits, array->ctx); + + auto device = runtime::DeviceAPI::Get(array->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const IdType* in_d = array.Ptr(); + IdArray ret; + IdType* out_d = nullptr; + if (prepend_zero) { + ret = aten::Full(0, len + 1, array->dtype.bits, array->ctx); + out_d = ret.Ptr() + 1; + } else { + ret = aten::NewIdArray(len, array->ctx, array->dtype.bits); + out_d = ret.Ptr(); + } + // Allocate workspace + size_t workspace_size = 0; + CUDA_CALL(cub::DeviceScan::InclusiveSum( + nullptr, workspace_size, in_d, out_d, len, stream)); + void* workspace = device->AllocWorkspace(array->ctx, workspace_size); + + // Compute cumsum + CUDA_CALL(cub::DeviceScan::InclusiveSum( + workspace, workspace_size, in_d, out_d, len, stream)); + + device->FreeWorkspace(array->ctx, workspace); + + return ret; +} + +template IdArray CumSum(IdArray, bool); +template IdArray CumSum(IdArray, bool); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/array_index_select.cu b/src/array/cuda/array_index_select.cu index 6e29e996a1c3..e5370a4c27c7 100644 --- a/src/array/cuda/array_index_select.cu +++ b/src/array/cuda/array_index_select.cu @@ -33,7 +33,7 @@ NDArray IndexSelect(NDArray array, IdArray index) { const DType* array_data = static_cast(cuda::GetDevicePointer(array)); const IdType* idx_data = static_cast(index->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_feat == 1) { const int nt = cuda::FindNumThreads(len); const int nb = (len + nt - 1) / nt; @@ -61,9 +61,9 @@ template NDArray IndexSelect(NDArray, IdArray); template NDArray IndexSelect(NDArray, IdArray); template NDArray IndexSelect(NDArray, IdArray); #if BF16_ENABLED -template NDArray IndexSelect( +template NDArray IndexSelect( NDArray, IdArray); -template NDArray IndexSelect( +template NDArray IndexSelect( NDArray, IdArray); #endif // BF16_ENABLED template NDArray IndexSelect(NDArray, IdArray); @@ -87,7 +87,7 @@ template uint32_t IndexSelect(NDArray array, int64_t index); template uint64_t IndexSelect(NDArray array, int64_t index); template __half IndexSelect(NDArray array, int64_t index); #if BF16_ENABLED -template __nv_bfloat16 IndexSelect( +template __hip_bfloat16 IndexSelect( NDArray array, int64_t index); #endif // BF16_ENABLED template float IndexSelect(NDArray array, int64_t index); diff --git a/src/array/cuda/array_index_select.cu.prehip b/src/array/cuda/array_index_select.cu.prehip new file mode 100644 index 000000000000..6e29e996a1c3 --- /dev/null +++ b/src/array/cuda/array_index_select.cu.prehip @@ -0,0 +1,98 @@ +/** + * Copyright (c) 2019 by Contributors + * @file array/cpu/array_index_select.cu + * @brief Array index select GPU implementation + */ +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./array_index_select.cuh" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +NDArray IndexSelect(NDArray array, IdArray index) { + const int64_t arr_len = array->shape[0]; + const int64_t len = index->shape[0]; + int64_t num_feat = 1; + std::vector shape{len}; + for (int d = 1; d < array->ndim; ++d) { + num_feat *= array->shape[d]; + shape.emplace_back(array->shape[d]); + } + + // use index->ctx for pinned array + NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx); + if (len == 0 || arr_len * num_feat == 0) return ret; + DType* ret_data = static_cast(ret->data); + + const DType* array_data = static_cast(cuda::GetDevicePointer(array)); + const IdType* idx_data = static_cast(index->data); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + if (num_feat == 1) { + const int nt = cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len, + arr_len, ret_data); + } else { + dim3 block(256, 1); + while (static_cast(block.x) >= 2 * num_feat) { + block.x /= 2; + block.y *= 2; + } + const dim3 grid((len + block.y - 1) / block.y); + CUDA_KERNEL_CALL( + IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat, + idx_data, len, arr_len, ret_data); + } + return ret; +} + +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +#if BF16_ENABLED +template NDArray IndexSelect( + NDArray, IdArray); +template NDArray IndexSelect( + NDArray, IdArray); +#endif // BF16_ENABLED +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); +template NDArray IndexSelect(NDArray, IdArray); + +template +DType IndexSelect(NDArray array, int64_t index) { + auto device = runtime::DeviceAPI::Get(array->ctx); + DType ret = static_cast(0.0f); + device->CopyDataFromTo( + static_cast(array->data) + index, 0, &ret, 0, sizeof(DType), + array->ctx, DGLContext{kDGLCPU, 0}, array->dtype); + return ret; +} + +template int32_t IndexSelect(NDArray array, int64_t index); +template int64_t IndexSelect(NDArray array, int64_t index); +template uint32_t IndexSelect(NDArray array, int64_t index); +template uint64_t IndexSelect(NDArray array, int64_t index); +template __half IndexSelect(NDArray array, int64_t index); +#if BF16_ENABLED +template __nv_bfloat16 IndexSelect( + NDArray array, int64_t index); +#endif // BF16_ENABLED +template float IndexSelect(NDArray array, int64_t index); +template double IndexSelect(NDArray array, int64_t index); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/array_index_select.cuh b/src/array/cuda/array_index_select.cuh index e4d8673ca026..441bd8e505a4 100644 --- a/src/array/cuda/array_index_select.cuh +++ b/src/array/cuda/array_index_select.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021-2022 by Contributors * @file array/cuda/array_index_select.cuh diff --git a/src/array/cuda/array_index_select.cuh.prehip b/src/array/cuda/array_index_select.cuh.prehip new file mode 100644 index 000000000000..e4d8673ca026 --- /dev/null +++ b/src/array/cuda/array_index_select.cuh.prehip @@ -0,0 +1,87 @@ +/** + * Copyright (c) 2021-2022 by Contributors + * @file array/cuda/array_index_select.cuh + * @brief Array index select GPU kernel implementation + */ + +#ifndef DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_ +#define DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_ + +namespace dgl { +namespace aten { +namespace impl { + +template +__global__ void IndexSelectSingleKernel( + const DType* array, const IdType* index, const int64_t length, + const int64_t arr_len, DType* out, const int64_t* perm = nullptr) { + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + assert(index[tx] >= 0 && index[tx] < arr_len); + const auto out_row = perm ? perm[tx] : tx; + out[out_row] = array[index[tx]]; + tx += stride_x; + } +} + +template +__global__ void IndexSelectMultiKernel( + const DType* const array, const int64_t num_feat, const IdType* const index, + const int64_t length, const int64_t arr_len, DType* const out, + const int64_t* perm = nullptr) { + int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y; + + const int64_t stride = blockDim.y * gridDim.x; + + while (out_row_index < length) { + int64_t col = threadIdx.x; + const int64_t in_row = index[out_row_index]; + assert(in_row >= 0 && in_row < arr_len); + const auto out_row = perm ? perm[out_row_index] : out_row_index; + while (col < num_feat) { + out[out_row * num_feat + col] = array[in_row * num_feat + col]; + col += blockDim.x; + } + out_row_index += stride; + } +} + +template +__global__ void IndexScatterSingleKernel( + const DType* array, const IdType* index, const int64_t length, + const int64_t arr_len, DType* out) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + assert(index[tx] >= 0 && index[tx] < arr_len); + out[index[tx]] = array[tx]; + tx += stride_x; + } +} + +template +__global__ void IndexScatterMultiKernel( + const DType* const array, const int64_t num_feat, const IdType* const index, + const int64_t length, const int64_t arr_len, DType* const out) { + int64_t in_row = blockIdx.x * blockDim.y + threadIdx.y; + + const int64_t stride = blockDim.y * gridDim.x; + + while (in_row < length) { + int64_t col = threadIdx.x; + const int64_t out_row = index[in_row]; + assert(out_row >= 0 && out_row < arr_len); + while (col < num_feat) { + out[out_row * num_feat + col] = array[in_row * num_feat + col]; + col += blockDim.x; + } + in_row += stride; + } +} + +} // namespace impl +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_ARRAY_INDEX_SELECT_CUH_ diff --git a/src/array/cuda/array_nonzero.cu b/src/array/cuda/array_nonzero.cu index 3ffaad2d657a..299afd4cc235 100644 --- a/src/array/cuda/array_nonzero.cu +++ b/src/array/cuda/array_nonzero.cu @@ -6,7 +6,7 @@ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -33,24 +33,24 @@ IdArray NonZero(IdArray array) { const int64_t len = array->shape[0]; IdArray ret = NewIdArray(len, ctx, 64); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const IdType* const in_data = static_cast(array->data); int64_t* const out_data = static_cast(ret->data); IsNonZeroIndex comp(in_data); - cub::CountingInputIterator counter(0); + hipcub::CountingInputIterator counter(0); // room for cub to output on GPU int64_t* d_num_nonzeros = static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); size_t temp_size = 0; - CUDA_CALL(cub::DeviceSelect::If( + CUDA_CALL(hipcub::DeviceSelect::If( nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream)); void* temp = device->AllocWorkspace(ctx, temp_size); - CUDA_CALL(cub::DeviceSelect::If( + CUDA_CALL(hipcub::DeviceSelect::If( temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream)); device->FreeWorkspace(ctx, temp); diff --git a/src/array/cuda/array_nonzero.cu.prehip b/src/array/cuda/array_nonzero.cu.prehip new file mode 100644 index 000000000000..3ffaad2d657a --- /dev/null +++ b/src/array/cuda/array_nonzero.cu.prehip @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cpu/array_nonzero.cc + * @brief Array nonzero CPU implementation + */ + +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +struct IsNonZeroIndex { + explicit IsNonZeroIndex(const IdType* array) : array_(array) {} + + __device__ bool operator()(const int64_t index) { return array_[index] != 0; } + + const IdType* array_; +}; + +template +IdArray NonZero(IdArray array) { + const auto& ctx = array->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + + const int64_t len = array->shape[0]; + IdArray ret = NewIdArray(len, ctx, 64); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const IdType* const in_data = static_cast(array->data); + int64_t* const out_data = static_cast(ret->data); + + IsNonZeroIndex comp(in_data); + cub::CountingInputIterator counter(0); + + // room for cub to output on GPU + int64_t* d_num_nonzeros = + static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); + + size_t temp_size = 0; + CUDA_CALL(cub::DeviceSelect::If( + nullptr, temp_size, counter, out_data, d_num_nonzeros, len, comp, + stream)); + void* temp = device->AllocWorkspace(ctx, temp_size); + CUDA_CALL(cub::DeviceSelect::If( + temp, temp_size, counter, out_data, d_num_nonzeros, len, comp, stream)); + device->FreeWorkspace(ctx, temp); + + // copy number of selected elements from GPU to CPU + int64_t num_nonzeros = cuda::GetCUDAScalar(device, ctx, d_num_nonzeros); + device->FreeWorkspace(ctx, d_num_nonzeros); + device->StreamSync(ctx, stream); + + // truncate array to size + return ret.CreateView({num_nonzeros}, ret->dtype, 0); +} + +template IdArray NonZero(IdArray); +template IdArray NonZero(IdArray); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/array_op_impl.cu b/src/array/cuda/array_op_impl.cu index f5f19be81ba3..f517e1057468 100644 --- a/src/array/cuda/array_op_impl.cu +++ b/src/array/cuda/array_op_impl.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020-2021 by Contributors * @file array/cuda/array_op_impl.cu @@ -36,7 +37,7 @@ IdArray BinaryElewise(IdArray lhs, IdArray rhs) { const IdType* lhs_data = static_cast(lhs->data); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -107,7 +108,7 @@ IdArray BinaryElewise(IdArray lhs, IdType rhs) { IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -178,7 +179,7 @@ IdArray BinaryElewise(IdType lhs, IdArray rhs) { IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits); const IdType* rhs_data = static_cast(rhs->data); IdType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -249,7 +250,7 @@ IdArray UnaryElewise(IdArray lhs) { IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); const IdType* lhs_data = static_cast(lhs->data); IdType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(len); int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -277,7 +278,7 @@ template NDArray Full(DType val, int64_t length, DGLContext ctx) { NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits::dtype, ctx); DType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -292,8 +293,8 @@ template IdArray Full( template IdArray Full( __half val, int64_t length, DGLContext ctx); #if BF16_ENABLED -template IdArray Full( - __nv_bfloat16 val, int64_t length, DGLContext ctx); +template IdArray Full( + __hip_bfloat16 val, int64_t length, DGLContext ctx); #endif // BF16_ENABLED template IdArray Full( float val, int64_t length, DGLContext ctx); @@ -319,7 +320,7 @@ IdArray Range(IdType low, IdType high, DGLContext ctx) { IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); if (length == 0) return ret; IdType* ret_data = static_cast(ret->data); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; CUDA_KERNEL_CALL( @@ -355,7 +356,7 @@ IdArray Relabel_(const std::vector& arrays) { const auto& ctx = arrays[0]->ctx; auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // build node maps and get the induced nodes OrderedHashTable node_map(total_length, ctx, stream); @@ -364,7 +365,7 @@ IdArray Relabel_(const std::vector& arrays) { static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8); - CUDA_CALL(cudaMemsetAsync( + CUDA_CALL(hipMemsetAsync( num_induced_device, 0, sizeof(*num_induced_device), stream)); node_map.FillWithDuplicates( @@ -416,7 +417,7 @@ IdArray AsNumBits(IdArray arr, uint8_t bits) { const std::vector shape(arr->shape, arr->shape + arr->ndim); IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx); const int64_t length = ret.NumElements(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = cuda::FindNumThreads(length); int nb = (length + nt - 1) / nt; if (bits == 32) { diff --git a/src/array/cuda/array_op_impl.cu.prehip b/src/array/cuda/array_op_impl.cu.prehip new file mode 100644 index 000000000000..f5f19be81ba3 --- /dev/null +++ b/src/array/cuda/array_op_impl.cu.prehip @@ -0,0 +1,441 @@ +/** + * Copyright (c) 2020-2021 by Contributors + * @file array/cuda/array_op_impl.cu + * @brief Array operator GPU implementation + */ +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "../../runtime/cuda/cuda_hashtable.cuh" +#include "../arith.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +using namespace runtime::cuda; +namespace aten { +namespace impl { + +///////////////////////////// BinaryElewise ///////////////////////////// + +template +__global__ void _BinaryElewiseKernel( + const IdType* lhs, const IdType* rhs, IdType* out, int64_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = Op::Call(lhs[tx], rhs[tx]); + tx += stride_x; + } +} + +template +IdArray BinaryElewise(IdArray lhs, IdArray rhs) { + const int64_t len = lhs->shape[0]; + IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); + const IdType* lhs_data = static_cast(lhs->data); + const IdType* rhs_data = static_cast(rhs->data); + IdType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(len); + int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_BinaryElewiseKernel), nb, nt, 0, stream, lhs_data, rhs_data, + ret_data, len); + return ret; +} + +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); +template IdArray BinaryElewise( + IdArray lhs, IdArray rhs); + +template +__global__ void _BinaryElewiseKernel( + const IdType* lhs, IdType rhs, IdType* out, int64_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = Op::Call(lhs[tx], rhs); + tx += stride_x; + } +} + +template +IdArray BinaryElewise(IdArray lhs, IdType rhs) { + const int64_t len = lhs->shape[0]; + IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); + const IdType* lhs_data = static_cast(lhs->data); + IdType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(len); + int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_BinaryElewiseKernel), nb, nt, 0, stream, lhs_data, rhs, + ret_data, len); + return ret; +} + +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int32_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); +template IdArray BinaryElewise( + IdArray lhs, int64_t rhs); + +template +__global__ void _BinaryElewiseKernel( + IdType lhs, const IdType* rhs, IdType* out, int64_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = Op::Call(lhs, rhs[tx]); + tx += stride_x; + } +} + +template +IdArray BinaryElewise(IdType lhs, IdArray rhs) { + const int64_t len = rhs->shape[0]; + IdArray ret = NewIdArray(rhs->shape[0], rhs->ctx, rhs->dtype.bits); + const IdType* rhs_data = static_cast(rhs->data); + IdType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(len); + int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_BinaryElewiseKernel), nb, nt, 0, stream, lhs, rhs_data, + ret_data, len); + return ret; +} + +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int32_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); +template IdArray BinaryElewise( + int64_t lhs, IdArray rhs); + +template +__global__ void _UnaryElewiseKernel( + const IdType* lhs, IdType* out, int64_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = Op::Call(lhs[tx]); + tx += stride_x; + } +} + +template +IdArray UnaryElewise(IdArray lhs) { + const int64_t len = lhs->shape[0]; + IdArray ret = NewIdArray(lhs->shape[0], lhs->ctx, lhs->dtype.bits); + const IdType* lhs_data = static_cast(lhs->data); + IdType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(len); + int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_UnaryElewiseKernel), nb, nt, 0, stream, lhs_data, ret_data, + len); + return ret; +} + +template IdArray UnaryElewise(IdArray lhs); +template IdArray UnaryElewise(IdArray lhs); + +///////////////////////////// Full ///////////////////////////// + +template +__global__ void _FullKernel(DType* out, int64_t length, DType val) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = val; + tx += stride_x; + } +} + +template +NDArray Full(DType val, int64_t length, DGLContext ctx) { + NDArray ret = NDArray::Empty({length}, DGLDataTypeTraits::dtype, ctx); + DType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(length); + int nb = (length + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_FullKernel), nb, nt, 0, stream, ret_data, length, val); + return ret; +} + +template IdArray Full( + int32_t val, int64_t length, DGLContext ctx); +template IdArray Full( + int64_t val, int64_t length, DGLContext ctx); +template IdArray Full( + __half val, int64_t length, DGLContext ctx); +#if BF16_ENABLED +template IdArray Full( + __nv_bfloat16 val, int64_t length, DGLContext ctx); +#endif // BF16_ENABLED +template IdArray Full( + float val, int64_t length, DGLContext ctx); +template IdArray Full( + double val, int64_t length, DGLContext ctx); + +///////////////////////////// Range ///////////////////////////// + +template +__global__ void _RangeKernel(IdType* out, IdType low, IdType length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = low + tx; + tx += stride_x; + } +} + +template +IdArray Range(IdType low, IdType high, DGLContext ctx) { + CHECK(high >= low) << "high must be bigger than low"; + const IdType length = high - low; + IdArray ret = NewIdArray(length, ctx, sizeof(IdType) * 8); + if (length == 0) return ret; + IdType* ret_data = static_cast(ret->data); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(length); + int nb = (length + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_RangeKernel), nb, nt, 0, stream, ret_data, low, length); + return ret; +} + +template IdArray Range(int32_t, int32_t, DGLContext); +template IdArray Range(int64_t, int64_t, DGLContext); + +///////////////////////////// Relabel_ ////////////////////////////// + +template +__global__ void _RelabelKernel( + IdType* out, int64_t length, DeviceOrderedHashTable table) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + + while (tx < length) { + out[tx] = table.Search(out[tx])->local; + tx += stride_x; + } +} + +template +IdArray Relabel_(const std::vector& arrays) { + IdArray all_nodes = Concat(arrays); + const int64_t total_length = all_nodes->shape[0]; + + if (total_length == 0) { + return all_nodes; + } + + const auto& ctx = arrays[0]->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + // build node maps and get the induced nodes + OrderedHashTable node_map(total_length, ctx, stream); + int64_t num_induced = 0; + int64_t* num_induced_device = + static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); + IdArray induced_nodes = NewIdArray(total_length, ctx, sizeof(IdType) * 8); + + CUDA_CALL(cudaMemsetAsync( + num_induced_device, 0, sizeof(*num_induced_device), stream)); + + node_map.FillWithDuplicates( + all_nodes.Ptr(), all_nodes->shape[0], induced_nodes.Ptr(), + num_induced_device, stream); + // copy using the internal current stream + device->CopyDataFromTo( + num_induced_device, 0, &num_induced, 0, sizeof(num_induced), ctx, + DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1}); + + device->StreamSync(ctx, stream); + device->FreeWorkspace(ctx, num_induced_device); + + // resize the induced nodes + induced_nodes->shape[0] = num_induced; + + // relabel + const int nt = 128; + for (IdArray arr : arrays) { + const int64_t length = arr->shape[0]; + int nb = (length + nt - 1) / nt; + CUDA_KERNEL_CALL( + (_RelabelKernel), nb, nt, 0, stream, arr.Ptr(), length, + node_map.DeviceHandle()); + } + + return induced_nodes; +} + +template IdArray Relabel_( + const std::vector& arrays); +template IdArray Relabel_( + const std::vector& arrays); + +///////////////////////////// AsNumBits ///////////////////////////// + +template +__global__ void _CastKernel(const InType* in, OutType* out, size_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[tx] = in[tx]; + tx += stride_x; + } +} + +template +IdArray AsNumBits(IdArray arr, uint8_t bits) { + const std::vector shape(arr->shape, arr->shape + arr->ndim); + IdArray ret = IdArray::Empty(shape, DGLDataType{kDGLInt, bits, 1}, arr->ctx); + const int64_t length = ret.NumElements(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = cuda::FindNumThreads(length); + int nb = (length + nt - 1) / nt; + if (bits == 32) { + CUDA_KERNEL_CALL( + (_CastKernel), nb, nt, 0, stream, + static_cast(arr->data), static_cast(ret->data), + length); + } else { + CUDA_KERNEL_CALL( + (_CastKernel), nb, nt, 0, stream, + static_cast(arr->data), static_cast(ret->data), + length); + } + return ret; +} + +template IdArray AsNumBits(IdArray arr, uint8_t bits); +template IdArray AsNumBits(IdArray arr, uint8_t bits); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/array_scatter.cu b/src/array/cuda/array_scatter.cu index 41acbba92581..83480f7b6be8 100644 --- a/src/array/cuda/array_scatter.cu +++ b/src/array/cuda/array_scatter.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2019 by Contributors * @file array/cuda/array_scatter.cu @@ -31,7 +32,7 @@ void Scatter_(IdArray index, NDArray value, NDArray out) { const DType* val = value.Ptr(); DType* outd = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int nt = cuda::FindNumThreads(len); const int nb = (len + nt - 1) / nt; CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd); @@ -41,7 +42,7 @@ template void Scatter_(IdArray, NDArray, NDArray); template void Scatter_(IdArray, NDArray, NDArray); template void Scatter_(IdArray, NDArray, NDArray); #if BF16_ENABLED -template void Scatter_( +template void Scatter_( IdArray, NDArray, NDArray); #endif // BF16_ENABLED template void Scatter_(IdArray, NDArray, NDArray); @@ -50,7 +51,7 @@ template void Scatter_(IdArray, NDArray, NDArray); template void Scatter_(IdArray, NDArray, NDArray); template void Scatter_(IdArray, NDArray, NDArray); #if BF16_ENABLED -template void Scatter_( +template void Scatter_( IdArray, NDArray, NDArray); #endif // BF16_ENABLED template void Scatter_(IdArray, NDArray, NDArray); diff --git a/src/array/cuda/array_scatter.cu.prehip b/src/array/cuda/array_scatter.cu.prehip new file mode 100644 index 000000000000..41acbba92581 --- /dev/null +++ b/src/array/cuda/array_scatter.cu.prehip @@ -0,0 +1,61 @@ +/** + * Copyright (c) 2019 by Contributors + * @file array/cuda/array_scatter.cu + * @brief Array scatter GPU implementation + */ +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +__global__ void _ScatterKernel( + const IdType* index, const DType* value, int64_t length, DType* out) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + out[index[tx]] = value[tx]; + tx += stride_x; + } +} + +template +void Scatter_(IdArray index, NDArray value, NDArray out) { + const int64_t len = index->shape[0]; + const IdType* idx = index.Ptr(); + const DType* val = value.Ptr(); + DType* outd = out.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int nt = cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL(_ScatterKernel, nb, nt, 0, stream, idx, val, len, outd); +} + +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +#if BF16_ENABLED +template void Scatter_( + IdArray, NDArray, NDArray); +#endif // BF16_ENABLED +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); +#if BF16_ENABLED +template void Scatter_( + IdArray, NDArray, NDArray); +#endif // BF16_ENABLED +template void Scatter_(IdArray, NDArray, NDArray); +template void Scatter_(IdArray, NDArray, NDArray); + +}; // namespace impl +}; // namespace aten +}; // namespace dgl diff --git a/src/array/cuda/array_sort.cu b/src/array/cuda/array_sort.cu index 390483e4a85b..5f697abcf76a 100644 --- a/src/array/cuda/array_sort.cu +++ b/src/array/cuda/array_sort.cu @@ -5,7 +5,7 @@ */ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -29,20 +29,20 @@ std::pair Sort(IdArray array, int num_bits) { IdType* keys_out = sorted_array.Ptr(); int64_t* values_out = sorted_idx.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_bits == 0) { num_bits = sizeof(IdType) * 8; } // Allocate workspace size_t workspace_size = 0; - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream)); void* workspace = device->AllocWorkspace(ctx, workspace_size); // Compute - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( workspace, workspace_size, keys_in, keys_out, values_in, values_out, nitems, 0, num_bits, stream)); diff --git a/src/array/cuda/array_sort.cu.prehip b/src/array/cuda/array_sort.cu.prehip new file mode 100644 index 000000000000..390483e4a85b --- /dev/null +++ b/src/array/cuda/array_sort.cu.prehip @@ -0,0 +1,61 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cpu/array_sort.cu + * @brief Array sort GPU implementation + */ +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +std::pair Sort(IdArray array, int num_bits) { + const auto& ctx = array->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t nitems = array->shape[0]; + IdArray orig_idx = Range(0, nitems, 64, ctx); + IdArray sorted_array = NewIdArray(nitems, ctx, array->dtype.bits); + IdArray sorted_idx = NewIdArray(nitems, ctx, 64); + + const IdType* keys_in = array.Ptr(); + const int64_t* values_in = orig_idx.Ptr(); + IdType* keys_out = sorted_array.Ptr(); + int64_t* values_out = sorted_idx.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + if (num_bits == 0) { + num_bits = sizeof(IdType) * 8; + } + + // Allocate workspace + size_t workspace_size = 0; + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + nullptr, workspace_size, keys_in, keys_out, values_in, values_out, nitems, + 0, num_bits, stream)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + + // Compute + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + workspace, workspace_size, keys_in, keys_out, values_in, values_out, + nitems, 0, num_bits, stream)); + + device->FreeWorkspace(ctx, workspace); + + return std::make_pair(sorted_array, sorted_idx); +} + +template std::pair Sort( + IdArray, int num_bits); +template std::pair Sort( + IdArray, int num_bits); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/atomic.cuh b/src/array/cuda/atomic.cuh index 0b88594a1dbb..85391945ae9f 100644 --- a/src/array/cuda/atomic.cuh +++ b/src/array/cuda/atomic.cuh @@ -6,7 +6,7 @@ #ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_ #define DGL_ARRAY_CUDA_ATOMIC_CUH_ -#include +#include #include #include @@ -16,7 +16,7 @@ #include "fp16.cuh" #if __CUDA_ARCH__ >= 600 -#include +#include #endif namespace dgl { @@ -67,28 +67,28 @@ struct Cast { #if BF16_ENABLED template <> -struct Cast<__nv_bfloat16> { - typedef Code::Type Type; - static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) { +struct Cast<__hip_bfloat16> { + typedef Code::Type Type; + static __device__ __forceinline__ Type Encode(__hip_bfloat16 val) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 return __bfloat16_as_ushort(val); #else printf( "Atomic operations are not supported for bfloat16 (BF16) " "on GPUs with compute capability less than 8.0.\n"); - __trap(); + abort(); return static_cast(0); #endif } - static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) { + static __device__ __forceinline__ __hip_bfloat16 Decode(Type code) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 return __ushort_as_bfloat16(code); #else printf( "Atomic operations are not supported for bfloat16 (BF16) " "on GPUs with compute capability less than 8.0.\n"); - __trap(); - return static_cast<__nv_bfloat16>(0.0f); + abort(); + return static_cast<__hip_bfloat16>(0.0f); #endif } }; @@ -130,7 +130,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT printf( "Atomic operations are not supported for half precision (FP16) " "on this GPU.\n"); - __trap(); + abort(); return val; #endif // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) } @@ -172,7 +172,7 @@ static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT DEFINE_ATOMIC(Max) DEFINE_ATOMIC_16BIT(Max, half) #if BF16_ENABLED -DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16) +DEFINE_ATOMIC_16BIT(Max, __hip_bfloat16) #endif // BF16_ENABLED #undef OP @@ -180,7 +180,7 @@ DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16) DEFINE_ATOMIC(Min) DEFINE_ATOMIC_16BIT(Min, half) #if BF16_ENABLED -DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16) +DEFINE_ATOMIC_16BIT(Min, __hip_bfloat16) #endif // BF16_ENABLED #undef OP @@ -304,7 +304,7 @@ __device__ __forceinline__ half AtomicAdd(half* addr, half val) { printf( "Atomic operations are not supported for half precision (FP16) " "on this GPU.\n"); - __trap(); + abort(); return val; #endif // __CUDA_ARCH__ >= 700 } @@ -312,8 +312,8 @@ __device__ __forceinline__ half AtomicAdd(half* addr, half val) { #if BF16_ENABLED template <> -__device__ __forceinline__ __nv_bfloat16 -AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) { +__device__ __forceinline__ __hip_bfloat16 +AtomicAdd<__hip_bfloat16>(__hip_bfloat16* addr, __hip_bfloat16 val) { // make sure we have bfloat16 support #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 return atomicAdd(addr, val); @@ -323,7 +323,7 @@ AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) { printf( "Atomic operations are not supported for bfloat16 (BF16) " "on GPUs with compute capability less than 8.0.\n"); - __trap(); + abort(); return val; #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 } diff --git a/src/array/cuda/atomic.cuh.prehip b/src/array/cuda/atomic.cuh.prehip new file mode 100644 index 000000000000..0b88594a1dbb --- /dev/null +++ b/src/array/cuda/atomic.cuh.prehip @@ -0,0 +1,336 @@ +/** + * Copyright (c) 2019 by Contributors + * @file array/cuda/atomic.cuh + * @brief Atomic functions + */ +#ifndef DGL_ARRAY_CUDA_ATOMIC_CUH_ +#define DGL_ARRAY_CUDA_ATOMIC_CUH_ + +#include + +#include +#include +#include + +#include "bf16.cuh" +#include "fp16.cuh" + +#if __CUDA_ARCH__ >= 600 +#include +#endif + +namespace dgl { +namespace aten { +namespace cuda { + +// Type trait for selecting code type +template +struct Code {}; + +template <> +struct Code<2> { + typedef unsigned short int Type; // NOLINT +}; + +template <> +struct Code<4> { + typedef unsigned int Type; // NOLINT +}; + +template <> +struct Code<8> { + typedef unsigned long long int Type; // NOLINT +}; + +// Helper class for converting to/from atomicCAS compatible types. +template +struct Cast { + typedef typename Code::Type Type; + static __device__ __forceinline__ Type Encode(T val) { + return static_cast(val); + } + static __device__ __forceinline__ T Decode(Type code) { + return static_cast(code); + } +}; + +template <> +struct Cast { + typedef Code::Type Type; + static __device__ __forceinline__ Type Encode(half val) { + return __half_as_ushort(val); + } + static __device__ __forceinline__ half Decode(Type code) { + return __ushort_as_half(code); + } +}; + +#if BF16_ENABLED +template <> +struct Cast<__nv_bfloat16> { + typedef Code::Type Type; + static __device__ __forceinline__ Type Encode(__nv_bfloat16 val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __bfloat16_as_ushort(val); +#else + printf( + "Atomic operations are not supported for bfloat16 (BF16) " + "on GPUs with compute capability less than 8.0.\n"); + __trap(); + return static_cast(0); +#endif + } + static __device__ __forceinline__ __nv_bfloat16 Decode(Type code) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __ushort_as_bfloat16(code); +#else + printf( + "Atomic operations are not supported for bfloat16 (BF16) " + "on GPUs with compute capability less than 8.0.\n"); + __trap(); + return static_cast<__nv_bfloat16>(0.0f); +#endif + } +}; +#endif // BF16_ENABLED + +template <> +struct Cast { + typedef Code::Type Type; + static __device__ __forceinline__ Type Encode(float val) { + return __float_as_uint(val); + } + static __device__ __forceinline__ float Decode(Type code) { + return __uint_as_float(code); + } +}; + +template <> +struct Cast { + typedef Code::Type Type; + static __device__ __forceinline__ Type Encode(double val) { + return __double_as_longlong(val); + } + static __device__ __forceinline__ double Decode(Type code) { + return __longlong_as_double(code); + } +}; + +static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT + unsigned short int* address, // NOLINT + unsigned short int compare, // NOLINT + unsigned short int val) { // NOLINT + static_assert(CUDART_VERSION >= 10000, "Requires at least CUDA 10"); +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) + return atomicCAS(address, compare, val); +#else + (void)address; + (void)compare; + (void)val; + printf( + "Atomic operations are not supported for half precision (FP16) " + "on this GPU.\n"); + __trap(); + return val; +#endif // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) +} + +#define DEFINE_ATOMIC(NAME) \ + template \ + __device__ __forceinline__ T Atomic##NAME(T* addr, T val) { \ + typedef typename Cast::Type CT; \ + CT* addr_as_ui = reinterpret_cast(addr); \ + CT old = *addr_as_ui; \ + CT assumed = old; \ + do { \ + assumed = old; \ + old = atomicCAS( \ + addr_as_ui, assumed, \ + Cast::Encode(OP(val, Cast::Decode(old)))); \ + } while (assumed != old); \ + return Cast::Decode(old); \ + } + +#define DEFINE_ATOMIC_16BIT(NAME, dtype) \ + template <> \ + __device__ __forceinline__ dtype Atomic##NAME( \ + dtype * addr, dtype val) { \ + typedef uint16_t CT; \ + CT* addr_as_ui = reinterpret_cast(addr); \ + CT old = *addr_as_ui; \ + CT assumed = old; \ + do { \ + assumed = old; \ + old = atomicCASshort( \ + addr_as_ui, assumed, \ + Cast::Encode(OP(val, Cast::Decode(old)))); \ + } while (assumed != old); \ + return Cast::Decode(old); \ + } + +#define OP(a, b) max(a, b) +DEFINE_ATOMIC(Max) +DEFINE_ATOMIC_16BIT(Max, half) +#if BF16_ENABLED +DEFINE_ATOMIC_16BIT(Max, __nv_bfloat16) +#endif // BF16_ENABLED +#undef OP + +#define OP(a, b) min(a, b) +DEFINE_ATOMIC(Min) +DEFINE_ATOMIC_16BIT(Min, half) +#if BF16_ENABLED +DEFINE_ATOMIC_16BIT(Min, __nv_bfloat16) +#endif // BF16_ENABLED +#undef OP + +#define OP(a, b) a + b +DEFINE_ATOMIC(Add) +#undef OP + +/** + * @brief Performs an atomic compare-and-swap on 64 bit integers. That is, + * it the word `old` at the memory location `address`, computes + * `(old == compare ? val : old)` , and stores the result back to memory at + * the same address. + * + * @param address The address to perform the atomic operation on. + * @param compare The value to compare to. + * @param val The new value to conditionally store. + * + * @return The old value at the address. + */ +inline __device__ int64_t +AtomicCAS(int64_t* const address, const int64_t compare, const int64_t val) { + // match the type of "::atomicCAS", so ignore lint warning + using Type = unsigned long long int; // NOLINT + + static_assert(sizeof(Type) == sizeof(*address), "Type width must match"); + + return atomicCAS( + reinterpret_cast(address), static_cast(compare), + static_cast(val)); +} + +/** + * @brief Performs an atomic compare-and-swap on 32 bit integers. That is, + * it the word `old` at the memory location `address`, computes + * `(old == compare ? val : old)` , and stores the result back to memory at + * the same address. + * + * @param address The address to perform the atomic operation on. + * @param compare The value to compare to. + * @param val The new value to conditionally store. + * + * @return The old value at the address. + */ +inline __device__ int32_t +AtomicCAS(int32_t* const address, const int32_t compare, const int32_t val) { + // match the type of "::atomicCAS", so ignore lint warning + using Type = int; // NOLINT + + static_assert(sizeof(Type) == sizeof(*address), "Type width must match"); + + return atomicCAS( + reinterpret_cast(address), static_cast(compare), + static_cast(val)); +} + +inline __device__ int64_t AtomicMax(int64_t* const address, const int64_t val) { + // match the type of "::atomicCAS", so ignore lint warning + using Type = unsigned long long int; // NOLINT + + static_assert(sizeof(Type) == sizeof(*address), "Type width must match"); + + return atomicMax(reinterpret_cast(address), static_cast(val)); +} + +inline __device__ int32_t AtomicMax(int32_t* const address, const int32_t val) { + // match the type of "::atomicCAS", so ignore lint warning + using Type = int; // NOLINT + + static_assert(sizeof(Type) == sizeof(*address), "Type width must match"); + + return atomicMax(reinterpret_cast(address), static_cast(val)); +} + +template <> +__device__ __forceinline__ float AtomicAdd(float* addr, float val) { +#if __CUDA_ARCH__ >= 200 + return atomicAdd(addr, val); +#else + typedef float T; + typedef typename Cast::Type CT; + CT* addr_as_ui = reinterpret_cast(addr); + CT old = *addr_as_ui; + CT assumed = old; + do { + assumed = old; + old = atomicCAS( + addr_as_ui, assumed, Cast::Encode(Cast::Decode(old) + val)); + } while (assumed != old); + return Cast::Decode(old); +#endif // __CUDA_ARCH__ +} + +template <> +__device__ __forceinline__ double AtomicAdd(double* addr, double val) { +#if __CUDA_ARCH__ >= 600 + return atomicAdd(addr, val); +#else + typedef double T; + typedef typename Cast::Type CT; + CT* addr_as_ui = reinterpret_cast(addr); + CT old = *addr_as_ui; + CT assumed = old; + do { + assumed = old; + old = atomicCAS( + addr_as_ui, assumed, Cast::Encode(Cast::Decode(old) + val)); + } while (assumed != old); + return Cast::Decode(old); +#endif +} + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 10000 +template <> +__device__ __forceinline__ half AtomicAdd(half* addr, half val) { +// make sure we have half support +#if __CUDA_ARCH__ >= 700 + return atomicAdd(addr, val); +#else + (void)addr; + (void)val; + printf( + "Atomic operations are not supported for half precision (FP16) " + "on this GPU.\n"); + __trap(); + return val; +#endif // __CUDA_ARCH__ >= 700 +} +#endif // defined(CUDART_VERSION) && CUDART_VERSION >= 10000 + +#if BF16_ENABLED +template <> +__device__ __forceinline__ __nv_bfloat16 +AtomicAdd<__nv_bfloat16>(__nv_bfloat16* addr, __nv_bfloat16 val) { +// make sure we have bfloat16 support +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return atomicAdd(addr, val); +#else + (void)addr; + (void)val; + printf( + "Atomic operations are not supported for bfloat16 (BF16) " + "on GPUs with compute capability less than 8.0.\n"); + __trap(); + return val; +#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +} +#endif // BF16_ENABLED + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_ATOMIC_CUH_ diff --git a/src/array/cuda/bf16.cuh b/src/array/cuda/bf16.cuh index 82fd0f332297..425df014a87c 100644 --- a/src/array/cuda/bf16.cuh +++ b/src/array/cuda/bf16.cuh @@ -20,129 +20,129 @@ #define DGL_ARRAY_CUDA_BF16_CUH_ #if BF16_ENABLED -#include +#include #include -static __device__ __forceinline__ __nv_bfloat16 -max(__nv_bfloat16 a, __nv_bfloat16 b) { +static __device__ __forceinline__ __hip_bfloat16 +max(__hip_bfloat16 a, __hip_bfloat16 b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 return __hmax(a, b); #else - return __nv_bfloat16(max(float(a), float(b))); // NOLINT + return __hip_bfloat16(max(float(a), float(b))); // NOLINT #endif } -static __device__ __forceinline__ __nv_bfloat16 -min(__nv_bfloat16 a, __nv_bfloat16 b) { +static __device__ __forceinline__ __hip_bfloat16 +min(__hip_bfloat16 a, __hip_bfloat16 b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 return __hmin(a, b); #else - return __nv_bfloat16(min(float(a), float(b))); // NOLINT + return __hip_bfloat16(min(float(a), float(b))); // NOLINT #endif } -#ifdef __CUDACC__ +#ifdef __HIPCC__ // Arithmetic BF16 operations for architecture >= 8.0 are already defined in // cuda_bf16.h #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) // CUDA 12.2 adds "emulated" support for older architectures. #if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) -__device__ __forceinline__ __nv_bfloat16 -operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { - return __nv_bfloat16(float(lh) + float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator+(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { + return __hip_bfloat16(float(lh) + float(rh)); // NOLINT } -__device__ __forceinline__ __nv_bfloat16 -operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { - return __nv_bfloat16(float(lh) - float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator-(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { + return __hip_bfloat16(float(lh) - float(rh)); // NOLINT } -__device__ __forceinline__ __nv_bfloat16 -operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { - return __nv_bfloat16(float(lh) * float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator*(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { + return __hip_bfloat16(float(lh) * float(rh)); // NOLINT } -__device__ __forceinline__ __nv_bfloat16 -operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { - return __nv_bfloat16(float(lh) / float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator/(const __hip_bfloat16& lh, const __hip_bfloat16& rh) { + return __hip_bfloat16(float(lh) / float(rh)); // NOLINT } -__device__ __forceinline__ __nv_bfloat16& operator+=( - __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT - lh = __nv_bfloat16(float(lh) + float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator+=( + __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT + lh = __hip_bfloat16(float(lh) + float(rh)); // NOLINT return lh; } -__device__ __forceinline__ __nv_bfloat16& operator-=( - __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT - lh = __nv_bfloat16(float(lh) - float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator-=( + __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT + lh = __hip_bfloat16(float(lh) - float(rh)); // NOLINT return lh; } -__device__ __forceinline__ __nv_bfloat16& operator*=( - __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT - lh = __nv_bfloat16(float(lh) * float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator*=( + __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT + lh = __hip_bfloat16(float(lh) * float(rh)); // NOLINT return lh; } -__device__ __forceinline__ __nv_bfloat16& operator/=( - __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT - lh = __nv_bfloat16(float(lh) / float(rh)); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator/=( + __hip_bfloat16& lh, const __hip_bfloat16& rh) { // NOLINT + lh = __hip_bfloat16(float(lh) / float(rh)); // NOLINT return lh; } -__device__ __forceinline__ __nv_bfloat16& operator++( - __nv_bfloat16& h) { // NOLINT - h = __nv_bfloat16(float(h) + 1.0f); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator++( + __hip_bfloat16& h) { // NOLINT + h = __hip_bfloat16(float(h) + 1.0f); // NOLINT return h; } -__device__ __forceinline__ __nv_bfloat16& operator--( - __nv_bfloat16& h) { // NOLINT - h = __nv_bfloat16(float(h) - 1.0f); // NOLINT +__device__ __forceinline__ __hip_bfloat16& operator--( + __hip_bfloat16& h) { // NOLINT + h = __hip_bfloat16(float(h) - 1.0f); // NOLINT return h; } -__device__ __forceinline__ __nv_bfloat16 -operator++(__nv_bfloat16& h, int) { // NOLINT - __nv_bfloat16 ret = h; - h = __nv_bfloat16(float(h) + 1.0f); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator++(__hip_bfloat16& h, int) { // NOLINT + __hip_bfloat16 ret = h; + h = __hip_bfloat16(float(h) + 1.0f); // NOLINT return ret; } -__device__ __forceinline__ __nv_bfloat16 -operator--(__nv_bfloat16& h, int) { // NOLINT - __nv_bfloat16 ret = h; - h = __nv_bfloat16(float(h) - 1.0f); // NOLINT +__device__ __forceinline__ __hip_bfloat16 +operator--(__hip_bfloat16& h, int) { // NOLINT + __hip_bfloat16 ret = h; + h = __hip_bfloat16(float(h) - 1.0f); // NOLINT return ret; } -__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) { +__device__ __forceinline__ __hip_bfloat16 operator+(const __hip_bfloat16& h) { return h; } -__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) { - return __nv_bfloat16(-float(h)); // NOLINT +__device__ __forceinline__ __hip_bfloat16 operator-(const __hip_bfloat16& h) { + return __hip_bfloat16(-float(h)); // NOLINT } __device__ __forceinline__ bool operator==( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) == float(rh); // NOLINT } __device__ __forceinline__ bool operator!=( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) != float(rh); // NOLINT } __device__ __forceinline__ bool operator>( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) > float(rh); // NOLINT } __device__ __forceinline__ bool operator<( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) < float(rh); // NOLINT } __device__ __forceinline__ bool operator>=( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) >= float(rh); // NOLINT } __device__ __forceinline__ bool operator<=( - const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + const __hip_bfloat16& lh, const __hip_bfloat16& rh) { return float(lh) <= float(rh); // NOLINT } #endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) -#endif // __CUDACC__ +#endif // __HIPCC__ #endif // BF16_ENABLED diff --git a/src/array/cuda/bf16.cuh.prehip b/src/array/cuda/bf16.cuh.prehip new file mode 100644 index 000000000000..82fd0f332297 --- /dev/null +++ b/src/array/cuda/bf16.cuh.prehip @@ -0,0 +1,149 @@ +/** + * Copyright (c) 2022 by Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file array/cuda/bf16.cuh + * @brief bfloat16 related functions. + */ +#ifndef DGL_ARRAY_CUDA_BF16_CUH_ +#define DGL_ARRAY_CUDA_BF16_CUH_ + +#if BF16_ENABLED +#include + +#include + +static __device__ __forceinline__ __nv_bfloat16 +max(__nv_bfloat16 a, __nv_bfloat16 b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __hmax(a, b); +#else + return __nv_bfloat16(max(float(a), float(b))); // NOLINT +#endif +} + +static __device__ __forceinline__ __nv_bfloat16 +min(__nv_bfloat16 a, __nv_bfloat16 b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __hmin(a, b); +#else + return __nv_bfloat16(min(float(a), float(b))); // NOLINT +#endif +} + +#ifdef __CUDACC__ +// Arithmetic BF16 operations for architecture >= 8.0 are already defined in +// cuda_bf16.h +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) +// CUDA 12.2 adds "emulated" support for older architectures. +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) +__device__ __forceinline__ __nv_bfloat16 +operator+(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return __nv_bfloat16(float(lh) + float(rh)); // NOLINT +} +__device__ __forceinline__ __nv_bfloat16 +operator-(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return __nv_bfloat16(float(lh) - float(rh)); // NOLINT +} +__device__ __forceinline__ __nv_bfloat16 +operator*(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return __nv_bfloat16(float(lh) * float(rh)); // NOLINT +} +__device__ __forceinline__ __nv_bfloat16 +operator/(const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return __nv_bfloat16(float(lh) / float(rh)); // NOLINT +} + +__device__ __forceinline__ __nv_bfloat16& operator+=( + __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT + lh = __nv_bfloat16(float(lh) + float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __nv_bfloat16& operator-=( + __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT + lh = __nv_bfloat16(float(lh) - float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __nv_bfloat16& operator*=( + __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT + lh = __nv_bfloat16(float(lh) * float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __nv_bfloat16& operator/=( + __nv_bfloat16& lh, const __nv_bfloat16& rh) { // NOLINT + lh = __nv_bfloat16(float(lh) / float(rh)); // NOLINT + return lh; +} + +__device__ __forceinline__ __nv_bfloat16& operator++( + __nv_bfloat16& h) { // NOLINT + h = __nv_bfloat16(float(h) + 1.0f); // NOLINT + return h; +} +__device__ __forceinline__ __nv_bfloat16& operator--( + __nv_bfloat16& h) { // NOLINT + h = __nv_bfloat16(float(h) - 1.0f); // NOLINT + return h; +} +__device__ __forceinline__ __nv_bfloat16 +operator++(__nv_bfloat16& h, int) { // NOLINT + __nv_bfloat16 ret = h; + h = __nv_bfloat16(float(h) + 1.0f); // NOLINT + return ret; +} +__device__ __forceinline__ __nv_bfloat16 +operator--(__nv_bfloat16& h, int) { // NOLINT + __nv_bfloat16 ret = h; + h = __nv_bfloat16(float(h) - 1.0f); // NOLINT + return ret; +} + +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16& h) { + return h; +} +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16& h) { + return __nv_bfloat16(-float(h)); // NOLINT +} + +__device__ __forceinline__ bool operator==( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) == float(rh); // NOLINT +} +__device__ __forceinline__ bool operator!=( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) != float(rh); // NOLINT +} +__device__ __forceinline__ bool operator>( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) > float(rh); // NOLINT +} +__device__ __forceinline__ bool operator<( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) < float(rh); // NOLINT +} +__device__ __forceinline__ bool operator>=( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) >= float(rh); // NOLINT +} +__device__ __forceinline__ bool operator<=( + const __nv_bfloat16& lh, const __nv_bfloat16& rh) { + return float(lh) <= float(rh); // NOLINT +} +#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) +#endif // __CUDACC__ + +#endif // BF16_ENABLED + +#endif // DGL_ARRAY_CUDA_BF16_CUH_ diff --git a/src/array/cuda/coo2csr.cu b/src/array/cuda/coo2csr.cu index 237a35a26b3e..b55b0a3812dd 100644 --- a/src/array/cuda/coo2csr.cu +++ b/src/array/cuda/coo2csr.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/coo2csr.cc @@ -24,12 +25,12 @@ CSRMatrix COOToCSR(COOMatrix coo) { template <> CSRMatrix COOToCSR(COOMatrix coo) { auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); bool row_sorted = coo.row_sorted; bool col_sorted = coo.col_sorted; @@ -51,9 +52,9 @@ CSRMatrix COOToCSR(COOMatrix coo) { NDArray indptr = aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits); int32_t* indptr_ptr = static_cast(indptr->data); - CUSPARSE_CALL(cusparseXcoo2csr( + CUSPARSE_CALL(hipsparseXcoo2csr( thr_entry->cusparse_handle, coo.row.Ptr(), nnz, coo.num_rows, - indptr_ptr, CUSPARSE_INDEX_BASE_ZERO)); + indptr_ptr, HIPSPARSE_INDEX_BASE_ZERO)); return CSRMatrix( coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted); @@ -101,7 +102,7 @@ template <> CSRMatrix COOToCSR(COOMatrix coo) { const auto& ctx = coo.row->ctx; const auto nbits = coo.row->dtype.bits; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); bool row_sorted = coo.row_sorted; bool col_sorted = coo.col_sorted; if (!row_sorted) { diff --git a/src/array/cuda/coo2csr.cu.prehip b/src/array/cuda/coo2csr.cu.prehip new file mode 100644 index 000000000000..237a35a26b3e --- /dev/null +++ b/src/array/cuda/coo2csr.cu.prehip @@ -0,0 +1,137 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/coo2csr.cc + * @brief COO2CSR + */ +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +template +CSRMatrix COOToCSR(COOMatrix coo) { + LOG(FATAL) << "Unreachable code."; + return {}; +} + +template <> +CSRMatrix COOToCSR(COOMatrix coo) { + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + bool row_sorted = coo.row_sorted; + bool col_sorted = coo.col_sorted; + if (!row_sorted) { + // we only need to sort the rows to perform conversion + coo = COOSort(coo, false); + col_sorted = coo.col_sorted; + } + + const int64_t nnz = coo.row->shape[0]; + CHECK_NO_OVERFLOW(coo.row->dtype, nnz); + // TODO(minjie): Many of our current implementation assumes that CSR must have + // a data array. This is a temporary workaround. Remove this after: + // - The old immutable graph implementation is deprecated. + // - The old binary reduce kernel is deprecated. + if (!COOHasData(coo)) + coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx); + + NDArray indptr = + aten::NewIdArray(coo.num_rows + 1, coo.row->ctx, coo.row->dtype.bits); + int32_t* indptr_ptr = static_cast(indptr->data); + CUSPARSE_CALL(cusparseXcoo2csr( + thr_entry->cusparse_handle, coo.row.Ptr(), nnz, coo.num_rows, + indptr_ptr, CUSPARSE_INDEX_BASE_ZERO)); + + return CSRMatrix( + coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted); +} + +/** + * @brief Search for the insertion positions for needle in the hay. + * + * The hay is a list of sorted elements and the result is the insertion position + * of each needle so that the insertion still gives sorted order. + * + * It essentially perform binary search to find upper bound for each needle + * elements. + * + * For example: + * hay = [0, 0, 1, 2, 2] + * needle = [0, 1, 2, 3] + * then, + * out = [2, 3, 5, 5] + */ +template +__global__ void _SortedSearchKernelUpperBound( + const IdType* hay, int64_t hay_size, const IdType* needles, + int64_t num_needles, IdType* pos) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < num_needles) { + const IdType ele = needles[tx]; + // binary search + IdType lo = 0, hi = hay_size; + while (lo < hi) { + IdType mid = (lo + hi) >> 1; + if (hay[mid] <= ele) { + lo = mid + 1; + } else { + hi = mid; + } + } + pos[tx] = lo; + tx += stride_x; + } +} + +template <> +CSRMatrix COOToCSR(COOMatrix coo) { + const auto& ctx = coo.row->ctx; + const auto nbits = coo.row->dtype.bits; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + bool row_sorted = coo.row_sorted; + bool col_sorted = coo.col_sorted; + if (!row_sorted) { + coo = COOSort(coo, false); + col_sorted = coo.col_sorted; + } + + const int64_t nnz = coo.row->shape[0]; + // TODO(minjie): Many of our current implementation assumes that CSR must have + // a data array. This is a temporary workaround. Remove this after: + // - The old immutable graph implementation is deprecated. + // - The old binary reduce kernel is deprecated. + if (!COOHasData(coo)) + coo.data = aten::Range(0, nnz, coo.row->dtype.bits, coo.row->ctx); + + IdArray rowids = Range(0, coo.num_rows, nbits, ctx); + const int nt = cuda::FindNumThreads(coo.num_rows); + const int nb = (coo.num_rows + nt - 1) / nt; + IdArray indptr = Full(0, coo.num_rows + 1, nbits, ctx); + CUDA_KERNEL_CALL( + _SortedSearchKernelUpperBound, nb, nt, 0, stream, coo.row.Ptr(), + nnz, rowids.Ptr(), coo.num_rows, indptr.Ptr() + 1); + + return CSRMatrix( + coo.num_rows, coo.num_cols, indptr, coo.col, coo.data, col_sorted); +} + +template CSRMatrix COOToCSR(COOMatrix coo); +template CSRMatrix COOToCSR(COOMatrix coo); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/coo_sort.cu b/src/array/cuda/coo_sort.cu index fc0bc67925de..585eeb8f644c 100644 --- a/src/array/cuda/coo_sort.cu +++ b/src/array/cuda/coo_sort.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/coo_sort.cc @@ -65,7 +66,7 @@ __global__ void _COODecodeEdgesKernel( template void COOSort_(COOMatrix* coo, bool sort_column) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int row_bits = cuda::_NumberOfBits(coo->num_rows); const int64_t nnz = coo->row->shape[0]; @@ -138,7 +139,7 @@ template std::pair COOIsSorted(COOMatrix coo) { const int64_t nnz = coo.row->shape[0]; const auto& ctx = coo.row->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = runtime::DeviceAPI::Get(ctx); // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but // should be fine. diff --git a/src/array/cuda/coo_sort.cu.prehip b/src/array/cuda/coo_sort.cu.prehip new file mode 100644 index 000000000000..fc0bc67925de --- /dev/null +++ b/src/array/cuda/coo_sort.cu.prehip @@ -0,0 +1,168 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/coo_sort.cc + * @brief Sort COO index + */ +#include + +#include "../../c_api_common.h" +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +///////////////////////////// COOSort_ ///////////////////////////// + +/** + * @brief Encode row and column IDs into a single scalar per edge. + * + * @tparam IdType The type to encode as. + * @param row The row (src) IDs per edge. + * @param col The column (dst) IDs per edge. + * @param nnz The number of edges. + * @param col_bits The number of bits used to encode the destination. The row + * information is packed into the remaining bits. + * @param key The encoded edges (output). + */ +template +__global__ void _COOEncodeEdgesKernel( + const IdType* const row, const IdType* const col, const int64_t nnz, + const int col_bits, IdType* const key) { + int64_t tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + + if (tx < nnz) { + key[tx] = row[tx] << col_bits | col[tx]; + } +} + +/** + * @brief Decode row and column IDs from the encoded edges. + * + * @tparam IdType The type the edges are encoded as. + * @param key The encoded edges. + * @param nnz The number of edges. + * @param col_bits The number of bits used to store the column/dst ID. + * @param row The row (src) IDs per edge (output). + * @param col The col (dst) IDs per edge (output). + */ +template +__global__ void _COODecodeEdgesKernel( + const IdType* const key, const int64_t nnz, const int col_bits, + IdType* const row, IdType* const col) { + int64_t tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + + if (tx < nnz) { + const IdType k = key[tx]; + row[tx] = k >> col_bits; + col[tx] = k & ((1 << col_bits) - 1); + } +} + +template +void COOSort_(COOMatrix* coo, bool sort_column) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int row_bits = cuda::_NumberOfBits(coo->num_rows); + + const int64_t nnz = coo->row->shape[0]; + if (sort_column) { + const int col_bits = cuda::_NumberOfBits(coo->num_cols); + const int num_bits = row_bits + col_bits; + + const int nt = 256; + const int nb = (nnz + nt - 1) / nt; + CHECK(static_cast(nb) * nt >= nnz); + + IdArray pos = aten::NewIdArray(nnz, coo->row->ctx, coo->row->dtype.bits); + + CUDA_KERNEL_CALL( + _COOEncodeEdgesKernel, nb, nt, 0, stream, coo->row.Ptr(), + coo->col.Ptr(), nnz, col_bits, pos.Ptr()); + + auto sorted = Sort(pos, num_bits); + + CUDA_KERNEL_CALL( + _COODecodeEdgesKernel, nb, nt, 0, stream, sorted.first.Ptr(), + nnz, col_bits, coo->row.Ptr(), coo->col.Ptr()); + + if (aten::COOHasData(*coo)) + coo->data = IndexSelect(coo->data, sorted.second); + else + coo->data = AsNumBits(sorted.second, coo->row->dtype.bits); + coo->row_sorted = coo->col_sorted = true; + } else { + const int num_bits = row_bits; + + auto sorted = Sort(coo->row, num_bits); + + coo->row = sorted.first; + coo->col = IndexSelect(coo->col, sorted.second); + + if (aten::COOHasData(*coo)) + coo->data = IndexSelect(coo->data, sorted.second); + else + coo->data = AsNumBits(sorted.second, coo->row->dtype.bits); + coo->row_sorted = true; + } +} + +template void COOSort_(COOMatrix* coo, bool sort_column); +template void COOSort_(COOMatrix* coo, bool sort_column); + +///////////////////////////// COOIsSorted ///////////////////////////// + +template +__global__ void _COOIsSortedKernel( + const IdType* row, const IdType* col, int64_t nnz, int8_t* row_sorted, + int8_t* col_sorted) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < nnz) { + if (tx == 0) { + row_sorted[0] = 1; + col_sorted[0] = 1; + } else { + row_sorted[tx] = static_cast(row[tx - 1] <= row[tx]); + col_sorted[tx] = + static_cast(row[tx - 1] < row[tx] || col[tx - 1] <= col[tx]); + } + tx += stride_x; + } +} + +template +std::pair COOIsSorted(COOMatrix coo) { + const int64_t nnz = coo.row->shape[0]; + const auto& ctx = coo.row->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(ctx); + // We allocate a workspace of 2*nnz bytes. It wastes a little bit memory but + // should be fine. + int8_t* row_flags = static_cast(device->AllocWorkspace(ctx, nnz)); + int8_t* col_flags = static_cast(device->AllocWorkspace(ctx, nnz)); + const int nt = cuda::FindNumThreads(nnz); + const int nb = (nnz + nt - 1) / nt; + CUDA_KERNEL_CALL( + _COOIsSortedKernel, nb, nt, 0, stream, coo.row.Ptr(), + coo.col.Ptr(), nnz, row_flags, col_flags); + + const bool row_sorted = cuda::AllTrue(row_flags, nnz, ctx); + const bool col_sorted = + row_sorted ? cuda::AllTrue(col_flags, nnz, ctx) : false; + + device->FreeWorkspace(ctx, row_flags); + device->FreeWorkspace(ctx, col_flags); + + return {row_sorted, col_sorted}; +} + +template std::pair COOIsSorted(COOMatrix coo); +template std::pair COOIsSorted(COOMatrix coo); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr2coo.cu b/src/array/cuda/csr2coo.cu index e7eecdcda125..c116a599eac5 100644 --- a/src/array/cuda/csr2coo.cu +++ b/src/array/cuda/csr2coo.cu @@ -8,7 +8,7 @@ #include #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -29,12 +29,12 @@ COOMatrix CSRToCOO(CSRMatrix csr) { template <> COOMatrix CSRToCOO(CSRMatrix csr) { auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; const int32_t* indptr_ptr = static_cast(indptr->data); @@ -42,9 +42,9 @@ COOMatrix CSRToCOO(CSRMatrix csr) { aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits); int32_t* row_ptr = static_cast(row->data); - CUSPARSE_CALL(cusparseXcsr2coo( + CUSPARSE_CALL(hipsparseXcsr2coo( thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows, - row_ptr, CUSPARSE_INDEX_BASE_ZERO)); + row_ptr, HIPSPARSE_INDEX_BASE_ZERO)); return COOMatrix( csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted); @@ -75,7 +75,7 @@ struct AdjacentDifference { template <> COOMatrix CSRToCOO(CSRMatrix csr) { const auto& ctx = csr.indptr->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t nnz = csr.indices->shape[0]; const auto nbits = csr.indptr->dtype.bits; @@ -94,14 +94,14 @@ COOMatrix CSRToCOO(CSRMatrix csr) { constexpr int64_t max_copy_at_once = std::numeric_limits::max(); for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) { std::size_t temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceCopy::Batched( + CUDA_CALL(hipcub::DeviceCopy::Batched( nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i, buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), stream)); auto temp = allocator.alloc_unique(temp_storage_bytes); - CUDA_CALL(cub::DeviceCopy::Batched( + CUDA_CALL(hipcub::DeviceCopy::Batched( temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i, buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), stream)); @@ -128,12 +128,12 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) { auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto device = runtime::DeviceAPI::Get(coo.row->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); NDArray row = coo.row, col = coo.col, data = coo.data; int32_t* row_ptr = static_cast(row->data); @@ -141,11 +141,11 @@ COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) { int32_t* data_ptr = static_cast(data->data); size_t workspace_size = 0; - CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt( + CUSPARSE_CALL(hipsparseXcoosort_bufferSizeExt( thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], data_ptr, row_ptr, &workspace_size)); void* workspace = device->AllocWorkspace(row->ctx, workspace_size); - CUSPARSE_CALL(cusparseXcoosortByRow( + CUSPARSE_CALL(hipsparseXcoosortByRow( thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], data_ptr, row_ptr, col_ptr, workspace)); device->FreeWorkspace(row->ctx, workspace); diff --git a/src/array/cuda/csr2coo.cu.prehip b/src/array/cuda/csr2coo.cu.prehip new file mode 100644 index 000000000000..e7eecdcda125 --- /dev/null +++ b/src/array/cuda/csr2coo.cu.prehip @@ -0,0 +1,183 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/csr2coo.cc + * @brief CSR2COO + */ +#include +#include +#include +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +template +COOMatrix CSRToCOO(CSRMatrix csr) { + LOG(FATAL) << "Unreachable codes"; + return {}; +} + +template <> +COOMatrix CSRToCOO(CSRMatrix csr) { + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; + const int32_t* indptr_ptr = static_cast(indptr->data); + NDArray row = + aten::NewIdArray(indices->shape[0], indptr->ctx, indptr->dtype.bits); + int32_t* row_ptr = static_cast(row->data); + + CUSPARSE_CALL(cusparseXcsr2coo( + thr_entry->cusparse_handle, indptr_ptr, indices->shape[0], csr.num_rows, + row_ptr, CUSPARSE_INDEX_BASE_ZERO)); + + return COOMatrix( + csr.num_rows, csr.num_cols, row, indices, data, true, csr.sorted); +} + +struct RepeatIndex { + template + __host__ __device__ auto operator()(IdType i) { + return thrust::make_constant_iterator(i); + } +}; + +template +struct OutputBufferIndexer { + const IdType* indptr; + IdType* buffer; + __host__ __device__ auto operator()(IdType i) { return buffer + indptr[i]; } +}; + +template +struct AdjacentDifference { + const IdType* indptr; + __host__ __device__ auto operator()(IdType i) { + return indptr[i + 1] - indptr[i]; + } +}; + +template <> +COOMatrix CSRToCOO(CSRMatrix csr) { + const auto& ctx = csr.indptr->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int64_t nnz = csr.indices->shape[0]; + const auto nbits = csr.indptr->dtype.bits; + IdArray ret_row = NewIdArray(nnz, ctx, nbits); + + runtime::CUDAWorkspaceAllocator allocator(csr.indptr->ctx); + thrust::counting_iterator iota(0); + + auto input_buffer = thrust::make_transform_iterator(iota, RepeatIndex{}); + auto output_buffer = thrust::make_transform_iterator( + iota, OutputBufferIndexer{ + csr.indptr.Ptr(), ret_row.Ptr()}); + auto buffer_sizes = thrust::make_transform_iterator( + iota, AdjacentDifference{csr.indptr.Ptr()}); + + constexpr int64_t max_copy_at_once = std::numeric_limits::max(); + for (int64_t i = 0; i < csr.num_rows; i += max_copy_at_once) { + std::size_t temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceCopy::Batched( + nullptr, temp_storage_bytes, input_buffer + i, output_buffer + i, + buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), + stream)); + + auto temp = allocator.alloc_unique(temp_storage_bytes); + + CUDA_CALL(cub::DeviceCopy::Batched( + temp.get(), temp_storage_bytes, input_buffer + i, output_buffer + i, + buffer_sizes + i, std::min(csr.num_rows - i, max_copy_at_once), + stream)); + } + + return COOMatrix( + csr.num_rows, csr.num_cols, ret_row, csr.indices, csr.data, true, + csr.sorted); +} + +template COOMatrix CSRToCOO(CSRMatrix csr); +template COOMatrix CSRToCOO(CSRMatrix csr); + +template +COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) { + LOG(FATAL) << "Unreachable codes"; + return {}; +} + +template <> +COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) { + COOMatrix coo = CSRToCOO(csr); + if (aten::IsNullArray(coo.data)) return coo; + + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + auto device = runtime::DeviceAPI::Get(coo.row->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + NDArray row = coo.row, col = coo.col, data = coo.data; + int32_t* row_ptr = static_cast(row->data); + int32_t* col_ptr = static_cast(col->data); + int32_t* data_ptr = static_cast(data->data); + + size_t workspace_size = 0; + CUSPARSE_CALL(cusparseXcoosort_bufferSizeExt( + thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], + data_ptr, row_ptr, &workspace_size)); + void* workspace = device->AllocWorkspace(row->ctx, workspace_size); + CUSPARSE_CALL(cusparseXcoosortByRow( + thr_entry->cusparse_handle, coo.num_rows, coo.num_cols, row->shape[0], + data_ptr, row_ptr, col_ptr, workspace)); + device->FreeWorkspace(row->ctx, workspace); + + // The row and column field have already been reordered according + // to data, thus the data field will be deprecated. + coo.data = aten::NullArray(); + coo.row_sorted = false; + coo.col_sorted = false; + return coo; +} + +template <> +COOMatrix CSRToCOODataAsOrder(CSRMatrix csr) { + COOMatrix coo = CSRToCOO(csr); + if (aten::IsNullArray(coo.data)) return coo; + const auto& sorted = Sort(coo.data); + + coo.row = IndexSelect(coo.row, sorted.second); + coo.col = IndexSelect(coo.col, sorted.second); + + // The row and column field have already been reordered according + // to data, thus the data field will be deprecated. + coo.data = aten::NullArray(); + coo.row_sorted = false; + coo.col_sorted = false; + return coo; +} + +template COOMatrix CSRToCOODataAsOrder(CSRMatrix csr); +template COOMatrix CSRToCOODataAsOrder(CSRMatrix csr); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr_get_data.cu b/src/array/cuda/csr_get_data.cu index 9574b2a5e849..ecb3e6796992 100644 --- a/src/array/cuda/csr_get_data.cu +++ b/src/array/cuda/csr_get_data.cu @@ -36,7 +36,7 @@ NDArray CSRGetData( IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx); if (rstlen == 0) return rst; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int nt = cuda::FindNumThreads(rstlen); const int nb = (rstlen + nt - 1) / nt; if (return_eids) @@ -67,12 +67,12 @@ template NDArray CSRGetData( CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, NDArray weights, __half filler); #if BF16_ENABLED -template NDArray CSRGetData( +template NDArray CSRGetData( CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, - NDArray weights, __nv_bfloat16 filler); -template NDArray CSRGetData( + NDArray weights, __hip_bfloat16 filler); +template NDArray CSRGetData( CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, - NDArray weights, __nv_bfloat16 filler); + NDArray weights, __hip_bfloat16 filler); #endif // BF16_ENABLED template NDArray CSRGetData( CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, diff --git a/src/array/cuda/csr_get_data.cu.prehip b/src/array/cuda/csr_get_data.cu.prehip new file mode 100644 index 000000000000..9574b2a5e849 --- /dev/null +++ b/src/array/cuda/csr_get_data.cu.prehip @@ -0,0 +1,100 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/cuda/csr_get_data.cu + * @brief Retrieve entries of a CSR matrix + */ +#include + +#include +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +template +NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, DType filler) { + const int64_t rowlen = rows->shape[0]; + const int64_t collen = cols->shape[0]; + + CHECK((rowlen == collen) || (rowlen == 1) || (collen == 1)) + << "Invalid row and col id array."; + + const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; + const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; + + const int64_t rstlen = std::max(rowlen, collen); + IdArray rst = NDArray::Empty({rstlen}, weights->dtype, rows->ctx); + if (rstlen == 0) return rst; + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int nt = cuda::FindNumThreads(rstlen); + const int nb = (rstlen + nt - 1) / nt; + if (return_eids) + BUG_IF_FAIL(DGLDataTypeTraits::dtype == rows->dtype) + << "DType does not match row's dtype."; + + const IdType* indptr_data = + static_cast(cuda::GetDevicePointer(csr.indptr)); + const IdType* indices_data = + static_cast(cuda::GetDevicePointer(csr.indices)); + const IdType* data_data = + CSRHasData(csr) ? static_cast(cuda::GetDevicePointer(csr.data)) + : nullptr; + + // TODO(minjie): use binary search for sorted csr + CUDA_KERNEL_CALL( + cuda::_LinearSearchKernel, nb, nt, 0, stream, indptr_data, indices_data, + data_data, rows.Ptr(), cols.Ptr(), row_stride, col_stride, + rstlen, return_eids ? nullptr : weights.Ptr(), filler, + rst.Ptr()); + return rst; +} + +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, __half filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, __half filler); +#if BF16_ENABLED +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, __nv_bfloat16 filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, __nv_bfloat16 filler); +#endif // BF16_ENABLED +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, float filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, float filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, double filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, double filler); + +// For CSRGetData(CSRMatrix, NDArray, NDArray) +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, int32_t filler); +template NDArray CSRGetData( + CSRMatrix csr, NDArray rows, NDArray cols, bool return_eids, + NDArray weights, int64_t filler); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr_mm.cu b/src/array/cuda/csr_mm.cu index ad05c5f4155c..35b8a80d7c2e 100644 --- a/src/array/cuda/csr_mm.cu +++ b/src/array/cuda/csr_mm.cu @@ -31,74 +31,74 @@ std::pair CusparseSpgemm( const int nnzB = B.indices->shape[0]; const DType alpha = 1.0; const DType beta = 0.0; - auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; - auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE; // device auto ctx = A.indptr->ctx; auto device = runtime::DeviceAPI::Get(ctx); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const DType* A_weights = A_weights_array.Ptr(); const DType* B_weights = B_weights_array.Ptr(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); // all one data array - cusparseSpMatDescr_t matA, matB, matC; + hipsparseSpMatDescr_t matA, matB, matC; IdArray dC_csrOffsets = IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx); IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr(); constexpr auto idtype = cusparse_idtype::value; constexpr auto dtype = cuda_dtype::value; // Create sparse matrix A, B and C in CSR format - CUSPARSE_CALL(cusparseCreateCsr( + CUSPARSE_CALL(hipsparseCreateCsr( &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr(), A.indices.Ptr(), - // cusparseCreateCsr only accepts non-const pointers. - const_cast(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, + // hipsparseCreateCsr only accepts non-const pointers. + const_cast(A_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype)); - CUSPARSE_CALL(cusparseCreateCsr( + CUSPARSE_CALL(hipsparseCreateCsr( &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr(), B.indices.Ptr(), - // cusparseCreateCsr only accepts non-const pointers. - const_cast(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, + // hipsparseCreateCsr only accepts non-const pointers. + const_cast(B_weights), idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype)); - CUSPARSE_CALL(cusparseCreateCsr( + CUSPARSE_CALL(hipsparseCreateCsr( &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr, - idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype)); + idtype, idtype, HIPSPARSE_INDEX_BASE_ZERO, dtype)); // SpGEMM Computation - cusparseSpGEMMDescr_t spgemmDesc; - cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT; + hipsparseSpGEMMDescr_t spgemmDesc; + hipsparseSpGEMMAlg_t alg = HIPSPARSE_SPGEMM_DEFAULT; - CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc)); + CUSPARSE_CALL(hipsparseSpGEMM_createDescr(&spgemmDesc)); size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0; // ask bufferSize1 bytes for external memory - CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + CUSPARSE_CALL(hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); // inspect the matrices A and B to understand the memory requiremnent - cusparseStatus_t e = cusparseSpGEMM_workEstimation( + hipsparseStatus_t e = hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1); - // CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1 + // HIPSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1 // and throws insufficient memory error within workEstimation call - if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + if (e == HIPSPARSE_STATUS_INSUFFICIENT_RESOURCES) { // fall back to ALG2 to estimate num_prods - alg = CUSPARSE_SPGEMM_ALG2; + alg = HIPSPARSE_SPGEMM_ALG2; device->FreeWorkspace(ctx, workspace1); - // rerun cusparseSpGEMM_workEstimation - CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + // rerun hipsparseSpGEMM_workEstimation + CUSPARSE_CALL(hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); - CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + CUSPARSE_CALL(hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); } else { - CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e; + CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e; } // get the number of intermediate products required for SpGEMM compute @@ -113,26 +113,26 @@ std::pair CusparseSpgemm( int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000; // switch to ALG2/ALG3 for medium & large problem size - if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) { + if (alg == HIPSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) { // use ALG3 for very large problem - alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3 - : CUSPARSE_SPGEMM_ALG2; + alg = num_prods > LARGE_NUM_PRODUCTS ? HIPSPARSE_SPGEMM_ALG3 + : HIPSPARSE_SPGEMM_ALG2; device->FreeWorkspace(ctx, workspace1); - // rerun cusparseSpGEMM_workEstimation - CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + // rerun hipsparseSpGEMM_workEstimation + CUSPARSE_CALL(hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); - CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + CUSPARSE_CALL(hipsparseSpGEMM_workEstimation( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); - } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) { - // no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3 - alg = CUSPARSE_SPGEMM_ALG3; + } else if (alg == HIPSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) { + // no need to rerun hipsparseSpGEMM_workEstimation between ALG2 and ALG3 + alg = HIPSPARSE_SPGEMM_ALG3; } - if (alg == CUSPARSE_SPGEMM_ALG2 || alg == CUSPARSE_SPGEMM_ALG3) { + if (alg == HIPSPARSE_SPGEMM_ALG2 || alg == HIPSPARSE_SPGEMM_ALG3) { // estimate memory for ALG2/ALG3; note chunk_fraction is only used by ALG3 // reduce chunk_fraction if crash due to mem., but it trades off speed float chunk_fraction = num_prods < 4 * LARGE_NUM_PRODUCTS ? 0.15 : 0.05; @@ -147,40 +147,40 @@ std::pair CusparseSpgemm( workspace3, &workspace_size2)); device->FreeWorkspace(ctx, workspace3); } else { - CUSPARSE_CALL(cusparseSpGEMM_compute( + CUSPARSE_CALL(hipsparseSpGEMM_compute( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size2, NULL)); } // ask bufferSize2 bytes for external memory void* workspace2 = device->AllocWorkspace(ctx, workspace_size2); // compute the intermediate product of A * B - CUSPARSE_CALL(cusparseSpGEMM_compute( + CUSPARSE_CALL(hipsparseSpGEMM_compute( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2)); // get matrix C non-zero entries C_nnz1 int64_t C_num_rows1, C_num_cols1, C_nnz1; CUSPARSE_CALL( - cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1)); + hipsparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1)); IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx); NDArray dC_weights = NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx); IdType* dC_columns_data = dC_columns.Ptr(); DType* dC_weights_data = dC_weights.Ptr(); // update matC with the new pointers - CUSPARSE_CALL(cusparseCsrSetPointers( + CUSPARSE_CALL(hipsparseCsrSetPointers( matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data)); // copy the final products to the matrix C - CUSPARSE_CALL(cusparseSpGEMM_copy( + CUSPARSE_CALL(hipsparseSpGEMM_copy( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, alg, spgemmDesc)); device->FreeWorkspace(ctx, workspace1); device->FreeWorkspace(ctx, workspace2); // destroy matrix/vector descriptors - CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc)); - CUSPARSE_CALL(cusparseDestroySpMat(matA)); - CUSPARSE_CALL(cusparseDestroySpMat(matB)); - CUSPARSE_CALL(cusparseDestroySpMat(matC)); + CUSPARSE_CALL(hipsparseSpGEMM_destroyDescr(spgemmDesc)); + CUSPARSE_CALL(hipsparseDestroySpMat(matA)); + CUSPARSE_CALL(hipsparseDestroySpMat(matB)); + CUSPARSE_CALL(hipsparseDestroySpMat(matC)); return { CSRMatrix( A.num_rows, B.num_cols, dC_csrOffsets, dC_columns, @@ -208,25 +208,25 @@ std::pair CusparseSpgemm( auto ctx = A.indptr->ctx; auto device = runtime::DeviceAPI::Get(ctx); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto idtype = A.indptr->dtype; auto dtype = A_weights_array->dtype; const DType* A_weights = A_weights_array.Ptr(); const DType* B_weights = B_weights_array.Ptr(); if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); - CUSPARSE_CALL(cusparseSetPointerMode( - thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetPointerMode( + thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST)); - CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info)); + CUSPARSE_CALL(hipsparseCreateCsrgemm2Info(&info)); - cusparseMatDescr_t matA, matB, matC, matD; - CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); - CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); - CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); - CUSPARSE_CALL(cusparseCreateMatDescr(&matD)); // needed even if D is null + hipsparseMatDescr_t matA, matB, matC, matD; + CUSPARSE_CALL(hipsparseCreateMatDescr(&matA)); + CUSPARSE_CALL(hipsparseCreateMatDescr(&matB)); + CUSPARSE_CALL(hipsparseCreateMatDescr(&matC)); + CUSPARSE_CALL(hipsparseCreateMatDescr(&matD)); // needed even if D is null CUSPARSE_CALL(CSRGEMM::bufferSizeExt( thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA, @@ -252,11 +252,11 @@ std::pair CusparseSpgemm( C_indptr.Ptr(), C_indices.Ptr(), info, workspace)); device->FreeWorkspace(ctx, workspace); - CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matD)); + CUSPARSE_CALL(hipsparseDestroyCsrgemm2Info(info)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matA)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matB)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matC)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matD)); return { CSRMatrix( @@ -314,9 +314,9 @@ template std::pair CSRMM( template std::pair CSRMM( const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); #if BF16_ENABLED -template std::pair CSRMM( +template std::pair CSRMM( const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); -template std::pair CSRMM( +template std::pair CSRMM( const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); #endif // BF16_ENABLED template std::pair CSRMM( diff --git a/src/array/cuda/csr_mm.cu.prehip b/src/array/cuda/csr_mm.cu.prehip new file mode 100644 index 000000000000..ad05c5f4155c --- /dev/null +++ b/src/array/cuda/csr_mm.cu.prehip @@ -0,0 +1,332 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/csr_mm.cu + * @brief SpSpMM/SpGEMM C APIs and definitions. + */ +#include +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./cusparse_dispatcher.cuh" +#include "./functor.cuh" +namespace dgl { + +using namespace dgl::runtime; + +namespace aten { +namespace cusparse { + +#if CUDART_VERSION >= 12000 + +/** @brief Cusparse implementation of SpGEMM on Csr format for CUDA 12.0+ */ +template +std::pair CusparseSpgemm( + const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B, + const NDArray B_weights_array) { + // We use Spgemm (SpSpMM) to perform following operation: + // C = A x B, where A, B and C are sparse matrices in csr format. + const int nnzA = A.indices->shape[0]; + const int nnzB = B.indices->shape[0]; + const DType alpha = 1.0; + const DType beta = 0.0; + auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + // device + auto ctx = A.indptr->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const DType* A_weights = A_weights_array.Ptr(); + const DType* B_weights = B_weights_array.Ptr(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + // all one data array + cusparseSpMatDescr_t matA, matB, matC; + IdArray dC_csrOffsets = + IdArray::Empty({A.num_rows + 1}, A.indptr->dtype, A.indptr->ctx); + IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr(); + constexpr auto idtype = cusparse_idtype::value; + constexpr auto dtype = cuda_dtype::value; + // Create sparse matrix A, B and C in CSR format + CUSPARSE_CALL(cusparseCreateCsr( + &matA, A.num_rows, A.num_cols, nnzA, A.indptr.Ptr(), + A.indices.Ptr(), + // cusparseCreateCsr only accepts non-const pointers. + const_cast(A_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, + dtype)); + CUSPARSE_CALL(cusparseCreateCsr( + &matB, B.num_rows, B.num_cols, nnzB, B.indptr.Ptr(), + B.indices.Ptr(), + // cusparseCreateCsr only accepts non-const pointers. + const_cast(B_weights), idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, + dtype)); + CUSPARSE_CALL(cusparseCreateCsr( + &matC, A.num_rows, B.num_cols, 0, dC_csrOffsets_data, nullptr, nullptr, + idtype, idtype, CUSPARSE_INDEX_BASE_ZERO, dtype)); + // SpGEMM Computation + cusparseSpGEMMDescr_t spgemmDesc; + cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT; + + CUSPARSE_CALL(cusparseSpGEMM_createDescr(&spgemmDesc)); + size_t workspace_size1 = 0, workspace_size2 = 0, workspace_size3 = 0; + // ask bufferSize1 bytes for external memory + CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); + void* workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); + // inspect the matrices A and B to understand the memory requiremnent + cusparseStatus_t e = cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1); + // CUSPARSE_SPGEMM_DEFAULT not support getting num_prods > 2^31 -1 + // and throws insufficient memory error within workEstimation call + if (e == CUSPARSE_STATUS_INSUFFICIENT_RESOURCES) { + // fall back to ALG2 to estimate num_prods + alg = CUSPARSE_SPGEMM_ALG2; + device->FreeWorkspace(ctx, workspace1); + // rerun cusparseSpGEMM_workEstimation + CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); + workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); + CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); + } else { + CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR in SpGEMM: " << e; + } + + // get the number of intermediate products required for SpGEMM compute + // num_prods indicates device memory consumption for SpGEMM if using ALG2/3 + int64_t num_prods; + CUSPARSE_CALL(cusparseSpGEMM_getNumProducts(spgemmDesc, &num_prods)); + + // assume free GPU mem at least ~15G for below heuristics to work + // user-defined medium problem size (below will use DEFAULT) + int64_t MEDIUM_NUM_PRODUCTS = 400000000; // 400*1000*1000; + // user-defined large problem size (above will use ALG3) + int64_t LARGE_NUM_PRODUCTS = 800000000; // 800*1000*1000; + + // switch to ALG2/ALG3 for medium & large problem size + if (alg == CUSPARSE_SPGEMM_DEFAULT && num_prods > MEDIUM_NUM_PRODUCTS) { + // use ALG3 for very large problem + alg = num_prods > LARGE_NUM_PRODUCTS ? CUSPARSE_SPGEMM_ALG3 + : CUSPARSE_SPGEMM_ALG2; + + device->FreeWorkspace(ctx, workspace1); + // rerun cusparseSpGEMM_workEstimation + CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, NULL)); + workspace1 = (device->AllocWorkspace(ctx, workspace_size1)); + CUSPARSE_CALL(cusparseSpGEMM_workEstimation( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size1, workspace1)); + } else if (alg == CUSPARSE_SPGEMM_ALG2 && num_prods > LARGE_NUM_PRODUCTS) { + // no need to rerun cusparseSpGEMM_workEstimation between ALG2 and ALG3 + alg = CUSPARSE_SPGEMM_ALG3; + } + + if (alg == CUSPARSE_SPGEMM_ALG2 || alg == CUSPARSE_SPGEMM_ALG3) { + // estimate memory for ALG2/ALG3; note chunk_fraction is only used by ALG3 + // reduce chunk_fraction if crash due to mem., but it trades off speed + float chunk_fraction = num_prods < 4 * LARGE_NUM_PRODUCTS ? 0.15 : 0.05; + CUSPARSE_CALL(cusparseSpGEMM_estimateMemory( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, chunk_fraction, &workspace_size3, NULL, + NULL)); + void* workspace3 = (device->AllocWorkspace(ctx, workspace_size3)); + CUSPARSE_CALL(cusparseSpGEMM_estimateMemory( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, chunk_fraction, &workspace_size3, + workspace3, &workspace_size2)); + device->FreeWorkspace(ctx, workspace3); + } else { + CUSPARSE_CALL(cusparseSpGEMM_compute( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size2, NULL)); + } + // ask bufferSize2 bytes for external memory + void* workspace2 = device->AllocWorkspace(ctx, workspace_size2); + // compute the intermediate product of A * B + CUSPARSE_CALL(cusparseSpGEMM_compute( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc, &workspace_size2, workspace2)); + // get matrix C non-zero entries C_nnz1 + int64_t C_num_rows1, C_num_cols1, C_nnz1; + CUSPARSE_CALL( + cusparseSpMatGetSize(matC, &C_num_rows1, &C_num_cols1, &C_nnz1)); + IdArray dC_columns = IdArray::Empty({C_nnz1}, A.indptr->dtype, A.indptr->ctx); + NDArray dC_weights = + NDArray::Empty({C_nnz1}, A_weights_array->dtype, A.indptr->ctx); + IdType* dC_columns_data = dC_columns.Ptr(); + DType* dC_weights_data = dC_weights.Ptr(); + // update matC with the new pointers + CUSPARSE_CALL(cusparseCsrSetPointers( + matC, dC_csrOffsets_data, dC_columns_data, dC_weights_data)); + // copy the final products to the matrix C + CUSPARSE_CALL(cusparseSpGEMM_copy( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, alg, spgemmDesc)); + + device->FreeWorkspace(ctx, workspace1); + device->FreeWorkspace(ctx, workspace2); + // destroy matrix/vector descriptors + CUSPARSE_CALL(cusparseSpGEMM_destroyDescr(spgemmDesc)); + CUSPARSE_CALL(cusparseDestroySpMat(matA)); + CUSPARSE_CALL(cusparseDestroySpMat(matB)); + CUSPARSE_CALL(cusparseDestroySpMat(matC)); + return { + CSRMatrix( + A.num_rows, B.num_cols, dC_csrOffsets, dC_columns, + NullArray(dC_csrOffsets->dtype, dC_csrOffsets->ctx)), + dC_weights}; +} + +#else // CUDART_VERSION < 12000 + +/** @brief Cusparse implementation of SpGEMM on Csr format for older CUDA + * versions */ +template +std::pair CusparseSpgemm( + const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B, + const NDArray B_weights_array) { + int nnzC; + csrgemm2Info_t info = nullptr; + size_t workspace_size; + const DType alpha = 1.; + const int nnzA = A.indices->shape[0]; + const int nnzB = B.indices->shape[0]; + const int m = A.num_rows; + const int n = A.num_cols; + const int k = B.num_cols; + auto ctx = A.indptr->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto idtype = A.indptr->dtype; + auto dtype = A_weights_array->dtype; + const DType* A_weights = A_weights_array.Ptr(); + const DType* B_weights = B_weights_array.Ptr(); + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(cusparseSetPointerMode( + thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST)); + + CUSPARSE_CALL(cusparseCreateCsrgemm2Info(&info)); + + cusparseMatDescr_t matA, matB, matC, matD; + CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); + CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); + CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); + CUSPARSE_CALL(cusparseCreateMatDescr(&matD)); // needed even if D is null + + CUSPARSE_CALL(CSRGEMM::bufferSizeExt( + thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA, + A.indptr.Ptr(), A.indices.Ptr(), matB, nnzB, + B.indptr.Ptr(), B.indices.Ptr(), nullptr, matD, 0, + nullptr, nullptr, info, &workspace_size)); + + void* workspace = device->AllocWorkspace(ctx, workspace_size); + IdArray C_indptr = IdArray::Empty({m + 1}, idtype, ctx); + CUSPARSE_CALL(CSRGEMM::nnz( + thr_entry->cusparse_handle, m, n, k, matA, nnzA, A.indptr.Ptr(), + A.indices.Ptr(), matB, nnzB, B.indptr.Ptr(), + B.indices.Ptr(), matD, 0, nullptr, nullptr, matC, + C_indptr.Ptr(), &nnzC, info, workspace)); + + IdArray C_indices = IdArray::Empty({nnzC}, idtype, ctx); + NDArray C_weights = NDArray::Empty({nnzC}, dtype, ctx); + CUSPARSE_CALL(CSRGEMM::compute( + thr_entry->cusparse_handle, m, n, k, &alpha, matA, nnzA, A_weights, + A.indptr.Ptr(), A.indices.Ptr(), matB, nnzB, B_weights, + B.indptr.Ptr(), B.indices.Ptr(), nullptr, matD, 0, + nullptr, nullptr, nullptr, matC, C_weights.Ptr(), + C_indptr.Ptr(), C_indices.Ptr(), info, workspace)); + + device->FreeWorkspace(ctx, workspace); + CUSPARSE_CALL(cusparseDestroyCsrgemm2Info(info)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matD)); + + return { + CSRMatrix( + m, k, C_indptr, C_indices, NullArray(C_indptr->dtype, C_indptr->ctx)), + C_weights}; +} + +#endif // CUDART_VERSION >= 12000 +} // namespace cusparse + +template +std::pair CSRMM( + const CSRMatrix& A, NDArray A_weights, const CSRMatrix& B, + NDArray B_weights) { + auto ctx = A.indptr->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + CSRMatrix newA, newB; + bool cast = false; + + // Cast 64 bit indices to 32 bit. + if (A.indptr->dtype.bits == 64) { + newA = CSRMatrix( + A.num_rows, A.num_cols, AsNumBits(A.indptr, 32), + AsNumBits(A.indices, 32), AsNumBits(A.data, 32)); + newB = CSRMatrix( + B.num_rows, B.num_cols, AsNumBits(B.indptr, 32), + AsNumBits(B.indices, 32), AsNumBits(B.data, 32)); + cast = true; + } + + // Reorder weights if A or B has edge IDs + NDArray newA_weights, newB_weights; + if (CSRHasData(A)) newA_weights = IndexSelect(A_weights, A.data); + if (CSRHasData(B)) newB_weights = IndexSelect(B_weights, B.data); + + auto result = cusparse::CusparseSpgemm( + cast ? newA : A, CSRHasData(A) ? newA_weights : A_weights, + cast ? newB : B, CSRHasData(B) ? newB_weights : B_weights); + + // Cast 32 bit indices back to 64 bit if necessary + if (cast) { + CSRMatrix C = result.first; + return { + CSRMatrix( + C.num_rows, C.num_cols, AsNumBits(C.indptr, 64), + AsNumBits(C.indices, 64), AsNumBits(C.data, 64)), + result.second}; + } else { + return result; + } +} + +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +#if BF16_ENABLED +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +#endif // BF16_ENABLED +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); +template std::pair CSRMM( + const CSRMatrix&, NDArray, const CSRMatrix&, NDArray); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr_sort.cu b/src/array/cuda/csr_sort.cu index 448d36e44529..e6f8d65c7d13 100644 --- a/src/array/cuda/csr_sort.cu +++ b/src/array/cuda/csr_sort.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/csr_sort.cc @@ -5,7 +6,7 @@ */ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -39,7 +40,7 @@ __global__ void _SegmentIsSorted( template bool CSRIsSorted(CSRMatrix csr) { const auto& ctx = csr.indptr->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = runtime::DeviceAPI::Get(ctx); // We allocate a workspace of num_rows bytes. It wastes a little bit memory // but should be fine. @@ -67,12 +68,12 @@ template <> void CSRSort_(CSRMatrix* csr) { auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); NDArray indptr = csr->indptr; NDArray indices = csr->indices; @@ -83,16 +84,16 @@ void CSRSort_(CSRMatrix* csr) { NDArray data = csr->data; size_t workspace_size = 0; - CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt( + CUSPARSE_CALL(hipsparseXcsrsort_bufferSizeExt( thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, indptr.Ptr(), indices.Ptr(), &workspace_size)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - cusparseMatDescr_t descr; - CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); - CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); - CUSPARSE_CALL(cusparseXcsrsort( + hipsparseMatDescr_t descr; + CUSPARSE_CALL(hipsparseCreateMatDescr(&descr)); + CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CALL(hipsparseXcsrsort( thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr, indptr.Ptr(), indices.Ptr(), data.Ptr(), workspace)); @@ -100,13 +101,13 @@ void CSRSort_(CSRMatrix* csr) { csr->sorted = true; // free resources - CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(descr)); device->FreeWorkspace(ctx, workspace); } template <> void CSRSort_(CSRMatrix* csr) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); const auto& ctx = csr->indptr->ctx; @@ -125,13 +126,13 @@ void CSRSort_(CSRMatrix* csr) { // Allocate workspace size_t workspace_size = 0; - CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs( nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); void* workspace = device->AllocWorkspace(ctx, workspace_size); // Compute - CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairs( workspace, workspace_size, key_in, key_out, value_in, value_out, nnz, csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); diff --git a/src/array/cuda/csr_sort.cu.prehip b/src/array/cuda/csr_sort.cu.prehip new file mode 100644 index 000000000000..448d36e44529 --- /dev/null +++ b/src/array/cuda/csr_sort.cu.prehip @@ -0,0 +1,151 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/csr_sort.cc + * @brief Sort CSR index + */ +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +/** + * @brief Check whether each row is sorted. + */ +template +__global__ void _SegmentIsSorted( + const IdType* indptr, const IdType* indices, int64_t num_rows, + int8_t* flags) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < num_rows) { + bool f = true; + for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) { + f = (indices[i - 1] <= indices[i]); + } + flags[tx] = static_cast(f); + tx += stride_x; + } +} + +template +bool CSRIsSorted(CSRMatrix csr) { + const auto& ctx = csr.indptr->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(ctx); + // We allocate a workspace of num_rows bytes. It wastes a little bit memory + // but should be fine. + int8_t* flags = + static_cast(device->AllocWorkspace(ctx, csr.num_rows)); + const int nt = cuda::FindNumThreads(csr.num_rows); + const int nb = (csr.num_rows + nt - 1) / nt; + CUDA_KERNEL_CALL( + _SegmentIsSorted, nb, nt, 0, stream, csr.indptr.Ptr(), + csr.indices.Ptr(), csr.num_rows, flags); + bool ret = cuda::AllTrue(flags, csr.num_rows, ctx); + device->FreeWorkspace(ctx, flags); + return ret; +} + +template bool CSRIsSorted(CSRMatrix csr); +template bool CSRIsSorted(CSRMatrix csr); + +template +void CSRSort_(CSRMatrix* csr) { + LOG(FATAL) << "Unreachable codes"; +} + +template <> +void CSRSort_(CSRMatrix* csr) { + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + NDArray indptr = csr->indptr; + NDArray indices = csr->indices; + const auto& ctx = indptr->ctx; + const int64_t nnz = indices->shape[0]; + if (!aten::CSRHasData(*csr)) + csr->data = aten::Range(0, nnz, indices->dtype.bits, ctx); + NDArray data = csr->data; + + size_t workspace_size = 0; + CUSPARSE_CALL(cusparseXcsrsort_bufferSizeExt( + thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, + indptr.Ptr(), indices.Ptr(), &workspace_size)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + + cusparseMatDescr_t descr; + CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); + CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CALL(cusparseXcsrsort( + thr_entry->cusparse_handle, csr->num_rows, csr->num_cols, nnz, descr, + indptr.Ptr(), indices.Ptr(), data.Ptr(), + workspace)); + + csr->sorted = true; + + // free resources + CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); + device->FreeWorkspace(ctx, workspace); +} + +template <> +void CSRSort_(CSRMatrix* csr) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(csr->indptr->ctx); + + const auto& ctx = csr->indptr->ctx; + const int64_t nnz = csr->indices->shape[0]; + const auto nbits = csr->indptr->dtype.bits; + if (!aten::CSRHasData(*csr)) csr->data = aten::Range(0, nnz, nbits, ctx); + + IdArray new_indices = csr->indices.Clone(); + IdArray new_data = csr->data.Clone(); + + const int64_t* offsets = csr->indptr.Ptr(); + const int64_t* key_in = csr->indices.Ptr(); + int64_t* key_out = new_indices.Ptr(); + const int64_t* value_in = csr->data.Ptr(); + int64_t* value_out = new_data.Ptr(); + + // Allocate workspace + size_t workspace_size = 0; + CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( + nullptr, workspace_size, key_in, key_out, value_in, value_out, nnz, + csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + + // Compute + CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairs( + workspace, workspace_size, key_in, key_out, value_in, value_out, nnz, + csr->num_rows, offsets, offsets + 1, 0, sizeof(int64_t) * 8, stream)); + + csr->sorted = true; + csr->indices = new_indices; + csr->data = new_data; + + // free resources + device->FreeWorkspace(ctx, workspace); +} + +template void CSRSort_(CSRMatrix* csr); +template void CSRSort_(CSRMatrix* csr); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr_sum.cu b/src/array/cuda/csr_sum.cu index b7564309c5cf..5ca340ef568f 100644 --- a/src/array/cuda/csr_sum.cu +++ b/src/array/cuda/csr_sum.cu @@ -32,21 +32,21 @@ std::pair CusparseCsrgeam2( auto ctx = A.indptr->ctx; auto device = runtime::DeviceAPI::Get(ctx); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const DType* A_weights = A_weights_array.Ptr(); const DType* B_weights = B_weights_array.Ptr(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); - cusparseMatDescr_t matA, matB, matC; - CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); - CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); - CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); + hipsparseMatDescr_t matA, matB, matC; + CUSPARSE_CALL(hipsparseCreateMatDescr(&matA)); + CUSPARSE_CALL(hipsparseCreateMatDescr(&matB)); + CUSPARSE_CALL(hipsparseCreateMatDescr(&matC)); - cusparseSetPointerMode( - thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST); + hipsparseSetPointerMode( + thr_entry->cusparse_handle, HIPSPARSE_POINTER_MODE_HOST); size_t workspace_size = 0; /* prepare output C */ IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx); @@ -81,9 +81,9 @@ std::pair CusparseCsrgeam2( device->FreeWorkspace(ctx, workspace); // destroy matrix/vector descriptors - CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); - CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matA)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matB)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(matC)); return { CSRMatrix( A.num_rows, A.num_cols, dC_csrOffsets, dC_columns, @@ -159,9 +159,9 @@ template std::pair CSRSum( template std::pair CSRSum( const std::vector&, const std::vector&); #if BF16_ENABLED -template std::pair CSRSum( +template std::pair CSRSum( const std::vector&, const std::vector&); -template std::pair CSRSum( +template std::pair CSRSum( const std::vector&, const std::vector&); #endif // BF16_ENABLED template std::pair CSRSum( diff --git a/src/array/cuda/csr_sum.cu.prehip b/src/array/cuda/csr_sum.cu.prehip new file mode 100644 index 000000000000..b7564309c5cf --- /dev/null +++ b/src/array/cuda/csr_sum.cu.prehip @@ -0,0 +1,177 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/spmm.cu + * @brief SpGEAM C APIs and definitions. + */ +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./cusparse_dispatcher.cuh" +#include "./functor.cuh" + +namespace dgl { + +using namespace dgl::runtime; + +namespace aten { +namespace cusparse { + +/** Cusparse implementation of SpSum on Csr format. */ +template +std::pair CusparseCsrgeam2( + const CSRMatrix& A, const NDArray A_weights_array, const CSRMatrix& B, + const NDArray B_weights_array) { + const int m = A.num_rows; + const int n = A.num_cols; + const int nnzA = A.indices->shape[0]; + const int nnzB = B.indices->shape[0]; + int nnzC; + const DType alpha = 1.0; + const DType beta = 1.0; + auto ctx = A.indptr->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const DType* A_weights = A_weights_array.Ptr(); + const DType* B_weights = B_weights_array.Ptr(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + cusparseMatDescr_t matA, matB, matC; + CUSPARSE_CALL(cusparseCreateMatDescr(&matA)); + CUSPARSE_CALL(cusparseCreateMatDescr(&matB)); + CUSPARSE_CALL(cusparseCreateMatDescr(&matC)); + + cusparseSetPointerMode( + thr_entry->cusparse_handle, CUSPARSE_POINTER_MODE_HOST); + size_t workspace_size = 0; + /* prepare output C */ + IdArray dC_csrOffsets = IdArray::Empty({m + 1}, A.indptr->dtype, ctx); + IdType* dC_csrOffsets_data = dC_csrOffsets.Ptr(); + IdArray dC_columns; + NDArray dC_weights; + IdType* dC_columns_data = dC_columns.Ptr(); + DType* dC_weights_data = dC_weights.Ptr(); + /* prepare buffer */ + CUSPARSE_CALL(CSRGEAM::bufferSizeExt( + thr_entry->cusparse_handle, m, n, &alpha, matA, nnzA, A_weights, + A.indptr.Ptr(), A.indices.Ptr(), &beta, matB, nnzB, + B_weights, B.indptr.Ptr(), B.indices.Ptr(), matC, + dC_weights_data, dC_csrOffsets_data, dC_columns_data, &workspace_size)); + + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUSPARSE_CALL(CSRGEAM::nnz( + thr_entry->cusparse_handle, m, n, matA, nnzA, A.indptr.Ptr(), + A.indices.Ptr(), matB, nnzB, B.indptr.Ptr(), + B.indices.Ptr(), matC, dC_csrOffsets_data, &nnzC, workspace)); + + dC_columns = IdArray::Empty({nnzC}, A.indptr->dtype, ctx); + dC_weights = NDArray::Empty({nnzC}, A_weights_array->dtype, ctx); + dC_columns_data = dC_columns.Ptr(); + dC_weights_data = dC_weights.Ptr(); + + CUSPARSE_CALL(CSRGEAM::compute( + thr_entry->cusparse_handle, m, n, &alpha, matA, nnzA, A_weights, + A.indptr.Ptr(), A.indices.Ptr(), &beta, matB, nnzB, + B_weights, B.indptr.Ptr(), B.indices.Ptr(), matC, + dC_weights_data, dC_csrOffsets_data, dC_columns_data, workspace)); + + device->FreeWorkspace(ctx, workspace); + // destroy matrix/vector descriptors + CUSPARSE_CALL(cusparseDestroyMatDescr(matA)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matB)); + CUSPARSE_CALL(cusparseDestroyMatDescr(matC)); + return { + CSRMatrix( + A.num_rows, A.num_cols, dC_csrOffsets, dC_columns, + NullArray(dC_csrOffsets->dtype, dC_csrOffsets->ctx), true), + dC_weights}; +} +} // namespace cusparse + +template +std::pair CSRSum( + const std::vector& As, const std::vector& A_weights) { + const int64_t M = As[0].num_rows; + const int64_t N = As[0].num_cols; + const int64_t n = As.size(); + + // Cast 64 bit indices to 32 bit + std::vector newAs; + newAs.reserve(n); + bool cast = false; + if (As[0].indptr->dtype.bits == 64) { + for (int i = 0; i < n; ++i) + newAs.emplace_back( + As[i].num_rows, As[i].num_cols, AsNumBits(As[i].indptr, 32), + AsNumBits(As[i].indices, 32), AsNumBits(As[i].data, 32)); + cast = true; + } else { + for (int i = 0; i < n; ++i) newAs.push_back(As[i]); + } + + // cuSPARSE csrgeam2 requires the CSR to be sorted. + // TODO(BarclayII): ideally the sorted CSR should be cached but I'm not sure + // how to do it. + for (int i = 0; i < n; ++i) { + if (!newAs[i].sorted) newAs[i] = CSRSort(newAs[i]); + } + + // Reorder weights if A[i] has edge IDs + std::vector A_weights_reordered(n); + for (int i = 0; i < n; ++i) { + if (CSRHasData(newAs[i])) + A_weights_reordered[i] = IndexSelect(A_weights[i], newAs[i].data); + else + A_weights_reordered[i] = A_weights[i]; + } + + // Loop and sum + auto result = std::make_pair( + CSRMatrix( + newAs[0].num_rows, newAs[0].num_cols, newAs[0].indptr, + newAs[0].indices, + NullArray(newAs[0].indptr->dtype, newAs[0].indptr->ctx)), + A_weights_reordered[0]); // Weights already reordered so we don't need + // As[0].data + for (int64_t i = 1; i < n; ++i) + result = cusparse::CusparseCsrgeam2( + result.first, result.second, newAs[i], A_weights_reordered[i]); + + // Cast 32 bit indices back to 64 bit if necessary + if (cast) { + CSRMatrix C = result.first; + return { + CSRMatrix( + C.num_rows, C.num_cols, AsNumBits(C.indptr, 64), + AsNumBits(C.indices, 64), AsNumBits(C.data, 64), true), + result.second}; + } else { + return result; + } +} + +template std::pair CSRSum( + const std::vector&, const std::vector&); +template std::pair CSRSum( + const std::vector&, const std::vector&); +#if BF16_ENABLED +template std::pair CSRSum( + const std::vector&, const std::vector&); +template std::pair CSRSum( + const std::vector&, const std::vector&); +#endif // BF16_ENABLED +template std::pair CSRSum( + const std::vector&, const std::vector&); +template std::pair CSRSum( + const std::vector&, const std::vector&); +template std::pair CSRSum( + const std::vector&, const std::vector&); +template std::pair CSRSum( + const std::vector&, const std::vector&); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/csr_transpose.cc b/src/array/cuda/csr_transpose.cc index 60dee39048cb..c5321f431fe4 100644 --- a/src/array/cuda/csr_transpose.cc +++ b/src/array/cuda/csr_transpose.cc @@ -24,12 +24,12 @@ template <> CSRMatrix CSRTranspose(CSRMatrix csr) { #if CUDART_VERSION < 12000 auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; const int64_t nnz = indices->shape[0]; @@ -53,26 +53,26 @@ CSRMatrix CSRTranspose(CSRMatrix csr) { auto device = runtime::DeviceAPI::Get(csr.indptr->ctx); // workspace size_t workspace_size; - CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize( + CUSPARSE_CALL(hipsparseCsr2cscEx2_bufferSize( thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, - CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference + HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO, + HIPSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference &workspace_size)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - CUSPARSE_CALL(cusparseCsr2cscEx2( + CUSPARSE_CALL(hipsparseCsr2cscEx2( thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, - CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference + HIP_R_32F, HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO, + HIPSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference workspace)); device->FreeWorkspace(ctx, workspace); #else - CUSPARSE_CALL(cusparseScsr2csc( + CUSPARSE_CALL(hipsparseScsr2csc( thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, static_cast(data_ptr), indptr_ptr, indices_ptr, static_cast(t_data_ptr), t_indices_ptr, t_indptr_ptr, - CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); + HIPSPARSE_ACTION_NUMERIC, HIPSPARSE_INDEX_BASE_ZERO)); #endif return CSRMatrix( diff --git a/src/array/cuda/csr_transpose.cc.prehip b/src/array/cuda/csr_transpose.cc.prehip new file mode 100644 index 000000000000..60dee39048cb --- /dev/null +++ b/src/array/cuda/csr_transpose.cc.prehip @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/csr_transpose.cc + * @brief CSR transpose (convert to CSC) + */ +#include + +#include "../../runtime/cuda/cuda_common.h" + +namespace dgl { + +using runtime::NDArray; + +namespace aten { +namespace impl { + +template +CSRMatrix CSRTranspose(CSRMatrix csr) { + LOG(FATAL) << "Unreachable codes"; + return {}; +} + +template <> +CSRMatrix CSRTranspose(CSRMatrix csr) { +#if CUDART_VERSION < 12000 + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + + NDArray indptr = csr.indptr, indices = csr.indices, data = csr.data; + const int64_t nnz = indices->shape[0]; + const auto& ctx = indptr->ctx; + const auto bits = indptr->dtype.bits; + if (aten::IsNullArray(data)) data = aten::Range(0, nnz, bits, ctx); + const int32_t* indptr_ptr = static_cast(indptr->data); + const int32_t* indices_ptr = static_cast(indices->data); + const void* data_ptr = data->data; + + // (BarclayII) csr2csc doesn't seem to clear the content of cscColPtr if nnz + // == 0. We need to do it ourselves. + NDArray t_indptr = aten::Full(0, csr.num_cols + 1, bits, ctx); + NDArray t_indices = aten::NewIdArray(nnz, ctx, bits); + NDArray t_data = aten::NewIdArray(nnz, ctx, bits); + int32_t* t_indptr_ptr = static_cast(t_indptr->data); + int32_t* t_indices_ptr = static_cast(t_indices->data); + void* t_data_ptr = t_data->data; + +#if CUDART_VERSION >= 10010 + auto device = runtime::DeviceAPI::Get(csr.indptr->ctx); + // workspace + size_t workspace_size; + CUSPARSE_CALL(cusparseCsr2cscEx2_bufferSize( + thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, + indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, + CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference + &workspace_size)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUSPARSE_CALL(cusparseCsr2cscEx2( + thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, data_ptr, + indptr_ptr, indices_ptr, t_data_ptr, t_indptr_ptr, t_indices_ptr, + CUDA_R_32F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, // see cusparse doc for reference + workspace)); + device->FreeWorkspace(ctx, workspace); +#else + CUSPARSE_CALL(cusparseScsr2csc( + thr_entry->cusparse_handle, csr.num_rows, csr.num_cols, nnz, + static_cast(data_ptr), indptr_ptr, indices_ptr, + static_cast(t_data_ptr), t_indices_ptr, t_indptr_ptr, + CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); +#endif + + return CSRMatrix( + csr.num_cols, csr.num_rows, t_indptr, t_indices, t_data, false); +#else + return COOToCSR(COOTranspose(CSRToCOO(csr, false))); +#endif +} + +template <> +CSRMatrix CSRTranspose(CSRMatrix csr) { + return COOToCSR(COOTranspose(CSRToCOO(csr, false))); +} + +template CSRMatrix CSRTranspose(CSRMatrix csr); +template CSRMatrix CSRTranspose(CSRMatrix csr); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/cuda_filter.cu b/src/array/cuda/cuda_filter.cu index 9bbd18f42307..72f727d40554 100644 --- a/src/array/cuda/cuda_filter.cu +++ b/src/array/cuda/cuda_filter.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file array/cuda/cuda_filter.cc @@ -6,7 +7,7 @@ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "../../runtime/cuda/cuda_hashtable.cuh" @@ -45,7 +46,7 @@ IdArray _PerformFilter(const OrderedHashTable& table, IdArray test) { const auto& ctx = test->ctx; auto device = runtime::DeviceAPI::Get(ctx); const int64_t size = test->shape[0]; - cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); + hipStream_t cudaStream = runtime::getCurrentCUDAStream(); if (size == 0) { return test; @@ -74,12 +75,12 @@ IdArray _PerformFilter(const OrderedHashTable& table, IdArray test) { // generate prefix-sum { size_t workspace_bytes; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, workspace_bytes, static_cast(nullptr), static_cast(nullptr), size + 1, cudaStream)); void* workspace = device->AllocWorkspace(ctx, workspace_bytes); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream)); device->FreeWorkspace(ctx, workspace); } @@ -109,7 +110,7 @@ class CudaFilterSet : public Filter { public: explicit CudaFilterSet(IdArray array) : table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) { - cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); + hipStream_t cudaStream = runtime::getCurrentCUDAStream(); table_.FillWithUnique( static_cast(array->data), array->shape[0], cudaStream); } diff --git a/src/array/cuda/cuda_filter.cu.prehip b/src/array/cuda/cuda_filter.cu.prehip new file mode 100644 index 000000000000..9bbd18f42307 --- /dev/null +++ b/src/array/cuda/cuda_filter.cu.prehip @@ -0,0 +1,140 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/cuda/cuda_filter.cc + * @brief Object for selecting items in a set, or selecting items not in a set. + */ + +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "../../runtime/cuda/cuda_hashtable.cuh" +#include "../filter.h" + +using namespace dgl::runtime::cuda; + +namespace dgl { +namespace array { + +namespace { + +template +__global__ void _IsInKernel( + DeviceOrderedHashTable table, const IdType* const array, + const int64_t size, IdType* const mark) { + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < size) { + mark[idx] = table.Contains(array[idx]) ^ (!include); + } +} + +template +__global__ void _InsertKernel( + const IdType* const prefix, const int64_t size, IdType* const result) { + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + if (idx < size) { + if (prefix[idx] != prefix[idx + 1]) { + result[prefix[idx]] = idx; + } + } +} + +template +IdArray _PerformFilter(const OrderedHashTable& table, IdArray test) { + const auto& ctx = test->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t size = test->shape[0]; + cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); + + if (size == 0) { + return test; + } + + // we need two arrays: 1) to act as a prefixsum + // for the number of entries that will be inserted, and + // 2) to collect the included items. + IdType* prefix = static_cast( + device->AllocWorkspace(ctx, sizeof(IdType) * (size + 1))); + + // will resize down later + IdArray result = aten::NewIdArray(size, ctx, sizeof(IdType) * 8); + + // mark each index based on it's existence in the hashtable + { + const dim3 block(256); + const dim3 grid((size + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + (_IsInKernel), grid, block, 0, cudaStream, + table.DeviceHandle(), static_cast(test->data), size, + prefix); + } + + // generate prefix-sum + { + size_t workspace_bytes; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, workspace_bytes, static_cast(nullptr), + static_cast(nullptr), size + 1, cudaStream)); + void* workspace = device->AllocWorkspace(ctx, workspace_bytes); + + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + workspace, workspace_bytes, prefix, prefix, size + 1, cudaStream)); + device->FreeWorkspace(ctx, workspace); + } + + // copy number using the internal current stream; + IdType num_unique; + device->CopyDataFromTo( + prefix + size, 0, &num_unique, 0, sizeof(num_unique), ctx, + DGLContext{kDGLCPU, 0}, test->dtype); + + // insert items into set + { + const dim3 block(256); + const dim3 grid((size + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _InsertKernel, grid, block, 0, cudaStream, prefix, size, + static_cast(result->data)); + } + device->FreeWorkspace(ctx, prefix); + + return result.CreateView({num_unique}, result->dtype); +} + +template +class CudaFilterSet : public Filter { + public: + explicit CudaFilterSet(IdArray array) + : table_(array->shape[0], array->ctx, runtime::getCurrentCUDAStream()) { + cudaStream_t cudaStream = runtime::getCurrentCUDAStream(); + table_.FillWithUnique( + static_cast(array->data), array->shape[0], cudaStream); + } + + IdArray find_included_indices(IdArray test) override { + return _PerformFilter(table_, test); + } + + IdArray find_excluded_indices(IdArray test) override { + return _PerformFilter(table_, test); + } + + private: + OrderedHashTable table_; +}; + +} // namespace + +template +FilterRef CreateSetFilter(IdArray set) { + return FilterRef(std::make_shared>(set)); +} + +template FilterRef CreateSetFilter(IdArray set); +template FilterRef CreateSetFilter(IdArray set); + +} // namespace array +} // namespace dgl diff --git a/src/array/cuda/cusparse_dispatcher.cuh b/src/array/cuda/cusparse_dispatcher.cuh index 0d9152b59145..5781654985f6 100644 --- a/src/array/cuda/cusparse_dispatcher.cuh +++ b/src/array/cuda/cusparse_dispatcher.cuh @@ -7,7 +7,7 @@ #ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ #define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ -#include +#include #include #include "bf16.cuh" @@ -20,70 +20,70 @@ namespace aten { template struct CSRGEMM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { BUG_IF_FAIL(false) << "This piece of code should not be reached."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgemm2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgemm2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { BUG_IF_FAIL(false) << "This piece of code should not be reached."; - return static_cast(0); + return static_cast(0); } }; template <> struct CSRGEMM<__half> { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a // different implementation would be required. LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16)."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgemm2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgemm2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { // TODO(ndickson): There is no cusparseHcsrgemm2, so a different // implementation would be required. LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16)."; - return static_cast(0); + return static_cast(0); } }; #if BF16_ENABLED template <> -struct CSRGEMM<__nv_bfloat16> { +struct CSRGEMM<__hip_bfloat16> { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a // different implementation would be required. LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgemm2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgemm2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { // TODO(ndickson): There is no cusparseHcsrgemm2, so a different // implementation would be required. LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16)."; - return static_cast(0); + return static_cast(0); } }; #endif // BF16_ENABLED @@ -91,36 +91,36 @@ struct CSRGEMM<__nv_bfloat16> { template <> struct CSRGEMM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { - return cusparseScsrgemm2_bufferSizeExt(args...); + static inline hipsparseStatus_t bufferSizeExt(Args... args) { + return hipsparseScsrgemm2_bufferSizeExt(args...); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgemm2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgemm2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { - return cusparseScsrgemm2(args...); + static inline hipsparseStatus_t compute(Args... args) { + return hipsparseScsrgemm2(args...); } }; template <> struct CSRGEMM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { - return cusparseDcsrgemm2_bufferSizeExt(args...); + static inline hipsparseStatus_t bufferSizeExt(Args... args) { + return hipsparseDcsrgemm2_bufferSizeExt(args...); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgemm2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgemm2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { - return cusparseDcsrgemm2(args...); + static inline hipsparseStatus_t compute(Args... args) { + return hipsparseDcsrgemm2(args...); } }; @@ -128,70 +128,70 @@ struct CSRGEMM { template struct CSRGEAM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { BUG_IF_FAIL(false) << "This piece of code should not be reached."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgeam2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgeam2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { BUG_IF_FAIL(false) << "This piece of code should not be reached."; - return static_cast(0); + return static_cast(0); } }; template <> struct CSRGEAM<__half> { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a // different implementation would be required. LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16)."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgeam2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgeam2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { // TODO(ndickson): There is no cusparseHcsrgeam2, so a different // implementation would be required. LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16)."; - return static_cast(0); + return static_cast(0); } }; #if BF16_ENABLED template <> -struct CSRGEAM<__nv_bfloat16> { +struct CSRGEAM<__hip_bfloat16> { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { + static inline hipsparseStatus_t bufferSizeExt(Args... args) { // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a // different implementation would be required. LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."; - return static_cast(0); + return static_cast(0); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgeam2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgeam2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { + static inline hipsparseStatus_t compute(Args... args) { // TODO(ndickson): There is no cusparseHcsrgeam2, so a different // implementation would be required. LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16)."; - return static_cast(0); + return static_cast(0); } }; #endif // BF16_ENABLED @@ -199,36 +199,36 @@ struct CSRGEAM<__nv_bfloat16> { template <> struct CSRGEAM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { - return cusparseScsrgeam2_bufferSizeExt(args...); + static inline hipsparseStatus_t bufferSizeExt(Args... args) { + return hipsparseScsrgeam2_bufferSizeExt(args...); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgeam2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgeam2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { - return cusparseScsrgeam2(args...); + static inline hipsparseStatus_t compute(Args... args) { + return hipsparseScsrgeam2(args...); } }; template <> struct CSRGEAM { template - static inline cusparseStatus_t bufferSizeExt(Args... args) { - return cusparseDcsrgeam2_bufferSizeExt(args...); + static inline hipsparseStatus_t bufferSizeExt(Args... args) { + return hipsparseDcsrgeam2_bufferSizeExt(args...); } template - static inline cusparseStatus_t nnz(Args... args) { - return cusparseXcsrgeam2Nnz(args...); + static inline hipsparseStatus_t nnz(Args... args) { + return hipsparseXcsrgeam2Nnz(args...); } template - static inline cusparseStatus_t compute(Args... args) { - return cusparseDcsrgeam2(args...); + static inline hipsparseStatus_t compute(Args... args) { + return hipsparseDcsrgeam2(args...); } }; diff --git a/src/array/cuda/cusparse_dispatcher.cuh.prehip b/src/array/cuda/cusparse_dispatcher.cuh.prehip new file mode 100644 index 000000000000..0d9152b59145 --- /dev/null +++ b/src/array/cuda/cusparse_dispatcher.cuh.prehip @@ -0,0 +1,238 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/dispatcher.cuh + * @brief Templates to dispatch into different cuSPARSE routines based on the + * type argument. + */ +#ifndef DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ +#define DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ + +#include +#include + +#include "bf16.cuh" +#include "fp16.cuh" + +namespace dgl { +namespace aten { + +/** @brief cusparseXcsrgemm dispatcher */ +template +struct CSRGEMM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + BUG_IF_FAIL(false) << "This piece of code should not be reached."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgemm2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + BUG_IF_FAIL(false) << "This piece of code should not be reached."; + return static_cast(0); + } +}; + +template <> +struct CSRGEMM<__half> { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a + // different implementation would be required. + LOG(FATAL) << "CSRGEMM::bufferSizeExt does not support dtype half (FP16)."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgemm2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgemm2, so a different + // implementation would be required. + LOG(FATAL) << "CSRGEMM::compute does not support dtype half (FP16)."; + return static_cast(0); + } +}; + +#if BF16_ENABLED +template <> +struct CSRGEMM<__nv_bfloat16> { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgemm2_bufferSizeExt, so a + // different implementation would be required. + LOG(FATAL) + << "CSRGEMM::bufferSizeExt does not support dtype bfloat16 (BF16)."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgemm2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgemm2, so a different + // implementation would be required. + LOG(FATAL) << "CSRGEMM::compute does not support dtype bfloat16 (BF16)."; + return static_cast(0); + } +}; +#endif // BF16_ENABLED + +template <> +struct CSRGEMM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + return cusparseScsrgemm2_bufferSizeExt(args...); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgemm2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + return cusparseScsrgemm2(args...); + } +}; + +template <> +struct CSRGEMM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + return cusparseDcsrgemm2_bufferSizeExt(args...); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgemm2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + return cusparseDcsrgemm2(args...); + } +}; + +/** @brief cusparseXcsrgeam dispatcher */ +template +struct CSRGEAM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + BUG_IF_FAIL(false) << "This piece of code should not be reached."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgeam2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + BUG_IF_FAIL(false) << "This piece of code should not be reached."; + return static_cast(0); + } +}; + +template <> +struct CSRGEAM<__half> { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a + // different implementation would be required. + LOG(FATAL) << "CSRGEAM::bufferSizeExt does not support dtype half (FP16)."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgeam2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgeam2, so a different + // implementation would be required. + LOG(FATAL) << "CSRGEAM::compute does not support dtype half (FP16)."; + return static_cast(0); + } +}; + +#if BF16_ENABLED +template <> +struct CSRGEAM<__nv_bfloat16> { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgeam2_bufferSizeExt, so a + // different implementation would be required. + LOG(FATAL) + << "CSRGEAM::bufferSizeExt does not support dtype bfloat16 (BF16)."; + return static_cast(0); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgeam2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + // TODO(ndickson): There is no cusparseHcsrgeam2, so a different + // implementation would be required. + LOG(FATAL) << "CSRGEAM::compute does not support dtype bfloat16 (BF16)."; + return static_cast(0); + } +}; +#endif // BF16_ENABLED + +template <> +struct CSRGEAM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + return cusparseScsrgeam2_bufferSizeExt(args...); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgeam2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + return cusparseScsrgeam2(args...); + } +}; + +template <> +struct CSRGEAM { + template + static inline cusparseStatus_t bufferSizeExt(Args... args) { + return cusparseDcsrgeam2_bufferSizeExt(args...); + } + + template + static inline cusparseStatus_t nnz(Args... args) { + return cusparseXcsrgeam2Nnz(args...); + } + + template + static inline cusparseStatus_t compute(Args... args) { + return cusparseDcsrgeam2(args...); + } +}; + +}; // namespace aten +}; // namespace dgl + +#endif // DGL_ARRAY_CUDA_CUSPARSE_DISPATCHER_CUH_ diff --git a/src/array/cuda/disjoint_union.cu b/src/array/cuda/disjoint_union.cu index 90cc07bd3f4c..d5b602900850 100644 --- a/src/array/cuda/disjoint_union.cu +++ b/src/array/cuda/disjoint_union.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2022, NVIDIA CORPORATION. * @@ -78,7 +79,7 @@ std::tuple _ComputePrefixSums( template void _Merge( IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs, - int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) { + int n_elms, DGLContext ctx, DGLDataType dtype, hipStream_t stream) { auto device = runtime::DeviceAPI::Get(ctx); int nt = 256; int nb = (n_elms + nt - 1) / nt; @@ -99,7 +100,7 @@ void _Merge( template COOMatrix DisjointUnionCoo(const std::vector& coos) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = runtime::DeviceAPI::Get(coos[0].row->ctx); uint64_t src_offset = 0, dst_offset = 0; bool has_data = false; diff --git a/src/array/cuda/disjoint_union.cu.prehip b/src/array/cuda/disjoint_union.cu.prehip new file mode 100644 index 000000000000..90cc07bd3f4c --- /dev/null +++ b/src/array/cuda/disjoint_union.cu.prehip @@ -0,0 +1,185 @@ +/** + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file array/gpu/disjoint_union.cu + * @brief Disjoint union GPU implementation. + */ + +#include +#include + +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +__global__ void _DisjointUnionKernel( + IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs, + int n_elms) { + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < n_elms) { + IdType i = dgl::cuda::_UpperBound(offset, n_arrs, tx) - 1; + if (arrs[i] == NULL) { + out[tx] = tx; + } else { + IdType j = tx - offset[i]; + out[tx] = arrs[i][j] + prefix[i]; + } + tx += stride_x; + } +} + +template +std::tuple _ComputePrefixSums( + const std::vector& coos) { + IdType n = coos.size(), nbits = coos[0].row->dtype.bits; + IdArray n_rows = NewIdArray(n, CPU, nbits); + IdArray n_cols = NewIdArray(n, CPU, nbits); + IdArray n_elms = NewIdArray(n, CPU, nbits); + + IdType* n_rows_data = n_rows.Ptr(); + IdType* n_cols_data = n_cols.Ptr(); + IdType* n_elms_data = n_elms.Ptr(); + + dgl::runtime::parallel_for(0, coos.size(), [&](IdType b, IdType e) { + for (IdType i = b; i < e; ++i) { + n_rows_data[i] = coos[i].num_rows; + n_cols_data[i] = coos[i].num_cols; + n_elms_data[i] = coos[i].row->shape[0]; + } + }); + + return std::make_tuple( + CumSum(n_rows.CopyTo(coos[0].row->ctx), true), + CumSum(n_cols.CopyTo(coos[0].row->ctx), true), + CumSum(n_elms.CopyTo(coos[0].row->ctx), true)); +} + +template +void _Merge( + IdType** arrs, IdType* prefix, IdType* offset, IdType* out, int64_t n_arrs, + int n_elms, DGLContext ctx, DGLDataType dtype, cudaStream_t stream) { + auto device = runtime::DeviceAPI::Get(ctx); + int nt = 256; + int nb = (n_elms + nt - 1) / nt; + + IdType** arrs_dev = static_cast( + device->AllocWorkspace(ctx, n_arrs * sizeof(IdType*))); + + device->CopyDataFromTo( + arrs, 0, arrs_dev, 0, sizeof(IdType*) * n_arrs, DGLContext{kDGLCPU, 0}, + ctx, dtype); + + CUDA_KERNEL_CALL( + _DisjointUnionKernel, nb, nt, 0, stream, arrs_dev, prefix, offset, out, + n_arrs, n_elms); + + device->FreeWorkspace(ctx, arrs_dev); +} + +template +COOMatrix DisjointUnionCoo(const std::vector& coos) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(coos[0].row->ctx); + uint64_t src_offset = 0, dst_offset = 0; + bool has_data = false; + bool row_sorted = true; + bool col_sorted = true; + + // check if data index array + for (size_t i = 0; i < coos.size(); ++i) { + CHECK_SAME_DTYPE(coos[0].row, coos[i].row); + CHECK_SAME_CONTEXT(coos[0].row, coos[i].row); + has_data |= COOHasData(coos[i]); + } + + auto prefixes = _ComputePrefixSums(coos); + auto prefix_src = static_cast(std::get<0>(prefixes)->data); + auto prefix_dst = static_cast(std::get<1>(prefixes)->data); + auto prefix_elm = static_cast(std::get<2>(prefixes)->data); + + std::unique_ptr rows(new IdType*[coos.size()]); + std::unique_ptr cols(new IdType*[coos.size()]); + std::unique_ptr data(new IdType*[coos.size()]); + + for (size_t i = 0; i < coos.size(); i++) { + row_sorted &= coos[i].row_sorted; + col_sorted &= coos[i].col_sorted; + rows[i] = coos[i].row.Ptr(); + cols[i] = coos[i].col.Ptr(); + data[i] = coos[i].data.Ptr(); + } + + auto ctx = coos[0].row->ctx; + auto dtype = coos[0].row->dtype; + + IdType n_elements = 0; + device->CopyDataFromTo( + &prefix_elm[coos.size()], 0, &n_elements, 0, sizeof(IdType), + coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype); + + device->CopyDataFromTo( + &prefix_src[coos.size()], 0, &src_offset, 0, sizeof(IdType), + coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype); + + device->CopyDataFromTo( + &prefix_dst[coos.size()], 0, &dst_offset, 0, sizeof(IdType), + coos[0].row->ctx, DGLContext{kDGLCPU, 0}, coos[0].row->dtype); + + // Union src array + IdArray result_src = + NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits); + _Merge( + rows.get(), prefix_src, prefix_elm, result_src.Ptr(), coos.size(), + n_elements, ctx, dtype, stream); + + // Union dst array + IdArray result_dst = + NewIdArray(n_elements, coos[0].col->ctx, coos[0].col->dtype.bits); + _Merge( + cols.get(), prefix_dst, prefix_elm, result_dst.Ptr(), coos.size(), + n_elements, ctx, dtype, stream); + + // Union data array if exists and fetch number of elements + IdArray result_dat = NullArray(); + if (has_data) { + result_dat = + NewIdArray(n_elements, coos[0].row->ctx, coos[0].row->dtype.bits); + _Merge( + data.get(), prefix_elm, prefix_elm, result_dat.Ptr(), + coos.size(), n_elements, ctx, dtype, stream); + } + + return COOMatrix( + src_offset, dst_offset, result_src, result_dst, result_dat, row_sorted, + col_sorted); +} + +template COOMatrix DisjointUnionCoo( + const std::vector& coos); +template COOMatrix DisjointUnionCoo( + const std::vector& coos); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/fp16.cuh b/src/array/cuda/fp16.cuh index e23837e019a1..8e7ca60b51c3 100644 --- a/src/array/cuda/fp16.cuh +++ b/src/array/cuda/fp16.cuh @@ -21,7 +21,7 @@ #ifndef DGL_ARRAY_CUDA_FP16_CUH_ #define DGL_ARRAY_CUDA_FP16_CUH_ -#include +#include #include @@ -41,9 +41,9 @@ static __device__ __forceinline__ half min(half a, half b) { #endif } -#ifdef __CUDACC__ +#ifdef __HIPCC__ // Arithmetic FP16 operations for architecture >= 5.3 are already defined in -// cuda_fp16.h +// hip/hip_fp16.h #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) // CUDA 12.2 adds "emulated" support for older architectures. #if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) @@ -129,6 +129,6 @@ __device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) { } #endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) -#endif // __CUDACC__ +#endif // __HIPCC__ #endif // DGL_ARRAY_CUDA_FP16_CUH_ diff --git a/src/array/cuda/fp16.cuh.prehip b/src/array/cuda/fp16.cuh.prehip new file mode 100644 index 000000000000..e23837e019a1 --- /dev/null +++ b/src/array/cuda/fp16.cuh.prehip @@ -0,0 +1,134 @@ +/** + * Copyright (c) 2020-2022 by Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file array/cuda/fp16.cuh + * @brief float16 related functions. + * @note this file is modified from TVM project: + * https://github.com/apache/tvm/blob/e561007f0c330e3d14c2bc8a3ef40fb741db9004/src/target/source/literal/cuda_half_t.h. + */ +#ifndef DGL_ARRAY_CUDA_FP16_CUH_ +#define DGL_ARRAY_CUDA_FP16_CUH_ + +#include + +#include + +static __device__ __forceinline__ half max(half a, half b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hgt(__half(a), __half(b)) ? a : b; +#else + return __half(max(float(a), float(b))); // NOLINT +#endif +} + +static __device__ __forceinline__ half min(half a, half b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hlt(__half(a), __half(b)) ? a : b; +#else + return __half(min(float(a), float(b))); // NOLINT +#endif +} + +#ifdef __CUDACC__ +// Arithmetic FP16 operations for architecture >= 5.3 are already defined in +// cuda_fp16.h +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) +// CUDA 12.2 adds "emulated" support for older architectures. +#if defined(CUDART_VERSION) && (CUDART_VERSION < 12020) +__device__ __forceinline__ __half +operator+(const __half& lh, const __half& rh) { + return __half(float(lh) + float(rh)); // NOLINT +} +__device__ __forceinline__ __half +operator-(const __half& lh, const __half& rh) { + return __half(float(lh) - float(rh)); // NOLINT +} +__device__ __forceinline__ __half +operator*(const __half& lh, const __half& rh) { + return __half(float(lh) * float(rh)); // NOLINT +} +__device__ __forceinline__ __half +operator/(const __half& lh, const __half& rh) { + return __half(float(lh) / float(rh)); // NOLINT +} + +__device__ __forceinline__ __half& operator+=( + __half& lh, const __half& rh) { // NOLINT + lh = __half(float(lh) + float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __half& operator-=( + __half& lh, const __half& rh) { // NOLINT + lh = __half(float(lh) - float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __half& operator*=( + __half& lh, const __half& rh) { // NOLINT + lh = __half(float(lh) * float(rh)); // NOLINT + return lh; +} +__device__ __forceinline__ __half& operator/=( + __half& lh, const __half& rh) { // NOLINT + lh = __half(float(lh) / float(rh)); // NOLINT + return lh; +} + +__device__ __forceinline__ __half& operator++(__half& h) { // NOLINT + h = __half(float(h) + 1.0f); // NOLINT + return h; +} +__device__ __forceinline__ __half& operator--(__half& h) { // NOLINT + h = __half(float(h) - 1.0f); // NOLINT + return h; +} +__device__ __forceinline__ __half operator++(__half& h, int) { // NOLINT + __half ret = h; + h = __half(float(h) + 1.0f); // NOLINT + return ret; +} +__device__ __forceinline__ __half operator--(__half& h, int) { // NOLINT + __half ret = h; + h = __half(float(h) - 1.0f); // NOLINT + return ret; +} + +__device__ __forceinline__ __half operator+(const __half& h) { return h; } +__device__ __forceinline__ __half operator-(const __half& h) { + return __half(-float(h)); // NOLINT +} + +__device__ __forceinline__ bool operator==(const __half& lh, const __half& rh) { + return float(lh) == float(rh); // NOLINT +} +__device__ __forceinline__ bool operator!=(const __half& lh, const __half& rh) { + return float(lh) != float(rh); // NOLINT +} +__device__ __forceinline__ bool operator>(const __half& lh, const __half& rh) { + return float(lh) > float(rh); // NOLINT +} +__device__ __forceinline__ bool operator<(const __half& lh, const __half& rh) { + return float(lh) < float(rh); // NOLINT +} +__device__ __forceinline__ bool operator>=(const __half& lh, const __half& rh) { + return float(lh) >= float(rh); // NOLINT +} +__device__ __forceinline__ bool operator<=(const __half& lh, const __half& rh) { + return float(lh) <= float(rh); // NOLINT +} +#endif // defined(CUDART_VERSION) && (CUDART_VERSION < 12020) +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 530) +#endif // __CUDACC__ + +#endif // DGL_ARRAY_CUDA_FP16_CUH_ diff --git a/src/array/cuda/functor.cuh b/src/array/cuda/functor.cuh index 226600cb9a84..1dc05e34e008 100644 --- a/src/array/cuda/functor.cuh +++ b/src/array/cuda/functor.cuh @@ -208,29 +208,29 @@ struct Sum : _Sum { #if BF16_ENABLED template -struct Sum : _Sum { - static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { +struct Sum : _Sum { + static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() { return __float2bfloat16_rn(0.); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { - _Sum::Call( + __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __hip_bfloat16 val, Idx uid, Idx eid) { + _Sum::Call( out_buf, arg_u_buf, arg_e_buf, val, uid, eid); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { - _Sum::Call(out_buf, arg_buf, val, id); + __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { + _Sum::Call(out_buf, arg_buf, val, id); } // sometimes we have to use float in reduction for better precision static __device__ __forceinline__ void Call( float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { + __hip_bfloat16 val, Idx uid, Idx eid) { _Sum::Call(out_buf, arg_u_buf, arg_e_buf, static_cast(val), uid, eid); } static __device__ __forceinline__ void Call( - float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { _Sum::Call(out_buf, arg_buf, static_cast(val), id); } @@ -313,29 +313,29 @@ struct Max : _Max { #if BF16_ENABLED template -struct Max : _Max { - static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { +struct Max : _Max { + static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() { return __float2bfloat16_rn(-std::numeric_limits::infinity()); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { - _Max::Call( + __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __hip_bfloat16 val, Idx uid, Idx eid) { + _Max::Call( out_buf, arg_u_buf, arg_e_buf, val, uid, eid); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { - _Max::Call(out_buf, arg_buf, val, id); + __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { + _Max::Call(out_buf, arg_buf, val, id); } // sometimes we have to use float in reduction for better precision static __device__ __forceinline__ void Call( float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { + __hip_bfloat16 val, Idx uid, Idx eid) { _Max::Call(out_buf, arg_u_buf, arg_e_buf, static_cast(val), uid, eid); } static __device__ __forceinline__ void Call( - float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { _Max::Call(out_buf, arg_buf, static_cast(val), id); } @@ -418,29 +418,29 @@ struct Min : _Min { #if BF16_ENABLED template -struct Min : _Min { - static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { +struct Min : _Min { + static constexpr __host__ __device__ __forceinline__ __hip_bfloat16 zero() { return __float2bfloat16_rn(std::numeric_limits::infinity()); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { - _Min::Call( + __hip_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __hip_bfloat16 val, Idx uid, Idx eid) { + _Min::Call( out_buf, arg_u_buf, arg_e_buf, val, uid, eid); } static __device__ __forceinline__ void Call( - __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { - _Min::Call(out_buf, arg_buf, val, id); + __hip_bfloat16 *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { + _Min::Call(out_buf, arg_buf, val, id); } // sometimes we have to use float in reduction for better precision static __device__ __forceinline__ void Call( float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, - __nv_bfloat16 val, Idx uid, Idx eid) { + __hip_bfloat16 val, Idx uid, Idx eid) { _Min::Call(out_buf, arg_u_buf, arg_e_buf, static_cast(val), uid, eid); } static __device__ __forceinline__ void Call( - float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + float *out_buf, Idx *arg_buf, __hip_bfloat16 val, Idx id) { _Min::Call(out_buf, arg_buf, static_cast(val), id); } diff --git a/src/array/cuda/functor.cuh.prehip b/src/array/cuda/functor.cuh.prehip new file mode 100644 index 000000000000..226600cb9a84 --- /dev/null +++ b/src/array/cuda/functor.cuh.prehip @@ -0,0 +1,456 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/functor.cuh + * @brief Functors for template on CUDA + */ +#ifndef DGL_ARRAY_CUDA_FUNCTOR_CUH_ +#define DGL_ARRAY_CUDA_FUNCTOR_CUH_ + +#include +#include + +#include "./atomic.cuh" +#include "./fp16.cuh" +#include "bf16.cuh" + +namespace dgl { +namespace aten { +namespace cuda { + +/////////////////////////// CUDA binary operators ////////////////////////////// +namespace binary { +template +struct Add { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return lhs[0] + rhs[0]; + } +}; +template +constexpr bool Add::use_lhs; +template +constexpr bool Add::use_rhs; +template +constexpr bool Add::reduce_last_dim; + +template +struct Sub { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return lhs[0] - rhs[0]; + } +}; +template +constexpr bool Sub::use_lhs; +template +constexpr bool Sub::use_rhs; +template +constexpr bool Sub::reduce_last_dim; + +template +struct Mul { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return lhs[0] * rhs[0]; + } +}; +template +constexpr bool Mul::use_lhs; +template +constexpr bool Mul::use_rhs; +template +constexpr bool Mul::reduce_last_dim; + +template +struct Div { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return lhs[0] / rhs[0]; + } +}; +template +constexpr bool Div::use_lhs; +template +constexpr bool Div::use_rhs; +template +constexpr bool Div::reduce_last_dim; + +template +struct CopyLhs { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = false; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return lhs[0]; + } +}; +template +constexpr bool CopyLhs::use_lhs; +template +constexpr bool CopyLhs::use_rhs; +template +constexpr bool CopyLhs::reduce_last_dim; + +template +struct CopyRhs { + static constexpr bool use_lhs = false; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = false; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + return rhs[0]; + } +}; +template +constexpr bool CopyRhs::use_lhs; +template +constexpr bool CopyRhs::use_rhs; +template +constexpr bool CopyRhs::reduce_last_dim; + +template +struct Dot { + static constexpr bool use_lhs = true; + static constexpr bool use_rhs = true; + static constexpr bool reduce_last_dim = true; + static __device__ __forceinline__ DType + Call(const DType *lhs, const DType *rhs, int64_t len = 1) { + DType rst = static_cast(0.0f); + for (int64_t i = 0; i < len; ++i) { + rst += lhs[i] * rhs[i]; + } + return rst; + } +}; +template +constexpr bool Dot::use_lhs; +template +constexpr bool Dot::use_rhs; +template +constexpr bool Dot::reduce_last_dim; + +} // end of namespace binary + +/////////////////////////// CUDA reduce operators ////////////////////////////// +namespace reduce { +template +struct _Sum { + static constexpr __host__ __device__ __forceinline__ DType zero() { + return 0.; + } + static constexpr bool require_arg = false; + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid, + Idx eid) { + if (!atomic) { + *out_buf += val; + } else { + cuda::AtomicAdd(out_buf, val); + } + } + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_buf, DType val, Idx id) { + if (!atomic) { + *out_buf += val; + } else { + cuda::AtomicAdd(out_buf, val); + } + } + static __device__ __forceinline__ void CallArg( + Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref, + Idx uid, Idx eid) {} +}; + +template +struct Sum : _Sum {}; + +template +struct Sum : _Sum { + static constexpr __host__ __device__ __forceinline__ __half zero() { + return __float2half_rn(0.); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Sum::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_buf, __half val, Idx id) { + _Sum::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Sum::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __half val, Idx id) { + _Sum::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; + +#if BF16_ENABLED +template +struct Sum : _Sum { + static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { + return __float2bfloat16_rn(0.); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Sum::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Sum::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Sum::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Sum::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; +#endif // BF16_ENABLED + +template +struct _Max { + static constexpr __host__ __device__ __forceinline__ DType zero() { + return -std::numeric_limits::infinity(); + } + static constexpr bool require_arg = true; + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid, + Idx eid) { + if (!atomic) { + if (*out_buf < val) { + *out_buf = val; + *arg_u_buf = uid; + *arg_e_buf = eid; + } + } else { + cuda::AtomicMax(out_buf, val); + } + } + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_buf, DType val, Idx id) { + if (!atomic) { + if (*out_buf < val) { + *out_buf = val; + *arg_buf = id; + } + } else { + cuda::AtomicMax(out_buf, val); + } + } + static __device__ __forceinline__ void CallArg( + Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref, + Idx uid, Idx eid) { + if (atomic) { + if (val == val_ref) { + if (arg_u_buf) arg_u_buf[fid] = uid; + if (arg_e_buf) arg_e_buf[fid] = eid; + } + } + } +}; + +template +struct Max : _Max {}; + +template +struct Max : _Max { + static constexpr __host__ __device__ __forceinline__ __half zero() { + return __float2half_rn(-6.550400e+04f); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Max::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_buf, __half val, Idx id) { + _Max::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Max::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __half val, Idx id) { + _Max::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; + +#if BF16_ENABLED +template +struct Max : _Max { + static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { + return __float2bfloat16_rn(-std::numeric_limits::infinity()); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Max::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Max::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Max::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Max::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; +#endif // BF16_ENABLED + +template +struct _Min { + static constexpr __host__ __device__ __forceinline__ DType zero() { + return std::numeric_limits::infinity(); + } + static constexpr bool require_arg = true; + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, DType val, Idx uid, + Idx eid) { + if (!atomic) { + if (*out_buf > val) { + *out_buf = val; + *arg_u_buf = uid; + *arg_e_buf = eid; + } + } else { + cuda::AtomicMin(out_buf, val); + } + } + static __device__ __forceinline__ void Call( + DType *out_buf, Idx *arg_buf, DType val, Idx id) { + if (!atomic) { + if (*out_buf > val) { + *out_buf = val; + *arg_buf = id; + } + } else { + cuda::AtomicMin(out_buf, val); + } + } + static __device__ __forceinline__ void CallArg( + Idx fid, Idx *arg_u_buf, Idx *arg_e_buf, DType val, DType val_ref, + Idx uid, Idx eid) { + if (atomic) { + if (val == val_ref) { + if (arg_u_buf) arg_u_buf[fid] = uid; + if (arg_e_buf) arg_e_buf[fid] = eid; + } + } + } +}; + +template +struct Min : _Min {}; + +template +struct Min : _Min { + static constexpr __host__ __device__ __forceinline__ __half zero() { + return __float2half_rn(6.550400e+04f); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Min::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __half *out_buf, Idx *arg_buf, __half val, Idx id) { + _Min::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __half val, Idx uid, Idx eid) { + _Min::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __half val, Idx id) { + _Min::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; + +#if BF16_ENABLED +template +struct Min : _Min { + static constexpr __host__ __device__ __forceinline__ __nv_bfloat16 zero() { + return __float2bfloat16_rn(std::numeric_limits::infinity()); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Min::Call( + out_buf, arg_u_buf, arg_e_buf, val, uid, eid); + } + static __device__ __forceinline__ void Call( + __nv_bfloat16 *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Min::Call(out_buf, arg_buf, val, id); + } + // sometimes we have to use float in reduction for better precision + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_u_buf, Idx *arg_e_buf, + __nv_bfloat16 val, Idx uid, Idx eid) { + _Min::Call(out_buf, arg_u_buf, arg_e_buf, + static_cast(val), uid, eid); + } + static __device__ __forceinline__ void Call( + float *out_buf, Idx *arg_buf, __nv_bfloat16 val, Idx id) { + _Min::Call(out_buf, arg_buf, + static_cast(val), id); + } +}; +#endif // BF16_ENABLED + +} // namespace reduce + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_FUNCTOR_CUH_ diff --git a/src/array/cuda/gather_mm.cu b/src/array/cuda/gather_mm.cu index c40d53bb05ec..cf7a87d9d844 100644 --- a/src/array/cuda/gather_mm.cu +++ b/src/array/cuda/gather_mm.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/gather_mm.cu @@ -20,54 +21,54 @@ namespace { /** @brief Call cuBLAS GEMM API for dense matmul operation for float and double. */ template -cublasStatus_t cublasGemm( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t cublasGemm( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, int k, const DType* alpha, const DType* A, int lda, const DType* B, int ldb, const DType* beta, DType* C, int ldc) { LOG(INFO) << "Not supported dtype"; - return CUBLAS_STATUS_EXECUTION_FAILED; + return HIPBLAS_STATUS_EXECUTION_FAILED; } template <> -cublasStatus_t cublasGemm<__half>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t cublasGemm<__half>( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, int k, const __half* alpha, const __half* A, int lda, const __half* B, int ldb, const __half* beta, __half* C, int ldc) { - return cublasHgemm( + return hipblasHgemm( handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } #if BF16_ENABLED template <> -cublasStatus_t cublasGemm<__nv_bfloat16>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A, - int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta, - __nv_bfloat16* C, int ldc) { +hipblasStatus_t cublasGemm<__hip_bfloat16>( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, + int m, int n, int k, const __hip_bfloat16* alpha, const __hip_bfloat16* A, + int lda, const __hip_bfloat16* B, int ldb, const __hip_bfloat16* beta, + __hip_bfloat16* C, int ldc) { float alpha_float = __bfloat162float(*alpha); float beta_float = __bfloat162float(*beta); - return cublasGemmEx( - handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B, - CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, + return hipblasGemmEx_v2( + handle, transa, transb, m, n, k, &alpha_float, A, HIP_R_16BF, lda, B, + HIP_R_16BF, ldb, &beta_float, C, HIP_R_16BF, ldc, HIPBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP); } #endif // BF16_ENABLED template <> -cublasStatus_t cublasGemm( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t cublasGemm( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc) { - return cublasSgemm( + return hipblasSgemm( handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -cublasStatus_t cublasGemm( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t cublasGemm( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, int k, const double* alpha, const double* A, int lda, const double* B, int ldb, const double* beta, double* C, int ldc) { - return cublasDgemm( + return hipblasDgemm( handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } @@ -203,7 +204,7 @@ void SegmentMM( const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, bool a_trans, bool b_trans) { auto device = runtime::DeviceAPI::Get(A->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const DType* A_data = A.Ptr(); const DType* B_data = B.Ptr(); const IdType* seglen_A_data = seglen_A.Ptr(); @@ -215,8 +216,8 @@ void SegmentMM( auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); if (!thr_entry->cublas_handle) - CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); - CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream)); IdType m_offset = 0; for (IdType etype = 0; etype < num_rel; ++etype) { @@ -226,10 +227,10 @@ void SegmentMM( n = B->shape[2]; // cols of B k = B->shape[1]; // cols of A == rows of B int ldb = n, lda = k, ldc = n; - cublasOperation_t transB = CUBLAS_OP_N; - cublasOperation_t transA = CUBLAS_OP_N; + hipblasOperation_t transB = HIPBLAS_OP_N; + hipblasOperation_t transA = HIPBLAS_OP_N; if (b_trans) { - transB = CUBLAS_OP_T; + transB = HIPBLAS_OP_T; ldb = n, lda = n, ldc = k; std::swap(n, k); } @@ -248,7 +249,7 @@ template void SegmentMMBackwardB( const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) { auto device = runtime::DeviceAPI::Get(A->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const DType* A_data = A.Ptr(); const DType* dC_data = dC.Ptr(); const IdType* seglen_data = seglen.Ptr(); @@ -260,8 +261,8 @@ void SegmentMMBackwardB( auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); if (!thr_entry->cublas_handle) - CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); - CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream)); IdType k_offset = 0; for (IdType etype = 0; etype < num_rel; ++etype) { @@ -271,8 +272,8 @@ void SegmentMMBackwardB( CHECK_LE(k_offset + k, A->shape[0]) << "Segement index out of bound of A->shape[0]."; int lddC = m, ldA = n, lddB = m; - cublasOperation_t trans_dC = CUBLAS_OP_N; - cublasOperation_t trans_A = CUBLAS_OP_T; + hipblasOperation_t trans_dC = HIPBLAS_OP_N; + hipblasOperation_t trans_A = HIPBLAS_OP_T; CUBLAS_CALL(cublasGemm( thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha, dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta, @@ -299,7 +300,7 @@ void GatherMM( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b) { auto device = runtime::DeviceAPI::Get(A->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t out_len = B->shape[2]; // cols of B int64_t in_len = A->shape[1]; // cols of A const int64_t tot_num_rows = A->shape[0]; @@ -332,7 +333,7 @@ void GatherMMScatter( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b, const NDArray idx_c) { auto device = runtime::DeviceAPI::Get(A->ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const IdType* idx_c_data = idx_c.Ptr(); int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B int64_t in_len = A->shape[1]; // cols of A @@ -367,10 +368,10 @@ template void GatherMM( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b); #if BF16_ENABLED -template void GatherMM( +template void GatherMM( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b); -template void GatherMM( +template void GatherMM( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b); #endif // BF16_ENABLED @@ -394,10 +395,10 @@ template void GatherMMScatter( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b, const NDArray idx_c); #if BF16_ENABLED -template void GatherMMScatter( +template void GatherMMScatter( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b, const NDArray idx_c); -template void GatherMMScatter( +template void GatherMMScatter( const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, const NDArray idx_b, const NDArray idx_c); #endif // BF16_ENABLED @@ -421,10 +422,10 @@ template void SegmentMM( const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, bool a_trans, bool b_trans); #if BF16_ENABLED -template void SegmentMM( +template void SegmentMM( const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, bool a_trans, bool b_trans); -template void SegmentMM( +template void SegmentMM( const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, bool a_trans, bool b_trans); #endif // BF16_ENABLED @@ -446,9 +447,9 @@ template void SegmentMMBackwardB( template void SegmentMMBackwardB( const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); #if BF16_ENABLED -template void SegmentMMBackwardB( +template void SegmentMMBackwardB( const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); -template void SegmentMMBackwardB( +template void SegmentMMBackwardB( const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); #endif // BF16_ENABLED template void SegmentMMBackwardB( diff --git a/src/array/cuda/gather_mm.cu.prehip b/src/array/cuda/gather_mm.cu.prehip new file mode 100644 index 000000000000..c40d53bb05ec --- /dev/null +++ b/src/array/cuda/gather_mm.cu.prehip @@ -0,0 +1,464 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/gather_mm.cu + * @brief GatherMM C APIs and definitions. + */ +#include + +#include // std::swap + +#include "./atomic.cuh" +#include "./functor.cuh" +#include "./utils.h" + +namespace dgl { +using namespace cuda; +namespace aten { + +namespace { + +/** @brief Call cuBLAS GEMM API for dense matmul operation for float and double. + */ +template +cublasStatus_t cublasGemm( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const DType* alpha, const DType* A, int lda, + const DType* B, int ldb, const DType* beta, DType* C, int ldc) { + LOG(INFO) << "Not supported dtype"; + return CUBLAS_STATUS_EXECUTION_FAILED; +} + +template <> +cublasStatus_t cublasGemm<__half>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const __half* alpha, const __half* A, int lda, + const __half* B, int ldb, const __half* beta, __half* C, int ldc) { + return cublasHgemm( + handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +#if BF16_ENABLED +template <> +cublasStatus_t cublasGemm<__nv_bfloat16>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const __nv_bfloat16* alpha, const __nv_bfloat16* A, + int lda, const __nv_bfloat16* B, int ldb, const __nv_bfloat16* beta, + __nv_bfloat16* C, int ldc) { + float alpha_float = __bfloat162float(*alpha); + float beta_float = __bfloat162float(*beta); + return cublasGemmEx( + handle, transa, transb, m, n, k, &alpha_float, A, CUDA_R_16BF, lda, B, + CUDA_R_16BF, ldb, &beta_float, C, CUDA_R_16BF, ldc, CUBLAS_COMPUTE_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); +} +#endif // BF16_ENABLED + +template <> +cublasStatus_t cublasGemm( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const float* alpha, const float* A, int lda, + const float* B, int ldb, const float* beta, float* C, int ldc) { + return cublasSgemm( + handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +template <> +cublasStatus_t cublasGemm( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const double* alpha, const double* A, int lda, + const double* B, int ldb, const double* beta, double* C, int ldc) { + return cublasDgemm( + handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +} // namespace + +namespace cuda { + +/** + * @note Each row of A multiplies a segment of matrix of B of dimension in_len * + * outlen. One warp is assigned to process one row of A. Each WARP sequentially + * multiplies one element of A and a row of B to compute partial result of the + * output. A is loaded in shared memory in a coalesced way. Output matrix is + * loaded in registers. B should get benefit from L2 cache. + */ +template +__global__ void GatherMMScatterKernel( + const DType* __restrict__ A, const DType* __restrict__ B, + DType* __restrict__ C, const Idx* __restrict__ idx_a, + const Idx* __restrict__ idx_b, const Idx* __restrict__ idx_c, + const int64_t num_rows, const int64_t in_len, const int64_t out_len) { + unsigned int tId = threadIdx.x; + unsigned int laneId = tId & 31; + unsigned int gId = (blockIdx.x * blockDim.x + threadIdx.x); + unsigned int warpId = gId >> 5; + unsigned int row = warpId; + if (row < num_rows) { + const unsigned int local_row = + row & 3; // hardcoded for TB size 128 (4 warps) + const Idx cur_rowA = (idx_a) ? idx_a[row] : row; + const Idx cur_rowB = (idx_b) ? idx_b[row] : row; + const Idx cur_rowC = (idx_c) ? idx_c[row] : row; + const Idx B_offset = cur_rowB * in_len * out_len; + const int sh_a_tile = 64; + __shared__ DType sh_A[4 * sh_a_tile]; + int a_tile = sh_a_tile; + for (unsigned int k_start = 0; k_start < in_len; k_start += 64) { + if ((in_len - k_start) < a_tile) a_tile = in_len - k_start; + // Load A in shared mem in a coalesced way + for (unsigned int l = laneId; l < a_tile; l += 32) + sh_A[local_row * sh_a_tile + l] = A[cur_rowA * in_len + (k_start + l)]; + __syncwarp(); + + for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { + DType out_reg = static_cast(0.0f); // thread private + const unsigned int l = laneId; + if (l < out_len) { + // iterate over elements of a row of A + for (unsigned int i = 0; i < a_tile; i++) { + const DType a_val = sh_A[local_row * sh_a_tile + i]; + // iterate over elements of a row of B in parallel + out_reg += + a_val * B[B_offset + ((i + k_start) * out_len + (outloop + l))]; + } + if (idx_c) { + AtomicAdd(C + cur_rowC * out_len + (outloop + l), out_reg); + } else { + C[cur_rowC * out_len + (outloop + l)] += out_reg; + } + } + } + } + } +} + +/** + * @note Output matrix is accumulated via atomic operations. Rest of the + * strategies are similar to GatherMMKernel. One warp is assigned to process one + * row of A. Each WARP sequentially multiplies one element of A and a row of B + * to compute partial result of the output. A is loaded in shared memory in a + * coalesced way. B should get benefit from L2 cache. + */ +template +__global__ void GatherMMScatterKernel2( + const DType* __restrict__ A, const DType* __restrict__ B, + DType* __restrict__ C, const Idx* __restrict__ idx_a, + const Idx* __restrict__ idx_b, const Idx* __restrict__ idx_c, + const int64_t num_rows, const int64_t in_len, const int64_t out_len) { + unsigned int tId = threadIdx.x; + unsigned int laneId = tId & 31; + unsigned int gId = (blockIdx.x * blockDim.x + threadIdx.x); + unsigned int warpId = gId >> 5; + unsigned int row = warpId; + if (row < num_rows) { + const unsigned int local_row = + row & 3; // hardcoded for TB size 128 (4 warps) + const Idx row_a = (idx_a) ? idx_a[row] : row; + const Idx row_b = (idx_b) ? idx_b[row] : row; + const Idx row_c = (idx_c) ? idx_c[row] : row; + const Idx C_offset = row_c * in_len * out_len; + const int sh_a_tile = 64; + __shared__ DType sh_A[4 * sh_a_tile]; + int a_tile = sh_a_tile; + for (unsigned int k_start = 0; k_start < in_len; k_start += 64) { + if ((in_len - k_start) < a_tile) a_tile = in_len - k_start; + /* Load A in shared mem in a coalesced way */ + for (unsigned int l = laneId; l < a_tile; l += 32) + sh_A[local_row * sh_a_tile + l] = A[row_a * in_len + (k_start + l)]; + __syncwarp(); + + for (unsigned int outloop = 0; outloop < out_len; outloop += 32) { + DType out_reg = static_cast(0.0f); // thread private + const unsigned int l = laneId; + if (l < out_len) { + const DType b_val = B[row_b * out_len + (outloop + l)]; + /* iterate over elements of a row of A */ + for (unsigned int i = 0; i < a_tile; i++) { + const DType a_val = sh_A[local_row * sh_a_tile + i]; + const Idx C_idx = + C_offset + ((i + k_start) * out_len + (outloop + l)); + AtomicAdd(C + C_idx, a_val * b_val); + } + } + } + } + } +} + +} // namespace cuda + +/** + * @brief Implementation of Gather_mm operator. The input matrix A is + * expected to be sorted according to relation type. + * @param A The input dense matrix of dimension m x k + * @param B The input dense matrix of dimension k x n + * @param C The output dense matrix of dimension m x n + * @param seglen_A The input vector of size R. Each element + * is the length of segments of input ``A`` + * @param a_trans Matrix A to be transposed + * @param b_trans Matrix B to be transposed + */ +template +void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans) { + auto device = runtime::DeviceAPI::Get(A->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const DType* A_data = A.Ptr(); + const DType* B_data = B.Ptr(); + const IdType* seglen_A_data = seglen_A.Ptr(); + DType* C_data = C.Ptr(); + int64_t A_offset = 0, B_offset = 0, C_offset = 0; + int64_t m, n, k; + int64_t num_rel = seglen_A.NumElements(); + DType alpha = 1., beta = 0.; + + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + if (!thr_entry->cublas_handle) + CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + + IdType m_offset = 0; + for (IdType etype = 0; etype < num_rel; ++etype) { + m = seglen_A_data[etype]; // rows of A + CHECK_LE(m_offset + m, A->shape[0]) + << "Segment index out of bound of A->shape[0]."; + n = B->shape[2]; // cols of B + k = B->shape[1]; // cols of A == rows of B + int ldb = n, lda = k, ldc = n; + cublasOperation_t transB = CUBLAS_OP_N; + cublasOperation_t transA = CUBLAS_OP_N; + if (b_trans) { + transB = CUBLAS_OP_T; + ldb = n, lda = n, ldc = k; + std::swap(n, k); + } + CUBLAS_CALL(cublasGemm( + thr_entry->cublas_handle, transB, transA, n, m, k, &alpha, + B_data + B_offset, ldb, A_data + A_offset, lda, &beta, + C_data + C_offset, ldc)); + A_offset += m * k; + B_offset += k * n; + C_offset += m * n; + m_offset += m; + } +} + +template +void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen) { + auto device = runtime::DeviceAPI::Get(A->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const DType* A_data = A.Ptr(); + const DType* dC_data = dC.Ptr(); + const IdType* seglen_data = seglen.Ptr(); + DType* dB_data = dB.Ptr(); + int64_t A_offset = 0, dC_offset = 0, dB_offset = 0; + int64_t m, n, k; + int64_t num_rel = seglen.NumElements(); + DType alpha = 1., beta = 0.; + + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + if (!thr_entry->cublas_handle) + CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + + IdType k_offset = 0; + for (IdType etype = 0; etype < num_rel; ++etype) { + m = dC->shape[1]; + n = A->shape[1]; + k = seglen_data[etype]; + CHECK_LE(k_offset + k, A->shape[0]) + << "Segement index out of bound of A->shape[0]."; + int lddC = m, ldA = n, lddB = m; + cublasOperation_t trans_dC = CUBLAS_OP_N; + cublasOperation_t trans_A = CUBLAS_OP_T; + CUBLAS_CALL(cublasGemm( + thr_entry->cublas_handle, trans_dC, trans_A, m, n, k, &alpha, + dC_data + dC_offset, lddC, A_data + A_offset, ldA, &beta, + dB_data + dB_offset, lddB)); + dC_offset += m * k; + A_offset += n * k; + dB_offset += m * n; + k_offset += k; + } +} + +/** + * @brief Implementation of Gather_mm operator. The input matrix A is + * expected to be sorted according to relation type. + * @param A The input dense matrix of dimension m x k + * @param B The input dense matrix of dimension k x n + * @param C The output dense matrix of dimension m x n + * @param idx_a The input vector to gather left hand operand on + * @param idx_b The input vector to gather right hand operand on + */ + +template +void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b) { + auto device = runtime::DeviceAPI::Get(A->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int64_t out_len = B->shape[2]; // cols of B + int64_t in_len = A->shape[1]; // cols of A + const int64_t tot_num_rows = A->shape[0]; + const int ntx = 128; + const int warp_size = 32; + const int nbx = ((tot_num_rows * warp_size + ntx - 1) / ntx); + const dim3 nblks(nbx); + const dim3 nthrs(ntx); + CUDA_KERNEL_CALL( + (cuda::GatherMMScatterKernel), nblks, nthrs, 0, stream, + A.Ptr(), B.Ptr(), C.Ptr(), idx_a.Ptr(), + idx_b.Ptr(), nullptr, tot_num_rows, in_len, out_len); +} + +/** + * @brief Implementation of Gather_mm operator. The input matrix A is + * expected to be sorted according to relation type. + * @param A The input dense matrix of dimension m x k + * @param B The input dense matrix of dimension k x n + * @param C The output dense matrix of dimension m x n + * @param idx_a The input vector to gather left hand operand on + * @param idx_b The input vector to gather right hand operand on + * @param idx_c The input vector to gather output operand on + * @param num_rel The number of idx types in idx_b + * @param a_trans Matrix A to be transposed + * @param b_trans Matrix B to be transposed + */ +template +void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c) { + auto device = runtime::DeviceAPI::Get(A->ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const IdType* idx_c_data = idx_c.Ptr(); + int64_t out_len = (B->ndim == 2) ? B->shape[1] : B->shape[2]; // cols of B + int64_t in_len = A->shape[1]; // cols of A + int64_t tot_num_rows = A->shape[0]; + const int ntx = 128; + const int warp_size = 32; + const int nbx = ((tot_num_rows * warp_size + ntx - 1) / ntx); + const dim3 nblks(nbx); + const dim3 nthrs(ntx); + if (B->ndim == 3) { + CUDA_KERNEL_CALL( + (cuda::GatherMMScatterKernel), nblks, nthrs, 0, stream, + A.Ptr(), B.Ptr(), C.Ptr(), idx_a.Ptr(), + idx_b.Ptr(), idx_c.Ptr(), tot_num_rows, in_len, + out_len); + } else { + // Custom kernel for W_grad[idx_c[i]] = H^T[i] * C.grad[i] + // This kernel accesses rows of A in a transposed way w/o explicitly + // converting A + CUDA_KERNEL_CALL( + (cuda::GatherMMScatterKernel2), nblks, nthrs, 0, stream, + A.Ptr(), B.Ptr(), C.Ptr(), idx_a.Ptr(), + idx_b.Ptr(), idx_c.Ptr(), tot_num_rows, in_len, + out_len); + } +} + +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +#if BF16_ENABLED +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +#endif // BF16_ENABLED +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); +template void GatherMM( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b); + +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +#if BF16_ENABLED +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +#endif // BF16_ENABLED +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); +template void GatherMMScatter( + const NDArray A, const NDArray B, NDArray C, const NDArray idx_a, + const NDArray idx_b, const NDArray idx_c); + +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +#if BF16_ENABLED +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +#endif // BF16_ENABLED +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); +template void SegmentMM( + const NDArray A, const NDArray B, NDArray C, const NDArray seglen_A, + bool a_trans, bool b_trans); + +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +#if BF16_ENABLED +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +#endif // BF16_ENABLED +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); +template void SegmentMMBackwardB( + const NDArray A, const NDArray dC, NDArray dB, const NDArray seglen); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/ge_spmm.cuh b/src/array/cuda/ge_spmm.cuh index 863f90e97876..2f8939fbb86f 100644 --- a/src/array/cuda/ge_spmm.cuh +++ b/src/array/cuda/ge_spmm.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/ge_spmm.cuh @@ -121,7 +122,7 @@ void GESpMMCsr( const DType* efeat_data = efeat.Ptr(); DType* out_data = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int ntx = 32; const int nty = 32; diff --git a/src/array/cuda/ge_spmm.cuh.prehip b/src/array/cuda/ge_spmm.cuh.prehip new file mode 100644 index 000000000000..863f90e97876 --- /dev/null +++ b/src/array/cuda/ge_spmm.cuh.prehip @@ -0,0 +1,144 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/ge_spmm.cuh + * @brief GE-SpMM CUDA kernel function header. + */ +#ifndef DGL_ARRAY_CUDA_GE_SPMM_CUH_ +#define DGL_ARRAY_CUDA_GE_SPMM_CUH_ + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" +#include "atomic.cuh" +#include "macro.cuh" + +namespace dgl { + +using namespace cuda; + +namespace aten { +namespace cuda { + +/** + * @brief CUDA kernel of GE-SpMM on Csr. + * @note GE-SpMM: https://arxiv.org/pdf/2007.03179.pdf + * The grid dimension x and y are reordered for better performance. + */ +template +__global__ void GESpMMKernel( + const DType* __restrict__ ufeat, const DType* __restrict__ efeat, + DType* __restrict__ out, const Idx* __restrict__ indptr, + const Idx* __restrict__ indices, const int64_t num_rows, + const int64_t num_cols, const int64_t feat_len) { + const Idx rid = + blockIdx.x * blockDim.y + threadIdx.y; // over vertices dimension + const Idx fid = (blockIdx.y * 64) + threadIdx.x; // over feature dimension + + if (rid < num_rows && fid < feat_len) { + const Idx low = __ldg(indptr + rid), high = __ldg(indptr + rid + 1); + DType accum_0 = 0., accum_1 = 0.; + + if (blockIdx.y != gridDim.y - 1) { // fid + 32 < feat_len + for (Idx left = low; left < high; left += 32) { + if (left + 32 <= high) { +#pragma unroll + for (Idx i = 0; i < 32; ++i) { + const Idx eid = left + i; + const Idx cid = __ldg(indices + eid); + const Idx offset = feat_len * cid + fid; + if (BinaryOp::use_rhs) { + accum_0 += BinaryOp::Call(ufeat + offset, efeat + eid); + accum_1 += BinaryOp::Call(ufeat + offset + 32, efeat + eid); + } else { + accum_0 += ufeat[offset]; + accum_1 += ufeat[offset + 32]; + } + } + } else { + for (Idx i = 0; left + i < high; ++i) { + const Idx eid = left + i; + const Idx cid = __ldg(indices + eid); + const Idx offset = feat_len * cid + fid; + if (BinaryOp::use_rhs) { + accum_0 += BinaryOp::Call(ufeat + offset, efeat + eid); + accum_1 += BinaryOp::Call(ufeat + offset + 32, efeat + eid); + } else { + accum_0 += ufeat[offset]; + accum_1 += ufeat[offset + 32]; + } + } + } + + out[feat_len * rid + fid] = accum_0; + out[feat_len * rid + fid + 32] = accum_1; + } + } else { + const Idx fid_0 = fid < feat_len ? fid : 0, + fid_1 = fid + 32 < feat_len ? fid + 32 : 0; + for (int left = low; left < high; left += 32) { + if (left + 32 <= high) { +#pragma unroll + for (int i = 0; i < 32; ++i) { + const Idx eid = left + i; + const Idx cid = __ldg(indices + eid); + const Idx offset = feat_len * cid; + if (BinaryOp::use_rhs) { + accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid); + accum_1 += BinaryOp::Call(ufeat + offset + fid_1, efeat + eid); + } else { + accum_0 += ufeat[offset + fid_0]; + accum_1 += ufeat[offset + fid_1]; + } + } + } else { + for (int i = 0; i + left < high; ++i) { + const Idx eid = left + i; + const Idx cid = __ldg(indices + eid); + const Idx offset = feat_len * cid; + if (BinaryOp::use_rhs) { + accum_0 += BinaryOp::Call(ufeat + offset + fid_0, efeat + eid); + accum_1 += BinaryOp::Call(ufeat + offset + fid_1, efeat + eid); + } else { + accum_0 += ufeat[offset + fid_0]; + accum_1 += ufeat[offset + fid_1]; + } + } + } + + out[feat_len * rid + fid] = accum_0; + if (fid + 32 < feat_len) out[feat_len * rid + fid + 32] = accum_1; + } + } + } +} + +template +void GESpMMCsr( + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + int64_t feat_len) { + const Idx* indptr = csr.indptr.Ptr(); + const Idx* indices = csr.indices.Ptr(); + const DType* ufeat_data = ufeat.Ptr(); + const DType* efeat_data = efeat.Ptr(); + DType* out_data = out.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int ntx = 32; + const int nty = 32; + const int nby = (feat_len + (ntx * 2) - 1) / (ntx * 2); + const int nbx = (csr.num_rows + nty - 1) / nty; + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + const int sh_mem_size = 0; + + CUDA_KERNEL_CALL( + (GESpMMKernel), nblks, nthrs, sh_mem_size, stream, + ufeat_data, efeat_data, out_data, indptr, indices, csr.num_rows, + csr.num_cols, feat_len); +} + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_GE_SPMM_CUH_ diff --git a/src/array/cuda/labor_sampling.cu b/src/array/cuda/labor_sampling.cu index c5076e62981d..3125a1bc3390 100644 --- a/src/array/cuda/labor_sampling.cu +++ b/src/array/cuda/labor_sampling.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /*! * Copyright (c) 2022, NVIDIA Corporation * Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) @@ -35,7 +36,7 @@ #include #include -#include // NOLINT +#include // NOLINT #include #include #include @@ -276,7 +277,7 @@ __global__ void _CSRRowWiseLayerSampleDegreeKernel( const FloatType* const ds, const FloatType* const d2s, const IdType* const indptr, const FloatType* const probs, const FloatType* const A, const IdType* const subindptr) { - typedef cub::BlockReduce BlockReduce; + typedef hipcub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ FloatType var_1_bcast[BLOCK_CTAS]; @@ -350,7 +351,7 @@ int log_size(const IdType size) { template void compute_importance_sampling_probabilities( - CSRMatrix mat, const IdType hop_size, cudaStream_t stream, + CSRMatrix mat, const IdType hop_size, hipStream_t stream, const continuous_seed seed, const IdType num_rows, const IdType* indptr, const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr, const IdType* nids, @@ -397,17 +398,17 @@ void compute_importance_sampling_probabilities( hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx, mat.indptr->dtype); - cub::DoubleBuffer hop_b(hop_2.get(), hop_3.get()); + hipcub::DoubleBuffer hop_b(hop_2.get(), hop_3.get()); { std::size_t temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceRadixSort::SortKeys( + CUDA_CALL(hipcub::DeviceRadixSort::SortKeys( nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices, stream)); auto temp = allocator.alloc_unique(temp_storage_bytes); - CUDA_CALL(cub::DeviceRadixSort::SortKeys( + CUDA_CALL(hipcub::DeviceRadixSort::SortKeys( temp.get(), temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices, stream)); } @@ -417,13 +418,13 @@ void compute_importance_sampling_probabilities( { std::size_t temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceRunLengthEncode::Encode( + CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode( nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(), hop_counts.get(), hop_unique_size.get(), hop_size, stream)); auto temp = allocator.alloc_unique(temp_storage_bytes); - CUDA_CALL(cub::DeviceRunLengthEncode::Encode( + CUDA_CALL(hipcub::DeviceRunLengthEncode::Encode( temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(), hop_counts.get(), hop_unique_size.get(), hop_size, stream)); @@ -521,7 +522,7 @@ std::pair CSRLaborSampling( runtime::CUDAWorkspaceAllocator allocator(ctx); const auto stream = runtime::getCurrentCUDAStream(); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); auto device = runtime::DeviceAPI::Get(ctx); @@ -568,11 +569,11 @@ std::pair CSRLaborSampling( auto ds_d2s = thrust::make_zip_iterator(ds, d2s); size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( + CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce( nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream)); auto temp = allocator.alloc_unique(prefix_temp_size); - CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( + CUDA_CALL(hipcub::DeviceSegmentedReduce::Reduce( temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream)); @@ -585,11 +586,11 @@ std::pair CSRLaborSampling( IdType hop_size; { size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, stream)); auto temp = allocator.alloc_unique(prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, stream)); @@ -618,11 +619,11 @@ std::pair CSRLaborSampling( auto modified_in_deg = thrust::make_transform_iterator( iota, AlignmentFunc{in_deg.get(), perm, num_rows}); size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(), num_rows + 1, stream)); auto temp = allocator.alloc_unique(prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( temp.get(), prefix_temp_size, modified_in_deg, subindptr_aligned.get(), num_rows + 1, stream)); diff --git a/src/array/cuda/labor_sampling.cu.prehip b/src/array/cuda/labor_sampling.cu.prehip new file mode 100644 index 000000000000..c5076e62981d --- /dev/null +++ b/src/array/cuda/labor_sampling.cu.prehip @@ -0,0 +1,833 @@ +/*! + * Copyright (c) 2022, NVIDIA Corporation + * Copyright (c) 2022, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file array/cuda/labor_sampling.cu + * @brief labor sampling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include + +#include "../../array/cuda/utils.h" +#include "../../random/continuous_seed.h" +#include "../../runtime/cuda/cuda_common.h" +#include "./functor.cuh" +#include "./spmm.cuh" + +namespace dgl { +namespace aten { +namespace impl { + +using dgl::random::continuous_seed; + +constexpr int BLOCK_SIZE = 128; +constexpr int CTA_SIZE = 128; +constexpr double eps = 0.0001; + +namespace { + +template +struct TransformOp { + const IdType* idx_coo; + const IdType* rows; + const IdType* indptr; + const IdType* subindptr; + const IdType* indices; + const IdType* data_arr; + bool is_pinned; + __host__ __device__ auto operator()(IdType idx) { + const auto in_row = idx_coo[idx]; + const auto row = rows[in_row]; + const auto in_idx = indptr[in_row] + idx - subindptr[in_row]; + const auto u = indices[is_pinned ? idx : in_idx]; + const auto data = data_arr ? data_arr[in_idx] : in_idx; + return thrust::make_tuple(row, u, data); + } +}; + +template < + typename IdType, typename FloatType, typename probs_t, typename A_t, + typename B_t> +struct TransformOpImp { + probs_t probs; + A_t A; + B_t B; + const IdType* idx_coo; + const IdType* rows; + const FloatType* cs; + const IdType* indptr; + const IdType* subindptr; + const IdType* indices; + const IdType* data_arr; + bool is_pinned; + __host__ __device__ auto operator()(IdType idx) { + const auto ps = probs[idx]; + const auto in_row = idx_coo[idx]; + const auto c = cs[in_row]; + const auto row = rows[in_row]; + const auto in_idx = indptr[in_row] + idx - subindptr[in_row]; + const auto u = indices[is_pinned ? idx : in_idx]; + const auto w = A[in_idx]; + const auto w2 = B[in_idx]; + const auto data = data_arr ? data_arr[in_idx] : in_idx; + return thrust::make_tuple( + in_row, row, u, data, w / min((FloatType)1, c * w2 * ps)); + } +}; + +template +struct StencilOp { + const FloatType* cs; + template + __host__ __device__ auto operator()( + IdType in_row, FloatType ps, FloatType rnd) { + return rnd <= cs[in_row] * ps; + } +}; + +template +struct StencilOpFused { + const continuous_seed seed; + const IdType* idx_coo; + const FloatType* cs; + const ps_t probs; + const A_t A; + const IdType* subindptr; + const IdType* indptr; + const IdType* indices; + const IdType* nids; + bool is_pinned; + __device__ auto operator()(IdType idx) { + const auto in_row = idx_coo[idx]; + const auto ps = probs[idx]; + IdType rofs = idx - subindptr[in_row]; + const auto in_idx = indptr[in_row] + rofs; + const auto u = indices[is_pinned ? idx : in_idx]; + const auto t = nids ? nids[u] : u; // t in the paper + // rolled random number r_t is a function of the random_seed and t + const float rnd = seed.uniform(t); + return rnd <= cs[in_row] * A[in_idx] * ps; + } +}; + +template +struct TransformOpMean { + const IdType* ds; + const FloatType* ws; + __host__ __device__ auto operator()(IdType idx, FloatType ps) { + return ps * ds[idx] / ws[idx]; + } +}; + +struct TransformOpMinWith1 { + template + __host__ __device__ auto operator()(FloatType x) { + return min((FloatType)1, x); + } +}; + +template +struct IndptrFunc { + const IdType* indptr; + const IdType* in_deg; + __host__ __device__ auto operator()(IdType row) { + return indptr[row] + (in_deg ? in_deg[row] : 0); + } +}; + +template +struct SquareFunc { + __host__ __device__ auto operator()(FloatType x) { + return thrust::make_tuple(x, x * x); + } +}; + +struct TupleSum { + template + __host__ __device__ T operator()(const T& a, const T& b) const { + return thrust::make_tuple( + thrust::get<0>(a) + thrust::get<0>(b), + thrust::get<1>(a) + thrust::get<1>(b)); + } +}; + +template +struct DegreeFunc { + const IdType num_picks; + const IdType* rows; + const IdType* indptr; + IdType* in_deg; + IdType* inrow_indptr; + FloatType* cs; + __host__ __device__ auto operator()(IdType tIdx) { + const auto out_row = rows[tIdx]; + const auto indptr_val = indptr[out_row]; + const auto d = indptr[out_row + 1] - indptr_val; + in_deg[tIdx] = d; + inrow_indptr[tIdx] = indptr_val; + cs[tIdx] = num_picks / (FloatType)d; + } +}; + +template +__global__ void _CSRRowWiseOneHopExtractorKernel( + const continuous_seed seed, const IdType hop_size, + const IdType* const indptr, const IdType* const subindptr, + const IdType* const indices, const IdType* const idx_coo, + const IdType* const nids, const FloatType* const A, FloatType* const rands, + IdType* const hop, FloatType* const A_l) { + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + + while (tx < hop_size) { + IdType rpos = idx_coo[tx]; + IdType rofs = tx - subindptr[rpos]; + const auto in_idx = indptr[rpos] + rofs; + const auto not_pinned = indices != hop; + const auto u = indices[not_pinned ? in_idx : tx]; + if (not_pinned) hop[tx] = u; + const auto t = nids ? nids[u] : u; + if (A) A_l[tx] = A[in_idx]; + // rolled random number r_t is a function of the random_seed and t + rands[tx] = (FloatType)seed.uniform(t); + tx += stride_x; + } +} + +constexpr int CACHE_LINE_SIZE = 128; + +template +struct AlignmentFunc { + static_assert(CACHE_LINE_SIZE % sizeof(IdType) == 0); + const IdType* in_deg; + const int64_t* perm; + IdType num_rows; + __host__ __device__ auto operator()(IdType row) { + constexpr int num_elements = CACHE_LINE_SIZE / sizeof(IdType); + return in_deg[perm ? perm[row % num_rows] : row] + num_elements - 1; + } +}; + +template +__global__ void _CSRRowWiseOneHopExtractorAlignedKernel( + const IdType hop_size, const IdType num_rows, const IdType* const indptr, + const IdType* const subindptr, const IdType* const subindptr_aligned, + const IdType* const indices, IdType* const hop, const int64_t* const perm) { + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + + while (tx < hop_size) { + const IdType rpos_ = + dgl::cuda::_UpperBound(subindptr_aligned, num_rows, tx) - 1; + const IdType rpos = perm ? perm[rpos_] : rpos_; + const auto out_row = subindptr[rpos]; + const auto d = subindptr[rpos + 1] - out_row; + const int offset = + ((uint64_t)(indices + indptr[rpos] - subindptr_aligned[rpos_]) % + CACHE_LINE_SIZE) / + sizeof(IdType); + const IdType rofs = tx - subindptr_aligned[rpos_] - offset; + if (rofs >= 0 && rofs < d) { + const auto in_idx = indptr[rpos] + rofs; + assert((uint64_t)(indices + in_idx - tx) % CACHE_LINE_SIZE == 0); + const auto u = indices[in_idx]; + hop[out_row + rofs] = u; + } + tx += stride_x; + } +} + +template +__global__ void _CSRRowWiseLayerSampleDegreeKernel( + const IdType num_picks, const IdType num_rows, FloatType* const cs, + const FloatType* const ds, const FloatType* const d2s, + const IdType* const indptr, const FloatType* const probs, + const FloatType* const A, const IdType* const subindptr) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ FloatType var_1_bcast[BLOCK_CTAS]; + + // we assign one warp per row + assert(blockDim.x == CTA_SIZE); + assert(blockDim.y == BLOCK_CTAS); + + IdType out_row = blockIdx.x * TILE_SIZE + threadIdx.y; + const auto last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + constexpr FloatType ONE = 1; + + while (out_row < last_row) { + const auto in_row_start = indptr[out_row]; + const auto out_row_start = subindptr[out_row]; + + const IdType degree = subindptr[out_row + 1] - out_row_start; + + if (degree > 0) { + // stands for k in in arXiv:2210.13339, i.e. fanout + const auto k = min(num_picks, degree); + // slightly better than NS + const FloatType d_ = ds ? ds[out_row] : degree; + // stands for right handside of Equation (22) in arXiv:2210.13339 + FloatType var_target = + d_ * d_ / k + (ds ? d2s[out_row] - d_ * d_ / degree : 0); + + auto c = cs[out_row]; + const int num_valid = min(degree, (IdType)CTA_SIZE); + // stands for left handside of Equation (22) in arXiv:2210.13339 + FloatType var_1; + do { + var_1 = 0; + if (A) { + for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) { + const auto w = A[in_row_start + idx]; + const auto ps = probs ? probs[out_row_start + idx] : w; + var_1 += w > 0 ? w * w / min(ONE, c * ps) : 0; + } + } else { + for (int idx = threadIdx.x; idx < degree; idx += CTA_SIZE) { + const auto ps = probs[out_row_start + idx]; + var_1 += 1 / min(ONE, c * ps); + } + } + var_1 = BlockReduce(temp_storage).Sum(var_1, num_valid); + if (threadIdx.x == 0) var_1_bcast[threadIdx.y] = var_1; + __syncthreads(); + var_1 = var_1_bcast[threadIdx.y]; + + c *= var_1 / var_target; + } while (min(var_1, var_target) / max(var_1, var_target) < 1 - eps); + + if (threadIdx.x == 0) cs[out_row] = c; + } + + out_row += BLOCK_CTAS; + } +} + +} // namespace + +template +int log_size(const IdType size) { + if (size <= 0) return 0; + for (int i = 0; i < static_cast(sizeof(IdType)) * 8; i++) + if (((size - 1) >> i) == 0) return i; + return sizeof(IdType) * 8; +} + +template +void compute_importance_sampling_probabilities( + CSRMatrix mat, const IdType hop_size, cudaStream_t stream, + const continuous_seed seed, const IdType num_rows, const IdType* indptr, + const IdType* subindptr, const IdType* indices, IdArray idx_coo_arr, + const IdType* nids, + FloatArray cs_arr, // holds the computed cs values, has size num_rows + const bool weighted, const FloatType* A, const FloatType* ds, + const FloatType* d2s, const IdType num_picks, DGLContext ctx, + const runtime::CUDAWorkspaceAllocator& allocator, + const exec_policy_t& exec_policy, const int importance_sampling, + IdType* hop_1, // holds the contiguous one-hop neighborhood, has size |E| + FloatType* rands, // holds the rolled random numbers r_t for each edge, has + // size |E| + FloatType* probs_found) { // holds the computed pi_t values for each edge, + // has size |E| + auto device = runtime::DeviceAPI::Get(ctx); + auto idx_coo = idx_coo_arr.Ptr(); + auto cs = cs_arr.Ptr(); + FloatArray A_l_arr = weighted + ? NewFloatArray(hop_size, ctx, sizeof(FloatType) * 8) + : NullArray(); + auto A_l = A_l_arr.Ptr(); + + const int max_log_num_vertices = log_size(mat.num_cols); + + { // extracts the onehop neighborhood cols to a contiguous range into hop_1 + const dim3 block(BLOCK_SIZE); + const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseOneHopExtractorKernel), grid, block, 0, + stream, seed, hop_size, indptr, subindptr, indices, idx_coo, nids, + weighted ? A : nullptr, rands, hop_1, A_l); + } + int64_t hop_uniq_size = 0; + IdArray hop_new_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + auto hop_new = hop_new_arr.Ptr(); + auto hop_unique = allocator.alloc_unique(hop_size); + // After this block, hop_unique holds the unique set of one-hop neighborhood + // and hop_new holds the relabeled hop_1, idx_coo already holds relabeled + // destination. hop_unique[hop_new] == hop_1 holds + { + auto hop_2 = allocator.alloc_unique(hop_size); + auto hop_3 = allocator.alloc_unique(hop_size); + + device->CopyDataFromTo( + hop_1, 0, hop_2.get(), 0, sizeof(IdType) * hop_size, ctx, ctx, + mat.indptr->dtype); + + cub::DoubleBuffer hop_b(hop_2.get(), hop_3.get()); + + { + std::size_t temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceRadixSort::SortKeys( + nullptr, temp_storage_bytes, hop_b, hop_size, 0, max_log_num_vertices, + stream)); + + auto temp = allocator.alloc_unique(temp_storage_bytes); + + CUDA_CALL(cub::DeviceRadixSort::SortKeys( + temp.get(), temp_storage_bytes, hop_b, hop_size, 0, + max_log_num_vertices, stream)); + } + + auto hop_counts = allocator.alloc_unique(hop_size + 1); + auto hop_unique_size = allocator.alloc_unique(1); + + { + std::size_t temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceRunLengthEncode::Encode( + nullptr, temp_storage_bytes, hop_b.Current(), hop_unique.get(), + hop_counts.get(), hop_unique_size.get(), hop_size, stream)); + + auto temp = allocator.alloc_unique(temp_storage_bytes); + + CUDA_CALL(cub::DeviceRunLengthEncode::Encode( + temp.get(), temp_storage_bytes, hop_b.Current(), hop_unique.get(), + hop_counts.get(), hop_unique_size.get(), hop_size, stream)); + + device->CopyDataFromTo( + hop_unique_size.get(), 0, &hop_uniq_size, 0, sizeof(hop_uniq_size), + ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + } + + thrust::lower_bound( + exec_policy, hop_unique.get(), hop_unique.get() + hop_uniq_size, hop_1, + hop_1 + hop_size, hop_new); + } + + // @todo Consider creating a CSC because the SpMV will be done multiple times. + COOMatrix rmat( + num_rows, hop_uniq_size, idx_coo_arr, hop_new_arr, NullArray(), true, + mat.sorted); + + BcastOff bcast_off; + bcast_off.use_bcast = false; + bcast_off.out_len = 1; + bcast_off.lhs_len = 1; + bcast_off.rhs_len = 1; + + FloatArray probs_arr = + NewFloatArray(hop_uniq_size, ctx, sizeof(FloatType) * 8); + auto probs_1 = probs_arr.Ptr(); + FloatArray probs_arr_2 = + NewFloatArray(hop_uniq_size, ctx, sizeof(FloatType) * 8); + auto probs = probs_arr_2.Ptr(); + auto arg_u = NewIdArray(hop_uniq_size, ctx, sizeof(IdType) * 8); + auto arg_e = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + + double prev_ex_nodes = hop_uniq_size; + + for (int iters = 0; iters < importance_sampling || importance_sampling < 0; + iters++) { + if (weighted && iters == 0) { + cuda::SpMMCoo< + IdType, FloatType, cuda::binary::Mul, + cuda::reduce::Max>( + bcast_off, rmat, cs_arr, A_l_arr, probs_arr_2, arg_u, arg_e); + } else { + cuda::SpMMCoo< + IdType, FloatType, cuda::binary::CopyLhs, + cuda::reduce::Max>( + bcast_off, rmat, cs_arr, NullArray(), iters ? probs_arr : probs_arr_2, + arg_u, arg_e); + } + + if (iters) + thrust::transform( + exec_policy, probs_1, probs_1 + hop_uniq_size, probs, probs, + thrust::multiplies{}); + + thrust::gather( + exec_policy, hop_new, hop_new + hop_size, probs, probs_found); + + { + constexpr int BLOCK_CTAS = BLOCK_SIZE / CTA_SIZE; + // the number of rows each thread block will cover + constexpr int TILE_SIZE = BLOCK_CTAS; + const dim3 block(CTA_SIZE, BLOCK_CTAS); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseLayerSampleDegreeKernel< + IdType, FloatType, BLOCK_CTAS, TILE_SIZE>), + grid, block, 0, stream, (IdType)num_picks, num_rows, cs, + weighted ? ds : nullptr, weighted ? d2s : nullptr, indptr, + probs_found, A, subindptr); + } + + { + auto probs_min_1 = + thrust::make_transform_iterator(probs, TransformOpMinWith1{}); + const double cur_ex_nodes = thrust::reduce( + exec_policy, probs_min_1, probs_min_1 + hop_uniq_size, 0.0); + if (cur_ex_nodes / prev_ex_nodes >= 1 - eps) break; + prev_ex_nodes = cur_ex_nodes; + } + } +} + +/////////////////////////////// CSR /////////////////////////////// + +template +std::pair CSRLaborSampling( + CSRMatrix mat, IdArray rows_arr, const int64_t num_picks, + FloatArray prob_arr, const int importance_sampling, IdArray random_seed_arr, + float seed2_contribution, IdArray NIDs) { + const bool weighted = !IsNullArray(prob_arr); + + const auto& ctx = rows_arr->ctx; + + runtime::CUDAWorkspaceAllocator allocator(ctx); + + const auto stream = runtime::getCurrentCUDAStream(); + const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + + auto device = runtime::DeviceAPI::Get(ctx); + + const IdType num_rows = rows_arr->shape[0]; + IdType* const rows = rows_arr.Ptr(); + IdType* const nids = IsNullArray(NIDs) ? nullptr : NIDs.Ptr(); + FloatType* const A = prob_arr.Ptr(); + + IdType* const indptr_ = mat.indptr.Ptr(); + IdType* const indices_ = mat.indices.Ptr(); + IdType* const data = CSRHasData(mat) ? mat.data.Ptr() : nullptr; + + // Read indptr only once in case it is pinned and access is slow. + auto indptr = allocator.alloc_unique(num_rows); + // compute in-degrees + auto in_deg = allocator.alloc_unique(num_rows + 1); + // cs stands for c_s in arXiv:2210.13339 + FloatArray cs_arr = NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8); + auto cs = cs_arr.Ptr(); + // ds stands for A_{*s} in arXiv:2210.13339 + FloatArray ds_arr = weighted + ? NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8) + : NullArray(); + auto ds = ds_arr.Ptr(); + // d2s stands for (A^2)_{*s} in arXiv:2210.13339, ^2 is elementwise. + FloatArray d2s_arr = weighted + ? NewFloatArray(num_rows, ctx, sizeof(FloatType) * 8) + : NullArray(); + auto d2s = d2s_arr.Ptr(); + + thrust::counting_iterator iota(0); + thrust::for_each( + exec_policy, iota, iota + num_rows, + DegreeFunc{ + (IdType)num_picks, rows, indptr_, in_deg.get(), indptr.get(), cs}); + + if (weighted) { + auto b_offsets = thrust::make_transform_iterator( + iota, IndptrFunc{indptr.get(), nullptr}); + auto e_offsets = thrust::make_transform_iterator( + iota, IndptrFunc{indptr.get(), in_deg.get()}); + + auto A_A2 = thrust::make_transform_iterator(A, SquareFunc{}); + auto ds_d2s = thrust::make_zip_iterator(ds, d2s); + + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( + nullptr, prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, e_offsets, + TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), stream)); + auto temp = allocator.alloc_unique(prefix_temp_size); + CUDA_CALL(cub::DeviceSegmentedReduce::Reduce( + temp.get(), prefix_temp_size, A_A2, ds_d2s, num_rows, b_offsets, + e_offsets, TupleSum{}, thrust::make_tuple((FloatType)0, (FloatType)0), + stream)); + } + + // fill subindptr + IdArray subindptr_arr = NewIdArray(num_rows + 1, ctx, sizeof(IdType) * 8); + auto subindptr = subindptr_arr.Ptr(); + + IdType hop_size; + { + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, + stream)); + auto temp = allocator.alloc_unique(prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + temp.get(), prefix_temp_size, in_deg.get(), subindptr, num_rows + 1, + stream)); + + device->CopyDataFromTo( + subindptr, num_rows * sizeof(hop_size), &hop_size, 0, sizeof(hop_size), + ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + } + IdArray hop_arr = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + CSRMatrix smat( + num_rows, mat.num_cols, subindptr_arr, hop_arr, NullArray(), mat.sorted); + // @todo Consider fusing CSRToCOO into StencilOpFused kernel + auto smatcoo = CSRToCOO(smat, false); + + auto idx_coo_arr = smatcoo.row; + auto idx_coo = idx_coo_arr.Ptr(); + + auto hop_1 = hop_arr.Ptr(); + const bool is_pinned = mat.indices.IsPinned(); + if (is_pinned) { + const auto res = Sort(rows_arr, log_size(mat.num_rows)); + const int64_t* perm = static_cast(res.second->data); + + IdType hop_size; // Shadows the original one as this is temporary + auto subindptr_aligned = allocator.alloc_unique(num_rows + 1); + { + auto modified_in_deg = thrust::make_transform_iterator( + iota, AlignmentFunc{in_deg.get(), perm, num_rows}); + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, modified_in_deg, subindptr_aligned.get(), + num_rows + 1, stream)); + auto temp = allocator.alloc_unique(prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + temp.get(), prefix_temp_size, modified_in_deg, + subindptr_aligned.get(), num_rows + 1, stream)); + + device->CopyDataFromTo( + subindptr_aligned.get(), num_rows * sizeof(hop_size), &hop_size, 0, + sizeof(hop_size), ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + } + const dim3 block(BLOCK_SIZE); + const dim3 grid((hop_size + BLOCK_SIZE - 1) / BLOCK_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseOneHopExtractorAlignedKernel), grid, block, 0, + stream, hop_size, num_rows, indptr.get(), subindptr, + subindptr_aligned.get(), indices_, hop_1, perm); + } + const auto indices = is_pinned ? hop_1 : indices_; + + auto rands = + allocator.alloc_unique(importance_sampling ? hop_size : 1); + auto probs_found = + allocator.alloc_unique(importance_sampling ? hop_size : 1); + + if (weighted) { + // Recompute c for weighted graphs. + constexpr int BLOCK_CTAS = BLOCK_SIZE / CTA_SIZE; + // the number of rows each thread block will cover + constexpr int TILE_SIZE = BLOCK_CTAS; + const dim3 block(CTA_SIZE, BLOCK_CTAS); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseLayerSampleDegreeKernel< + IdType, FloatType, BLOCK_CTAS, TILE_SIZE>), + grid, block, 0, stream, (IdType)num_picks, num_rows, cs, ds, d2s, + indptr.get(), nullptr, A, subindptr); + } + + const continuous_seed random_seed = + IsNullArray(random_seed_arr) + ? continuous_seed(RandomEngine::ThreadLocal()->RandInt(1000000000)) + : continuous_seed(random_seed_arr, seed2_contribution); + + if (importance_sampling) + compute_importance_sampling_probabilities< + IdType, FloatType, decltype(exec_policy)>( + mat, hop_size, stream, random_seed, num_rows, indptr.get(), subindptr, + indices, idx_coo_arr, nids, cs_arr, weighted, A, ds, d2s, + (IdType)num_picks, ctx, allocator, exec_policy, importance_sampling, + hop_1, rands.get(), probs_found.get()); + + IdArray picked_row = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + IdArray picked_col = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + IdArray picked_idx = NewIdArray(hop_size, ctx, sizeof(IdType) * 8); + FloatArray picked_imp = + importance_sampling || weighted + ? NewFloatArray(hop_size, ctx, sizeof(FloatType) * 8) + : NullArray(); + + IdType* const picked_row_data = picked_row.Ptr(); + IdType* const picked_col_data = picked_col.Ptr(); + IdType* const picked_idx_data = picked_idx.Ptr(); + FloatType* const picked_imp_data = picked_imp.Ptr(); + + auto picked_inrow = allocator.alloc_unique( + importance_sampling || weighted ? hop_size : 1); + + // Sample edges here + IdType num_edges; + { + thrust::constant_iterator one(1); + if (importance_sampling) { + auto output = thrust::make_zip_iterator( + picked_inrow.get(), picked_row_data, picked_col_data, picked_idx_data, + picked_imp_data); + if (weighted) { + auto transformed_output = thrust::make_transform_output_iterator( + output, + TransformOpImp< + IdType, FloatType, FloatType*, FloatType*, decltype(one)>{ + probs_found.get(), A, one, idx_coo, rows, cs, indptr.get(), + subindptr, indices, data, is_pinned}); + auto stencil = + thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get()); + num_edges = + thrust::copy_if( + exec_policy, iota, iota + hop_size, stencil, transformed_output, + thrust::make_zip_function(StencilOp{cs})) - + transformed_output; + } else { + auto transformed_output = thrust::make_transform_output_iterator( + output, + TransformOpImp< + IdType, FloatType, FloatType*, decltype(one), decltype(one)>{ + probs_found.get(), one, one, idx_coo, rows, cs, indptr.get(), + subindptr, indices, data, is_pinned}); + auto stencil = + thrust::make_zip_iterator(idx_coo, probs_found.get(), rands.get()); + num_edges = + thrust::copy_if( + exec_policy, iota, iota + hop_size, stencil, transformed_output, + thrust::make_zip_function(StencilOp{cs})) - + transformed_output; + } + } else { + if (weighted) { + auto output = thrust::make_zip_iterator( + picked_inrow.get(), picked_row_data, picked_col_data, + picked_idx_data, picked_imp_data); + auto transformed_output = thrust::make_transform_output_iterator( + output, + TransformOpImp< + IdType, FloatType, decltype(one), FloatType*, FloatType*>{ + one, A, A, idx_coo, rows, cs, indptr.get(), subindptr, indices, + data, is_pinned}); + const auto pred = + StencilOpFused{ + random_seed, idx_coo, cs, one, A, + subindptr, indptr.get(), indices, nids, is_pinned}; + num_edges = thrust::copy_if( + exec_policy, iota, iota + hop_size, iota, + transformed_output, pred) - + transformed_output; + } else { + auto output = thrust::make_zip_iterator( + picked_row_data, picked_col_data, picked_idx_data); + auto transformed_output = thrust::make_transform_output_iterator( + output, TransformOp{ + idx_coo, rows, indptr.get(), subindptr, indices, data, + is_pinned}); + const auto pred = + StencilOpFused{ + random_seed, idx_coo, cs, one, one, + subindptr, indptr.get(), indices, nids, is_pinned}; + num_edges = thrust::copy_if( + exec_policy, iota, iota + hop_size, iota, + transformed_output, pred) - + transformed_output; + } + } + } + + // Normalize edge weights here + if (importance_sampling || weighted) { + thrust::constant_iterator one(1); + // contains degree information + auto ds = allocator.alloc_unique(num_rows); + // contains sum of edge weights + auto ws = allocator.alloc_unique(num_rows); + // contains degree information only for vertices with nonzero degree + auto ds_2 = allocator.alloc_unique(num_rows); + // contains sum of edge weights only for vertices with nonzero degree + auto ws_2 = allocator.alloc_unique(num_rows); + auto output_ = thrust::make_zip_iterator(ds.get(), ws.get()); + // contains row ids only for vertices with nonzero degree + auto keys = allocator.alloc_unique(num_rows); + auto input = thrust::make_zip_iterator(one, picked_imp_data); + auto new_end = thrust::reduce_by_key( + exec_policy, picked_inrow.get(), picked_inrow.get() + num_edges, input, + keys.get(), output_, thrust::equal_to{}, TupleSum{}); + { + thrust::constant_iterator zero_int(0); + thrust::constant_iterator zero_float(0); + auto input = thrust::make_zip_iterator(zero_int, zero_float); + auto output = thrust::make_zip_iterator(ds_2.get(), ws_2.get()); + thrust::copy(exec_policy, input, input + num_rows, output); + { + const auto num_rows_2 = new_end.first - keys.get(); + thrust::scatter( + exec_policy, output_, output_ + num_rows_2, keys.get(), output); + } + } + { + auto input = + thrust::make_zip_iterator(picked_inrow.get(), picked_imp_data); + auto transformed_input = thrust::make_transform_iterator( + input, thrust::make_zip_function(TransformOpMean{ + ds_2.get(), ws_2.get()})); + thrust::copy( + exec_policy, transformed_input, transformed_input + num_edges, + picked_imp_data); + } + } + + picked_row = picked_row.CreateView({num_edges}, picked_row->dtype); + picked_col = picked_col.CreateView({num_edges}, picked_col->dtype); + picked_idx = picked_idx.CreateView({num_edges}, picked_idx->dtype); + if (importance_sampling || weighted) + picked_imp = picked_imp.CreateView({num_edges}, picked_imp->dtype); + + return std::make_pair( + COOMatrix(mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx), + picked_imp); +} + +template std::pair +CSRLaborSampling( + CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray); +template std::pair +CSRLaborSampling( + CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray); +template std::pair +CSRLaborSampling( + CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray); +template std::pair +CSRLaborSampling( + CSRMatrix, IdArray, int64_t, FloatArray, int, IdArray, float, IdArray); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/macro.cuh b/src/array/cuda/macro.cuh index ad24a9445273..ea7fb90ac1a0 100644 --- a/src/array/cuda/macro.cuh +++ b/src/array/cuda/macro.cuh @@ -30,14 +30,14 @@ const auto device = runtime::DeviceAPI::Get(ctx); \ (LHS_OFF) = static_cast(device->AllocWorkspace( \ ctx, sizeof(int64_t) * info.lhs_offset.size())); \ - CUDA_CALL(cudaMemcpy( \ + CUDA_CALL(hipMemcpy( \ (LHS_OFF), &info.lhs_offset[0], \ - sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \ + sizeof(int64_t) * info.lhs_offset.size(), hipMemcpyHostToDevice)); \ (RHS_OFF) = static_cast(device->AllocWorkspace( \ ctx, sizeof(int64_t) * info.rhs_offset.size())); \ - CUDA_CALL(cudaMemcpy( \ + CUDA_CALL(hipMemcpy( \ (RHS_OFF), &info.rhs_offset[0], \ - sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \ + sizeof(int64_t) * info.rhs_offset.size(), hipMemcpyHostToDevice)); \ if ((EDGE_MAP)) { \ constexpr bool UseIdx = true; \ { __VA_ARGS__ } \ diff --git a/src/array/cuda/macro.cuh.prehip b/src/array/cuda/macro.cuh.prehip new file mode 100644 index 000000000000..ad24a9445273 --- /dev/null +++ b/src/array/cuda/macro.cuh.prehip @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/macro.cuh + * @brief Macro to call SPMM/SDDMM cuda kernels. + */ +#ifndef DGL_ARRAY_CUDA_MACRO_CUH_ +#define DGL_ARRAY_CUDA_MACRO_CUH_ + +///////////////////////// Dispatchers ////////////////////////// + +/* Macro used for switching between broadcasting and non-broadcasting kernels. + * It also copies the auxiliary information for calculating broadcasting offsets + * to GPU. + */ +#define BCAST_IDX_CTX_SWITCH(BCAST, EDGE_MAP, CTX, LHS_OFF, RHS_OFF, ...) \ + do { \ + const BcastOff &info = (BCAST); \ + if (!info.use_bcast) { \ + constexpr bool UseBcast = false; \ + if ((EDGE_MAP)) { \ + constexpr bool UseIdx = true; \ + { __VA_ARGS__ } \ + } else { \ + constexpr bool UseIdx = false; \ + { __VA_ARGS__ } \ + } \ + } else { \ + constexpr bool UseBcast = true; \ + const DGLContext ctx = (CTX); \ + const auto device = runtime::DeviceAPI::Get(ctx); \ + (LHS_OFF) = static_cast(device->AllocWorkspace( \ + ctx, sizeof(int64_t) * info.lhs_offset.size())); \ + CUDA_CALL(cudaMemcpy( \ + (LHS_OFF), &info.lhs_offset[0], \ + sizeof(int64_t) * info.lhs_offset.size(), cudaMemcpyHostToDevice)); \ + (RHS_OFF) = static_cast(device->AllocWorkspace( \ + ctx, sizeof(int64_t) * info.rhs_offset.size())); \ + CUDA_CALL(cudaMemcpy( \ + (RHS_OFF), &info.rhs_offset[0], \ + sizeof(int64_t) * info.rhs_offset.size(), cudaMemcpyHostToDevice)); \ + if ((EDGE_MAP)) { \ + constexpr bool UseIdx = true; \ + { __VA_ARGS__ } \ + } else { \ + constexpr bool UseIdx = false; \ + { __VA_ARGS__ } \ + } \ + device->FreeWorkspace(ctx, (LHS_OFF)); \ + device->FreeWorkspace(ctx, (RHS_OFF)); \ + } \ + } while (0) + +#endif // DGL_ARRAY_CUDA_MACRO_CUH_ diff --git a/src/array/cuda/negative_sampling.cu b/src/array/cuda/negative_sampling.cu index 2cf2623306b1..041982e040f2 100644 --- a/src/array/cuda/negative_sampling.cu +++ b/src/array/cuda/negative_sampling.cu @@ -1,15 +1,16 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file array/cuda/negative_sampling.cu * @brief rowwise sampling */ -#include +#include #include #include #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -31,9 +32,9 @@ __global__ void _GlobalUniformNegativeSamplingKernel( int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; const int stride_x = gridDim.x * blockDim.x; - curandStatePhilox4_32_10_t + hiprandStatePhilox4_32_10_t rng; // this allows generating 4 32-bit ints at a time - curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + hiprand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); while (tx < num_samples) { for (int i = 0; i < num_trials; ++i) { @@ -88,7 +89,7 @@ struct IsNotMinusOne { template void SortOrderedPairs( runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor, - IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) { + IdType* tmp_major, IdType* tmp_minor, int64_t n, hipStream_t stream) { // Sort ordered pairs in lexicographical order by two radix sorts since // cub's radix sorts are stable. // We need a 2*n auxiliary storage to store the results form the first radix @@ -98,21 +99,21 @@ void SortOrderedPairs( void* tmp2 = nullptr; // Radix sort by minor key first, reorder the major key in the progress. - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream)); tmp1 = device->AllocWorkspace(ctx, s1); - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, stream)); // Radix sort by major key next. - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream)); tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) : tmp1; // reuse buffer if s2 <= s1 - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, stream)); @@ -141,7 +142,7 @@ std::pair CSRGlobalUniformNegativeSampling( IdType* out_row_data = out_row.Ptr(); IdType* out_col_data = out_col.Ptr(); auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int nt = cuda::FindNumThreads(num_actual_samples); const int nb = (num_actual_samples + nt - 1) / nt; std::pair result; @@ -159,11 +160,11 @@ std::pair CSRGlobalUniformNegativeSampling( IsNotMinusOne op; PairIterator begin(row_data, col_data); PairIterator out_begin(out_row_data, out_col_data); - CUDA_CALL(cub::DeviceSelect::If( + CUDA_CALL(hipcub::DeviceSelect::If( nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream)); void* tmp = device->AllocWorkspace(ctx, tmp_size); - CUDA_CALL(cub::DeviceSelect::If( + CUDA_CALL(hipcub::DeviceSelect::If( tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, stream)); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); @@ -181,13 +182,13 @@ std::pair CSRGlobalUniformNegativeSampling( size_t tmp_size_unique = 0; void* tmp_unique = nullptr; - CUDA_CALL(cub::DeviceSelect::Unique( + CUDA_CALL(hipcub::DeviceSelect::Unique( nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream)); tmp_unique = (tmp_size_unique > tmp_size) ? device->AllocWorkspace(ctx, tmp_size_unique) : tmp; // reuse buffer - CUDA_CALL(cub::DeviceSelect::Unique( + CUDA_CALL(hipcub::DeviceSelect::Unique( tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, num_out, stream)); num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); diff --git a/src/array/cuda/negative_sampling.cu.prehip b/src/array/cuda/negative_sampling.cu.prehip new file mode 100644 index 000000000000..2cf2623306b1 --- /dev/null +++ b/src/array/cuda/negative_sampling.cu.prehip @@ -0,0 +1,220 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/cuda/negative_sampling.cu + * @brief rowwise sampling + */ + +#include +#include +#include +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +using namespace dgl::runtime; + +namespace dgl { +namespace aten { +namespace impl { + +namespace { + +template +__global__ void _GlobalUniformNegativeSamplingKernel( + const IdType* __restrict__ indptr, const IdType* __restrict__ indices, + IdType* __restrict__ row, IdType* __restrict__ col, int64_t num_row, + int64_t num_col, int64_t num_samples, int num_trials, + bool exclude_self_loops, int32_t random_seed) { + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + + curandStatePhilox4_32_10_t + rng; // this allows generating 4 32-bit ints at a time + curand_init(random_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + + while (tx < num_samples) { + for (int i = 0; i < num_trials; ++i) { + uint4 result = curand4(&rng); + // Turns out that result.x is always 0 with the above RNG. + uint64_t y_hi = result.y >> 16; + uint64_t y_lo = result.y & 0xFFFF; + uint64_t z = static_cast(result.z); + uint64_t w = static_cast(result.w); + int64_t u = static_cast(((y_lo << 32L) | z) % num_row); + int64_t v = static_cast(((y_hi << 32L) | w) % num_col); + + if (exclude_self_loops && (u == v)) continue; + + // binary search of v among indptr[u:u+1] + int64_t b = indptr[u], e = indptr[u + 1] - 1; + bool found = false; + while (b <= e) { + int64_t m = (b + e) / 2; + if (indices[m] == v) { + found = true; + break; + } else if (indices[m] < v) { + b = m + 1; + } else { + e = m - 1; + } + } + + if (!found) { + row[tx] = u; + col[tx] = v; + break; + } + } + + tx += stride_x; + } +} + +template +struct IsNotMinusOne { + __device__ __forceinline__ bool operator()(const std::pair& a) { + return a.first != -1; + } +}; + +/** + * @brief Sort ordered pairs in ascending order, using \a tmp_major and \a + * tmp_minor as temporary buffers, each with \a n elements. + */ +template +void SortOrderedPairs( + runtime::DeviceAPI* device, DGLContext ctx, IdType* major, IdType* minor, + IdType* tmp_major, IdType* tmp_minor, int64_t n, cudaStream_t stream) { + // Sort ordered pairs in lexicographical order by two radix sorts since + // cub's radix sorts are stable. + // We need a 2*n auxiliary storage to store the results form the first radix + // sort. + size_t s1 = 0, s2 = 0; + void* tmp1 = nullptr; + void* tmp2 = nullptr; + + // Radix sort by minor key first, reorder the major key in the progress. + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, + stream)); + tmp1 = device->AllocWorkspace(ctx, s1); + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + tmp1, s1, minor, tmp_minor, major, tmp_major, n, 0, sizeof(IdType) * 8, + stream)); + + // Radix sort by major key next. + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, + stream)); + tmp2 = (s2 > s1) ? device->AllocWorkspace(ctx, s2) + : tmp1; // reuse buffer if s2 <= s1 + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + tmp2, s2, tmp_major, major, tmp_minor, minor, n, 0, sizeof(IdType) * 8, + stream)); + + if (tmp1 != tmp2) device->FreeWorkspace(ctx, tmp2); + device->FreeWorkspace(ctx, tmp1); +} + +}; // namespace + +template +std::pair CSRGlobalUniformNegativeSampling( + const CSRMatrix& csr, int64_t num_samples, int num_trials, + bool exclude_self_loops, bool replace, double redundancy) { + auto ctx = csr.indptr->ctx; + auto dtype = csr.indptr->dtype; + const int64_t num_row = csr.num_rows; + const int64_t num_col = csr.num_cols; + const int64_t num_actual_samples = + static_cast(num_samples * (1 + redundancy)); + IdArray row = Full(-1, num_actual_samples, ctx); + IdArray col = Full(-1, num_actual_samples, ctx); + IdArray out_row = IdArray::Empty({num_actual_samples}, dtype, ctx); + IdArray out_col = IdArray::Empty({num_actual_samples}, dtype, ctx); + IdType* row_data = row.Ptr(); + IdType* col_data = col.Ptr(); + IdType* out_row_data = out_row.Ptr(); + IdType* out_col_data = out_col.Ptr(); + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int nt = cuda::FindNumThreads(num_actual_samples); + const int nb = (num_actual_samples + nt - 1) / nt; + std::pair result; + int64_t num_out; + + CUDA_KERNEL_CALL( + _GlobalUniformNegativeSamplingKernel, nb, nt, 0, stream, + csr.indptr.Ptr(), csr.indices.Ptr(), row_data, col_data, + num_row, num_col, num_actual_samples, num_trials, exclude_self_loops, + RandomEngine::ThreadLocal()->RandInt32()); + + size_t tmp_size = 0; + int64_t* num_out_cuda = + static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); + IsNotMinusOne op; + PairIterator begin(row_data, col_data); + PairIterator out_begin(out_row_data, out_col_data); + CUDA_CALL(cub::DeviceSelect::If( + nullptr, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, + stream)); + void* tmp = device->AllocWorkspace(ctx, tmp_size); + CUDA_CALL(cub::DeviceSelect::If( + tmp, tmp_size, begin, out_begin, num_out_cuda, num_actual_samples, op, + stream)); + num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); + + if (!replace) { + IdArray unique_row = IdArray::Empty({num_out}, dtype, ctx); + IdArray unique_col = IdArray::Empty({num_out}, dtype, ctx); + IdType* unique_row_data = unique_row.Ptr(); + IdType* unique_col_data = unique_col.Ptr(); + PairIterator unique_begin(unique_row_data, unique_col_data); + + SortOrderedPairs( + device, ctx, out_row_data, out_col_data, unique_row_data, + unique_col_data, num_out, stream); + + size_t tmp_size_unique = 0; + void* tmp_unique = nullptr; + CUDA_CALL(cub::DeviceSelect::Unique( + nullptr, tmp_size_unique, out_begin, unique_begin, num_out_cuda, + num_out, stream)); + tmp_unique = (tmp_size_unique > tmp_size) + ? device->AllocWorkspace(ctx, tmp_size_unique) + : tmp; // reuse buffer + CUDA_CALL(cub::DeviceSelect::Unique( + tmp_unique, tmp_size_unique, out_begin, unique_begin, num_out_cuda, + num_out, stream)); + num_out = cuda::GetCUDAScalar(device, ctx, num_out_cuda); + + num_out = std::min(num_samples, num_out); + result = { + unique_row.CreateView({num_out}, dtype), + unique_col.CreateView({num_out}, dtype)}; + + if (tmp_unique != tmp) device->FreeWorkspace(ctx, tmp_unique); + } else { + num_out = std::min(num_samples, num_out); + result = { + out_row.CreateView({num_out}, dtype), + out_col.CreateView({num_out}, dtype)}; + } + + device->FreeWorkspace(ctx, tmp); + device->FreeWorkspace(ctx, num_out_cuda); + return result; +} + +template std::pair CSRGlobalUniformNegativeSampling< + kDGLCUDA, int32_t>(const CSRMatrix&, int64_t, int, bool, bool, double); +template std::pair CSRGlobalUniformNegativeSampling< + kDGLCUDA, int64_t>(const CSRMatrix&, int64_t, int, bool, bool, double); + +}; // namespace impl +}; // namespace aten +}; // namespace dgl diff --git a/src/array/cuda/rowwise_sampling.cu b/src/array/cuda/rowwise_sampling.cu index 38f08e7b277d..aaa777a179cf 100644 --- a/src/array/cuda/rowwise_sampling.cu +++ b/src/array/cuda/rowwise_sampling.cu @@ -1,15 +1,16 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file array/cuda/rowwise_sampling.cu * @brief uniform rowwise sampling */ -#include +#include #include #include #include -#include +#include #include #include "../../array/cuda/atomic.cuh" @@ -126,8 +127,8 @@ __global__ void _CSRRowWiseSampleUniformKernel( const int64_t last_row = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); - curandStatePhilox4_32_10_t rng; - curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + hiprandStatePhilox4_32_10_t rng; + hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); while (out_row < last_row) { const int64_t row = in_rows[out_row]; @@ -151,7 +152,7 @@ __global__ void _CSRRowWiseSampleUniformKernel( __syncthreads(); for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) { - const int num = curand(&rng) % (idx + 1); + const int num = hiprand(&rng) % (idx + 1); if (num < num_picks) { // use max so as to achieve the replacement order the serial // algorithm would have @@ -204,8 +205,8 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( const int64_t last_row = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); - curandStatePhilox4_32_10_t rng; - curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + hiprandStatePhilox4_32_10_t rng; + hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); while (out_row < last_row) { const int64_t row = in_rows[out_row]; @@ -216,7 +217,7 @@ __global__ void _CSRRowWiseSampleUniformReplaceKernel( if (deg > 0) { // each thread then blindly copies in rows only if deg > 0. for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { - const int64_t edge = curand(&rng) % deg; + const int64_t edge = hiprand(&rng) % deg; const int64_t out_idx = out_row_start + idx; out_rows[out_idx] = row; out_cols[out_idx] = in_index[in_row_start + edge]; @@ -237,7 +238,7 @@ COOMatrix _CSRRowWiseSamplingUniform( CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) { const auto& ctx = rows->ctx; auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t num_rows = rows->shape[0]; const IdType* const slice_rows = static_cast(rows->data); @@ -279,16 +280,16 @@ COOMatrix _CSRRowWiseSamplingUniform( IdType* out_ptr = static_cast( device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, out_deg); - cudaEvent_t copyEvent; - CUDA_CALL(cudaEventCreate(©Event)); + hipEvent_t copyEvent; + CUDA_CALL(hipEventCreate(©Event)); NDArray new_len_tensor; if (TensorDispatcher::Global()->IsAvailable()) { @@ -301,10 +302,10 @@ COOMatrix _CSRRowWiseSamplingUniform( } // copy using the internal current stream - CUDA_CALL(cudaMemcpyAsync( + CUDA_CALL(hipMemcpyAsync( new_len_tensor->data, out_ptr + num_rows, sizeof(IdType), - cudaMemcpyDeviceToHost, stream)); - CUDA_CALL(cudaEventRecord(copyEvent, stream)); + hipMemcpyDeviceToHost, stream)); + CUDA_CALL(hipEventRecord(copyEvent, stream)); const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); @@ -329,8 +330,8 @@ COOMatrix _CSRRowWiseSamplingUniform( device->FreeWorkspace(ctx, out_ptr); // wait for copying `new_len` to finish - CUDA_CALL(cudaEventSynchronize(copyEvent)); - CUDA_CALL(cudaEventDestroy(copyEvent)); + CUDA_CALL(hipEventSynchronize(copyEvent)); + CUDA_CALL(hipEventDestroy(copyEvent)); const IdType new_len = static_cast(new_len_tensor->data)[0]; picked_row = picked_row.CreateView({new_len}, picked_row->dtype); diff --git a/src/array/cuda/rowwise_sampling.cu.prehip b/src/array/cuda/rowwise_sampling.cu.prehip new file mode 100644 index 000000000000..38f08e7b277d --- /dev/null +++ b/src/array/cuda/rowwise_sampling.cu.prehip @@ -0,0 +1,366 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/cuda/rowwise_sampling.cu + * @brief uniform rowwise sampling + */ + +#include +#include +#include +#include + +#include +#include + +#include "../../array/cuda/atomic.cuh" +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +using namespace dgl::cuda; +using namespace dgl::aten::cuda; +using TensorDispatcher = dgl::runtime::TensorDispatcher; + +namespace dgl { +namespace aten { +namespace impl { + +namespace { + +constexpr int BLOCK_SIZE = 128; + +/** + * @brief Compute the size of each row in the sampled CSR, without replacement. + * + * @tparam IdType The type of node and edge indexes. + * @param num_picks The number of non-zero entries to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The index where each row's edges start. + * @param out_deg The size of each row in the sampled matrix, as indexed by + * `in_rows` (output). + */ +template +__global__ void _CSRRowWiseSampleDegreeKernel( + const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + IdType* const out_deg) { + const int tIdx = threadIdx.x + blockIdx.x * blockDim.x; + + if (tIdx < num_rows) { + const int in_row = in_rows[tIdx]; + const int out_row = tIdx; + out_deg[out_row] = min( + static_cast(num_picks), in_ptr[in_row + 1] - in_ptr[in_row]); + + if (out_row == num_rows - 1) { + // make the prefixsum work + out_deg[num_rows] = 0; + } + } +} + +/** + * @brief Compute the size of each row in the sampled CSR, with replacement. + * + * @tparam IdType The type of node and edge indexes. + * @param num_picks The number of non-zero entries to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The index where each row's edges start. + * @param out_deg The size of each row in the sampled matrix, as indexed by + * `in_rows` (output). + */ +template +__global__ void _CSRRowWiseSampleDegreeReplaceKernel( + const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + IdType* const out_deg) { + const int tIdx = threadIdx.x + blockIdx.x * blockDim.x; + + if (tIdx < num_rows) { + const int64_t in_row = in_rows[tIdx]; + const int64_t out_row = tIdx; + + if (in_ptr[in_row + 1] - in_ptr[in_row] == 0) { + out_deg[out_row] = 0; + } else { + out_deg[out_row] = static_cast(num_picks); + } + + if (out_row == num_rows - 1) { + // make the prefixsum work + out_deg[num_rows] = 0; + } + } +} + +/** + * @brief Perform row-wise uniform sampling on a CSR matrix, + * and generate a COO matrix, without replacement. + * + * @tparam IdType The ID type used for matrices. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + * @param rand_seed The random seed to use. + * @param num_picks The number of non-zeros to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The indptr array of the input CSR. + * @param in_index The indices array of the input CSR. + * @param data The data array of the input CSR. + * @param out_ptr The offset to write each row to in the output COO. + * @param out_rows The rows of the output COO (output). + * @param out_cols The columns of the output COO (output). + * @param out_idxs The data array of the output COO (output). + */ +template +__global__ void _CSRRowWiseSampleUniformKernel( + const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + const IdType* const in_index, const IdType* const data, + const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols, + IdType* const out_idxs) { + // we assign one warp per row + assert(blockDim.x == BLOCK_SIZE); + + int64_t out_row = blockIdx.x * TILE_SIZE; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + curandStatePhilox4_32_10_t rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + + while (out_row < last_row) { + const int64_t row = in_rows[out_row]; + const int64_t in_row_start = in_ptr[row]; + const int64_t deg = in_ptr[row + 1] - in_row_start; + const int64_t out_row_start = out_ptr[out_row]; + + if (deg <= num_picks) { + // just copy row when there is not enough nodes to sample. + for (int idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) { + const IdType in_idx = in_row_start + idx; + out_rows[out_row_start + idx] = row; + out_cols[out_row_start + idx] = in_index[in_idx]; + out_idxs[out_row_start + idx] = data ? data[in_idx] : in_idx; + } + } else { + // generate permutation list via reservoir algorithm + for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { + out_idxs[out_row_start + idx] = idx; + } + __syncthreads(); + + for (int idx = num_picks + threadIdx.x; idx < deg; idx += BLOCK_SIZE) { + const int num = curand(&rng) % (idx + 1); + if (num < num_picks) { + // use max so as to achieve the replacement order the serial + // algorithm would have + AtomicMax(out_idxs + out_row_start + num, idx); + } + } + __syncthreads(); + + // copy permutation over + for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { + const IdType perm_idx = out_idxs[out_row_start + idx] + in_row_start; + out_rows[out_row_start + idx] = row; + out_cols[out_row_start + idx] = in_index[perm_idx]; + out_idxs[out_row_start + idx] = data ? data[perm_idx] : perm_idx; + } + } + out_row += 1; + } +} + +/** + * @brief Perform row-wise uniform sampling on a CSR matrix, + * and generate a COO matrix, with replacement. + * + * @tparam IdType The ID type used for matrices. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + * @param rand_seed The random seed to use. + * @param num_picks The number of non-zeros to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The indptr array of the input CSR. + * @param in_index The indices array of the input CSR. + * @param data The data array of the input CSR. + * @param out_ptr The offset to write each row to in the output COO. + * @param out_rows The rows of the output COO (output). + * @param out_cols The columns of the output COO (output). + * @param out_idxs The data array of the output COO (output). + */ +template +__global__ void _CSRRowWiseSampleUniformReplaceKernel( + const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + const IdType* const in_index, const IdType* const data, + const IdType* const out_ptr, IdType* const out_rows, IdType* const out_cols, + IdType* const out_idxs) { + // we assign one warp per row + assert(blockDim.x == BLOCK_SIZE); + + int64_t out_row = blockIdx.x * TILE_SIZE; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + curandStatePhilox4_32_10_t rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + + while (out_row < last_row) { + const int64_t row = in_rows[out_row]; + const int64_t in_row_start = in_ptr[row]; + const int64_t out_row_start = out_ptr[out_row]; + const int64_t deg = in_ptr[row + 1] - in_row_start; + + if (deg > 0) { + // each thread then blindly copies in rows only if deg > 0. + for (int idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { + const int64_t edge = curand(&rng) % deg; + const int64_t out_idx = out_row_start + idx; + out_rows[out_idx] = row; + out_cols[out_idx] = in_index[in_row_start + edge]; + out_idxs[out_idx] = + data ? data[in_row_start + edge] : in_row_start + edge; + } + } + out_row += 1; + } +} + +} // namespace + +///////////////////////////// CSR sampling ////////////////////////// + +template +COOMatrix _CSRRowWiseSamplingUniform( + CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) { + const auto& ctx = rows->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int64_t num_rows = rows->shape[0]; + const IdType* const slice_rows = static_cast(rows->data); + + IdArray picked_row = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdArray picked_col = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdArray picked_idx = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdType* const out_rows = static_cast(picked_row->data); + IdType* const out_cols = static_cast(picked_col->data); + IdType* const out_idxs = static_cast(picked_idx->data); + + const IdType* in_ptr = static_cast(GetDevicePointer(mat.indptr)); + const IdType* in_cols = static_cast(GetDevicePointer(mat.indices)); + const IdType* data = CSRHasData(mat) + ? static_cast(GetDevicePointer(mat.data)) + : nullptr; + + // compute degree + IdType* out_deg = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + if (replace) { + const dim3 block(512); + const dim3 grid((num_rows + block.x - 1) / block.x); + CUDA_KERNEL_CALL( + _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks, + num_rows, slice_rows, in_ptr, out_deg); + } else { + const dim3 block(512); + const dim3 grid((num_rows + block.x - 1) / block.x); + CUDA_KERNEL_CALL( + _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks, + num_rows, slice_rows, in_ptr, out_deg); + } + + // fill out_ptr + IdType* out_ptr = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); + void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); + device->FreeWorkspace(ctx, prefix_temp); + device->FreeWorkspace(ctx, out_deg); + + cudaEvent_t copyEvent; + CUDA_CALL(cudaEventCreate(©Event)); + + NDArray new_len_tensor; + if (TensorDispatcher::Global()->IsAvailable()) { + new_len_tensor = NDArray::PinnedEmpty( + {1}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); + } else { + // use pageable memory, it will unecessarily block but be functional + new_len_tensor = NDArray::Empty( + {1}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); + } + + // copy using the internal current stream + CUDA_CALL(cudaMemcpyAsync( + new_len_tensor->data, out_ptr + num_rows, sizeof(IdType), + cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaEventRecord(copyEvent, stream)); + + const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); + + // select edges + // the number of rows each thread block will cover + constexpr int TILE_SIZE = 128 / BLOCK_SIZE; + if (replace) { // with replacement + const dim3 block(BLOCK_SIZE); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseSampleUniformReplaceKernel), grid, block, + 0, stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, + in_cols, data, out_ptr, out_rows, out_cols, out_idxs); + } else { // without replacement + const dim3 block(BLOCK_SIZE); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseSampleUniformKernel), grid, block, 0, + stream, random_seed, num_picks, num_rows, slice_rows, in_ptr, in_cols, + data, out_ptr, out_rows, out_cols, out_idxs); + } + device->FreeWorkspace(ctx, out_ptr); + + // wait for copying `new_len` to finish + CUDA_CALL(cudaEventSynchronize(copyEvent)); + CUDA_CALL(cudaEventDestroy(copyEvent)); + + const IdType new_len = static_cast(new_len_tensor->data)[0]; + picked_row = picked_row.CreateView({new_len}, picked_row->dtype); + picked_col = picked_col.CreateView({new_len}, picked_col->dtype); + picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype); + + return COOMatrix( + mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx); +} + +template +COOMatrix CSRRowWiseSamplingUniform( + CSRMatrix mat, IdArray rows, const int64_t num_picks, const bool replace) { + if (num_picks == -1) { + // Basically this is UnitGraph::InEdges(). + COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false); + IdArray sliced_rows = IndexSelect(rows, coo.row); + return COOMatrix( + mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data); + } else { + return _CSRRowWiseSamplingUniform( + mat, rows, num_picks, replace); + } +} + +template COOMatrix CSRRowWiseSamplingUniform( + CSRMatrix, IdArray, int64_t, bool); +template COOMatrix CSRRowWiseSamplingUniform( + CSRMatrix, IdArray, int64_t, bool); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/rowwise_sampling_prob.cu b/src/array/cuda/rowwise_sampling_prob.cu index bab553ca8b56..229923cb720d 100644 --- a/src/array/cuda/rowwise_sampling_prob.cu +++ b/src/array/cuda/rowwise_sampling_prob.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2022 by Contributors * @file array/cuda/rowwise_sampling_prob.cu @@ -6,11 +7,11 @@ * sampling code rowwise_sampling.cu. * @author pengqirong (OPPO), dlasalle and Xin from Nvidia. */ -#include +#include #include #include -#include +#include #include #include "../../array/cuda/atomic.cuh" @@ -19,7 +20,7 @@ // require CUB 1.17 to use DeviceSegmentedSort static_assert( - CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort"); + HIPCUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort"); namespace dgl { using namespace cuda; @@ -159,8 +160,8 @@ __global__ void _CSRAResValueKernel( const int64_t last_row = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); - curandStatePhilox4_32_10_t rng; - curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + hiprandStatePhilox4_32_10_t rng; + hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); while (out_row < last_row) { const int64_t row = in_rows[out_row]; @@ -179,7 +180,7 @@ __global__ void _CSRAResValueKernel( prob, data, idx, in_row_start, &item_prob); // compute A-Res value ares[ares_idx] = static_cast( - __powf(curand_uniform(&rng), 1.0f / item_prob)); + __powf(hiprand_uniform(&rng), 1.0f / item_prob)); ares_idxs[ares_idx] = static_cast(in_idx); } } @@ -317,8 +318,8 @@ __global__ void _CSRRowWiseSampleReplaceKernel( const int64_t last_row = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); - curandStatePhilox4_32_10_t rng; - curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + hiprandStatePhilox4_32_10_t rng; + hiprand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); while (out_row < last_row) { const int64_t row = in_rows[out_row]; @@ -330,7 +331,7 @@ __global__ void _CSRRowWiseSampleReplaceKernel( if (deg > 0) { // Specialize BlockScan for a 1D block of BLOCK_SIZE threads - typedef cub::BlockScan BlockScan; + typedef hipcub::BlockScan BlockScan; // Allocate shared memory for BlockScan __shared__ typename BlockScan::TempStorage temp_storage; // Initialize running total @@ -362,10 +363,10 @@ __global__ void _CSRRowWiseSampleReplaceKernel( for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { // get random value FloatType sum = cdf[cdf_row_start + deg - 1]; - FloatType rand = static_cast(curand_uniform(&rng) * sum); + FloatType rand = static_cast(hiprand_uniform(&rng) * sum); // get the offset of the first value within cdf array which is greater // than random value. - int64_t item = cub::UpperBound( + int64_t item = hipcub::UpperBound( &cdf[cdf_row_start], deg, rand); item = min(item, deg - 1); // get in and out index @@ -441,7 +442,7 @@ COOMatrix _COORemoveIf( const COOMatrix& coo, const NDArray& values, DType criteria) { const DType* val = values.Ptr(); auto maskgen = [val, criteria]( - int nb, int nt, cudaStream_t stream, int64_t nnz, + int nb, int nt, hipStream_t stream, int64_t nnz, const IdType* data, int8_t* flags) { CUDA_KERNEL_CALL( (_GenerateFlagsKernel), nb, nt, 0, stream, nnz, @@ -481,7 +482,7 @@ COOMatrix _CSRRowWiseSampling( const FloatArray& prob, bool replace) { const auto& ctx = rows->ctx; auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t num_rows = rows->shape[0]; const IdType* const slice_rows = static_cast(rows->data); @@ -530,10 +531,10 @@ COOMatrix _CSRRowWiseSampling( IdType* temp_ptr = static_cast( device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, temp_deg); @@ -551,16 +552,16 @@ COOMatrix _CSRRowWiseSampling( IdType* out_ptr = static_cast( device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); device->FreeWorkspace(ctx, prefix_temp); device->FreeWorkspace(ctx, out_deg); - cudaEvent_t copyEvent; - CUDA_CALL(cudaEventCreate(©Event)); + hipEvent_t copyEvent; + CUDA_CALL(hipEventCreate(©Event)); // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and // wait on a cudaevent IdType new_len; @@ -568,7 +569,7 @@ COOMatrix _CSRRowWiseSampling( device->CopyDataFromTo( out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype); - CUDA_CALL(cudaEventRecord(copyEvent, stream)); + CUDA_CALL(hipEventRecord(copyEvent, stream)); // allocate workspace // 1) for w/ replacement, it's a global buffer to store cdf segments (one @@ -612,16 +613,16 @@ COOMatrix _CSRRowWiseSampling( IdType* sort_temp_idxs = static_cast( device->AllocWorkspace(ctx, temp_len * sizeof(IdType))); - cub::DoubleBuffer sort_keys(temp, sort_temp); - cub::DoubleBuffer sort_values(temp_idxs, sort_temp_idxs); + hipcub::DoubleBuffer sort_keys(temp, sort_temp); + hipcub::DoubleBuffer sort_values(temp_idxs, sort_temp_idxs); void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, num_rows, temp_ptr, temp_ptr + 1, stream)); d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes); - CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceSegmentedSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, num_rows, temp_ptr, temp_ptr + 1, stream)); device->FreeWorkspace(ctx, d_temp_storage); @@ -641,8 +642,8 @@ COOMatrix _CSRRowWiseSampling( device->FreeWorkspace(ctx, out_ptr); // wait for copying `new_len` to finish - CUDA_CALL(cudaEventSynchronize(copyEvent)); - CUDA_CALL(cudaEventDestroy(copyEvent)); + CUDA_CALL(hipEventSynchronize(copyEvent)); + CUDA_CALL(hipEventDestroy(copyEvent)); picked_row = picked_row.CreateView({new_len}, picked_row->dtype); picked_col = picked_col.CreateView({new_len}, picked_col->dtype); diff --git a/src/array/cuda/rowwise_sampling_prob.cu.prehip b/src/array/cuda/rowwise_sampling_prob.cu.prehip new file mode 100644 index 000000000000..bab553ca8b56 --- /dev/null +++ b/src/array/cuda/rowwise_sampling_prob.cu.prehip @@ -0,0 +1,696 @@ +/** + * Copyright (c) 2022 by Contributors + * @file array/cuda/rowwise_sampling_prob.cu + * @brief weighted rowwise sampling. The degree computing kernels and + * host-side functions are partially borrowed from the uniform rowwise + * sampling code rowwise_sampling.cu. + * @author pengqirong (OPPO), dlasalle and Xin from Nvidia. + */ +#include +#include +#include + +#include +#include + +#include "../../array/cuda/atomic.cuh" +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +// require CUB 1.17 to use DeviceSegmentedSort +static_assert( + CUB_VERSION >= 101700, "Require CUB >= 1.17 to use DeviceSegmentedSort"); + +namespace dgl { +using namespace cuda; +using namespace aten::cuda; +namespace aten { +namespace impl { + +namespace { + +constexpr int BLOCK_SIZE = 128; + +/** + * @brief Compute the size of each row in the sampled CSR, without replacement. + * temp_deg is calculated for rows with deg > num_picks. + * For these rows, we will calculate their A-Res values and sort them to get + * top-num_picks. + * + * @tparam IdType The type of node and edge indexes. + * @param num_picks The number of non-zero entries to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The index where each row's edges start. + * @param out_deg The size of each row in the sampled matrix, as indexed by + * `in_rows` (output). + * @param temp_deg The size of each row in the input matrix, as indexed by + * `in_rows` (output). + */ +template +__global__ void _CSRRowWiseSampleDegreeKernel( + const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + IdType* const out_deg, IdType* const temp_deg) { + const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x; + + if (tIdx < num_rows) { + const int64_t in_row = in_rows[tIdx]; + const int64_t out_row = tIdx; + const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row]; + // temp_deg is used to generate ares_ptr + temp_deg[out_row] = deg > static_cast(num_picks) ? deg : 0; + out_deg[out_row] = min(static_cast(num_picks), deg); + + if (out_row == num_rows - 1) { + // make the prefixsum work + out_deg[num_rows] = 0; + temp_deg[num_rows] = 0; + } + } +} + +/** + * @brief Compute the size of each row in the sampled CSR, with replacement. + * We need the actual in degree of each row to store CDF values. + * + * @tparam IdType The type of node and edge indexes. + * @param num_picks The number of non-zero entries to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The index where each row's edges start. + * @param out_deg The size of each row in the sampled matrix, as indexed by + * `in_rows` (output). + * @param temp_deg The size of each row in the input matrix, as indexed by + * `in_rows` (output). + */ +template +__global__ void _CSRRowWiseSampleDegreeReplaceKernel( + const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + IdType* const out_deg, IdType* const temp_deg) { + const int64_t tIdx = threadIdx.x + blockIdx.x * blockDim.x; + + if (tIdx < num_rows) { + const int64_t in_row = in_rows[tIdx]; + const int64_t out_row = tIdx; + const IdType deg = in_ptr[in_row + 1] - in_ptr[in_row]; + temp_deg[out_row] = deg; + out_deg[out_row] = deg == 0 ? 0 : static_cast(num_picks); + + if (out_row == num_rows - 1) { + // make the prefixsum work + out_deg[num_rows] = 0; + temp_deg[num_rows] = 0; + } + } +} + +/** + * @brief Equivalent to numpy expression: array[idx[off:off + len]] + * + * @tparam IdType The ID type used for indices. + * @tparam FloatType The float type used for array values. + * @param array The array to be selected. + * @param idx_data The index mapping array. + * @param index The index of value to be selected. + * @param offset The offset to start. + * @param out The selected value (output). + */ +template +__device__ void _DoubleSlice( + const FloatType* const array, const IdType* const idx_data, + const IdType idx, const IdType offset, FloatType* const out) { + if (idx_data) { + *out = array[idx_data[offset + idx]]; + } else { + *out = array[offset + idx]; + } +} + +/** + * @brief Compute A-Res value. A-Res value needs to be calculated only if deg + * is greater than num_picks in weighted rowwise sampling without replacement. + * + * @tparam IdType The ID type used for matrices. + * @tparam FloatType The Float type used for matrices. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + * @param rand_seed The random seed to use. + * @param num_picks The number of non-zeros to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The indptr array of the input CSR. + * @param data The data array of the input CSR. + * @param prob The probability array of the input CSR. + * @param ares_ptr The offset to write each row to in the A-res array. + * @param ares_idxs The A-Res value corresponding index array, the index of + * input CSR (output). + * @param ares The A-Res value array (output). + * @author pengqirong (OPPO) + */ +template +__global__ void _CSRAResValueKernel( + const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + const IdType* const data, const FloatType* const prob, + const IdType* const ares_ptr, IdType* const ares_idxs, + FloatType* const ares) { + int64_t out_row = blockIdx.x * TILE_SIZE; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + curandStatePhilox4_32_10_t rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + + while (out_row < last_row) { + const int64_t row = in_rows[out_row]; + const int64_t in_row_start = in_ptr[row]; + const int64_t deg = in_ptr[row + 1] - in_row_start; + // A-Res value needs to be calculated only if deg is greater than num_picks + // in weighted rowwise sampling without replacement + if (deg > num_picks) { + const int64_t ares_row_start = ares_ptr[out_row]; + + for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) { + const int64_t in_idx = in_row_start + idx; + const int64_t ares_idx = ares_row_start + idx; + FloatType item_prob; + _DoubleSlice( + prob, data, idx, in_row_start, &item_prob); + // compute A-Res value + ares[ares_idx] = static_cast( + __powf(curand_uniform(&rng), 1.0f / item_prob)); + ares_idxs[ares_idx] = static_cast(in_idx); + } + } + out_row += 1; + } +} + +/** + * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO + * matrix, without replacement. After sorting, we select top-num_picks items. + * + * @tparam IdType The ID type used for matrices. + * @tparam FloatType The Float type used for matrices. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + * @param num_picks The number of non-zeros to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The indptr array of the input CSR. + * @param in_cols The columns array of the input CSR. + * @param data The data array of the input CSR. + * @param out_ptr The offset to write each row to in the output COO. + * @param ares_ptr The offset to write each row to in the ares array. + * @param sort_ares_idxs The sorted A-Res value corresponding index array, the + * index of input CSR. + * @param out_rows The rows of the output COO (output). + * @param out_cols The columns of the output COO (output). + * @param out_idxs The data array of the output COO (output). + * @author pengqirong (OPPO) + */ +template +__global__ void _CSRRowWiseSampleKernel( + const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + const IdType* const in_cols, const IdType* const data, + const IdType* const out_ptr, const IdType* const ares_ptr, + const IdType* const sort_ares_idxs, IdType* const out_rows, + IdType* const out_cols, IdType* const out_idxs) { + // we assign one warp per row + assert(blockDim.x == BLOCK_SIZE); + + int64_t out_row = blockIdx.x * TILE_SIZE; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + while (out_row < last_row) { + const int64_t row = in_rows[out_row]; + const int64_t in_row_start = in_ptr[row]; + const int64_t out_row_start = out_ptr[out_row]; + const int64_t deg = in_ptr[row + 1] - in_row_start; + + if (deg > num_picks) { + const int64_t ares_row_start = ares_ptr[out_row]; + for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { + // get in and out index, the in_idx is one of top num_picks A-Res value + // corresponding index in input CSR. + const int64_t out_idx = out_row_start + idx; + const int64_t ares_idx = ares_row_start + idx; + const int64_t in_idx = sort_ares_idxs[ares_idx]; + // copy permutation over + out_rows[out_idx] = static_cast(row); + out_cols[out_idx] = in_cols[in_idx]; + out_idxs[out_idx] = static_cast(data ? data[in_idx] : in_idx); + } + } else { + for (int64_t idx = threadIdx.x; idx < deg; idx += BLOCK_SIZE) { + // get in and out index + const int64_t out_idx = out_row_start + idx; + const int64_t in_idx = in_row_start + idx; + // copy permutation over + out_rows[out_idx] = static_cast(row); + out_cols[out_idx] = in_cols[in_idx]; + out_idxs[out_idx] = static_cast(data ? data[in_idx] : in_idx); + } + } + out_row += 1; + } +} + +// A stateful callback functor that maintains a running prefix to be applied +// during consecutive scan operations. +template +struct BlockPrefixCallbackOp { + // Running prefix + FloatType running_total; + // Constructor + __device__ BlockPrefixCallbackOp(FloatType running_total) + : running_total(running_total) {} + // Callback operator to be entered by the first warp of threads in the block. + // Thread-0 is responsible for returning a value for seeding the block-wide + // scan. + __device__ FloatType operator()(FloatType block_aggregate) { + FloatType old_prefix = running_total; + running_total += block_aggregate; + return old_prefix; + } +}; + +/** + * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO + * matrix, with replacement. We store the CDF (unnormalized) of all neighbors of + * a row in global memory and use binary search to find inverse indices as + * selected items. + * + * @tparam IdType The ID type used for matrices. + * @tparam FloatType The Float type used for matrices. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + * @param rand_seed The random seed to use. + * @param num_picks The number of non-zeros to pick per row. + * @param num_rows The number of rows to pick. + * @param in_rows The set of rows to pick. + * @param in_ptr The indptr array of the input CSR. + * @param in_cols The columns array of the input CSR. + * @param data The data array of the input CSR. + * @param prob The probability array of the input CSR. + * @param out_ptr The offset to write each row to in the output COO. + * @param cdf_ptr The offset of each cdf segment. + * @param cdf The global buffer to store cdf segments. + * @param out_rows The rows of the output COO (output). + * @param out_cols The columns of the output COO (output). + * @param out_idxs The data array of the output COO (output). + * @author pengqirong (OPPO) + */ +template +__global__ void _CSRRowWiseSampleReplaceKernel( + const uint64_t rand_seed, const int64_t num_picks, const int64_t num_rows, + const IdType* const in_rows, const IdType* const in_ptr, + const IdType* const in_cols, const IdType* const data, + const FloatType* const prob, const IdType* const out_ptr, + const IdType* const cdf_ptr, FloatType* const cdf, IdType* const out_rows, + IdType* const out_cols, IdType* const out_idxs) { + // we assign one warp per row + assert(blockDim.x == BLOCK_SIZE); + + int64_t out_row = blockIdx.x * TILE_SIZE; + const int64_t last_row = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_rows); + + curandStatePhilox4_32_10_t rng; + curand_init(rand_seed * gridDim.x + blockIdx.x, threadIdx.x, 0, &rng); + + while (out_row < last_row) { + const int64_t row = in_rows[out_row]; + const int64_t in_row_start = in_ptr[row]; + const int64_t out_row_start = out_ptr[out_row]; + const int64_t cdf_row_start = cdf_ptr[out_row]; + const int64_t deg = in_ptr[row + 1] - in_row_start; + const FloatType MIN_THREAD_DATA = static_cast(0.0f); + + if (deg > 0) { + // Specialize BlockScan for a 1D block of BLOCK_SIZE threads + typedef cub::BlockScan BlockScan; + // Allocate shared memory for BlockScan + __shared__ typename BlockScan::TempStorage temp_storage; + // Initialize running total + BlockPrefixCallbackOp prefix_op(MIN_THREAD_DATA); + + int64_t max_iter = (1 + (deg - 1) / BLOCK_SIZE) * BLOCK_SIZE; + // Have the block iterate over segments of items + for (int64_t idx = threadIdx.x; idx < max_iter; idx += BLOCK_SIZE) { + // Load a segment of consecutive items that are blocked across threads + FloatType thread_data; + if (idx < deg) + _DoubleSlice( + prob, data, idx, in_row_start, &thread_data); + else + thread_data = MIN_THREAD_DATA; + thread_data = max(thread_data, MIN_THREAD_DATA); + // Collectively compute the block-wide inclusive prefix sum + BlockScan(temp_storage) + .InclusiveSum(thread_data, thread_data, prefix_op); + __syncthreads(); + + // Store scanned items to cdf array + if (idx < deg) { + cdf[cdf_row_start + idx] = thread_data; + } + } + __syncthreads(); + + for (int64_t idx = threadIdx.x; idx < num_picks; idx += BLOCK_SIZE) { + // get random value + FloatType sum = cdf[cdf_row_start + deg - 1]; + FloatType rand = static_cast(curand_uniform(&rng) * sum); + // get the offset of the first value within cdf array which is greater + // than random value. + int64_t item = cub::UpperBound( + &cdf[cdf_row_start], deg, rand); + item = min(item, deg - 1); + // get in and out index + const int64_t in_idx = in_row_start + item; + const int64_t out_idx = out_row_start + idx; + // copy permutation over + out_rows[out_idx] = static_cast(row); + out_cols[out_idx] = in_cols[in_idx]; + out_idxs[out_idx] = static_cast(data ? data[in_idx] : in_idx); + } + } + out_row += 1; + } +} + +template +__global__ void _GenerateFlagsKernel( + int64_t n, const IdType* idx, const DType* values, DType criteria, + BoolType* output) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < n) { + output[tx] = (values[idx ? idx[tx] : tx] != criteria); + tx += stride_x; + } +} + +template +COOMatrix COOGeneralRemoveIf(const COOMatrix& coo, MaskGen maskgen) { + using namespace dgl::cuda; + + const auto idtype = coo.row->dtype; + const auto ctx = coo.row->ctx; + const int64_t nnz = coo.row->shape[0]; + const IdType* row = coo.row.Ptr(); + const IdType* col = coo.col.Ptr(); + const IdArray& eid = + COOHasData(coo) ? coo.data : Range(0, nnz, sizeof(IdType) * 8, ctx); + const IdType* data = coo.data.Ptr(); + IdArray new_row = IdArray::Empty({nnz}, idtype, ctx); + IdArray new_col = IdArray::Empty({nnz}, idtype, ctx); + IdArray new_eid = IdArray::Empty({nnz}, idtype, ctx); + IdType* new_row_data = new_row.Ptr(); + IdType* new_col_data = new_col.Ptr(); + IdType* new_eid_data = new_eid.Ptr(); + auto stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(ctx); + + int8_t* flags = static_cast(device->AllocWorkspace(ctx, nnz)); + int nt = dgl::cuda::FindNumThreads(nnz); + int64_t nb = (nnz + nt - 1) / nt; + + maskgen(nb, nt, stream, nnz, data, flags); + + int64_t* rst = + static_cast(device->AllocWorkspace(ctx, sizeof(int64_t))); + MaskSelect(device, ctx, row, flags, new_row_data, nnz, rst, stream); + MaskSelect(device, ctx, col, flags, new_col_data, nnz, rst, stream); + MaskSelect(device, ctx, data, flags, new_eid_data, nnz, rst, stream); + + int64_t new_len = GetCUDAScalar(device, ctx, rst); + + device->FreeWorkspace(ctx, flags); + device->FreeWorkspace(ctx, rst); + return COOMatrix( + coo.num_rows, coo.num_cols, new_row.CreateView({new_len}, idtype, 0), + new_col.CreateView({new_len}, idtype, 0), + new_eid.CreateView({new_len}, idtype, 0)); +} + +template +COOMatrix _COORemoveIf( + const COOMatrix& coo, const NDArray& values, DType criteria) { + const DType* val = values.Ptr(); + auto maskgen = [val, criteria]( + int nb, int nt, cudaStream_t stream, int64_t nnz, + const IdType* data, int8_t* flags) { + CUDA_KERNEL_CALL( + (_GenerateFlagsKernel), nb, nt, 0, stream, nnz, + data, val, criteria, flags); + }; + return COOGeneralRemoveIf( + coo, maskgen); +} + +} // namespace + +/////////////////////////////// CSR /////////////////////////////// + +/** + * @brief Perform weighted row-wise sampling on a CSR matrix, and generate a COO + * matrix. Use CDF sampling algorithm for with replacement: + * 1) Calculate the CDF of all neighbor's prob. + * 2) For each [0, num_picks), generate a rand ~ U(0, 1). Use binary search to + * find its index in the CDF array as a chosen item. + * Use A-Res sampling algorithm for without replacement: + * 1) For rows with deg > num_picks, calculate A-Res values for all neighbors. + * 2) Sort the A-Res array and select top-num_picks as chosen items. + * + * @tparam XPU The device type used for matrices. + * @tparam IdType The ID type used for matrices. + * @tparam FloatType The Float type used for matrices. + * @param mat The CSR matrix. + * @param rows The set of rows to pick. + * @param num_picks The number of non-zeros to pick per row. + * @param prob The probability array of the input CSR. + * @param replace Is replacement sampling? + * @author pengqirong (OPPO), dlasalle and Xin from Nvidia. + */ +template +COOMatrix _CSRRowWiseSampling( + const CSRMatrix& mat, const IdArray& rows, int64_t num_picks, + const FloatArray& prob, bool replace) { + const auto& ctx = rows->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int64_t num_rows = rows->shape[0]; + const IdType* const slice_rows = static_cast(rows->data); + + IdArray picked_row = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdArray picked_col = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdArray picked_idx = + NewIdArray(num_rows * num_picks, ctx, sizeof(IdType) * 8); + IdType* const out_rows = static_cast(picked_row->data); + IdType* const out_cols = static_cast(picked_col->data); + IdType* const out_idxs = static_cast(picked_idx->data); + + const IdType* in_ptr = static_cast(GetDevicePointer(mat.indptr)); + const IdType* in_cols = static_cast(GetDevicePointer(mat.indices)); + const IdType* data = CSRHasData(mat) + ? static_cast(GetDevicePointer(mat.data)) + : nullptr; + const FloatType* prob_data = static_cast(GetDevicePointer(prob)); + + // compute degree + // out_deg: the size of each row in the sampled matrix + // temp_deg: the size of each row we will manipulate in sampling + // 1) for w/o replacement: in degree if it's greater than num_picks else 0 + // 2) for w/ replacement: in degree + IdType* out_deg = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + IdType* temp_deg = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + if (replace) { + const dim3 block(512); + const dim3 grid((num_rows + block.x - 1) / block.x); + CUDA_KERNEL_CALL( + _CSRRowWiseSampleDegreeReplaceKernel, grid, block, 0, stream, num_picks, + num_rows, slice_rows, in_ptr, out_deg, temp_deg); + } else { + const dim3 block(512); + const dim3 grid((num_rows + block.x - 1) / block.x); + CUDA_KERNEL_CALL( + _CSRRowWiseSampleDegreeKernel, grid, block, 0, stream, num_picks, + num_rows, slice_rows, in_ptr, out_deg, temp_deg); + } + + // fill temp_ptr + IdType* temp_ptr = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); + void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + prefix_temp, prefix_temp_size, temp_deg, temp_ptr, num_rows + 1, stream)); + device->FreeWorkspace(ctx, prefix_temp); + device->FreeWorkspace(ctx, temp_deg); + + // TODO(Xin): The copy here is too small, and the overhead of creating + // cuda events cannot be ignored. Just use synchronized copy. + IdType temp_len; + // copy using the internal current stream. + device->CopyDataFromTo( + temp_ptr, num_rows * sizeof(temp_len), &temp_len, 0, sizeof(temp_len), + ctx, DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + device->StreamSync(ctx, stream); + + // fill out_ptr + IdType* out_ptr = static_cast( + device->AllocWorkspace(ctx, (num_rows + 1) * sizeof(IdType))); + prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); + prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + prefix_temp, prefix_temp_size, out_deg, out_ptr, num_rows + 1, stream)); + device->FreeWorkspace(ctx, prefix_temp); + device->FreeWorkspace(ctx, out_deg); + + cudaEvent_t copyEvent; + CUDA_CALL(cudaEventCreate(©Event)); + // TODO(dlasalle): use pinned memory to overlap with the actual sampling, and + // wait on a cudaevent + IdType new_len; + // copy using the internal current stream. + device->CopyDataFromTo( + out_ptr, num_rows * sizeof(new_len), &new_len, 0, sizeof(new_len), ctx, + DGLContext{kDGLCPU, 0}, mat.indptr->dtype); + CUDA_CALL(cudaEventRecord(copyEvent, stream)); + + // allocate workspace + // 1) for w/ replacement, it's a global buffer to store cdf segments (one + // segment for each row). + // 2) for w/o replacement, it's used to store a-res segments (one segment for + // each row with degree > num_picks) + FloatType* temp = static_cast( + device->AllocWorkspace(ctx, temp_len * sizeof(FloatType))); + + const uint64_t rand_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); + + // select edges + // the number of rows each thread block will cover + constexpr int TILE_SIZE = 128 / BLOCK_SIZE; + if (replace) { // with replacement. + const dim3 block(BLOCK_SIZE); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRRowWiseSampleReplaceKernel), grid, + block, 0, stream, rand_seed, num_picks, num_rows, slice_rows, in_ptr, + in_cols, data, prob_data, out_ptr, temp_ptr, temp, out_rows, out_cols, + out_idxs); + device->FreeWorkspace(ctx, temp); + } else { // without replacement + IdType* temp_idxs = static_cast( + device->AllocWorkspace(ctx, (temp_len) * sizeof(IdType))); + + // Compute A-Res value. A-Res value needs to be calculated only if deg + // is greater than num_picks in weighted rowwise sampling without + // replacement. + const dim3 block(BLOCK_SIZE); + const dim3 grid((num_rows + TILE_SIZE - 1) / TILE_SIZE); + CUDA_KERNEL_CALL( + (_CSRAResValueKernel), grid, block, 0, + stream, rand_seed, num_picks, num_rows, slice_rows, in_ptr, data, + prob_data, temp_ptr, temp_idxs, temp); + + // sort A-Res value array. + FloatType* sort_temp = static_cast( + device->AllocWorkspace(ctx, temp_len * sizeof(FloatType))); + IdType* sort_temp_idxs = static_cast( + device->AllocWorkspace(ctx, temp_len * sizeof(IdType))); + + cub::DoubleBuffer sort_keys(temp, sort_temp); + cub::DoubleBuffer sort_values(temp_idxs, sort_temp_idxs); + + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, + num_rows, temp_ptr, temp_ptr + 1, stream)); + d_temp_storage = device->AllocWorkspace(ctx, temp_storage_bytes); + CUDA_CALL(cub::DeviceSegmentedSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, sort_keys, sort_values, temp_len, + num_rows, temp_ptr, temp_ptr + 1, stream)); + device->FreeWorkspace(ctx, d_temp_storage); + device->FreeWorkspace(ctx, temp); + device->FreeWorkspace(ctx, temp_idxs); + device->FreeWorkspace(ctx, sort_temp); + device->FreeWorkspace(ctx, sort_temp_idxs); + + // select tok-num_picks as results + CUDA_KERNEL_CALL( + (_CSRRowWiseSampleKernel), grid, block, 0, + stream, num_picks, num_rows, slice_rows, in_ptr, in_cols, data, out_ptr, + temp_ptr, sort_values.Current(), out_rows, out_cols, out_idxs); + } + + device->FreeWorkspace(ctx, temp_ptr); + device->FreeWorkspace(ctx, out_ptr); + + // wait for copying `new_len` to finish + CUDA_CALL(cudaEventSynchronize(copyEvent)); + CUDA_CALL(cudaEventDestroy(copyEvent)); + + picked_row = picked_row.CreateView({new_len}, picked_row->dtype); + picked_col = picked_col.CreateView({new_len}, picked_col->dtype); + picked_idx = picked_idx.CreateView({new_len}, picked_idx->dtype); + + return COOMatrix( + mat.num_rows, mat.num_cols, picked_row, picked_col, picked_idx); +} + +template +COOMatrix CSRRowWiseSampling( + CSRMatrix mat, IdArray rows, int64_t num_picks, FloatArray prob, + bool replace) { + COOMatrix result; + if (num_picks == -1) { + // Basically this is UnitGraph::InEdges(). + COOMatrix coo = CSRToCOO(CSRSliceRows(mat, rows), false); + IdArray sliced_rows = IndexSelect(rows, coo.row); + result = + COOMatrix(mat.num_rows, mat.num_cols, sliced_rows, coo.col, coo.data); + } else { + result = _CSRRowWiseSampling( + mat, rows, num_picks, prob, replace); + } + // NOTE(BarclayII): I'm removing the entries with zero probability after + // sampling. Is there a better way? + return _COORemoveIf(result, prob, static_cast(0)); +} + +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +// These are not being called, but we instantiate them anyway to prevent missing +// symbols in Debug build +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); +template COOMatrix CSRRowWiseSampling( + CSRMatrix, IdArray, int64_t, FloatArray, bool); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/sddmm.cu b/src/array/cuda/sddmm.cu index 957d1a02a377..3f00596f6af2 100644 --- a/src/array/cuda/sddmm.cu +++ b/src/array/cuda/sddmm.cu @@ -48,10 +48,10 @@ template void SDDMMCsr( const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); #if BF16_ENABLED -template void SDDMMCsr( +template void SDDMMCsr( const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); -template void SDDMMCsr( +template void SDDMMCsr( const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); #endif // BF16_ENABLED @@ -75,10 +75,10 @@ template void SDDMMCoo( const std::string& op, const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); #if BF16_ENABLED -template void SDDMMCoo( +template void SDDMMCoo( const std::string& op, const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); -template void SDDMMCoo( +template void SDDMMCoo( const std::string& op, const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); #endif // BF16_ENABLED diff --git a/src/array/cuda/sddmm.cu.prehip b/src/array/cuda/sddmm.cu.prehip new file mode 100644 index 000000000000..957d1a02a377 --- /dev/null +++ b/src/array/cuda/sddmm.cu.prehip @@ -0,0 +1,99 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/sddmm.cu + * @brief SDDMM C APIs and definitions. + */ +#include + +#include "./functor.cuh" +#include "./sddmm.cuh" + +namespace dgl { +namespace aten { + +/** + * @brief CUDA implementation of g-SDDMM on Csr format. + */ +template +void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target) { + SWITCH_OP(op, Op, { + SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, { + cuda::SDDMMCsr( + bcast, csr, lhs, rhs, out); + }); + }); +} + +/** + * @brief CUDA implementation of g-SDDMM on Coo format. + */ +template +void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target) { + SWITCH_OP(op, Op, { + SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, { + cuda::SDDMMCoo( + bcast, coo, lhs, rhs, out); + }); + }); +} + +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +#if BF16_ENABLED +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +#endif // BF16_ENABLED +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCsr( + const std::string& op, const BcastOff& bcast, const CSRMatrix& csr, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); + +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +#if BF16_ENABLED +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +#endif // BF16_ENABLED +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); +template void SDDMMCoo( + const std::string& op, const BcastOff& bcast, const COOMatrix& coo, + NDArray lhs, NDArray rhs, NDArray out, int lhs_target, int rhs_target); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/sddmm.cuh b/src/array/cuda/sddmm.cuh index bc1cadfa01c3..33d4b5999892 100644 --- a/src/array/cuda/sddmm.cuh +++ b/src/array/cuda/sddmm.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/sddmm.cuh @@ -275,7 +276,7 @@ void SDDMMCoo( const DType* lhs_data = lhs.Ptr(); const DType* rhs_data = rhs.Ptr(); DType* out_data = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t *lhs_off = nullptr, *rhs_off = nullptr; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; @@ -337,7 +338,7 @@ void SDDMMCsr( const DType* lhs_data = lhs.Ptr(); const DType* rhs_data = rhs.Ptr(); DType* out_data = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0]; int64_t *lhs_off = nullptr, *rhs_off = nullptr; diff --git a/src/array/cuda/sddmm.cuh.prehip b/src/array/cuda/sddmm.cuh.prehip new file mode 100644 index 000000000000..bc1cadfa01c3 --- /dev/null +++ b/src/array/cuda/sddmm.cuh.prehip @@ -0,0 +1,368 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/sddmm.cuh + * @brief SDDMM CUDA kernel function header. + */ +#ifndef DGL_ARRAY_CUDA_SDDMM_CUH_ +#define DGL_ARRAY_CUDA_SDDMM_CUH_ + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "../selector.h" +#include "./functor.cuh" +#include "./utils.h" +#include "atomic.cuh" +#include "bf16.cuh" +#include "fp16.cuh" +#include "functor.cuh" +#include "macro.cuh" + +namespace dgl { + +using namespace cuda; + +namespace aten { +namespace cuda { + +#define SWITCH_OP(op, Op, ...) \ + do { \ + if ((op) == "add") { \ + typedef cuda::binary::Add Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "sub") { \ + typedef cuda::binary::Sub Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "mul") { \ + typedef cuda::binary::Mul Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "div") { \ + typedef cuda::binary::Div Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "copy_lhs") { \ + typedef cuda::binary::CopyLhs Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "copy_rhs") { \ + typedef cuda::binary::CopyRhs Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "dot") { \ + typedef cuda::binary::Dot Op; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "Unsupported SpMM/SDDMM binary operator: " << op; \ + } \ + } while (0) + +#define SWITCH_RHS(rhs_target, RhsTarget, ...) \ + do { \ + if ((rhs_target) == 0) { \ + constexpr int RhsTarget = 0; \ + { __VA_ARGS__ } \ + } else if ((rhs_target) == 1) { \ + constexpr int RhsTarget = 1; \ + { __VA_ARGS__ } \ + } else if ((rhs_target) == 2) { \ + constexpr int RhsTarget = 2; \ + { __VA_ARGS__ } \ + } else { \ + LOG(INFO) << "Invalid rhs target: " << (rhs_target); \ + } \ + } while (0) + +#define SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, ...) \ + do { \ + if ((lhs_target) == 0) { \ + constexpr int LhsTarget = 0; \ + SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__); \ + } else if ((lhs_target) == 1) { \ + constexpr int LhsTarget = 1; \ + SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__); \ + } else if ((lhs_target) == 2) { \ + constexpr int LhsTarget = 2; \ + SWITCH_RHS(rhs_target, RhsTarget, __VA_ARGS__); \ + } else { \ + LOG(INFO) << "Invalid lhs target: " << (lhs_target); \ + } \ + } while (0) + +constexpr unsigned int full_mask = 0xffffffff; + +/** + * @brief CUDA kernel of g-SDDMM on Coo format. + * @note it uses edge parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different edges. Threadblocks + * on the x-axis are responsible for the computation on different + * positions in feature dimension. + */ +template < + typename Idx, typename DType, typename BinaryOp, bool UseBcast = false, + bool UseIdx = false, int LhsTarget = 0, int RhsTarget = 2> +__global__ void SDDMMCooKernel( + const DType* __restrict__ lhs, const DType* __restrict__ rhs, + DType* __restrict__ out, const Idx* __restrict__ row, + const Idx* __restrict__ col, const Idx* __restrict__ edge_map, int64_t N, + int64_t M, int64_t E, int64_t reduce_size, + const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off, + int64_t lhs_len, int64_t rhs_len, int64_t out_len) { + // SDDMM with COO. + Idx ty = blockIdx.y * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.y; + while (ty < E) { + const Idx src = _ldg(row + ty); + const Idx dst = _ldg(col + ty); + const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty; + const DType* lhsoff = + BinaryOp::use_lhs + ? (lhs + Selector::Call(src, eid, dst) * lhs_len) + : nullptr; + const DType* rhsoff = + BinaryOp::use_rhs + ? (rhs + Selector::Call(src, eid, dst) * rhs_len) + : nullptr; + DType* outoff = out + eid * out_len; + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = blockDim.x * gridDim.x; + while (tx < out_len) { + const Idx lhs_add = UseBcast ? lhs_off[tx] : tx; + const Idx rhs_add = UseBcast ? rhs_off[tx] : tx; + DType val = BinaryOp::Call( + lhsoff + lhs_add * reduce_size, rhsoff + rhs_add * reduce_size, + reduce_size); + outoff[tx] = val; + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA kernel of SDDMM-dot on Coo format, accelerated with tree + * reduction. + * @note it uses edge parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different edges. Threadblocks + * on the x-axis are responsible for the computation on different + * positions in feature dimension. + */ +template < + typename Idx, typename DType, bool UseBcast = false, bool UseIdx = false, + int LhsTarget = 0, int RhsTarget = 2> +__global__ void SDDMMCooTreeReduceKernel( + const DType* __restrict__ lhs, const DType* __restrict__ rhs, + DType* __restrict__ out, const Idx* __restrict__ row, + const Idx* __restrict__ col, const Idx* __restrict__ edge_map, int64_t N, + int64_t M, int64_t E, int64_t reduce_size, + const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off, + int64_t lhs_len, int64_t rhs_len, int64_t out_len) { + Idx ty = blockIdx.x * blockDim.y + threadIdx.y; + if (ty < E) { + const Idx src = _ldg(row + ty); + const Idx dst = _ldg(col + ty); + const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty; + const DType* lhsoff = + lhs + Selector::Call(src, eid, dst) * lhs_len; + const DType* rhsoff = + rhs + Selector::Call(src, eid, dst) * rhs_len; + DType* outoff = out + eid * out_len; + int tx = threadIdx.x; // tx < 32 + for (int i = blockIdx.y; i < out_len; + i += gridDim.y) { // over output feature dimension + const Idx lhs_add = UseBcast ? __ldg(lhs_off + i) : i; + const Idx rhs_add = UseBcast ? __ldg(rhs_off + i) : i; + DType val = reduce::Sum::zero(); + for (int j = tx; j < reduce_size; j += 64) { + val += lhsoff[lhs_add * reduce_size + j] * + rhsoff[rhs_add * reduce_size + j]; + if (j + 32 < reduce_size) + val += lhsoff[lhs_add * reduce_size + j + 32] * + rhsoff[rhs_add * reduce_size + j + 32]; + } +#pragma unroll + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down_sync(full_mask, val, offset); + if (tx == 0) outoff[i] = val; + } + } +} + +// Binary search the row_offsets to find the source node of the edge id. +template +__device__ __forceinline__ Idx +BinarySearchSrc(const Idx* array, Idx length, Idx eid) { + Idx lo = 0, hi = length - 1; + while (lo < hi) { + Idx mid = (lo + hi) >> 1; + if (_ldg(array + mid) <= eid) { + lo = mid + 1; + } else { + hi = mid; + } + } + // INVARIANT: lo == hi + if (_ldg(array + hi) == eid) { + return hi; + } else { + return hi - 1; + } +} + +/** + * @brief CUDA kernel of g-SDDMM on Csr format. + * @note it uses edge parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different edges. Threadblocks + * on the x-axis are responsible for the computation on different + * positions in feature dimension. To efficiently find the source node idx and + * destination node index of an given edge on Csr format, it uses binary search + * (time complexity O(log N)). + */ +template < + typename Idx, typename DType, typename BinaryOp, bool UseBcast = false, + bool UseIdx = false, int LhsTarget = 0, int RhsTarget = 2> +__global__ void SDDMMCsrKernel( + const DType* __restrict__ lhs, const DType* __restrict__ rhs, + DType* __restrict__ out, const Idx* __restrict__ indptr, + const Idx* __restrict__ indices, const Idx* __restrict__ edge_map, + int64_t N, int64_t M, int64_t E, int64_t reduce_size, + const int64_t* __restrict__ lhs_off, const int64_t* __restrict__ rhs_off, + int64_t lhs_len, int64_t rhs_len, int64_t out_len) { + // SDDMM with Csr. + Idx ty = blockIdx.y * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.y; + while (ty < E) { + const Idx src = BinarySearchSrc(indptr, N + 1, ty); + const Idx dst = _ldg(indices + ty); + const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t stride_x = blockDim.x * gridDim.x; + const DType* lhsoff = + BinaryOp::use_lhs + ? (lhs + Selector::Call(src, eid, dst) * lhs_len) + : nullptr; + const DType* rhsoff = + BinaryOp::use_rhs + ? (rhs + Selector::Call(src, eid, dst) * rhs_len) + : nullptr; + DType* outoff = out + eid * out_len; + while (tx < out_len) { + const Idx lhs_add = UseBcast ? lhs_off[tx] : tx; + const Idx rhs_add = UseBcast ? rhs_off[tx] : tx; + DType val = BinaryOp::Call( + lhsoff + lhs_add * reduce_size, rhsoff + rhs_add * reduce_size, + reduce_size); + outoff[tx] = val; + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA implementation of g-SDDMM on Coo format. + * @param bcast Broadcast information. + * @param coo The Coo matrix. + * @param lhs The left hand side operand feature. + * @param rhs The right hand size operand feature. + * @param out The result feature on edges. + */ +template < + typename Idx, typename DType, typename Op, int LhsTarget = 0, + int RhsTarget = 2> +void SDDMMCoo( + const BcastOff& bcast, const COOMatrix& coo, NDArray lhs, NDArray rhs, + NDArray out) { + const Idx* row = coo.row.Ptr(); + const Idx* col = coo.col.Ptr(); + const Idx* edge_map = coo.data.Ptr(); + const DType* lhs_data = lhs.Ptr(); + const DType* rhs_data = rhs.Ptr(); + DType* out_data = out.Ptr(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + int64_t *lhs_off = nullptr, *rhs_off = nullptr; + int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; + int64_t reduce_dim = bcast.reduce_size; + + const int64_t nnz = coo.row->shape[0]; + const bool use_idx = !IsNullArray(coo.data); + + if (std::is_same >::value && reduce_dim >= 32) { + const int ntx = 32; // on feature dimension + const int nty = 8; // on out dimension + const int nbx = (nnz + nty - 1) / nty; + const int nby = FindNumBlocks<'y'>(len); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, { + CUDA_KERNEL_CALL( + (SDDMMCooTreeReduceKernel< + Idx, DType, UseBcast, UseIdx, LhsTarget, RhsTarget>), + nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, row, col, + edge_map, coo.num_rows, coo.num_cols, nnz, reduce_dim, lhs_off, + rhs_off, lhs_len, rhs_len, len); + }); + } else { + const int ntx = FindNumThreads(len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (len + ntx - 1) / ntx; + const int nby = FindNumBlocks<'y'>((nnz + nty - 1) / nty); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, { + CUDA_KERNEL_CALL( + (SDDMMCooKernel< + Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>), + nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, row, col, + edge_map, coo.num_rows, coo.num_cols, nnz, reduce_dim, lhs_off, + rhs_off, lhs_len, rhs_len, len); + }); + } +} + +/** + * @brief CUDA implementation of g-SDDMM on Csr format. + * @param bcast Broadcast information. + * @param csr The Csr matrix. + * @param lhs The left hand side operand feature. + * @param rhs The right hand size operand feature. + * @param out The result feature on edges. + */ +template < + typename Idx, typename DType, typename Op, int LhsTarget = 0, + int RhsTarget = 2> +void SDDMMCsr( + const BcastOff& bcast, const CSRMatrix& csr, NDArray lhs, NDArray rhs, + NDArray out) { + const Idx* indptr = csr.indptr.Ptr(); + const Idx* indices = csr.indices.Ptr(); + const Idx* edge_map = csr.data.Ptr(); + const DType* lhs_data = lhs.Ptr(); + const DType* rhs_data = rhs.Ptr(); + DType* out_data = out.Ptr(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int64_t N = csr.num_rows, M = csr.num_cols, E = csr.indices->shape[0]; + + int64_t *lhs_off = nullptr, *rhs_off = nullptr; + int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; + int64_t reduce_dim = bcast.reduce_size; + + const int ntx = FindNumThreads(len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (len + ntx - 1) / ntx; + const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + const bool use_idx = !IsNullArray(csr.data); + + BCAST_IDX_CTX_SWITCH(bcast, use_idx, out->ctx, lhs_off, rhs_off, { + CUDA_KERNEL_CALL( + (SDDMMCsrKernel< + Idx, DType, Op, UseBcast, UseIdx, LhsTarget, RhsTarget>), + nblks, nthrs, 0, stream, lhs_data, rhs_data, out_data, indptr, indices, + edge_map, N, M, E, reduce_dim, lhs_off, rhs_off, lhs_len, rhs_len, len); + }); +} + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_SDDMM_CUH_ diff --git a/src/array/cuda/sddmm_hetero_coo.cu b/src/array/cuda/sddmm_hetero_coo.cu index 180e189e7152..278c9d60cafe 100644 --- a/src/array/cuda/sddmm_hetero_coo.cu +++ b/src/array/cuda/sddmm_hetero_coo.cu @@ -49,13 +49,13 @@ template void SDDMMCooHetero( int rhs_target, const std::vector& in_eid, const std::vector& out_eid); #if BF16_ENABLED -template void SDDMMCooHetero( +template void SDDMMCooHetero( const std::string& op, const BcastOff& bcast, const std::vector& vec_coo, const std::vector& lhs, const std::vector& rhs, std::vector out, int lhs_target, int rhs_target, const std::vector& in_eid, const std::vector& out_eid); -template void SDDMMCooHetero( +template void SDDMMCooHetero( const std::string& op, const BcastOff& bcast, const std::vector& vec_coo, const std::vector& lhs, const std::vector& rhs, std::vector out, int lhs_target, diff --git a/src/array/cuda/sddmm_hetero_coo.cu.prehip b/src/array/cuda/sddmm_hetero_coo.cu.prehip new file mode 100644 index 000000000000..180e189e7152 --- /dev/null +++ b/src/array/cuda/sddmm_hetero_coo.cu.prehip @@ -0,0 +1,91 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/sddmm.cu + * @brief SDDMM C APIs and definitions. + */ +#include + +#include "./sddmm.cuh" + +namespace dgl { +namespace aten { + +/** + * @brief CUDA implementation of g-SDDMM on heterograph using + Csr format. + */ +template +void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& vec_lhs, + const std::vector& vec_rhs, std::vector vec_out, + int lhs_target, int rhs_target, const std::vector& lhs_eid, + const std::vector& rhs_eid) { + SWITCH_OP(op, Op, { + SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, { + /* Call SDDMM CUDA kernel for each relation type sequentially */ + for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) { + COOMatrix coo = vec_coo[etype]; + NDArray lhs = vec_lhs[lhs_eid[etype]]; + NDArray rhs = vec_rhs[rhs_eid[etype]]; + NDArray out = vec_out[etype]; + cuda::SDDMMCoo( + bcast, coo, lhs, rhs, out); + } + }); + }); +} + +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +#if BF16_ENABLED +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +#endif // BF16_ENABLED +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCooHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_coo, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/sddmm_hetero_csr.cu b/src/array/cuda/sddmm_hetero_csr.cu index 7a0331e700b9..cfeaf3cfc348 100644 --- a/src/array/cuda/sddmm_hetero_csr.cu +++ b/src/array/cuda/sddmm_hetero_csr.cu @@ -48,13 +48,13 @@ template void SDDMMCsrHetero( int rhs_target, const std::vector& in_eid, const std::vector& out_eid); #if BF16_ENABLED -template void SDDMMCsrHetero( +template void SDDMMCsrHetero( const std::string& op, const BcastOff& bcast, const std::vector& vec_csr, const std::vector& lhs, const std::vector& rhs, std::vector out, int lhs_target, int rhs_target, const std::vector& in_eid, const std::vector& out_eid); -template void SDDMMCsrHetero( +template void SDDMMCsrHetero( const std::string& op, const BcastOff& bcast, const std::vector& vec_csr, const std::vector& lhs, const std::vector& rhs, std::vector out, int lhs_target, diff --git a/src/array/cuda/sddmm_hetero_csr.cu.prehip b/src/array/cuda/sddmm_hetero_csr.cu.prehip new file mode 100644 index 000000000000..7a0331e700b9 --- /dev/null +++ b/src/array/cuda/sddmm_hetero_csr.cu.prehip @@ -0,0 +1,90 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/sddmm.cu + * @brief SDDMM C APIs and definitions. + */ +#include + +#include "./sddmm.cuh" + +namespace dgl { +namespace aten { + +/** + * @brief CUDA implementation of g-SDDMM on heterograph using Csr format. + */ +template +void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& vec_lhs, + const std::vector& vec_rhs, std::vector vec_out, + int lhs_target, int rhs_target, const std::vector& lhs_eid, + const std::vector& rhs_eid) { + SWITCH_OP(op, Op, { + SWITCH_TARGET(lhs_target, rhs_target, LhsTarget, RhsTarget, { + /* Call SDDMM CUDA kernel for each relation type sequentially */ + for (dgl_type_t etype = 0; etype < lhs_eid.size(); ++etype) { + CSRMatrix csr = vec_csr[etype]; + NDArray lhs = vec_lhs[lhs_eid[etype]]; + NDArray rhs = vec_rhs[rhs_eid[etype]]; + NDArray out = vec_out[etype]; + cuda::SDDMMCsr( + bcast, csr, lhs, rhs, out); + } + }); + }); +} + +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +#if BF16_ENABLED +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +#endif // BF16_ENABLED +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); +template void SDDMMCsrHetero( + const std::string& op, const BcastOff& bcast, + const std::vector& vec_csr, const std::vector& lhs, + const std::vector& rhs, std::vector out, int lhs_target, + int rhs_target, const std::vector& in_eid, + const std::vector& out_eid); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/segment_reduce.cu b/src/array/cuda/segment_reduce.cu index d83c1e68eeb1..7417ca178e24 100644 --- a/src/array/cuda/segment_reduce.cu +++ b/src/array/cuda/segment_reduce.cu @@ -60,10 +60,10 @@ template void SegmentReduce( const std::string& op, NDArray feat, NDArray offsets, NDArray out, NDArray arg); #if BF16_ENABLED -template void SegmentReduce( +template void SegmentReduce( const std::string& op, NDArray feat, NDArray offsets, NDArray out, NDArray arg); -template void SegmentReduce( +template void SegmentReduce( const std::string& op, NDArray feat, NDArray offsets, NDArray out, NDArray arg); #endif // BF16_ENABLED @@ -85,9 +85,9 @@ template void ScatterAdd( template void ScatterAdd( NDArray feat, NDArray idx, NDArray out); #if BF16_ENABLED -template void ScatterAdd( +template void ScatterAdd( NDArray feat, NDArray idx, NDArray out); -template void ScatterAdd( +template void ScatterAdd( NDArray feat, NDArray idx, NDArray out); #endif // BF16_ENABLED template void ScatterAdd( @@ -108,11 +108,11 @@ template void UpdateGradMinMax_hetero( const std::vector& feat, const std::vector& idx, const std::vector& idx_etype, std::vector* out); #if BF16_ENABLED -template void UpdateGradMinMax_hetero( +template void UpdateGradMinMax_hetero( const HeteroGraphPtr& g, const std::string& op, const std::vector& feat, const std::vector& idx, const std::vector& idx_etype, std::vector* out); -template void UpdateGradMinMax_hetero( +template void UpdateGradMinMax_hetero( const HeteroGraphPtr& g, const std::string& op, const std::vector& feat, const std::vector& idx, const std::vector& idx_etype, std::vector* out); @@ -139,9 +139,9 @@ template void BackwardSegmentCmp( template void BackwardSegmentCmp( NDArray feat, NDArray arg, NDArray out); #if BF16_ENABLED -template void BackwardSegmentCmp( +template void BackwardSegmentCmp( NDArray feat, NDArray arg, NDArray out); -template void BackwardSegmentCmp( +template void BackwardSegmentCmp( NDArray feat, NDArray arg, NDArray out); #endif // BF16_ENABLED template void BackwardSegmentCmp( diff --git a/src/array/cuda/segment_reduce.cu.prehip b/src/array/cuda/segment_reduce.cu.prehip new file mode 100644 index 000000000000..d83c1e68eeb1 --- /dev/null +++ b/src/array/cuda/segment_reduce.cu.prehip @@ -0,0 +1,157 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/segment_reduce.cu + * @brief Segment reduce C APIs and definitions. + */ +#include +#include + +#include "./functor.cuh" +#include "./segment_reduce.cuh" +#include "./utils.h" + +namespace dgl { + +using namespace cuda; + +namespace aten { + +template +void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg) { + if (op == "sum") { + cuda::SegmentReduce>( + feat, offsets, out, arg); + } else if (op == "max") { + cuda::SegmentReduce>( + feat, offsets, out, arg); + } else if (op == "min") { + cuda::SegmentReduce>( + feat, offsets, out, arg); + } else { + LOG(FATAL) << "Not implemented"; + } +} + +template +void ScatterAdd(NDArray feat, NDArray idx, NDArray out) { + cuda::ScatterAdd(feat, idx, out); +} + +template +void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out) { + cuda::UpdateGradMinMax_hetero( + g, op, feat, idx, idx_etype, out); +} + +template +void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { + cuda::BackwardSegmentCmp(feat, arg, out); +} + +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +#if BF16_ENABLED +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +#endif // BF16_ENABLED +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); +template void SegmentReduce( + const std::string& op, NDArray feat, NDArray offsets, NDArray out, + NDArray arg); + +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +#if BF16_ENABLED +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +#endif // BF16_ENABLED +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); +template void ScatterAdd( + NDArray feat, NDArray idx, NDArray out); + +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +#if BF16_ENABLED +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +#endif // BF16_ENABLED +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); +template void UpdateGradMinMax_hetero( + const HeteroGraphPtr& g, const std::string& op, + const std::vector& feat, const std::vector& idx, + const std::vector& idx_etype, std::vector* out); + +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +#if BF16_ENABLED +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +#endif // BF16_ENABLED +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); +template void BackwardSegmentCmp( + NDArray feat, NDArray arg, NDArray out); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/segment_reduce.cuh b/src/array/cuda/segment_reduce.cuh index b1be03fadd71..90f0616993cc 100644 --- a/src/array/cuda/segment_reduce.cuh +++ b/src/array/cuda/segment_reduce.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/segment_reduce.cuh @@ -125,7 +126,7 @@ void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) { DType* out_data = out.Ptr(); IdType* arg_data = arg.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t n = out->shape[0]; int64_t dim = 1; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; @@ -155,7 +156,7 @@ void ScatterAdd(NDArray feat, NDArray idx, NDArray out) { const IdType* idx_data = idx.Ptr(); DType* out_data = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t n = feat->shape[0]; int64_t dim = 1; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; @@ -186,7 +187,7 @@ void UpdateGradMinMax_hetero( const std::vector& list_feat, const std::vector& list_idx, const std::vector& list_idx_types, std::vector* list_out) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (op == "copy_lhs" || op == "copy_rhs") { std::vector> src_dst_ntypes( graph->NumVertexTypes(), std::vector()); @@ -239,7 +240,7 @@ void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { const IdType* arg_data = arg.Ptr(); DType* out_data = out.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t n = feat->shape[0]; int64_t dim = 1; for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; diff --git a/src/array/cuda/segment_reduce.cuh.prehip b/src/array/cuda/segment_reduce.cuh.prehip new file mode 100644 index 000000000000..b1be03fadd71 --- /dev/null +++ b/src/array/cuda/segment_reduce.cuh.prehip @@ -0,0 +1,262 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/segment_reduce.cuh + * @brief Segment reduce kernel function header. + */ +#ifndef DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_ +#define DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_ + +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./atomic.cuh" +#include "./utils.h" + +namespace dgl { + +using namespace cuda; +using namespace runtime; + +namespace aten { +namespace cuda { + +/** + * @brief CUDA kernel of segment reduce. + * @note each blockthread is responsible for aggregation on a row + * in the result tensor. + */ +template +__global__ void SegmentReduceKernel( + const DType* feat, const IdType* offsets, DType* out, IdType* arg, + int64_t n, int64_t dim) { + for (int row = blockIdx.x; row < n; row += gridDim.x) { + int col = blockIdx.y * blockDim.x + threadIdx.x; + while (col < dim) { + typename accum_dtype::type local_accum = ReduceOp::zero(); + IdType local_arg = -1; + for (IdType i = offsets[row]; i < offsets[row + 1]; ++i) { + ReduceOp::Call(&local_accum, &local_arg, feat[i * dim + col], i); + } + out[row * dim + col] = static_cast(local_accum); + if (ReduceOp::require_arg) arg[row * dim + col] = local_arg; + col += gridDim.y * blockDim.x; + } + } +} + +/** + * @brief CUDA kernel of scatter add. + * @note each blockthread is responsible for adding a row in feature tensor + * to a target row in output tensor. + */ +template +__global__ void ScatterAddKernel( + const DType* feat, const IdType* idx, DType* out, int64_t n, int64_t dim) { + for (int row = blockIdx.x; row < n; row += gridDim.x) { + const int write_row = idx[row]; + int col = blockIdx.y * blockDim.x + threadIdx.x; + while (col < dim) { + cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]); + col += gridDim.y * blockDim.x; + } + } +} + +/** + * @brief CUDA kernel to update gradients for reduce op max/min + * @note each WARP (group of 32 threads) is responsible for adding a row in + * feature tensor to a target row in output tensor. + */ + +template +__global__ void UpdateGradMinMaxHeteroKernel( + const DType* feat, const IdType* idx, const IdType* idx_type, DType* out, + int64_t n, int64_t dim, int type) { + unsigned int tId = threadIdx.x; + unsigned int laneId = tId & 31; + unsigned int gId = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int warpId = gId >> 5; + unsigned int warp_size = 32; + unsigned int row = warpId; + + while (row < n) { + for (unsigned int col = laneId; col < dim; col += warp_size) { + if (type == idx_type[row * dim + col]) { + const int write_row = idx[row * dim + col]; + cuda::AtomicAdd(out + write_row * dim + col, feat[row * dim + col]); + } + } + row += blockDim.x * gridDim.x; + } +} + +/** + * @brief CUDA kernel of backward phase in segment min/max. + * @note each blockthread is responsible for writing a row in the + * result gradient tensor by lookup the ArgMin/Max for index information. + */ +template +__global__ void BackwardSegmentCmpKernel( + const DType* feat, const IdType* arg, DType* out, int64_t n, int64_t dim) { + for (int row = blockIdx.x; row < n; row += gridDim.x) { + int col = blockIdx.y * blockDim.x + threadIdx.x; + while (col < dim) { + int write_row = arg[row * dim + col]; + if (write_row >= 0) { + out[write_row * dim + col] = feat[row * dim + col]; + } + col += gridDim.y * blockDim.x; + } + } +} + +/** + * @brief CUDA implementation of forward phase of Segment Reduce. + * @param feat The input tensor. + * @param offsets The offsets tensor. + * @param out The output tensor. + * @param arg An auxiliary tensor storing ArgMax/Min information, + */ +template +void SegmentReduce(NDArray feat, NDArray offsets, NDArray out, NDArray arg) { + const DType* feat_data = feat.Ptr(); + const IdType* offsets_data = offsets.Ptr(); + DType* out_data = out.Ptr(); + IdType* arg_data = arg.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int64_t n = out->shape[0]; + int64_t dim = 1; + for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; + + const int nbx = FindNumBlocks<'x'>(n); + const int ntx = FindNumThreads(dim); + const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx); + const int nty = 1; + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + // TODO(zihao): try cub's DeviceSegmentedReduce and compare the performance. + CUDA_KERNEL_CALL( + (SegmentReduceKernel), nblks, nthrs, 0, stream, + feat_data, offsets_data, out_data, arg_data, n, dim); +} + +/** + * @brief CUDA implementation of Scatter Add (on first dimension). + * @note math equation: out[idx[i], *] += feat[i, *] + * @param feat The input tensor. + * @param idx The indices tensor. + * @param out The output tensor. + */ +template +void ScatterAdd(NDArray feat, NDArray idx, NDArray out) { + const DType* feat_data = feat.Ptr(); + const IdType* idx_data = idx.Ptr(); + DType* out_data = out.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int64_t n = feat->shape[0]; + int64_t dim = 1; + for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; + + const int nbx = FindNumBlocks<'x'>(n); + const int ntx = FindNumThreads(dim); + const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx); + const int nty = 1; + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + CUDA_KERNEL_CALL( + (ScatterAddKernel), nblks, nthrs, 0, stream, feat_data, + idx_data, out_data, n, dim); +} + +/** + * @brief CUDA implementation to update gradients for reduce op max/min + * @param graph The input heterogeneous graph. + * @param op The binary operator, could be `copy_u`, `copy_e'. + * @param list_feat List of the input tensors. + * @param list_idx List of the indices tensors. + * @param list_idx_etype List of the node- or edge-type tensors. + * @param list_out List of the output tensors. + */ +template +void UpdateGradMinMax_hetero( + const HeteroGraphPtr& graph, const std::string& op, + const std::vector& list_feat, const std::vector& list_idx, + const std::vector& list_idx_types, + std::vector* list_out) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + if (op == "copy_lhs" || op == "copy_rhs") { + std::vector> src_dst_ntypes( + graph->NumVertexTypes(), std::vector()); + for (dgl_type_t etype = 0; etype < graph->NumEdgeTypes(); ++etype) { + auto pair = graph->meta_graph()->FindEdge(etype); + const dgl_id_t dst_ntype = pair.first; // graph is reversed + const dgl_id_t src_ntype = pair.second; + auto same_src_dst_ntype = std::find( + std::begin(src_dst_ntypes[dst_ntype]), + std::end(src_dst_ntypes[dst_ntype]), src_ntype); + // if op is "copy_lhs", relation type with same src and dst node type will + // be updated once + if (op == "copy_lhs" && + same_src_dst_ntype != std::end(src_dst_ntypes[dst_ntype])) + continue; + src_dst_ntypes[dst_ntype].push_back(src_ntype); + const DType* feat_data = list_feat[dst_ntype].Ptr(); + const IdType* idx_data = list_idx[dst_ntype].Ptr(); + const IdType* idx_type_data = list_idx_types[dst_ntype].Ptr(); + int type = (op == "copy_lhs") ? src_ntype : etype; + DType* out_data = (*list_out)[type].Ptr(); + int dim = 1; + for (int i = 1; i < (*list_out)[type]->ndim; ++i) + dim *= (*list_out)[type]->shape[i]; + int n = list_feat[dst_ntype]->shape[0]; + const int th_per_row = 32; + const int ntx = 128; + const int nbx = FindNumBlocks<'x'>((n * th_per_row + ntx - 1) / ntx); + const dim3 nblks(nbx); + const dim3 nthrs(ntx); + CUDA_KERNEL_CALL( + (UpdateGradMinMaxHeteroKernel), nblks, nthrs, 0, + stream, feat_data, idx_data, idx_type_data, out_data, n, dim, type); + } + } +} + +/** + * @brief CUDA implementation of backward phase of Segment Reduce with Min/Max + * reducer. + * @note math equation: out[arg[i, k], k] = feat[i, k] + * @param feat The input + * tensor. + * @param arg The ArgMin/Max information, used for indexing. + * @param out The output tensor. + */ +template +void BackwardSegmentCmp(NDArray feat, NDArray arg, NDArray out) { + const DType* feat_data = feat.Ptr(); + const IdType* arg_data = arg.Ptr(); + DType* out_data = out.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int64_t n = feat->shape[0]; + int64_t dim = 1; + for (int i = 1; i < out->ndim; ++i) dim *= out->shape[i]; + + const int nbx = FindNumBlocks<'x'>(n); + const int ntx = FindNumThreads(dim); + const int nby = FindNumBlocks<'y'>((dim + ntx - 1) / ntx); + const int nty = 1; + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + CUDA_KERNEL_CALL( + (BackwardSegmentCmpKernel), nblks, nthrs, 0, stream, + feat_data, arg_data, out_data, n, dim); +} + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_SEGMENT_REDUCE_CUH_ diff --git a/src/array/cuda/spmat_op_impl_coo.cu b/src/array/cuda/spmat_op_impl_coo.cu index dddcb6c01413..24c53d4ea9ce 100644 --- a/src/array/cuda/spmat_op_impl_coo.cu +++ b/src/array/cuda/spmat_op_impl_coo.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by contributors. * @file array/cuda/spmat_op_impl_coo.cu @@ -72,7 +73,7 @@ __global__ void _COOGetRowNNZKernel( template int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = coo.row->ctx; IdType nnz = coo.row->shape[0]; IdType nt = 1024; @@ -103,7 +104,7 @@ __global__ void _COOGetAllRowNNZKernel( template NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = coo.row->ctx; IdType nnz = coo.row->shape[0]; IdType num_rows = coo.num_rows; diff --git a/src/array/cuda/spmat_op_impl_coo.cu.prehip b/src/array/cuda/spmat_op_impl_coo.cu.prehip new file mode 100644 index 000000000000..dddcb6c01413 --- /dev/null +++ b/src/array/cuda/spmat_op_impl_coo.cu.prehip @@ -0,0 +1,139 @@ +/** + * Copyright (c) 2021 by contributors. + * @file array/cuda/spmat_op_impl_coo.cu + * @brief COO operator GPU implementation + */ +#include + +#include +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./atomic.cuh" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; +using namespace cuda; + +namespace aten { +namespace impl { + +template +__device__ void _warpReduce(volatile IdType* sdata, IdType tid) { + sdata[tid] += sdata[tid + 32]; + sdata[tid] += sdata[tid + 16]; + sdata[tid] += sdata[tid + 8]; + sdata[tid] += sdata[tid + 4]; + sdata[tid] += sdata[tid + 2]; + sdata[tid] += sdata[tid + 1]; +} + +template +__global__ void _COOGetRowNNZKernel( + const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnt, + const int64_t row_query, IdType nnz) { + __shared__ IdType local_cnt[1024]; + IdType tx = threadIdx.x; + IdType bx = blockIdx.x; + local_cnt[tx] = 0; + IdType start = bx * blockDim.x; + while (start < nnz) { + if (start + tx < nnz) + local_cnt[tx] = (row_indices[start + tx] == row_query); + __syncthreads(); + if (tx < 512) { + local_cnt[tx] += local_cnt[tx + 512]; + __syncthreads(); + } + if (tx < 256) { + local_cnt[tx] += local_cnt[tx + 256]; + __syncthreads(); + } + if (tx < 128) { + local_cnt[tx] += local_cnt[tx + 128]; + __syncthreads(); + } + if (tx < 64) { + local_cnt[tx] += local_cnt[tx + 64]; + __syncthreads(); + } + if (tx < 32) { + _warpReduce(local_cnt, tx); + } + if (tx == 0) { + cuda::AtomicAdd(glb_cnt, local_cnt[tx]); + } + start += blockDim.x * gridDim.x; + } +} + +template +int64_t COOGetRowNNZ(COOMatrix coo, int64_t row) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = coo.row->ctx; + IdType nnz = coo.row->shape[0]; + IdType nt = 1024; + IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); + NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx); + _Fill(rst.Ptr(), 1, IdType(0)); + CUDA_KERNEL_CALL( + _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr(), + rst.Ptr(), row, nnz); + rst = rst.CopyTo(DGLContext{kDGLCPU, 0}); + return *rst.Ptr(); +} + +template int64_t COOGetRowNNZ(COOMatrix, int64_t); +template int64_t COOGetRowNNZ(COOMatrix, int64_t); + +template +__global__ void _COOGetAllRowNNZKernel( + const IdType* __restrict__ row_indices, IdType* __restrict__ glb_cnts, + IdType nnz) { + IdType eid = blockIdx.x * blockDim.x + threadIdx.x; + while (eid < nnz) { + IdType row = row_indices[eid]; + cuda::AtomicAdd(glb_cnts + row, IdType(1)); + eid += blockDim.x * gridDim.x; + } +} + +template +NDArray COOGetRowNNZ(COOMatrix coo, NDArray rows) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = coo.row->ctx; + IdType nnz = coo.row->shape[0]; + IdType num_rows = coo.num_rows; + IdType num_queries = rows->shape[0]; + if (num_queries == 1) { + auto rows_cpu = rows.CopyTo(DGLContext{kDGLCPU, 0}); + int64_t row = *rows_cpu.Ptr(); + IdType nt = 1024; + IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); + NDArray rst = NDArray::Empty({1}, coo.row->dtype, coo.row->ctx); + _Fill(rst.Ptr(), 1, IdType(0)); + CUDA_KERNEL_CALL( + _COOGetRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr(), + rst.Ptr(), row, nnz); + return rst; + } else { + IdType nt = 1024; + IdType nb = dgl::cuda::FindNumBlocks<'x'>((nnz + nt - 1) / nt); + NDArray in_degrees = NDArray::Empty({num_rows}, rows->dtype, rows->ctx); + _Fill(in_degrees.Ptr(), num_rows, IdType(0)); + CUDA_KERNEL_CALL( + _COOGetAllRowNNZKernel, nb, nt, 0, stream, coo.row.Ptr(), + in_degrees.Ptr(), nnz); + return IndexSelect(in_degrees, rows); + } +} + +template NDArray COOGetRowNNZ(COOMatrix, NDArray); +template NDArray COOGetRowNNZ(COOMatrix, NDArray); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/spmat_op_impl_csr.cu b/src/array/cuda/spmat_op_impl_csr.cu index 7ee1f2ddcecc..4e07fcb0416c 100644 --- a/src/array/cuda/spmat_op_impl_csr.cu +++ b/src/array/cuda/spmat_op_impl_csr.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/spmat_op_impl_csr.cu @@ -7,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -28,7 +29,7 @@ namespace impl { template bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = csr.indptr->ctx; IdArray rows = aten::VecToIdArray({row}, sizeof(IdType) * 8, ctx); IdArray cols = aten::VecToIdArray({col}, sizeof(IdType) * 8, ctx); @@ -58,7 +59,7 @@ NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) { if (rstlen == 0) return rst; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int nt = dgl::cuda::FindNumThreads(rstlen); const int nb = (rstlen + nt - 1) / nt; const IdType* data = nullptr; @@ -104,7 +105,7 @@ template bool CSRHasDuplicate(CSRMatrix csr) { if (!csr.sorted) csr = CSRSort(csr); const auto& ctx = csr.indptr->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = runtime::DeviceAPI::Get(ctx); // We allocate a workspace of num_rows bytes. It wastes a little bit memory // but should be fine. @@ -149,7 +150,7 @@ __global__ void _CSRGetRowNNZKernel( template NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto len = rows->shape[0]; const IdType* vid_data = rows.Ptr(); const IdType* indptr_data = @@ -250,7 +251,7 @@ __global__ void _SegmentCopyKernel( template CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t len = rows->shape[0]; IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true); const int64_t nnz = aten::IndexSelect(ret_indptr, len); @@ -367,7 +368,7 @@ std::vector CSRGetDataAndIndices( const int64_t nnz = csr.indices->shape[0]; const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const IdType* indptr_data = static_cast(GetDevicePointer(csr.indptr)); @@ -532,7 +533,7 @@ __global__ void _SegmentMaskColKernel( static_cast(num_rows)); NodeQueryHashmap hashmap(hashmap_buffer, buffer_size); - typedef cub::WarpReduce WarpReduce; + typedef hipcub::WarpReduce WarpReduce; __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS]; while (out_row < last_row) { @@ -557,7 +558,7 @@ __global__ void _SegmentMaskColKernel( template CSRMatrix CSRSliceMatrix( CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = rows->ctx; const auto& dtype = rows->dtype; const auto nbits = dtype.bits; @@ -582,7 +583,7 @@ CSRMatrix CSRSliceMatrix( // A count for how many masked values per row. IdArray count = NewIdArray(csr.num_rows, ctx, nbits); CUDA_CALL( - cudaMemset(count.Ptr(), 0, sizeof(IdType) * (csr.num_rows))); + hipMemset(count.Ptr(), 0, sizeof(IdType) * (csr.num_rows))); // Generate a NodeQueryHashmap buffer. The key of the hashmap is col. // For performance, the load factor of the hashmap is in (0.25, 0.5); @@ -593,7 +594,7 @@ CSRMatrix CSRSliceMatrix( using it = thrust::counting_iterator; runtime::CUDAWorkspaceAllocator allocator(ctx); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); thrust::for_each( exec_policy, it(0), it(new_ncols), [key = cols.Ptr(), buffer = hashmap_buffer.Ptr(), diff --git a/src/array/cuda/spmat_op_impl_csr.cu.prehip b/src/array/cuda/spmat_op_impl_csr.cu.prehip new file mode 100644 index 000000000000..7ee1f2ddcecc --- /dev/null +++ b/src/array/cuda/spmat_op_impl_csr.cu.prehip @@ -0,0 +1,654 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/spmat_op_impl_csr.cu + * @brief CSR operator CPU implementation + */ +#include +#include +#include + +#include +#include +#include +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./atomic.cuh" +#include "./utils.h" + +namespace dgl { + +using runtime::NDArray; +using namespace cuda; + +namespace aten { +namespace impl { + +///////////////////////////// CSRIsNonZero ///////////////////////////// + +template +bool CSRIsNonZero(CSRMatrix csr, int64_t row, int64_t col) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = csr.indptr->ctx; + IdArray rows = aten::VecToIdArray({row}, sizeof(IdType) * 8, ctx); + IdArray cols = aten::VecToIdArray({col}, sizeof(IdType) * 8, ctx); + rows = rows.CopyTo(ctx); + cols = cols.CopyTo(ctx); + IdArray out = aten::NewIdArray(1, ctx, sizeof(IdType) * 8); + const IdType* data = nullptr; + // TODO(minjie): use binary search for sorted csr + CUDA_KERNEL_CALL( + dgl::cuda::_LinearSearchKernel, 1, 1, 0, stream, csr.indptr.Ptr(), + csr.indices.Ptr(), data, rows.Ptr(), cols.Ptr(), + 1, 1, 1, static_cast(nullptr), static_cast(-1), + out.Ptr()); + out = out.CopyTo(DGLContext{kDGLCPU, 0}); + return *out.Ptr() != -1; +} + +template bool CSRIsNonZero(CSRMatrix, int64_t, int64_t); +template bool CSRIsNonZero(CSRMatrix, int64_t, int64_t); + +template +NDArray CSRIsNonZero(CSRMatrix csr, NDArray row, NDArray col) { + const auto rowlen = row->shape[0]; + const auto collen = col->shape[0]; + const auto rstlen = std::max(rowlen, collen); + NDArray rst = NDArray::Empty({rstlen}, row->dtype, row->ctx); + if (rstlen == 0) return rst; + const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; + const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int nt = dgl::cuda::FindNumThreads(rstlen); + const int nb = (rstlen + nt - 1) / nt; + const IdType* data = nullptr; + const IdType* indptr_data = + static_cast(GetDevicePointer(csr.indptr)); + const IdType* indices_data = + static_cast(GetDevicePointer(csr.indices)); + // TODO(minjie): use binary search for sorted csr + CUDA_KERNEL_CALL( + dgl::cuda::_LinearSearchKernel, nb, nt, 0, stream, indptr_data, + indices_data, data, row.Ptr(), col.Ptr(), row_stride, + col_stride, rstlen, static_cast(nullptr), + static_cast(-1), rst.Ptr()); + return rst != -1; +} + +template NDArray CSRIsNonZero(CSRMatrix, NDArray, NDArray); +template NDArray CSRIsNonZero(CSRMatrix, NDArray, NDArray); + +///////////////////////////// CSRHasDuplicate ///////////////////////////// + +/** + * @brief Check whether each row does not have any duplicate entries. + * Assume the CSR is sorted. + */ +template +__global__ void _SegmentHasNoDuplicate( + const IdType* indptr, const IdType* indices, int64_t num_rows, + int8_t* flags) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < num_rows) { + bool f = true; + for (IdType i = indptr[tx] + 1; f && i < indptr[tx + 1]; ++i) { + f = (indices[i - 1] != indices[i]); + } + flags[tx] = static_cast(f); + tx += stride_x; + } +} + +template +bool CSRHasDuplicate(CSRMatrix csr) { + if (!csr.sorted) csr = CSRSort(csr); + const auto& ctx = csr.indptr->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = runtime::DeviceAPI::Get(ctx); + // We allocate a workspace of num_rows bytes. It wastes a little bit memory + // but should be fine. + int8_t* flags = + static_cast(device->AllocWorkspace(ctx, csr.num_rows)); + const int nt = dgl::cuda::FindNumThreads(csr.num_rows); + const int nb = (csr.num_rows + nt - 1) / nt; + CUDA_KERNEL_CALL( + _SegmentHasNoDuplicate, nb, nt, 0, stream, csr.indptr.Ptr(), + csr.indices.Ptr(), csr.num_rows, flags); + bool ret = dgl::cuda::AllTrue(flags, csr.num_rows, ctx); + device->FreeWorkspace(ctx, flags); + return !ret; +} + +template bool CSRHasDuplicate(CSRMatrix csr); +template bool CSRHasDuplicate(CSRMatrix csr); + +///////////////////////////// CSRGetRowNNZ ///////////////////////////// + +template +int64_t CSRGetRowNNZ(CSRMatrix csr, int64_t row) { + const IdType cur = aten::IndexSelect(csr.indptr, row); + const IdType next = aten::IndexSelect(csr.indptr, row + 1); + return next - cur; +} + +template int64_t CSRGetRowNNZ(CSRMatrix, int64_t); +template int64_t CSRGetRowNNZ(CSRMatrix, int64_t); + +template +__global__ void _CSRGetRowNNZKernel( + const IdType* vid, const IdType* indptr, IdType* out, int64_t length) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + const IdType vv = vid[tx]; + out[tx] = indptr[vv + 1] - indptr[vv]; + tx += stride_x; + } +} + +template +NDArray CSRGetRowNNZ(CSRMatrix csr, NDArray rows) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto len = rows->shape[0]; + const IdType* vid_data = rows.Ptr(); + const IdType* indptr_data = + static_cast(GetDevicePointer(csr.indptr)); + NDArray rst = NDArray::Empty({len}, rows->dtype, rows->ctx); + IdType* rst_data = static_cast(rst->data); + const int nt = dgl::cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + _CSRGetRowNNZKernel, nb, nt, 0, stream, vid_data, indptr_data, rst_data, + len); + return rst; +} + +template NDArray CSRGetRowNNZ(CSRMatrix, NDArray); +template NDArray CSRGetRowNNZ(CSRMatrix, NDArray); + +////////////////////////// CSRGetRowColumnIndices ////////////////////////////// + +template +NDArray CSRGetRowColumnIndices(CSRMatrix csr, int64_t row) { + const int64_t len = impl::CSRGetRowNNZ(csr, row); + const int64_t offset = + aten::IndexSelect(csr.indptr, row) * sizeof(IdType); + return csr.indices.CreateView({len}, csr.indices->dtype, offset); +} + +template NDArray CSRGetRowColumnIndices(CSRMatrix, int64_t); +template NDArray CSRGetRowColumnIndices(CSRMatrix, int64_t); + +///////////////////////////// CSRGetRowData ///////////////////////////// + +template +NDArray CSRGetRowData(CSRMatrix csr, int64_t row) { + const int64_t len = impl::CSRGetRowNNZ(csr, row); + const int64_t offset = + aten::IndexSelect(csr.indptr, row) * sizeof(IdType); + if (aten::CSRHasData(csr)) + return csr.data.CreateView({len}, csr.data->dtype, offset); + else + return aten::Range( + offset, offset + len, csr.indptr->dtype.bits, csr.indptr->ctx); +} + +template NDArray CSRGetRowData(CSRMatrix, int64_t); +template NDArray CSRGetRowData(CSRMatrix, int64_t); + +///////////////////////////// CSRSliceRows ///////////////////////////// + +template +CSRMatrix CSRSliceRows(CSRMatrix csr, int64_t start, int64_t end) { + const int64_t num_rows = end - start; + const IdType st_pos = aten::IndexSelect(csr.indptr, start); + const IdType ed_pos = aten::IndexSelect(csr.indptr, end); + const IdType nnz = ed_pos - st_pos; + IdArray ret_indptr = aten::IndexSelect(csr.indptr, start, end + 1) - st_pos; + // indices and data can be view arrays + IdArray ret_indices = csr.indices.CreateView( + {nnz}, csr.indices->dtype, st_pos * sizeof(IdType)); + IdArray ret_data; + if (CSRHasData(csr)) + ret_data = + csr.data.CreateView({nnz}, csr.data->dtype, st_pos * sizeof(IdType)); + else + ret_data = + aten::Range(st_pos, ed_pos, csr.indptr->dtype.bits, csr.indptr->ctx); + return CSRMatrix( + num_rows, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted); +} + +template CSRMatrix CSRSliceRows(CSRMatrix, int64_t, int64_t); +template CSRMatrix CSRSliceRows(CSRMatrix, int64_t, int64_t); + +/** + * @brief Copy data segment to output buffers + * + * For the i^th row r = row[i], copy the data from indptr[r] ~ indptr[r+1] + * to the out_data from out_indptr[i] ~ out_indptr[i+1] + * + * If the provided `data` array is nullptr, write the read index to the + * out_data. + * + */ +template +__global__ void _SegmentCopyKernel( + const IdType* indptr, const DType* data, const IdType* row, int64_t length, + int64_t n_row, const IdType* out_indptr, DType* out_data) { + IdType tx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + IdType rpos = dgl::cuda::_UpperBound(out_indptr, n_row, tx) - 1; + IdType rofs = tx - out_indptr[rpos]; + const IdType u = row[rpos]; + out_data[tx] = data ? data[indptr[u] + rofs] : indptr[u] + rofs; + tx += stride_x; + } +} + +template +CSRMatrix CSRSliceRows(CSRMatrix csr, NDArray rows) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int64_t len = rows->shape[0]; + IdArray ret_indptr = aten::CumSum(aten::CSRGetRowNNZ(csr, rows), true); + const int64_t nnz = aten::IndexSelect(ret_indptr, len); + + const int nt = 256; // for better GPU usage of small invocations + const int nb = (nnz + nt - 1) / nt; + + // Copy indices. + IdArray ret_indices = NDArray::Empty({nnz}, csr.indptr->dtype, rows->ctx); + + const IdType* indptr_data = + static_cast(GetDevicePointer(csr.indptr)); + const IdType* indices_data = + static_cast(GetDevicePointer(csr.indices)); + const IdType* data_data = + CSRHasData(csr) ? static_cast(GetDevicePointer(csr.data)) + : nullptr; + + CUDA_KERNEL_CALL( + _SegmentCopyKernel, nb, nt, 0, stream, indptr_data, indices_data, + rows.Ptr(), nnz, len, ret_indptr.Ptr(), + ret_indices.Ptr()); + // Copy data. + IdArray ret_data = NDArray::Empty({nnz}, csr.indptr->dtype, rows->ctx); + CUDA_KERNEL_CALL( + _SegmentCopyKernel, nb, nt, 0, stream, indptr_data, data_data, + rows.Ptr(), nnz, len, ret_indptr.Ptr(), + ret_data.Ptr()); + return CSRMatrix( + len, csr.num_cols, ret_indptr, ret_indices, ret_data, csr.sorted); +} + +template CSRMatrix CSRSliceRows(CSRMatrix, NDArray); +template CSRMatrix CSRSliceRows(CSRMatrix, NDArray); + +///////////////////////////// CSRGetDataAndIndices ///////////////////////////// + +/** + * @brief Generate a 0-1 mask for each index that hits the provided (row, col) + * index. + * + * Examples: + * Given a CSR matrix (with duplicate entries) as follows: + * [[0, 1, 2, 0, 0], + * [1, 0, 0, 0, 0], + * [0, 0, 1, 1, 0], + * [0, 0, 0, 0, 0]] + * Given rows: [0, 1], cols: [0, 2, 3] + * The result mask is: [0, 1, 1, 1, 0, 0] + */ +template +__global__ void _SegmentMaskKernel( + const IdType* indptr, const IdType* indices, const IdType* row, + const IdType* col, int64_t row_stride, int64_t col_stride, int64_t length, + IdType* mask) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + int rpos = tx * row_stride, cpos = tx * col_stride; + const IdType r = row[rpos], c = col[cpos]; + for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) { + if (indices[i] == c) { + mask[i] = 1; + } + } + tx += stride_x; + } +} + +/** + * @brief Search for the insertion positions for needle in the hay. + * + * The hay is a list of sorted elements and the result is the insertion position + * of each needle so that the insertion still gives sorted order. + * + * It essentially perform binary search to find lower bound for each needle + * elements. Require the largest elements in the hay is larger than the given + * needle elements. Commonly used in searching for row IDs of a given set of + * coordinates. + */ +template +__global__ void _SortedSearchKernel( + const IdType* hay, int64_t hay_size, const IdType* needles, + int64_t num_needles, IdType* pos) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < num_needles) { + const IdType ele = needles[tx]; + // binary search + IdType lo = 0, hi = hay_size - 1; + while (lo < hi) { + IdType mid = (lo + hi) >> 1; + if (hay[mid] <= ele) { + lo = mid + 1; + } else { + hi = mid; + } + } + pos[tx] = (hay[hi] == ele) ? hi : hi - 1; + tx += stride_x; + } +} + +template +std::vector CSRGetDataAndIndices( + CSRMatrix csr, NDArray row, NDArray col) { + const auto rowlen = row->shape[0]; + const auto collen = col->shape[0]; + const auto len = std::max(rowlen, collen); + if (len == 0) return {NullArray(), NullArray(), NullArray()}; + + const auto& ctx = row->ctx; + const auto nbits = row->dtype.bits; + const int64_t nnz = csr.indices->shape[0]; + const int64_t row_stride = (rowlen == 1 && collen != 1) ? 0 : 1; + const int64_t col_stride = (collen == 1 && rowlen != 1) ? 0 : 1; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const IdType* indptr_data = + static_cast(GetDevicePointer(csr.indptr)); + const IdType* indices_data = + static_cast(GetDevicePointer(csr.indices)); + + // Generate a 0-1 mask for matched (row, col) positions. + IdArray mask = Full(0, nnz, nbits, ctx); + const int nt = dgl::cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + _SegmentMaskKernel, nb, nt, 0, stream, indptr_data, indices_data, + row.Ptr(), col.Ptr(), row_stride, col_stride, len, + mask.Ptr()); + + IdArray idx = AsNumBits(NonZero(mask), nbits); + if (idx->shape[0] == 0) + // No data. Return three empty arrays. + return {idx, idx, idx}; + + // Search for row index + IdArray ret_row = NewIdArray(idx->shape[0], ctx, nbits); + const int nt2 = dgl::cuda::FindNumThreads(idx->shape[0]); + const int nb2 = (idx->shape[0] + nt - 1) / nt; + CUDA_KERNEL_CALL( + _SortedSearchKernel, nb2, nt2, 0, stream, indptr_data, csr.num_rows, + idx.Ptr(), idx->shape[0], ret_row.Ptr()); + + // Column & data can be obtained by index select. + IdArray ret_col = IndexSelect(csr.indices, idx); + IdArray ret_data = CSRHasData(csr) ? IndexSelect(csr.data, idx) : idx; + return {ret_row, ret_col, ret_data}; +} + +template std::vector CSRGetDataAndIndices( + CSRMatrix csr, NDArray rows, NDArray cols); +template std::vector CSRGetDataAndIndices( + CSRMatrix csr, NDArray rows, NDArray cols); + +///////////////////////////// CSRSliceMatrix ///////////////////////////// + +int64_t _UpPower(int64_t numel) { + uint64_t ret = 1 << static_cast(std::log2(numel) + 1); + return ret; +} + +/** + * @brief Thomas Wang's 32 bit Mix Function. + * Source link: https://gist.github.com/badboy/6267743 + */ +__device__ inline uint32_t _Hash32Shift(uint32_t key) { + key = ~key + (key << 15); + key = key ^ (key >> 12); + key = key + (key << 2); + key = key ^ (key >> 4); + key = key * 2057; + key = key ^ (key >> 16); + return key; +} + +/** + * @brief Thomas Wang's 64 bit Mix Function. + * Source link: https://gist.github.com/badboy/6267743 + */ +__device__ inline uint64_t _Hash64Shift(uint64_t key) { + key = (~key) + (key << 21); + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); + key = key ^ (key >> 28); + key = key + (key << 31); + return key; +} + +/** + * @brief A hashmap designed for CSRSliceMatrix, similar in function to set. For + * performance, it can only be created and called in the cuda kernel. + */ +template +struct NodeQueryHashmap { + __device__ inline NodeQueryHashmap(IdType* Kptr, size_t numel) + : kptr_(Kptr), capacity_(numel) {} + + /** + * @brief Insert a key. It must be called by cuda threads. + * + * @param key The key to be inserted. + */ + __device__ inline void Insert(IdType key) { + uint32_t delta = 1; + uint32_t pos = Hash(key); + IdType prev = dgl::aten::cuda::AtomicCAS(&kptr_[pos], kEmptyKey_, key); + while (prev != key && prev != kEmptyKey_) { + pos = Hash(pos + delta); + delta += 1; + prev = dgl::aten::cuda::AtomicCAS(&kptr_[pos], kEmptyKey_, key); + } + } + + /** + * @brief Check whether a key exists within the hashtable. It must be called + * by cuda threads. + * + * @param key The key to check for. + * @return True if the key exists in the hashtable. + */ + __device__ inline bool Query(IdType key) { + uint32_t delta = 1; + uint32_t pos = Hash(key); + while (true) { + if (kptr_[pos] == key) return true; + if (kptr_[pos] == kEmptyKey_) return false; + pos = Hash(pos + delta); + delta += 1; + } + return false; + } + + __device__ inline uint32_t Hash(int32_t key) { + return _Hash32Shift(key) & (capacity_ - 1); + } + + __device__ inline uint32_t Hash(uint32_t key) { + return _Hash32Shift(key) & (capacity_ - 1); + } + + __device__ inline uint32_t Hash(int64_t key) { + return static_cast(_Hash64Shift(key)) & (capacity_ - 1); + } + + __device__ inline uint32_t Hash(uint64_t key) { + return static_cast(_Hash64Shift(key)) & (capacity_ - 1); + } + + IdType kEmptyKey_{-1}; + IdType* kptr_; + uint32_t capacity_{0}; +}; + +/** + * @brief Generate a 0-1 mask for each index whose column is in the provided + * hashmap. It also counts the number of masked values per row. + * + * @tparam IdType The ID type used for matrices. + * @tparam WARP_SIZE The number of cuda threads in a cuda warp. + * @tparam BLOCK_WARPS The number of warps in a cuda block. + * @tparam TILE_SIZE The number of rows covered by each threadblock. + */ +template +__global__ void _SegmentMaskColKernel( + const IdType* indptr, const IdType* indices, int64_t num_rows, + IdType* hashmap_buffer, int64_t buffer_size, IdType* mask, IdType* count) { + assert(blockDim.x == WARP_SIZE); + assert(blockDim.y == BLOCK_WARPS); + + int warp_id = threadIdx.y; + int laneid = threadIdx.x; + IdType out_row = blockIdx.x * TILE_SIZE + threadIdx.y; + IdType last_row = + min(static_cast((blockIdx.x + 1) * TILE_SIZE), + static_cast(num_rows)); + + NodeQueryHashmap hashmap(hashmap_buffer, buffer_size); + typedef cub::WarpReduce WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[BLOCK_WARPS]; + + while (out_row < last_row) { + IdType local_count = 0; + IdType in_row_start = indptr[out_row]; + IdType in_row_end = indptr[out_row + 1]; + for (int idx = in_row_start + laneid; idx < in_row_end; idx += WARP_SIZE) { + bool is_in = hashmap.Query(indices[idx]); + if (is_in) { + local_count += 1; + mask[idx] = 1; + } + } + IdType reduce_count = WarpReduce(temp_storage[warp_id]).Sum(local_count); + if (laneid == 0) { + count[out_row] = reduce_count; + } + out_row += BLOCK_WARPS; + } +} + +template +CSRMatrix CSRSliceMatrix( + CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = rows->ctx; + const auto& dtype = rows->dtype; + const auto nbits = dtype.bits; + const int64_t new_nrows = rows->shape[0]; + const int64_t new_ncols = cols->shape[0]; + + if (new_nrows == 0 || new_ncols == 0) + return CSRMatrix( + new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), + NullArray(dtype, ctx), NullArray(dtype, ctx)); + + // First slice rows + csr = CSRSliceRows(csr, rows); + + if (csr.indices->shape[0] == 0) + return CSRMatrix( + new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), + NullArray(dtype, ctx), NullArray(dtype, ctx)); + + // Generate a 0-1 mask for matched (row, col) positions. + IdArray mask = Full(0, csr.indices->shape[0], nbits, ctx); + // A count for how many masked values per row. + IdArray count = NewIdArray(csr.num_rows, ctx, nbits); + CUDA_CALL( + cudaMemset(count.Ptr(), 0, sizeof(IdType) * (csr.num_rows))); + + // Generate a NodeQueryHashmap buffer. The key of the hashmap is col. + // For performance, the load factor of the hashmap is in (0.25, 0.5); + // Because num_cols is usually less than 1 Million (on GPU), the + // memory overhead is not significant (less than 31MB) at a low load factor. + int64_t buffer_size = _UpPower(new_ncols) * 2; + IdArray hashmap_buffer = Full(-1, buffer_size, nbits, ctx); + + using it = thrust::counting_iterator; + runtime::CUDAWorkspaceAllocator allocator(ctx); + const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + thrust::for_each( + exec_policy, it(0), it(new_ncols), + [key = cols.Ptr(), buffer = hashmap_buffer.Ptr(), + buffer_size] __device__(int64_t i) { + NodeQueryHashmap hashmap(buffer, buffer_size); + hashmap.Insert(key[i]); + }); + + const IdType* indptr_data = + static_cast(GetDevicePointer(csr.indptr)); + const IdType* indices_data = + static_cast(GetDevicePointer(csr.indices)); + + // Execute SegmentMaskColKernel + const int64_t num_rows = csr.num_rows; + constexpr int WARP_SIZE = 32; + // With a simple fine-tuning, TILE_SIZE=16 gives a good performance. + constexpr int TILE_SIZE = 16; + constexpr int BLOCK_WARPS = CUDA_MAX_NUM_THREADS / WARP_SIZE; + IdType nb = + dgl::cuda::FindNumBlocks<'x'>((num_rows + TILE_SIZE - 1) / TILE_SIZE); + const dim3 nthrs(WARP_SIZE, BLOCK_WARPS); + const dim3 nblks(nb); + CUDA_KERNEL_CALL( + (_SegmentMaskColKernel), nblks, + nthrs, 0, stream, indptr_data, indices_data, num_rows, + hashmap_buffer.Ptr(), buffer_size, mask.Ptr(), + count.Ptr()); + + IdArray idx = AsNumBits(NonZero(mask), nbits); + if (idx->shape[0] == 0) + return CSRMatrix( + new_nrows, new_ncols, Full(0, new_nrows + 1, nbits, ctx), + NullArray(dtype, ctx), NullArray(dtype, ctx)); + + // Indptr needs to be adjusted according to the new nnz per row. + IdArray ret_indptr = CumSum(count, true); + + // Column & data can be obtained by index select. + IdArray ret_col = IndexSelect(csr.indices, idx); + IdArray ret_data = CSRHasData(csr) ? IndexSelect(csr.data, idx) : idx; + + // Relabel column + IdArray col_hash = NewIdArray(csr.num_cols, ctx, nbits); + Scatter_(cols, Range(0, cols->shape[0], nbits, ctx), col_hash); + ret_col = IndexSelect(col_hash, ret_col); + + return CSRMatrix(new_nrows, new_ncols, ret_indptr, ret_col, ret_data); +} + +template CSRMatrix CSRSliceMatrix( + CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols); +template CSRMatrix CSRSliceMatrix( + CSRMatrix csr, runtime::NDArray rows, runtime::NDArray cols); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/spmm.cu b/src/array/cuda/spmm.cu index a91927896277..e7d4ca952f89 100644 --- a/src/array/cuda/spmm.cu +++ b/src/array/cuda/spmm.cu @@ -114,11 +114,11 @@ template void SpMMCsr( const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); #if BF16_ENABLED -template void SpMMCsr( +template void SpMMCsr( const std::string& op, const std::string& reduce, const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); -template void SpMMCsr( +template void SpMMCsr( const std::string& op, const std::string& reduce, const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); @@ -149,11 +149,11 @@ template void SpMMCoo( const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); #if BF16_ENABLED -template void SpMMCoo( +template void SpMMCoo( const std::string& op, const std::string& reduce, const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); -template void SpMMCoo( +template void SpMMCoo( const std::string& op, const std::string& reduce, const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, std::vector out_aux); diff --git a/src/array/cuda/spmm.cu.prehip b/src/array/cuda/spmm.cu.prehip new file mode 100644 index 000000000000..a91927896277 --- /dev/null +++ b/src/array/cuda/spmm.cu.prehip @@ -0,0 +1,179 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/spmm.cu + * @brief SPMM C APIs and definitions. + */ +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./functor.cuh" +#include "./ge_spmm.cuh" +#include "./spmm.cuh" + +namespace dgl { + +using namespace cuda; + +namespace aten { + +/** + * @brief CUDA implementation of g-SpMM on Csr format. + * @note use cusparse if the reduce operator is `sum` and there is + * no broadcast, use dgl's kernel in other cases. + */ +template +void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux) { + bool is_scalar_efeat = efeat.NumElements() == csr.indices->shape[0]; + bool use_efeat = op != "copy_lhs"; + bool use_deterministic_alg_only = false; + if (NULL != std::getenv("USE_DETERMINISTIC_ALG")) + use_deterministic_alg_only = true; + + if (reduce == "sum") { + bool more_nnz = (csr.indices->shape[0] > csr.num_rows * csr.num_cols); + if (op == "copy_lhs" && cusparse_available(more_nnz)) { + // cusparse + int64_t x_length = 1; + for (int i = 1; i < ufeat->ndim; ++i) x_length *= ufeat->shape[i]; + CusparseCsrmm2( + ufeat->ctx, csr, static_cast(ufeat->data), nullptr, + static_cast(out->data), x_length, use_deterministic_alg_only); + } else if ( + op == "mul" && is_scalar_efeat && + cusparse_available(more_nnz)) { + // cusparse + int64_t x_length = 1; + for (int i = 1; i < ufeat->ndim; ++i) x_length *= ufeat->shape[i]; + if (!IsNullArray(csr.data)) { + efeat = IndexSelect(efeat, csr.data); + } + CusparseCsrmm2( + ufeat->ctx, csr, static_cast(ufeat->data), + static_cast(efeat->data), static_cast(out->data), + x_length, use_deterministic_alg_only); + } else { // general kernel + SWITCH_OP(op, Op, { + cuda::SpMMCsr >( + bcast, csr, ufeat, efeat, out, NullArray(), NullArray()); + }); + } + } else if (reduce == "max") { + SWITCH_OP(op, Op, { + cuda::SpMMCsr >( + bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]); + }); + } else if (reduce == "min") { + SWITCH_OP(op, Op, { + cuda::SpMMCsr >( + bcast, csr, ufeat, efeat, out, out_aux[0], out_aux[1]); + }); + } else { + LOG(FATAL) << "Not implemented"; + } +} + +/** + * @brief CUDA implementation of g-SpMM on Coo format. + */ +template +void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux) { + if (reduce == "sum") { + SWITCH_OP(op, Op, { + cuda::SpMMCoo >( + bcast, coo, ufeat, efeat, out, NullArray(), NullArray()); + }); + } else if (reduce == "max") { + SWITCH_OP(op, Op, { + cuda::SpMMCoo >( + bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]); + }); + } else if (reduce == "min") { + SWITCH_OP(op, Op, { + cuda::SpMMCoo >( + bcast, coo, ufeat, efeat, out, out_aux[0], out_aux[1]); + }); + } else { + LOG(FATAL) << "Not implemented"; + } +} + +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +#if BF16_ENABLED +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +#endif // BF16_ENABLED +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCsr( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const CSRMatrix& csr, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); + +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +#if BF16_ENABLED +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +#endif // BF16_ENABLED +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); +template void SpMMCoo( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const COOMatrix& coo, NDArray ufeat, NDArray efeat, NDArray out, + std::vector out_aux); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/spmm.cuh b/src/array/cuda/spmm.cuh index 9ebed71cc134..3677648682cf 100644 --- a/src/array/cuda/spmm.cuh +++ b/src/array/cuda/spmm.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/spmm.cuh @@ -35,7 +36,7 @@ inline bool cusparse_available(bool more_nnz_than_matrix_size) { return false; #else if (std::is_same::value || - std::is_same::value) + std::is_same::value) return false; // cusparse's SpMM on fp16 is slow, temporally disabled. // If the CSR matrix has more NNZ than matrix size, we should not use // cuSPARSE 11.1. @@ -47,54 +48,54 @@ namespace { /** @brief Call cuBLAS geam API for transpose operation for float and double. */ template -cublasStatus_t Xgeam( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t Xgeam( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, const DType* alpha, const DType* A, int lda, const DType* beta, const DType* B, int ldb, DType* C, int ldc) { LOG(FATAL) << "Not supported dtype"; - return CUBLAS_STATUS_EXECUTION_FAILED; + return HIPBLAS_STATUS_EXECUTION_FAILED; } template <> -cublasStatus_t Xgeam<__half>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t Xgeam<__half>( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, const __half* alpha, const __half* A, int lda, const __half* beta, const __half* B, int ldb, __half* C, int ldc) { // TODO(ndickson): There is no cublasHgeam, so a different // implementation would be required. LOG(FATAL) << "Xgeam does not support dtype half (FP16)"; - return CUBLAS_STATUS_EXECUTION_FAILED; + return HIPBLAS_STATUS_EXECUTION_FAILED; } #if BF16_ENABLED template <> -cublasStatus_t Xgeam<__nv_bfloat16>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda, - const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb, - __nv_bfloat16* C, int ldc) { +hipblasStatus_t Xgeam<__hip_bfloat16>( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, + int m, int n, const __hip_bfloat16* alpha, const __hip_bfloat16* A, int lda, + const __hip_bfloat16* beta, const __hip_bfloat16* B, int ldb, + __hip_bfloat16* C, int ldc) { // TODO(ndickson): There is no cublasHgeam, so a different // implementation would be required. LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)"; - return CUBLAS_STATUS_EXECUTION_FAILED; + return HIPBLAS_STATUS_EXECUTION_FAILED; } #endif // BF16_ENABLED template <> -cublasStatus_t Xgeam( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t Xgeam( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, const float* alpha, const float* A, int lda, const float* beta, const float* B, int ldb, float* C, int ldc) { - return cublasSgeam( + return hipblasSgeam( handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } template <> -cublasStatus_t Xgeam( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, +hipblasStatus_t Xgeam( + hipblasHandle_t handle, hipblasOperation_t transa, hipblasOperation_t transb, int m, int n, const double* alpha, const double* A, int lda, const double* beta, const double* B, int ldb, double* C, int ldc) { - return cublasDgeam( + return hipblasDgeam( handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); } @@ -119,12 +120,12 @@ template void _Transpose(const DType* in, DType* out, int row, int col) { DType alpha = 1., beta = 0.; auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (!thr_entry->cublas_handle) - CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); - CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + CUBLAS_CALL(hipblasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(hipblasSetStream(thr_entry->cublas_handle, stream)); CUBLAS_CALL(Xgeam( - thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in, + thr_entry->cublas_handle, HIPBLAS_OP_T, HIPBLAS_OP_N, row, col, &alpha, in, col, &beta, nullptr, row, out, row)); } @@ -134,7 +135,7 @@ void _Transpose(const DType* in, DType* out, int row, int col) { */ template <> void _Transpose<__half>(const __half* in, __half* out, int row, int col) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = FindNumThreads(row); int nb = col; CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); @@ -146,9 +147,9 @@ void _Transpose<__half>(const __half* in, __half* out, int row, int col) { * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel. */ template <> -void _Transpose<__nv_bfloat16>( - const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); +void _Transpose<__hip_bfloat16>( + const __hip_bfloat16* in, __hip_bfloat16* out, int row, int col) { + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = FindNumThreads(row); int nb = col; CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); @@ -157,36 +158,36 @@ void _Transpose<__nv_bfloat16>( #if CUDART_VERSION < 11000 template -cusparseStatus_t Xcsrmm2( - cusparseHandle_t handle, cusparseOperation_t transA, - cusparseOperation_t transB, int m, int n, int k, int nnz, - const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA, +hipsparseStatus_t Xcsrmm2( + hipsparseHandle_t handle, hipsparseOperation_t transA, + hipsparseOperation_t transB, int m, int n, int k, int nnz, + const DType* alpha, const hipsparseMatDescr_t descrA, const DType* csrValA, const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb, const DType* beta, DType* C, int ldc) { LOG(INFO) << "Not supported dtype"; - return CUSPARSE_STATUS_EXECUTION_FAILED; + return HIPSPARSE_STATUS_EXECUTION_FAILED; } template <> -cusparseStatus_t Xcsrmm2( - cusparseHandle_t handle, cusparseOperation_t transA, - cusparseOperation_t transB, int m, int n, int k, int nnz, - const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA, +hipsparseStatus_t Xcsrmm2( + hipsparseHandle_t handle, hipsparseOperation_t transA, + hipsparseOperation_t transB, int m, int n, int k, int nnz, + const float* alpha, const hipsparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb, const float* beta, float* C, int ldc) { - return cusparseScsrmm2( + return hipsparseScsrmm2( handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc); } template <> -cusparseStatus_t Xcsrmm2( - cusparseHandle_t handle, cusparseOperation_t transA, - cusparseOperation_t transB, int m, int n, int k, int nnz, - const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA, +hipsparseStatus_t Xcsrmm2( + hipsparseHandle_t handle, hipsparseOperation_t transA, + hipsparseOperation_t transB, int m, int n, int k, int nnz, + const double* alpha, const hipsparseMatDescr_t descrA, const double* csrValA, const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb, const double* beta, double* C, int ldc) { - return cusparseDcsrmm2( + return hipsparseDcsrmm2( handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, csrColIndA, B, ldb, beta, C, ldc); } @@ -214,12 +215,12 @@ void CusparseCsrmm2( // device auto device = runtime::DeviceAPI::Get(ctx); auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, stream)); // all one data array DType* valptr = nullptr; if (!A_data) { @@ -228,54 +229,54 @@ void CusparseCsrmm2( _Fill(valptr, nnz, static_cast(1.)); } #if CUDART_VERSION >= 11000 - cusparseSpMatDescr_t matA; - cusparseDnMatDescr_t matB, matC; + hipsparseSpMatDescr_t matA; + hipsparseDnMatDescr_t matB, matC; constexpr auto dtype = cuda_dtype::value; constexpr auto idtype = cusparse_idtype::value; - CUSPARSE_CALL(cusparseCreateCsr( + CUSPARSE_CALL(hipsparseCreateCsr( &matA, m, k, nnz, static_cast(csr.indptr->data), static_cast(csr.indices->data), const_cast(valptr ? valptr : A_data), idtype, idtype, - CUSPARSE_INDEX_BASE_ZERO, dtype)); - CUSPARSE_CALL(cusparseCreateDnMat( - &matB, k, n, n, const_cast(B_data), dtype, CUSPARSE_ORDER_ROW)); + HIPSPARSE_INDEX_BASE_ZERO, dtype)); + CUSPARSE_CALL(hipsparseCreateDnMat( + &matB, k, n, n, const_cast(B_data), dtype, HIPSPARSE_ORDER_ROW)); CUSPARSE_CALL( - cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); + hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW)); - auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; - auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE; size_t workspace_size; - cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only - ? CUSPARSE_SPMM_CSR_ALG3 - : CUSPARSE_SPMM_CSR_ALG2; - CUSPARSE_CALL(cusparseSpMM_bufferSize( + hipsparseSpMMAlg_t spmm_alg = use_deterministic_alg_only + ? HIPSPARSE_SPMM_CSR_ALG3 + : HIPSPARSE_SPMM_CSR_ALG2; + CUSPARSE_CALL(hipsparseSpMM_bufferSize( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, spmm_alg, &workspace_size)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - CUSPARSE_CALL(cusparseSpMM( + CUSPARSE_CALL(hipsparseSpMM( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, spmm_alg, workspace)); device->FreeWorkspace(ctx, workspace); - CUSPARSE_CALL(cusparseDestroySpMat(matA)); - CUSPARSE_CALL(cusparseDestroyDnMat(matB)); - CUSPARSE_CALL(cusparseDestroyDnMat(matC)); + CUSPARSE_CALL(hipsparseDestroySpMat(matA)); + CUSPARSE_CALL(hipsparseDestroyDnMat(matB)); + CUSPARSE_CALL(hipsparseDestroyDnMat(matC)); #else // allocate matrix for temporary transposed output DType* trans_out = static_cast(device->AllocWorkspace(ctx, m * n * sizeof(DType))); - cusparseMatDescr_t descr; - CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); - CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + hipsparseMatDescr_t descr; + CUSPARSE_CALL(hipsparseCreateMatDescr(&descr)); + CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO)); CUSPARSE_CALL(Xcsrmm2( - thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, + thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, (valptr) ? valptr : A_data, static_cast(csr.indptr->data), static_cast(csr.indices->data), B_data, n, &beta, trans_out, m)); - CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(descr)); // transpose the output matrix _Transpose(trans_out, C_data, n, m); device->FreeWorkspace(ctx, trans_out); @@ -287,7 +288,7 @@ void CusparseCsrmm2( template void CusparseCsrmm2Hetero( const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data, - const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id, + const DType* A_data, DType* C_data, int64_t x_length, hipStream_t strm_id, bool use_deterministic_alg_only = false) { // We use csrmm2 to perform following operation: // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix @@ -311,9 +312,9 @@ void CusparseCsrmm2Hetero( auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); // allocate cusparse handle if needed if (!thr_entry->cusparse_handle) { - CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + CUSPARSE_CALL(hipsparseCreate(&(thr_entry->cusparse_handle))); } - CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id)); + CUSPARSE_CALL(hipsparseSetStream(thr_entry->cusparse_handle, strm_id)); // all one data array DType* valptr = nullptr; if (!A_data) { @@ -322,50 +323,50 @@ void CusparseCsrmm2Hetero( _Fill(valptr, nnz, static_cast(1.)); } #if CUDART_VERSION >= 11000 - cusparseSpMatDescr_t matA; - cusparseDnMatDescr_t matB, matC; + hipsparseSpMatDescr_t matA; + hipsparseDnMatDescr_t matB, matC; constexpr auto dtype = cuda_dtype::value; constexpr auto idtype = cusparse_idtype::value; - CUSPARSE_CALL(cusparseCreateCsr( + CUSPARSE_CALL(hipsparseCreateCsr( &matA, m, k, nnz, static_cast(csr.indptr->data), static_cast(csr.indices->data), const_cast(valptr ? valptr : A_data), idtype, idtype, - CUSPARSE_INDEX_BASE_ZERO, dtype)); - CUSPARSE_CALL(cusparseCreateDnMat( - &matB, k, n, n, const_cast(B_data), dtype, CUSPARSE_ORDER_ROW)); + HIPSPARSE_INDEX_BASE_ZERO, dtype)); + CUSPARSE_CALL(hipsparseCreateDnMat( + &matB, k, n, n, const_cast(B_data), dtype, HIPSPARSE_ORDER_ROW)); CUSPARSE_CALL( - cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); + hipsparseCreateDnMat(&matC, m, n, n, C_data, dtype, HIPSPARSE_ORDER_ROW)); - auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; - auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transA = HIPSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = HIPSPARSE_OPERATION_NON_TRANSPOSE; size_t workspace_size; - cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only - ? CUSPARSE_SPMM_CSR_ALG3 - : CUSPARSE_SPMM_CSR_ALG2; - CUSPARSE_CALL(cusparseSpMM_bufferSize( + hipsparseSpMMAlg_t spmm_alg = use_deterministic_alg_only + ? HIPSPARSE_SPMM_CSR_ALG3 + : HIPSPARSE_SPMM_CSR_ALG2; + CUSPARSE_CALL(hipsparseSpMM_bufferSize( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, spmm_alg, &workspace_size)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - CUSPARSE_CALL(cusparseSpMM( + CUSPARSE_CALL(hipsparseSpMM( thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, matC, dtype, spmm_alg, workspace)); device->FreeWorkspace(ctx, workspace); - CUSPARSE_CALL(cusparseDestroySpMat(matA)); - CUSPARSE_CALL(cusparseDestroyDnMat(matB)); - CUSPARSE_CALL(cusparseDestroyDnMat(matC)); + CUSPARSE_CALL(hipsparseDestroySpMat(matA)); + CUSPARSE_CALL(hipsparseDestroyDnMat(matB)); + CUSPARSE_CALL(hipsparseDestroyDnMat(matC)); #else - cusparseMatDescr_t descr; - CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); - CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); - CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + hipsparseMatDescr_t descr; + CUSPARSE_CALL(hipsparseCreateMatDescr(&descr)); + CUSPARSE_CALL(hipsparseSetMatType(descr, HIPSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(hipsparseSetMatIndexBase(descr, HIPSPARSE_INDEX_BASE_ZERO)); CHECK_EQ(sizeof(IdType), sizeof(int32_t)); CUSPARSE_CALL(Xcsrmm2( - thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, + thr_entry->cusparse_handle, HIPSPARSE_OPERATION_NON_TRANSPOSE, + HIPSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, (valptr) ? valptr : A_data, static_cast(csr.indptr->data), static_cast(csr.indices->data), B_data, n, &beta, C_data, m)); - CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); + CUSPARSE_CALL(hipsparseDestroyMatDescr(descr)); #endif if (valptr) device->FreeWorkspace(ctx, valptr); } @@ -632,7 +633,7 @@ void SpMMCoo( */ #if BF16_ENABLED if (std::is_same::value || - std::is_same::value) + std::is_same::value) #else if (std::is_same::value) #endif // BF16_ENABLED @@ -645,7 +646,7 @@ void SpMMCoo( *efeat_data = efeat.Ptr(); DType* out_data = out.Ptr(); Idx *argu_data = argu.Ptr(), *arge_data = arge.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0]; int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; @@ -710,7 +711,7 @@ void SpMMCsr( Idx* argu_data = argu.Ptr(); Idx* arge_data = arge.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; @@ -771,7 +772,7 @@ void SpMMCmpCsrHetero( Idx* argu_data = argu.Ptr(); Idx* arge_data = arge.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; diff --git a/src/array/cuda/spmm.cuh.prehip b/src/array/cuda/spmm.cuh.prehip new file mode 100644 index 000000000000..9ebed71cc134 --- /dev/null +++ b/src/array/cuda/spmm.cuh.prehip @@ -0,0 +1,802 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/spmm.cuh + * @brief SPMM CUDA kernel function header. + */ +#ifndef DGL_ARRAY_CUDA_SPMM_CUH_ +#define DGL_ARRAY_CUDA_SPMM_CUH_ + +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" +#include "atomic.cuh" +#include "bf16.cuh" +#include "fp16.cuh" +#include "macro.cuh" + +namespace dgl { + +using namespace cuda; + +namespace aten { + +/** + * @brief Determine whether cusparse SpMM function is applicable. + */ +template +inline bool cusparse_available(bool more_nnz_than_matrix_size) { +#if CUDART_VERSION < 11000 + if (std::is_same::value && + (std::is_same::value || std::is_same::value)) + return true; + return false; +#else + if (std::is_same::value || + std::is_same::value) + return false; // cusparse's SpMM on fp16 is slow, temporally disabled. + // If the CSR matrix has more NNZ than matrix size, we should not use + // cuSPARSE 11.1. + return !more_nnz_than_matrix_size; +#endif +} + +namespace { + +/** @brief Call cuBLAS geam API for transpose operation for float and double. */ +template +cublasStatus_t Xgeam( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, const DType* alpha, const DType* A, int lda, + const DType* beta, const DType* B, int ldb, DType* C, int ldc) { + LOG(FATAL) << "Not supported dtype"; + return CUBLAS_STATUS_EXECUTION_FAILED; +} + +template <> +cublasStatus_t Xgeam<__half>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, const __half* alpha, const __half* A, int lda, + const __half* beta, const __half* B, int ldb, __half* C, int ldc) { + // TODO(ndickson): There is no cublasHgeam, so a different + // implementation would be required. + LOG(FATAL) << "Xgeam does not support dtype half (FP16)"; + return CUBLAS_STATUS_EXECUTION_FAILED; +} + +#if BF16_ENABLED +template <> +cublasStatus_t Xgeam<__nv_bfloat16>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, const __nv_bfloat16* alpha, const __nv_bfloat16* A, int lda, + const __nv_bfloat16* beta, const __nv_bfloat16* B, int ldb, + __nv_bfloat16* C, int ldc) { + // TODO(ndickson): There is no cublasHgeam, so a different + // implementation would be required. + LOG(FATAL) << "Xgeam does not support dtype bfloat16 (BF16)"; + return CUBLAS_STATUS_EXECUTION_FAILED; +} +#endif // BF16_ENABLED + +template <> +cublasStatus_t Xgeam( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, const float* alpha, const float* A, int lda, + const float* beta, const float* B, int ldb, float* C, int ldc) { + return cublasSgeam( + handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); +} + +template <> +cublasStatus_t Xgeam( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, const double* alpha, const double* A, int lda, + const double* beta, const double* B, int ldb, double* C, int ldc) { + return cublasDgeam( + handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc); +} + +/** + * @brief Transpose operator kernel implementation. + * @note not efficient but it's not a bottleneck, used for float16 dtype. + */ +template +__global__ void _TransposeKernel( + const DType* __restrict__ in, DType* __restrict__ out, int n, int m) { + int i = blockIdx.x; + for (int j = threadIdx.x; j < m; j += blockDim.x) + out[i * m + j] = in[j * n + i]; +} + +/** + * @brief Tranpose the input matrix. + * @param row number of rows of input matrix. + * @param col number of columns of input matrix. + */ +template +void _Transpose(const DType* in, DType* out, int row, int col) { + DType alpha = 1., beta = 0.; + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + if (!thr_entry->cublas_handle) + CUBLAS_CALL(cublasCreate(&(thr_entry->cublas_handle))); + CUBLAS_CALL(cublasSetStream(thr_entry->cublas_handle, stream)); + CUBLAS_CALL(Xgeam( + thr_entry->cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &alpha, in, + col, &beta, nullptr, row, out, row)); +} + +/** + * @brief Tranpose the input matrix for data type half. + * @note cuBLAS has no geam API for half data type, fallback to our kernel. + */ +template <> +void _Transpose<__half>(const __half* in, __half* out, int row, int col) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = FindNumThreads(row); + int nb = col; + CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); +} + +#if BF16_ENABLED +/** + * @brief Tranpose the input matrix for data type half. + * @note cuBLAS has no geam API for bf16 data type, fallback to our kernel. + */ +template <> +void _Transpose<__nv_bfloat16>( + const __nv_bfloat16* in, __nv_bfloat16* out, int row, int col) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = FindNumThreads(row); + int nb = col; + CUDA_KERNEL_CALL(_TransposeKernel, nb, nt, 0, stream, in, out, col, row); +} +#endif // BF16_ENABLED + +#if CUDART_VERSION < 11000 +template +cusparseStatus_t Xcsrmm2( + cusparseHandle_t handle, cusparseOperation_t transA, + cusparseOperation_t transB, int m, int n, int k, int nnz, + const DType* alpha, const cusparseMatDescr_t descrA, const DType* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const DType* B, int ldb, + const DType* beta, DType* C, int ldc) { + LOG(INFO) << "Not supported dtype"; + return CUSPARSE_STATUS_EXECUTION_FAILED; +} + +template <> +cusparseStatus_t Xcsrmm2( + cusparseHandle_t handle, cusparseOperation_t transA, + cusparseOperation_t transB, int m, int n, int k, int nnz, + const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const float* B, int ldb, + const float* beta, float* C, int ldc) { + return cusparseScsrmm2( + handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, B, ldb, beta, C, ldc); +} + +template <> +cusparseStatus_t Xcsrmm2( + cusparseHandle_t handle, cusparseOperation_t transA, + cusparseOperation_t transB, int m, int n, int k, int nnz, + const double* alpha, const cusparseMatDescr_t descrA, const double* csrValA, + const int* csrRowPtrA, const int* csrColIndA, const double* B, int ldb, + const double* beta, double* C, int ldc) { + return cusparseDcsrmm2( + handle, transA, transB, m, n, k, nnz, alpha, descrA, csrValA, csrRowPtrA, + csrColIndA, B, ldb, beta, C, ldc); +} +#endif + +/** Cusparse implementation of SpMM on Csr format. */ +template +void CusparseCsrmm2( + const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data, + const DType* A_data, DType* C_data, int x_length, + bool use_deterministic_alg_only = false) { + // We use csrmm2 to perform following operation: + // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix + // for node feature tensor. However, since cusparse only supports + // column-major, while our tensor is stored in row-major, the actual + // computation is: C = trans(A x trans(B)). Currently, we use cublasXgeam to + // implement transposition and allocate intermediate workspace memory for + // this. + const int m = csr.num_rows; + const int n = x_length; + const int k = csr.num_cols; + const int nnz = csr.indices->shape[0]; + const DType alpha = 1.0; + const DType beta = 0.0; + // device + auto device = runtime::DeviceAPI::Get(ctx); + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, stream)); + // all one data array + DType* valptr = nullptr; + if (!A_data) { + valptr = + static_cast(device->AllocWorkspace(ctx, nnz * sizeof(DType))); + _Fill(valptr, nnz, static_cast(1.)); + } +#if CUDART_VERSION >= 11000 + cusparseSpMatDescr_t matA; + cusparseDnMatDescr_t matB, matC; + constexpr auto dtype = cuda_dtype::value; + constexpr auto idtype = cusparse_idtype::value; + CUSPARSE_CALL(cusparseCreateCsr( + &matA, m, k, nnz, static_cast(csr.indptr->data), + static_cast(csr.indices->data), + const_cast(valptr ? valptr : A_data), idtype, idtype, + CUSPARSE_INDEX_BASE_ZERO, dtype)); + CUSPARSE_CALL(cusparseCreateDnMat( + &matB, k, n, n, const_cast(B_data), dtype, CUSPARSE_ORDER_ROW)); + CUSPARSE_CALL( + cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); + + auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + size_t workspace_size; + cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only + ? CUSPARSE_SPMM_CSR_ALG3 + : CUSPARSE_SPMM_CSR_ALG2; + CUSPARSE_CALL(cusparseSpMM_bufferSize( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, spmm_alg, &workspace_size)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUSPARSE_CALL(cusparseSpMM( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, spmm_alg, workspace)); + device->FreeWorkspace(ctx, workspace); + + CUSPARSE_CALL(cusparseDestroySpMat(matA)); + CUSPARSE_CALL(cusparseDestroyDnMat(matB)); + CUSPARSE_CALL(cusparseDestroyDnMat(matC)); +#else + // allocate matrix for temporary transposed output + DType* trans_out = + static_cast(device->AllocWorkspace(ctx, m * n * sizeof(DType))); + + cusparseMatDescr_t descr; + CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); + CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CUSPARSE_CALL(Xcsrmm2( + thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, + (valptr) ? valptr : A_data, static_cast(csr.indptr->data), + static_cast(csr.indices->data), B_data, n, &beta, trans_out, + m)); + CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); + // transpose the output matrix + _Transpose(trans_out, C_data, n, m); + device->FreeWorkspace(ctx, trans_out); +#endif + if (valptr) device->FreeWorkspace(ctx, valptr); +} + +/** Cusparse implementation of SpMM on Csr format. */ +template +void CusparseCsrmm2Hetero( + const DGLContext& ctx, const CSRMatrix& csr, const DType* B_data, + const DType* A_data, DType* C_data, int64_t x_length, cudaStream_t strm_id, + bool use_deterministic_alg_only = false) { + // We use csrmm2 to perform following operation: + // C = A x B, where A is a sparse matrix in csr format, B is the dense matrix + // for node feature tensor. However, since cusparse only supports + // column-major, while our tensor is stored in row-major, the actual + // computation is: C = trans(A x trans(B)). Currently, we use cublasXgeam to + // implement transposition and allocate intermediate workspace memory for + // this. + int int_maxlimit = std::numeric_limits::max(); + CHECK_GE(int_maxlimit, (csr.num_rows)); + CHECK_GE(int_maxlimit, csr.num_cols); + CHECK_GE(int_maxlimit, csr.indices->shape[0]); + const int m = csr.num_rows; + const int n = x_length; + const int k = csr.num_cols; + const int nnz = csr.indices->shape[0]; + const DType alpha = 1.0; + const DType beta = 1.0; + // device + auto device = runtime::DeviceAPI::Get(ctx); + auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + // allocate cusparse handle if needed + if (!thr_entry->cusparse_handle) { + CUSPARSE_CALL(cusparseCreate(&(thr_entry->cusparse_handle))); + } + CUSPARSE_CALL(cusparseSetStream(thr_entry->cusparse_handle, strm_id)); + // all one data array + DType* valptr = nullptr; + if (!A_data) { + valptr = + static_cast(device->AllocWorkspace(ctx, nnz * sizeof(DType))); + _Fill(valptr, nnz, static_cast(1.)); + } +#if CUDART_VERSION >= 11000 + cusparseSpMatDescr_t matA; + cusparseDnMatDescr_t matB, matC; + constexpr auto dtype = cuda_dtype::value; + constexpr auto idtype = cusparse_idtype::value; + CUSPARSE_CALL(cusparseCreateCsr( + &matA, m, k, nnz, static_cast(csr.indptr->data), + static_cast(csr.indices->data), + const_cast(valptr ? valptr : A_data), idtype, idtype, + CUSPARSE_INDEX_BASE_ZERO, dtype)); + CUSPARSE_CALL(cusparseCreateDnMat( + &matB, k, n, n, const_cast(B_data), dtype, CUSPARSE_ORDER_ROW)); + CUSPARSE_CALL( + cusparseCreateDnMat(&matC, m, n, n, C_data, dtype, CUSPARSE_ORDER_ROW)); + + auto transA = CUSPARSE_OPERATION_NON_TRANSPOSE; + auto transB = CUSPARSE_OPERATION_NON_TRANSPOSE; + size_t workspace_size; + cusparseSpMMAlg_t spmm_alg = use_deterministic_alg_only + ? CUSPARSE_SPMM_CSR_ALG3 + : CUSPARSE_SPMM_CSR_ALG2; + CUSPARSE_CALL(cusparseSpMM_bufferSize( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, spmm_alg, &workspace_size)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUSPARSE_CALL(cusparseSpMM( + thr_entry->cusparse_handle, transA, transB, &alpha, matA, matB, &beta, + matC, dtype, spmm_alg, workspace)); + device->FreeWorkspace(ctx, workspace); + + CUSPARSE_CALL(cusparseDestroySpMat(matA)); + CUSPARSE_CALL(cusparseDestroyDnMat(matB)); + CUSPARSE_CALL(cusparseDestroyDnMat(matC)); +#else + cusparseMatDescr_t descr; + CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); + CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); + CHECK_EQ(sizeof(IdType), sizeof(int32_t)); + CUSPARSE_CALL(Xcsrmm2( + thr_entry->cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, m, n, k, nnz, &alpha, descr, + (valptr) ? valptr : A_data, static_cast(csr.indptr->data), + static_cast(csr.indices->data), B_data, n, &beta, C_data, m)); + CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); +#endif + if (valptr) device->FreeWorkspace(ctx, valptr); +} + +} // namespace + +#define SWITCH_OP(op, Op, ...) \ + do { \ + if ((op) == "add") { \ + typedef cuda::binary::Add Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "sub") { \ + typedef cuda::binary::Sub Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "mul") { \ + typedef cuda::binary::Mul Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "div") { \ + typedef cuda::binary::Div Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "copy_lhs") { \ + typedef cuda::binary::CopyLhs Op; \ + { __VA_ARGS__ } \ + } else if ((op) == "copy_rhs") { \ + typedef cuda::binary::CopyRhs Op; \ + { __VA_ARGS__ } \ + } else { \ + LOG(FATAL) << "Unsupported SpMM binary operator: " << op; \ + } \ + } while (0) + +namespace cuda { + +/** + * @brief CUDA kernel of g-SpMM on Coo format. + * @note it uses edge parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different edges. Threadblocks + * on the x-axis are responsible for the computation on different + * positions in feature dimension. To avoid possible data hazards, it uses + * atomic operators for reduction. + */ +template < + typename Idx, typename DType, typename BinaryOp, typename ReduceOp, + bool UseBcast = false, bool UseIdx = false> +__global__ void SpMMCooKernel( + const DType* __restrict__ ufeat, const DType* __restrict__ efeat, + DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e, + const Idx* __restrict__ row, const Idx* __restrict__ col, + const Idx* __restrict__ edge_map, int64_t N, int64_t M, int64_t E, + const int64_t* __restrict__ ubcast_off, + const int64_t* __restrict__ ebcast_off, int64_t ufeat_len, + int64_t efeat_len, int64_t out_len) { + // SPMM with COO. + Idx ty = blockIdx.y * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.y; + while (ty < E) { + const Idx src = _ldg(row + ty); + const Idx dst = _ldg(col + ty); + const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t stride_x = blockDim.x * gridDim.x; + const DType* uoff = BinaryOp::use_lhs ? (ufeat + src * ufeat_len) : nullptr; + const DType* eoff = BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr; + DType* outoff = out + dst * out_len; + while (tx < out_len) { + const int64_t lhs_add = UseBcast ? ubcast_off[tx] : tx; + const int64_t rhs_add = UseBcast ? ebcast_off[tx] : tx; + DType val = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add); + Idx* arguoff = nullptr; // arguoff is not used in SpMMCoo. + Idx* argeoff = nullptr; // argeoff is not used in SpMMCoo. + ReduceOp::Call(outoff + tx, arguoff, argeoff, val, src, eid); + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA kernel to compute argu and arge in g-SpMM on Coo format. + * @note it uses edge parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different edges. Threadblocks + * on the x-axis are responsible for the computation on different + * positions in feature dimension. + */ +template < + typename Idx, typename DType, typename BinaryOp, typename ReduceOp, + bool UseBcast = false, bool UseIdx = false> +__global__ void ArgSpMMCooKernel( + const DType* __restrict__ ufeat, const DType* __restrict__ efeat, + DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e, + const Idx* __restrict__ row, const Idx* __restrict__ col, + const Idx* __restrict__ edge_map, int64_t N, int64_t M, int64_t E, + const int64_t* __restrict__ ubcast_off, + const int64_t* __restrict__ ebcast_off, int64_t ufeat_len, + int64_t efeat_len, int64_t out_len) { + // SPMM with COO arg max/min. + Idx ty = blockIdx.y * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.y; + while (ty < E) { + const Idx src = _ldg(row + ty); + const Idx dst = _ldg(col + ty); + const Idx eid = UseIdx ? _ldg(edge_map + ty) : ty; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + const int64_t stride_x = blockDim.x * gridDim.x; + const DType* uoff = BinaryOp::use_lhs ? (ufeat + src * ufeat_len) : nullptr; + const DType* eoff = BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr; + const DType* outoff = out + dst * out_len; + Idx* arguoff = BinaryOp::use_lhs ? (arg_u + dst * out_len) : nullptr; + Idx* argeoff = BinaryOp::use_rhs ? (arg_e + dst * out_len) : nullptr; + while (tx < out_len) { + int64_t lhs_add = UseBcast ? ubcast_off[tx] : tx; + int64_t rhs_add = UseBcast ? ebcast_off[tx] : tx; + DType val = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add); + ReduceOp::CallArg(tx, arguoff, argeoff, val, outoff[tx], src, eid); + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA kernel of g-SpMM on Csr format. + * @note it uses node parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different destination nodes. + * Threadblocks on the x-axis are responsible for the computation on + * different positions in feature dimension. + */ +template < + typename Idx, typename DType, typename BinaryOp, typename ReduceOp, + bool UseBcast = false, bool UseIdx = false> +__global__ void SpMMCsrKernel( + const DType* __restrict__ ufeat, const DType* __restrict__ efeat, + DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e, + const Idx* __restrict__ indptr, const Idx* __restrict__ indices, + const Idx* __restrict__ edge_map, int64_t num_rows, int64_t num_cols, + const int64_t* __restrict__ ubcast_off, + const int64_t* __restrict__ ebcast_off, int64_t ufeat_len, + int64_t efeat_len, int64_t out_len) { + // SPMM with CSR. + int ty = blockIdx.x * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.x; + const int stride_x = blockDim.x * gridDim.y; + while (ty < num_rows) { + int tx = blockIdx.y * blockDim.x + threadIdx.x; + while (tx < out_len) { + typename accum_dtype::type local_accum = ReduceOp::zero(); + Idx local_argu = 0, local_arge = 0; + const int lhs_add = UseBcast ? ubcast_off[tx] : tx; + const int rhs_add = UseBcast ? ebcast_off[tx] : tx; + for (Idx i = indptr[ty]; i < indptr[ty + 1]; ++i) { + const Idx eid = UseIdx ? _ldg(edge_map + i) : i; + const Idx cid = _ldg(indices + i); + const DType* uoff = + BinaryOp::use_lhs ? (ufeat + cid * ufeat_len) : nullptr; + const DType* eoff = + BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr; + DType out = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add); + ReduceOp::Call(&local_accum, &local_argu, &local_arge, out, cid, eid); + } + // The use of += is to compute cross-type reducing on heterogeneous graph + // when reduce op is `sum`. + // C = SpMM(SpA, B) + C + // Separate kernel `SpMMCmpCsrHeteroKernel` is used for max- and + // min-reducer. It does not affect the output on homogeneous graph as + // `out` is initialized to zero. + out[ty * out_len + tx] += static_cast(local_accum); + if (ReduceOp::require_arg && BinaryOp::use_lhs) + arg_u[ty * out_len + tx] = local_argu; + if (ReduceOp::require_arg && BinaryOp::use_rhs) + arg_e[ty * out_len + tx] = local_arge; + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA kernel of SpMM-Min/Max on Csr format. + * @note it uses node parallel strategy, different threadblocks (on y-axis) + * is responsible for the computation on different destination nodes. + * Threadblocks on the x-axis are responsible for the computation on + * different positions in feature dimension. + */ +template < + typename Idx, typename DType, typename BinaryOp, typename ReduceOp, + bool UseBcast = false, bool UseIdx = false> +__global__ void SpMMCmpCsrHeteroKernel( + const DType* __restrict__ ufeat, const DType* __restrict__ efeat, + DType* __restrict__ out, Idx* __restrict__ arg_u, Idx* __restrict__ arg_e, + Idx* __restrict__ arg_u_ntype, Idx* __restrict__ arg_e_etype, + const Idx* __restrict__ indptr, const Idx* __restrict__ indices, + const Idx* __restrict__ edge_map, int64_t num_rows, int64_t num_cols, + const int64_t* __restrict__ ubcast_off, + const int64_t* __restrict__ ebcast_off, int64_t ufeat_len, + int64_t efeat_len, int64_t out_len, const int src_type, const int etype) { + // SPMM with CSR. + int ty = blockIdx.y * blockDim.y + threadIdx.y; + const Idx stride_y = blockDim.y * gridDim.y; + const int stride_x = blockDim.x * gridDim.x; + while (ty < num_rows) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + while (tx < out_len) { + using accum_type = typename accum_dtype::type; + accum_type local_accum = + static_cast(out[ty * out_len + tx]); // ReduceOp::zero(); + Idx local_argu = 0, local_arge = 0; + const int lhs_add = UseBcast ? ubcast_off[tx] : tx; + const int rhs_add = UseBcast ? ebcast_off[tx] : tx; + for (Idx i = indptr[ty]; i < indptr[ty + 1]; ++i) { + const Idx eid = UseIdx ? _ldg(edge_map + i) : i; + const Idx cid = _ldg(indices + i); + const DType* uoff = + BinaryOp::use_lhs ? (ufeat + cid * ufeat_len) : nullptr; + const DType* eoff = + BinaryOp::use_rhs ? (efeat + eid * efeat_len) : nullptr; + DType tmp_out = BinaryOp::Call(uoff + lhs_add, eoff + rhs_add); + ReduceOp::Call( + &local_accum, &local_argu, &local_arge, tmp_out, cid, eid); + } + // Update output only when max/min values are different that original + // output + DType new_out = static_cast(local_accum); + if (out[ty * out_len + tx] != new_out) { + out[ty * out_len + tx] = new_out; + if (ReduceOp::require_arg && BinaryOp::use_lhs) { + arg_u[ty * out_len + tx] = local_argu; + arg_u_ntype[ty * out_len + tx] = src_type; + } + if (ReduceOp::require_arg && BinaryOp::use_rhs) { + arg_e[ty * out_len + tx] = local_arge; + arg_e_etype[ty * out_len + tx] = etype; + } + } + tx += stride_x; + } + ty += stride_y; + } +} + +/** + * @brief CUDA implementation of g-SpMM on Coo format. + * @param bcast Broadcast information. + * @param coo The Coo matrix. + * @param ufeat The feature on source nodes. + * @param efeat The feature on edges. + * @param out The result feature on destination nodes. + * @param argu Arg-Min/Max on source nodes, which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param arge Arg-Min/Max on edges. which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + */ +template +void SpMMCoo( + const BcastOff& bcast, const COOMatrix& coo, NDArray ufeat, NDArray efeat, + NDArray out, NDArray argu, NDArray arge) { + /** + * TODO(Xin): Disable half precision for SpMMCoo due to the round-off error. + * We should use fp32 for the accumulation but it's hard to modify the + * current implementation. + */ +#if BF16_ENABLED + if (std::is_same::value || + std::is_same::value) +#else + if (std::is_same::value) +#endif // BF16_ENABLED + LOG(FATAL) << "SpMMCoo doesn't support half precision fow now. " + << "Please use SpMMCsr instead by allowing the graph " + << "materialize CSR/CSC formats."; + const Idx *row = coo.row.Ptr(), *col = coo.col.Ptr(), + *edge_map = coo.data.Ptr(); + const DType *ufeat_data = ufeat.Ptr(), + *efeat_data = efeat.Ptr(); + DType* out_data = out.Ptr(); + Idx *argu_data = argu.Ptr(), *arge_data = arge.Ptr(); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int64_t N = coo.num_rows, M = coo.num_cols, E = coo.row->shape[0]; + + int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; + int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; + + int64_t out_size = out.NumElements(); + const int nt = FindNumThreads(out_size); + const int nb = (out_size + nt - 1) / nt; + CUDA_KERNEL_CALL( + _FillKernel, nb, nt, 0, stream, out_data, out_size, ReduceOp::zero()); + + const int ntx = FindNumThreads(len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (len + ntx - 1) / ntx; + const int nby = FindNumBlocks<'y'>((E + nty - 1) / nty); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + const bool use_idx = !IsNullArray(coo.data); + + BCAST_IDX_CTX_SWITCH(bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, { + CUDA_KERNEL_CALL( + (SpMMCooKernel), + nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data, + arge_data, row, col, edge_map, N, M, E, ubcast_off, ebcast_off, lhs_len, + rhs_len, len); + if (ReduceOp::require_arg) { + CUDA_KERNEL_CALL( + (ArgSpMMCooKernel), + nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data, + arge_data, row, col, edge_map, N, M, E, ubcast_off, ebcast_off, + lhs_len, rhs_len, len); + } + }); +} + +/** + * @brief CUDA implementation of g-SpMM on Csr format. + * @param bcast Broadcast information. + * @param csr The Csr matrix. + * @param ufeat The feature on source nodes. + * @param efeat The feature on edges. + * @param out The result feature on destination nodes. + * @param argu Arg-Min/Max on source nodes, which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param arge Arg-Min/Max on edges. which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + */ +template +void SpMMCsr( + const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, + NDArray out, NDArray argu, NDArray arge) { + const Idx* indptr = csr.indptr.Ptr(); + const Idx* indices = csr.indices.Ptr(); + const Idx* edge_map = csr.data.Ptr(); + const DType* ufeat_data = ufeat.Ptr(); + const DType* efeat_data = efeat.Ptr(); + DType* out_data = out.Ptr(); + Idx* argu_data = argu.Ptr(); + Idx* arge_data = arge.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; + int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; + const int ntx = FindNumThreads(len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nby = (len + ntx - 1) / ntx; + const int nbx = FindNumBlocks<'x'>((csr.num_rows + nty - 1) / nty); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + const bool use_idx = !IsNullArray(csr.data); + + BCAST_IDX_CTX_SWITCH( + bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, + {CUDA_KERNEL_CALL( + (SpMMCsrKernel), + nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data, + arge_data, indptr, indices, edge_map, csr.num_rows, csr.num_cols, + ubcast_off, ebcast_off, lhs_len, rhs_len, len)}); +} + +/** + * @brief CUDA kernel of SpMM-Min/Max on Csr format on heterogeneous graph. + * @param bcast Broadcast information. + * @param csr The Csr matrix. + * @param ufeat The feature on source nodes. + * @param efeat The feature on edges. + * @param out The result feature on destination nodes. + * @param argu Arg-Min/Max on source nodes, which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param arge Arg-Min/Max on edges. which refers the source node indices + * correspond to the minimum/maximum values of reduction result on + * destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param argu_ntype Node type of the arg-Min/Max on source nodes, which refers + * the source node types correspond to the minimum/maximum values of reduction + * result on destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param arge_etype Edge-type of the arg-Min/Max on edges. which refers the + * source node indices correspond to the minimum/maximum values of reduction + * result on destination nodes. It's useful in computing gradients of Min/Max + * reducer. + * @param src_type Node type of the source nodes of an etype + * @param etype Edge type + */ +template +void SpMMCmpCsrHetero( + const BcastOff& bcast, const CSRMatrix& csr, NDArray ufeat, NDArray efeat, + NDArray out, NDArray argu, NDArray arge, NDArray argu_ntype, + NDArray arge_etype, const int src_type, const int etype) { + const Idx* indptr = csr.indptr.Ptr(); + const Idx* indices = csr.indices.Ptr(); + const Idx* edge_map = csr.data.Ptr(); + const DType* ufeat_data = ufeat.Ptr(); + const DType* efeat_data = efeat.Ptr(); + DType* out_data = out.Ptr(); + Idx* argu_data = argu.Ptr(); + Idx* arge_data = arge.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + int64_t *ubcast_off = nullptr, *ebcast_off = nullptr; + int64_t len = bcast.out_len, lhs_len = bcast.lhs_len, rhs_len = bcast.rhs_len; + const int ntx = FindNumThreads(len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (len + ntx - 1) / ntx; + const int nby = FindNumBlocks<'y'>((csr.num_rows + nty - 1) / nty); + const dim3 nblks(nbx, nby); + const dim3 nthrs(ntx, nty); + const bool use_idx = !IsNullArray(csr.data); + + BCAST_IDX_CTX_SWITCH( + bcast, use_idx, ufeat->ctx, ubcast_off, ebcast_off, + {CUDA_KERNEL_CALL( + (SpMMCmpCsrHeteroKernel< + Idx, DType, BinaryOp, ReduceOp, UseBcast, UseIdx>), + nblks, nthrs, 0, stream, ufeat_data, efeat_data, out_data, argu_data, + arge_data, static_cast(argu_ntype->data), + static_cast(arge_etype->data), indptr, indices, edge_map, + csr.num_rows, csr.num_cols, ubcast_off, ebcast_off, lhs_len, rhs_len, + len, src_type, etype)}); +} + +} // namespace cuda +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_SPMM_CUH_ diff --git a/src/array/cuda/spmm_hetero.cu b/src/array/cuda/spmm_hetero.cu index b5b258df2ef6..6bcd98a898be 100644 --- a/src/array/cuda/spmm_hetero.cu +++ b/src/array/cuda/spmm_hetero.cu @@ -55,7 +55,7 @@ void SpMMCsrHetero( if (m == 0) continue; DType* out = static_cast(device->AllocWorkspace( vec_csr[0].indptr->ctx, m * n * sizeof(DType))); - CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType))); + CUDA_CALL(hipMemset(out, 0, m * n * sizeof(DType))); trans_out[ntype] = out; } } @@ -116,7 +116,7 @@ void SpMMCsrHetero( } } - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) { const dgl_type_t src_id = ufeat_ntids[etype]; const dgl_type_t dst_id = out_ntids[etype]; @@ -214,14 +214,14 @@ template void SpMMCsrHetero( const std::vector& ufeat_ntids, const std::vector& out_ntids); #if BF16_ENABLED -template void SpMMCsrHetero( +template void SpMMCsrHetero( const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::vector& csr, const std::vector& ufeat, const std::vector& efeat, std::vector* out, std::vector>* out_aux, const std::vector& ufeat_ntids, const std::vector& out_ntids); -template void SpMMCsrHetero( +template void SpMMCsrHetero( const std::string& op, const std::string& reduce, const BcastOff& bcast, const std::vector& csr, const std::vector& ufeat, const std::vector& efeat, std::vector* out, diff --git a/src/array/cuda/spmm_hetero.cu.prehip b/src/array/cuda/spmm_hetero.cu.prehip new file mode 100644 index 000000000000..b5b258df2ef6 --- /dev/null +++ b/src/array/cuda/spmm_hetero.cu.prehip @@ -0,0 +1,262 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/spmm.cu + * @brief SPMM C APIs and definitions. + */ +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./functor.cuh" +#include "./ge_spmm.cuh" +#include "./spmm.cuh" + +namespace dgl { + +using namespace cuda; + +namespace aten { + +/** + * @brief CUDA implementation of g-SpMM on Csr format. + * @note use cusparse if the reduce operator is `sum` and there is + * no broadcast, use dgl's kernel in other cases. + */ +template +void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& vec_csr, + const std::vector& vec_ufeat, + const std::vector& vec_efeat, std::vector* vec_out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, // ufeat node type id + const std::vector& out_ntids) { // output node type id + bool is_scalar_efeat = + vec_efeat[0].NumElements() == vec_csr[0].indices->shape[0]; + bool use_efeat = op != "copy_lhs"; + auto device = runtime::DeviceAPI::Get(vec_csr[0].indptr->ctx); + std::vector trans_out((*vec_out).size(), NULL); + bool use_deterministic_alg_only = false; + if (NULL != std::getenv("USE_DETERMINISTIC_ALG")) + use_deterministic_alg_only = true; + + bool use_legacy_cusparsemm = + (CUDART_VERSION < 11000) && (reduce == "sum") && + // legacy cuSPARSE does not care about NNZ, hence the argument "false". + ((op == "copy_lhs" && cusparse_available(false)) || + (op == "mul" && is_scalar_efeat && + cusparse_available(false))); + // Create temporary output buffer to store non-transposed output + if (use_legacy_cusparsemm) { + for (dgl_type_t ntype = 0; ntype < (*vec_out).size(); ++ntype) { + const int m = (*vec_out)[ntype]->shape[0]; + const int n = (*vec_out)[ntype]->shape[1]; + if (m == 0) continue; + DType* out = static_cast(device->AllocWorkspace( + vec_csr[0].indptr->ctx, m * n * sizeof(DType))); + CUDA_CALL(cudaMemset(out, 0, m * n * sizeof(DType))); + trans_out[ntype] = out; + } + } + // Check shape of ufeat for all relation type and compute feature size + int64_t x_length = 1; + for (dgl_type_t etype = 0; etype < (ufeat_ntids.size() - 1); ++etype) { + NDArray ufeat = vec_ufeat[ufeat_ntids[etype]]; + NDArray next_ufeat = vec_ufeat[ufeat_ntids[etype + 1]]; + CHECK_EQ(ufeat->ndim, next_ufeat->ndim) + << "Input features have different shapes"; + for (int i = 1; i < ufeat->ndim; ++i) { + if (ufeat->shape[i] != next_ufeat->shape[i]) { + if (ufeat->shape[i] == 1 || next_ufeat->shape[i] == 1) + LOG(FATAL) << "Homogenized message passing on heterogeneous graphs " + "does not support " + << "automatic broadcasting. Please manually broadcast it " + "before calling " + << "message passing functions."; + else + LOG(FATAL) << "Input features have different shapes."; + return; + } + + if (etype == 0) x_length *= ufeat->shape[i]; + } + } + // TODO(Israt): Can python do the following initializations while creating the + // tensors? + if (reduce == "max" || reduce == "min") { + const int64_t dim = bcast.out_len; + std::vector updated((*vec_out).size(), false); + for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) { + DType* out_off = (*vec_out)[out_ntids[etype]].Ptr(); + if (reduce == "max") + _Fill( + out_off, vec_csr[etype].num_rows * dim, + cuda::reduce::Max::zero()); + else // min + _Fill( + out_off, vec_csr[etype].num_rows * dim, + cuda::reduce::Min::zero()); + const dgl_type_t dst_id = out_ntids[etype]; + if (!updated[dst_id]) { + updated[dst_id] = true; + if (op == "copy_lhs") { + IdType* argu_ntype = (*out_aux)[2][dst_id].Ptr(); + _Fill( + argu_ntype, vec_csr[etype].num_rows * dim, + static_cast(-1)); + } + if (op == "copy_rhs") { + IdType* arge_etype = (*out_aux)[3][dst_id].Ptr(); + _Fill( + arge_etype, vec_csr[etype].num_rows * dim, + static_cast(-1)); + } + } + } + } + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + for (dgl_type_t etype = 0; etype < ufeat_ntids.size(); ++etype) { + const dgl_type_t src_id = ufeat_ntids[etype]; + const dgl_type_t dst_id = out_ntids[etype]; + CSRMatrix csr = vec_csr[etype]; + if (reduce == "sum") { + bool more_nnz = (csr.indices->shape[0] > csr.num_rows * csr.num_cols); + /* Call SpMM for each relation type */ + if (op == "copy_lhs" && + cusparse_available(more_nnz)) { // cusparse + /* If CUDA is less than 11.0, put the output in trans_out for later + * transposition */ + DType* out = (CUDART_VERSION < 11000) + ? trans_out[dst_id] + : static_cast((*vec_out)[dst_id]->data); + CusparseCsrmm2Hetero( + csr.indptr->ctx, csr, static_cast(vec_ufeat[src_id]->data), + nullptr, out, x_length, stream, use_deterministic_alg_only); + } else if ( + op == "mul" && is_scalar_efeat && + cusparse_available(more_nnz)) { // cusparse + NDArray efeat = vec_efeat[etype]; + if (!IsNullArray(csr.data)) efeat = IndexSelect(efeat, csr.data); + CusparseCsrmm2Hetero( + csr.indptr->ctx, csr, static_cast(vec_ufeat[src_id]->data), + static_cast(efeat->data), + // TODO(Israt): Change (*vec_out) to trans_out to support CUDA + // version < 11 + static_cast((*vec_out)[dst_id]->data), x_length, stream, + use_deterministic_alg_only); + } else { // general kernel + NDArray ufeat = + (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id]; + NDArray efeat = + (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype]; + SWITCH_OP(op, Op, { + cuda::SpMMCsr>( + bcast, csr, ufeat, efeat, (*vec_out)[dst_id], NullArray(), + NullArray()); + }); + } + } else if (reduce == "max") { + SWITCH_OP(op, Op, { + NDArray ufeat = + (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id]; + NDArray efeat = + (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype]; + cuda::SpMMCmpCsrHetero< + IdType, DType, Op, cuda::reduce::Max>( + bcast, csr, ufeat, efeat, (*vec_out)[dst_id], (*out_aux)[0][dst_id], + (*out_aux)[1][dst_id], (*out_aux)[2][dst_id], (*out_aux)[3][dst_id], + src_id, etype); + }); + } else if (reduce == "min") { + SWITCH_OP(op, Op, { + NDArray ufeat = + (vec_ufeat.size() == 0) ? NullArray() : vec_ufeat[src_id]; + NDArray efeat = + (vec_efeat.size() == 0) ? NullArray() : vec_efeat[etype]; + cuda::SpMMCmpCsrHetero< + IdType, DType, Op, cuda::reduce::Min>( + bcast, csr, ufeat, efeat, (*vec_out)[dst_id], (*out_aux)[0][dst_id], + (*out_aux)[1][dst_id], (*out_aux)[2][dst_id], (*out_aux)[3][dst_id], + src_id, etype); + }); + } else { + LOG(FATAL) << "Not implemented"; + } + } + + if (use_legacy_cusparsemm) { + // transpose output + for (dgl_type_t ntype = 0; ntype < (*vec_out).size(); ++ntype) { + const int m = (*vec_out)[ntype]->shape[0]; + const int n = (*vec_out)[ntype]->shape[1]; + if (m == 0) continue; + DType* C_data = static_cast((*vec_out)[ntype]->data); + _Transpose(trans_out[ntype], C_data, n, m); + device->FreeWorkspace(vec_csr[0].indptr->ctx, trans_out[ntype]); + } + } +} + +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +#if BF16_ENABLED +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +#endif // BF16_ENABLED +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); +template void SpMMCsrHetero( + const std::string& op, const std::string& reduce, const BcastOff& bcast, + const std::vector& csr, const std::vector& ufeat, + const std::vector& efeat, std::vector* out, + std::vector>* out_aux, + const std::vector& ufeat_ntids, + const std::vector& out_ntids); + +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/utils.cu b/src/array/cuda/utils.cu index 1006e7a70732..48f42a9e081f 100644 --- a/src/array/cuda/utils.cu +++ b/src/array/cuda/utils.cu @@ -4,7 +4,7 @@ * @brief Utilities for CUDA kernels. */ -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" @@ -17,11 +17,11 @@ bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) { int8_t* rst = static_cast(device->AllocWorkspace(ctx, 1)); // Call CUB's reduction size_t workspace_size = 0; - cudaStream_t stream = runtime::getCurrentCUDAStream(); - CUDA_CALL(cub::DeviceReduce::Min( + hipStream_t stream = runtime::getCurrentCUDAStream(); + CUDA_CALL(hipcub::DeviceReduce::Min( nullptr, workspace_size, flags, rst, length, stream)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - CUDA_CALL(cub::DeviceReduce::Min( + CUDA_CALL(hipcub::DeviceReduce::Min( workspace, workspace_size, flags, rst, length, stream)); int8_t cpu_rst = GetCUDAScalar(device, ctx, rst); device->FreeWorkspace(ctx, workspace); diff --git a/src/array/cuda/utils.cu.prehip b/src/array/cuda/utils.cu.prehip new file mode 100644 index 000000000000..1006e7a70732 --- /dev/null +++ b/src/array/cuda/utils.cu.prehip @@ -0,0 +1,33 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/utils.cu + * @brief Utilities for CUDA kernels. + */ + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "./utils.h" + +namespace dgl { +namespace cuda { + +bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx) { + auto device = runtime::DeviceAPI::Get(ctx); + int8_t* rst = static_cast(device->AllocWorkspace(ctx, 1)); + // Call CUB's reduction + size_t workspace_size = 0; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + CUDA_CALL(cub::DeviceReduce::Min( + nullptr, workspace_size, flags, rst, length, stream)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUDA_CALL(cub::DeviceReduce::Min( + workspace, workspace_size, flags, rst, length, stream)); + int8_t cpu_rst = GetCUDAScalar(device, ctx, rst); + device->FreeWorkspace(ctx, workspace); + device->FreeWorkspace(ctx, rst); + return cpu_rst == 1; +} + +} // namespace cuda +} // namespace dgl diff --git a/src/array/cuda/utils.h b/src/array/cuda/utils.h index 157bdb295150..502ccaddefc3 100644 --- a/src/array/cuda/utils.h +++ b/src/array/cuda/utils.h @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file array/cuda/utils.h @@ -11,7 +12,7 @@ #include #include -#include +#include #include #include "../../runtime/cuda/cuda_common.h" @@ -126,7 +127,7 @@ __global__ void _FillKernel(DType* ptr, size_t length, DType val) { /** @brief Fill the vector started from ptr of size length with val */ template void _Fill(DType* ptr, size_t length, DType val) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); int nt = FindNumThreads(length); int nb = (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound. @@ -185,8 +186,8 @@ template __global__ void _LinearSearchKernel( const IdType* indptr, const IdType* indices, const IdType* data, const IdType* row, const IdType* col, int64_t row_stride, - int64_t col_stride, int64_t length, const __nv_bfloat16* weights, - __nv_bfloat16 filler, __nv_bfloat16* out) { + int64_t col_stride, int64_t length, const __hip_bfloat16* weights, + __hip_bfloat16 filler, __hip_bfloat16* out) { int tx = blockIdx.x * blockDim.x + threadIdx.x; const int stride_x = gridDim.x * blockDim.x; while (tx < length) { @@ -204,7 +205,7 @@ __global__ void _LinearSearchKernel( } else { // If the result is saved in bf16, it should be fine to convert it to // float first - out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast(v)); + out[tx] = weights ? weights[v] : __hip_bfloat16(static_cast(v)); } tx += stride_x; } @@ -277,12 +278,12 @@ template void MaskSelect( runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input, const BoolType* mask, DType* output, int64_t n, int64_t* rst, - cudaStream_t stream) { + hipStream_t stream) { size_t workspace_size = 0; - CUDA_CALL(cub::DeviceSelect::Flagged( + CUDA_CALL(hipcub::DeviceSelect::Flagged( nullptr, workspace_size, input, mask, output, rst, n, stream)); void* workspace = device->AllocWorkspace(ctx, workspace_size); - CUDA_CALL(cub::DeviceSelect::Flagged( + CUDA_CALL(hipcub::DeviceSelect::Flagged( workspace, workspace_size, input, mask, output, rst, n, stream)); device->FreeWorkspace(ctx, workspace); } @@ -290,7 +291,7 @@ void MaskSelect( inline void* GetDevicePointer(runtime::NDArray array) { void* ptr = array->data; if (array.IsPinned()) { - CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0)); + CUDA_CALL(hipHostGetDevicePointer(&ptr, ptr, 0)); } return ptr; } diff --git a/src/array/cuda/utils.h.prehip b/src/array/cuda/utils.h.prehip new file mode 100644 index 000000000000..157bdb295150 --- /dev/null +++ b/src/array/cuda/utils.h.prehip @@ -0,0 +1,301 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/cuda/utils.h + * @brief Utilities for CUDA kernels. + */ +#ifndef DGL_ARRAY_CUDA_UTILS_H_ +#define DGL_ARRAY_CUDA_UTILS_H_ + +#include +#include +#include +#include + +#include +#include + +#include "../../runtime/cuda/cuda_common.h" + +namespace dgl { +namespace cuda { + +#define CUDA_MAX_NUM_BLOCKS_X 0x7FFFFFFF +#define CUDA_MAX_NUM_BLOCKS_Y 0xFFFF +#define CUDA_MAX_NUM_BLOCKS_Z 0xFFFF +// The max number of threads per block +#define CUDA_MAX_NUM_THREADS 256 + +/** @brief Calculate the number of threads needed given the dimension length. + * + * It finds the biggest number that is smaller than min(dim, max_nthrs) + * and is also power of two. + */ +inline int FindNumThreads(int dim, int max_nthrs = CUDA_MAX_NUM_THREADS) { + CHECK_GE(dim, 0); + if (dim == 0) return 1; + int ret = max_nthrs; + while (ret > dim) { + ret = ret >> 1; + } + return ret; +} + +template +int _NumberOfBits(const T& range) { + if (range <= 1) { + // ranges of 0 or 1 require no bits to store + return 0; + } + + int bits = 1; + const auto urange = static_cast>(range); + while (bits < static_cast(sizeof(T) * 8) && (1ull << bits) < urange) { + ++bits; + } + + if (bits < static_cast(sizeof(T) * 8)) { + CHECK_EQ((range - 1) >> bits, 0); + } + CHECK_NE((range - 1) >> (bits - 1), 0); + + return bits; +} + +/** + * @brief Find number of blocks is smaller than nblks and max_nblks + * on the given axis ('x', 'y' or 'z'). + */ +template +inline int FindNumBlocks(int nblks, int max_nblks = -1) { + int default_max_nblks = -1; + switch (axis) { + case 'x': + default_max_nblks = CUDA_MAX_NUM_BLOCKS_X; + break; + case 'y': + default_max_nblks = CUDA_MAX_NUM_BLOCKS_Y; + break; + case 'z': + default_max_nblks = CUDA_MAX_NUM_BLOCKS_Z; + break; + default: + LOG(FATAL) << "Axis " << axis << " not recognized"; + break; + } + if (max_nblks == -1) max_nblks = default_max_nblks; + CHECK_NE(nblks, 0); + if (nblks < max_nblks) return nblks; + return max_nblks; +} + +template +__device__ __forceinline__ T _ldg(T* addr) { +#if __CUDA_ARCH__ >= 350 + return __ldg(addr); +#else + return *addr; +#endif +} + +/** + * @brief Return true if the given bool flag array is all true. + * The input bool array is in int8_t type so it is aligned with byte address. + * + * @param flags The bool array. + * @param length The length. + * @param ctx Device context. + * @return True if all the flags are true. + */ +bool AllTrue(int8_t* flags, int64_t length, const DGLContext& ctx); + +/** + * @brief CUDA Kernel of filling the vector started from ptr of size length + * with val. + * @note internal use only. + */ +template +__global__ void _FillKernel(DType* ptr, size_t length, DType val) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + ptr[tx] = val; + tx += stride_x; + } +} + +/** @brief Fill the vector started from ptr of size length with val */ +template +void _Fill(DType* ptr, size_t length, DType val) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + int nt = FindNumThreads(length); + int nb = + (length + nt - 1) / nt; // on x-axis, no need to worry about upperbound. + CUDA_KERNEL_CALL(cuda::_FillKernel, nb, nt, 0, stream, ptr, length, val); +} + +/** + * @brief Search adjacency list linearly for each (row, col) pair and + * write the data under the matched position in the indices array to the output. + * + * If there is no match, the value in \c filler is written. + * If there are multiple matches, only the first match is written. + * If the given data array is null, write the matched position to the output. + */ +template +__global__ void _LinearSearchKernel( + const IdType* indptr, const IdType* indices, const IdType* data, + const IdType* row, const IdType* col, int64_t row_stride, + int64_t col_stride, int64_t length, const DType* weights, DType filler, + DType* out) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + int rpos = tx * row_stride, cpos = tx * col_stride; + IdType v = -1; + const IdType r = row[rpos], c = col[cpos]; + for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) { + if (indices[i] == c) { + v = data ? data[i] : i; + break; + } + } + if (v == -1) { + out[tx] = filler; + } else { + // The casts here are to be able to handle DType being __half. + // GCC treats int64_t as a distinct type from long long, so + // without the explcit cast to long long, it errors out saying + // that the implicit cast results in an ambiguous choice of + // constructor for __half. + // The using statement is to avoid a linter error about using + // long or long long. + using LongLong = long long; // NOLINT + out[tx] = weights ? weights[v] : DType(LongLong(v)); + } + tx += stride_x; + } +} + +#if BF16_ENABLED +/** + * @brief Specialization for bf16 because conversion from long long to bfloat16 + * doesn't exist before SM80. + */ +template +__global__ void _LinearSearchKernel( + const IdType* indptr, const IdType* indices, const IdType* data, + const IdType* row, const IdType* col, int64_t row_stride, + int64_t col_stride, int64_t length, const __nv_bfloat16* weights, + __nv_bfloat16 filler, __nv_bfloat16* out) { + int tx = blockIdx.x * blockDim.x + threadIdx.x; + const int stride_x = gridDim.x * blockDim.x; + while (tx < length) { + int rpos = tx * row_stride, cpos = tx * col_stride; + IdType v = -1; + const IdType r = row[rpos], c = col[cpos]; + for (IdType i = indptr[r]; i < indptr[r + 1]; ++i) { + if (indices[i] == c) { + v = data ? data[i] : i; + break; + } + } + if (v == -1) { + out[tx] = filler; + } else { + // If the result is saved in bf16, it should be fine to convert it to + // float first + out[tx] = weights ? weights[v] : __nv_bfloat16(static_cast(v)); + } + tx += stride_x; + } +} +#endif // BF16_ENABLED + +template +inline DType GetCUDAScalar( + runtime::DeviceAPI* device_api, DGLContext ctx, const DType* cuda_ptr) { + DType result; + device_api->CopyDataFromTo( + cuda_ptr, 0, &result, 0, sizeof(result), ctx, DGLContext{kDGLCPU, 0}, + DGLDataTypeTraits::dtype); + return result; +} + +/** + * @brief Given a sorted array and a value this function returns the index + * of the first element which compares greater than value. + * + * This function assumes 0-based index + * @param A: ascending sorted array + * @param n: size of the A + * @param x: value to search in A + * @return index, i, of the first element st. A[i]>x. If x>=A[n-1] returns n. + * if x +__device__ IdType _UpperBound(const IdType* A, int64_t n, IdType x) { + IdType l = 0, r = n, m = 0; + while (l < r) { + m = l + (r - l) / 2; + if (x >= A[m]) { + l = m + 1; + } else { + r = m; + } + } + return l; +} + +/** + * @brief Given a sorted array and a value this function returns the index + * of the element who is equal to val. If not exist returns n+1 + * + * This function assumes 0-based index + * @param A: ascending sorted array + * @param n: size of the A + * @param x: value to search in A + * @return index, i, st. A[i]==x. If such an index not exists returns 'n'. + */ +template +__device__ IdType _BinarySearch(const IdType* A, int64_t n, IdType x) { + IdType l = 0, r = n - 1, m = 0; + while (l <= r) { + m = l + (r - l) / 2; + if (A[m] == x) { + return m; + } + if (A[m] < x) { + l = m + 1; + } else { + r = m - 1; + } + } + return n; // not found +} + +template +void MaskSelect( + runtime::DeviceAPI* device, const DGLContext& ctx, const DType* input, + const BoolType* mask, DType* output, int64_t n, int64_t* rst, + cudaStream_t stream) { + size_t workspace_size = 0; + CUDA_CALL(cub::DeviceSelect::Flagged( + nullptr, workspace_size, input, mask, output, rst, n, stream)); + void* workspace = device->AllocWorkspace(ctx, workspace_size); + CUDA_CALL(cub::DeviceSelect::Flagged( + workspace, workspace_size, input, mask, output, rst, n, stream)); + device->FreeWorkspace(ctx, workspace); +} + +inline void* GetDevicePointer(runtime::NDArray array) { + void* ptr = array->data; + if (array.IsPinned()) { + CUDA_CALL(cudaHostGetDevicePointer(&ptr, ptr, 0)); + } + return ptr; +} + +} // namespace cuda +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_UTILS_H_ diff --git a/src/array/cuda/uvm/array_index_select_uvm.cu b/src/array/cuda/uvm/array_index_select_uvm.cu index df067f259f24..7e8bb13fb370 100644 --- a/src/array/cuda/uvm/array_index_select_uvm.cu +++ b/src/array/cuda/uvm/array_index_select_uvm.cu @@ -17,7 +17,7 @@ namespace impl { template NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t arr_len = array->shape[0]; const int64_t len = index->shape[0]; int64_t num_feat = 1; @@ -78,7 +78,7 @@ template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); template void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const DType* source_data = static_cast(source->data); const IdType* idx_data = static_cast(index->data); const int64_t arr_len = dest->shape[0]; diff --git a/src/array/cuda/uvm/array_index_select_uvm.cu.prehip b/src/array/cuda/uvm/array_index_select_uvm.cu.prehip new file mode 100644 index 000000000000..df067f259f24 --- /dev/null +++ b/src/array/cuda/uvm/array_index_select_uvm.cu.prehip @@ -0,0 +1,131 @@ +/** + * Copyright (c) 2019-2022 by Contributors + * @file array/cuda/uvm/array_index_select_uvm.cu + * @brief Array index select GPU implementation + */ +#include + +#include "../../../runtime/cuda/cuda_common.h" +#include "../array_index_select.cuh" +#include "../utils.h" +#include "./array_index_select_uvm.cuh" + +namespace dgl { +using runtime::NDArray; +namespace aten { +namespace impl { + +template +NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const int64_t arr_len = array->shape[0]; + const int64_t len = index->shape[0]; + int64_t num_feat = 1; + std::vector shape{len}; + + CHECK(array.IsPinned()); + const DType* array_data = static_cast(cuda::GetDevicePointer(array)); + CHECK_EQ(index->ctx.device_type, kDGLCUDA); + + for (int d = 1; d < array->ndim; ++d) { + num_feat *= array->shape[d]; + shape.emplace_back(array->shape[d]); + } + + NDArray ret = NDArray::Empty(shape, array->dtype, index->ctx); + if (len == 0 || arr_len * num_feat == 0) return ret; + DType* ret_data = static_cast(ret->data); + + auto res = Sort(index, cuda::_NumberOfBits(arr_len)); + const IdType* idx_data = static_cast(res.first->data); + const int64_t* perm_data = static_cast(res.second->data); + + if (num_feat == 1) { + const int nt = cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + IndexSelectSingleKernel, nb, nt, 0, stream, array_data, idx_data, len, + arr_len, ret_data, perm_data); + } else { + dim3 block(256, 1); + while (static_cast(block.x) >= 2 * num_feat) { + block.x /= 2; + block.y *= 2; + } + const dim3 grid((len + block.y - 1) / block.y); + if (num_feat * sizeof(DType) < 2 * CACHE_LINE_SIZE) { + CUDA_KERNEL_CALL( + IndexSelectMultiKernel, grid, block, 0, stream, array_data, num_feat, + idx_data, len, arr_len, ret_data, perm_data); + } else { + CUDA_KERNEL_CALL( + IndexSelectMultiKernelAligned, grid, block, 0, stream, array_data, + num_feat, idx_data, len, arr_len, ret_data, perm_data); + } + } + return ret; +} + +// floating point types are treated as their equal width integer types +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); +template NDArray IndexSelectCPUFromGPU(NDArray, IdArray); + +template +void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const DType* source_data = static_cast(source->data); + const IdType* idx_data = static_cast(index->data); + const int64_t arr_len = dest->shape[0]; + const int64_t len = index->shape[0]; + int64_t num_feat = 1; + std::vector shape{len}; + + CHECK(dest.IsPinned()); + DType* dest_data = static_cast(cuda::GetDevicePointer(dest)); + CHECK_EQ(index->ctx.device_type, kDGLCUDA); + CHECK_EQ(source->ctx.device_type, kDGLCUDA); + + for (int d = 1; d < source->ndim; ++d) { + num_feat *= source->shape[d]; + } + + if (len == 0) return; + + if (num_feat == 1) { + const int nt = cuda::FindNumThreads(len); + const int nb = (len + nt - 1) / nt; + CUDA_KERNEL_CALL( + IndexScatterSingleKernel, nb, nt, 0, stream, source_data, idx_data, len, + arr_len, dest_data); + } else { + dim3 block(256, 1); + while (static_cast(block.x) >= 2 * num_feat) { + block.x /= 2; + block.y *= 2; + } + const dim3 grid((len + block.y - 1) / block.y); + CUDA_KERNEL_CALL( + IndexScatterMultiKernel, grid, block, 0, stream, source_data, num_feat, + idx_data, len, arr_len, dest_data); + } +} + +// floating point types are treated as their equal width integer types +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); +template void IndexScatterGPUToCPU(NDArray, IdArray, NDArray); + +} // namespace impl +} // namespace aten +} // namespace dgl diff --git a/src/array/cuda/uvm/array_index_select_uvm.cuh b/src/array/cuda/uvm/array_index_select_uvm.cuh index 5a7d222e55ce..1ab7f05ba1db 100644 --- a/src/array/cuda/uvm/array_index_select_uvm.cuh +++ b/src/array/cuda/uvm/array_index_select_uvm.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file array/cpu/array_index_select_uvm.cuh diff --git a/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip b/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip new file mode 100644 index 000000000000..5a7d222e55ce --- /dev/null +++ b/src/array/cuda/uvm/array_index_select_uvm.cuh.prehip @@ -0,0 +1,52 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/cpu/array_index_select_uvm.cuh + * @brief Array index select GPU kernel implementation + */ + +#ifndef DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_ +#define DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_ + +#define CACHE_LINE_SIZE 128 + +namespace dgl { +namespace aten { +namespace impl { + +/** + * This is a cross-device access version of IndexSelectMultiKernel. + * Since the memory access over PCIe is more sensitive to the + * data access aligment (cacheline), we need a separate version here. + */ +template +__global__ void IndexSelectMultiKernelAligned( + const DType* const array, const int64_t num_feat, const IdType* const index, + const int64_t length, const int64_t arr_len, DType* const out, + const int64_t* perm = nullptr) { + int64_t out_row_index = blockIdx.x * blockDim.y + threadIdx.y; + + const int64_t stride = blockDim.y * gridDim.x; + + while (out_row_index < length) { + int64_t col = threadIdx.x; + const int64_t in_row = index[out_row_index]; + assert(in_row >= 0 && in_row < arr_len); + const int64_t idx_offset = + ((uint64_t)(&array[in_row * num_feat]) % CACHE_LINE_SIZE) / + sizeof(DType); + col = col - idx_offset; + const auto out_row = perm ? perm[out_row_index] : out_row_index; + while (col < num_feat) { + if (col >= 0) + out[out_row * num_feat + col] = array[in_row * num_feat + col]; + col += blockDim.x; + } + out_row_index += stride; + } +} + +} // namespace impl +} // namespace aten +} // namespace dgl + +#endif // DGL_ARRAY_CUDA_UVM_ARRAY_INDEX_SELECT_UVM_CUH_ diff --git a/src/array/filter.cc b/src/array/filter.cc index 658816d60199..01306e0d2458 100644 --- a/src/array/filter.cc +++ b/src/array/filter.cc @@ -24,7 +24,7 @@ DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet") auto ctx = array->ctx; // TODO(nv-dlasalle): Implement CPU version. if (ctx.device_type == kDGLCUDA) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM ATEN_ID_TYPE_SWITCH(array->dtype, IdType, { *rv = CreateSetFilter(array); }); diff --git a/src/array/filter.cc.prehip b/src/array/filter.cc.prehip new file mode 100644 index 000000000000..658816d60199 --- /dev/null +++ b/src/array/filter.cc.prehip @@ -0,0 +1,54 @@ +/** + * Copyright (c) 2021 by Contributors + * @file array/filter.cc + * @brief Object for selecting items in a set, or selecting items not in a set. + */ + +#include "./filter.h" + +#include +#include +#include + +namespace dgl { +namespace array { + +using namespace dgl::runtime; + +template +FilterRef CreateSetFilter(IdArray set); + +DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterCreateFromSet") + .set_body([](DGLArgs args, DGLRetValue* rv) { + IdArray array = args[0]; + auto ctx = array->ctx; + // TODO(nv-dlasalle): Implement CPU version. + if (ctx.device_type == kDGLCUDA) { +#ifdef DGL_USE_CUDA + ATEN_ID_TYPE_SWITCH(array->dtype, IdType, { + *rv = CreateSetFilter(array); + }); +#else + LOG(FATAL) << "GPU support not compiled."; +#endif + } else { + LOG(FATAL) << "CPU support not yet implemented."; + } + }); + +DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindIncludedIndices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + FilterRef filter = args[0]; + IdArray array = args[1]; + *rv = filter->find_included_indices(array); + }); + +DGL_REGISTER_GLOBAL("utils.filter._CAPI_DGLFilterFindExcludedIndices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + FilterRef filter = args[0]; + IdArray array = args[1]; + *rv = filter->find_excluded_indices(array); + }); + +} // namespace array +} // namespace dgl diff --git a/src/array/selector.h b/src/array/selector.h index 1257cf18be31..b72c66a0f697 100644 --- a/src/array/selector.h +++ b/src/array/selector.h @@ -12,13 +12,13 @@ namespace dgl { namespace { -#ifdef __CUDACC__ +#ifdef __HIPCC__ #define DGLDEVICE __device__ #define DGLINLINE __forceinline__ #else #define DGLDEVICE #define DGLINLINE inline -#endif // __CUDACC__ +#endif // __HIPCC__ } // namespace diff --git a/src/array/selector.h.prehip b/src/array/selector.h.prehip new file mode 100644 index 000000000000..1257cf18be31 --- /dev/null +++ b/src/array/selector.h.prehip @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2020 by Contributors + * @file array/selector.h + * @brief Selector functions to select among src/edge/dst attributes. + */ +#ifndef DGL_ARRAY_SELECTOR_H_ +#define DGL_ARRAY_SELECTOR_H_ + +#include + +namespace dgl { + +namespace { + +#ifdef __CUDACC__ +#define DGLDEVICE __device__ +#define DGLINLINE __forceinline__ +#else +#define DGLDEVICE +#define DGLINLINE inline +#endif // __CUDACC__ + +} // namespace + +/** + * @brief Select among src/edge/dst feature/idx. + * @note the integer argument target specifies which target + * to choose, 0: src, 1: edge, 2: dst. + */ +template +struct Selector { + template + static DGLDEVICE DGLINLINE T Call(T src, T edge, T dst) { + LOG(INFO) << "Target " << target << " not recognized."; + return src; + } +}; + +template <> +template +DGLDEVICE DGLINLINE T Selector<0>::Call(T src, T edge, T dst) { + return src; +} + +template <> +template +DGLDEVICE DGLINLINE T Selector<1>::Call(T src, T edge, T dst) { + return edge; +} + +template <> +template +DGLDEVICE DGLINLINE T Selector<2>::Call(T src, T edge, T dst) { + return dst; +} + +} // namespace dgl + +#endif // DGL_ARRAY_SELECTOR_H_ diff --git a/src/array/uvm_array.cc b/src/array/uvm_array.cc index 671c4f262520..e29d9f192b56 100644 --- a/src/array/uvm_array.cc +++ b/src/array/uvm_array.cc @@ -16,7 +16,7 @@ namespace dgl { namespace aten { NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM CHECK(array.IsPinned()) << "Input array must be in pinned memory."; CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU."; CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension."; @@ -34,7 +34,7 @@ NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { } void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM CHECK(dest.IsPinned()) << "Destination array must be in pinned memory."; CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU."; CHECK_EQ(source->ctx.device_type, kDGLCUDA) diff --git a/src/array/uvm_array.cc.prehip b/src/array/uvm_array.cc.prehip new file mode 100644 index 000000000000..671c4f262520 --- /dev/null +++ b/src/array/uvm_array.cc.prehip @@ -0,0 +1,74 @@ +/** + * Copyright (c) 2019-2022 by Contributors + * @file array/uvm_array.cc + * @brief DGL array utilities implementation + */ +#include + +#include + +#include "../c_api_common.h" +#include "./uvm_array_op.h" + +using namespace dgl::runtime; + +namespace dgl { +namespace aten { + +NDArray IndexSelectCPUFromGPU(NDArray array, IdArray index) { +#ifdef DGL_USE_CUDA + CHECK(array.IsPinned()) << "Input array must be in pinned memory."; + CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU."; + CHECK_GE(array->ndim, 1) << "Input array must have at least 1 dimension."; + CHECK_EQ(index->ndim, 1) << "Index must be a 1D array."; + + ATEN_DTYPE_BITS_ONLY_SWITCH(array->dtype, DType, "values", { + ATEN_ID_TYPE_SWITCH(index->dtype, IdType, { + return impl::IndexSelectCPUFromGPU(array, index); + }); + }); +#endif + LOG(FATAL) << "IndexSelectCPUFromGPU requires CUDA."; + // Should be unreachable + return NDArray{}; +} + +void IndexScatterGPUToCPU(NDArray dest, IdArray index, NDArray source) { +#ifdef DGL_USE_CUDA + CHECK(dest.IsPinned()) << "Destination array must be in pinned memory."; + CHECK_EQ(index->ctx.device_type, kDGLCUDA) << "Index must be on the GPU."; + CHECK_EQ(source->ctx.device_type, kDGLCUDA) + << "Source array must be on the GPU."; + CHECK_EQ(dest->dtype, source->dtype) << "Destination array and source " + "array must have the same dtype."; + CHECK_GE(dest->ndim, 1) + << "Destination array must have at least 1 dimension."; + CHECK_EQ(index->ndim, 1) << "Index must be a 1D array."; + + ATEN_DTYPE_BITS_ONLY_SWITCH(source->dtype, DType, "values", { + ATEN_ID_TYPE_SWITCH(index->dtype, IdType, { + impl::IndexScatterGPUToCPU(dest, index, source); + }); + }); +#else + LOG(FATAL) << "IndexScatterGPUToCPU requires CUDA."; +#endif +} + +DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexSelectCPUFromGPU") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArray array = args[0]; + IdArray index = args[1]; + *rv = IndexSelectCPUFromGPU(array, index); + }); + +DGL_REGISTER_GLOBAL("ndarray.uvm._CAPI_DGLIndexScatterGPUToCPU") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArray dest = args[0]; + IdArray index = args[1]; + NDArray source = args[2]; + IndexScatterGPUToCPU(dest, index, source); + }); + +} // namespace aten +} // namespace dgl diff --git a/src/geometry/cuda/edge_coarsening_impl.cu b/src/geometry/cuda/edge_coarsening_impl.cu index 6907410c6477..1d1630444d30 100644 --- a/src/geometry/cuda/edge_coarsening_impl.cu +++ b/src/geometry/cuda/edge_coarsening_impl.cu @@ -1,9 +1,10 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2019 by Contributors * @file geometry/cuda/edge_coarsening_impl.cu * @brief Edge coarsening CUDA implementation */ -#include +#include #include #include #include @@ -32,9 +33,9 @@ __global__ void generate_uniform_kernel( float *ret_values, size_t num, uint64_t seed) { size_t id = blockIdx.x * blockDim.x + threadIdx.x; if (id < num) { - curandState state; - curand_init(seed, id, 0, &state); - ret_values[id] = curand_uniform(&state); + hiprandState state; + hiprand_init(seed, id, 0, &state); + ret_values[id] = hiprand_uniform(&state); } } @@ -116,7 +117,7 @@ __global__ void weighted_respond_kernel( template bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { // initial done signal - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream); // generate color prop for each node @@ -132,8 +133,8 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes, result_data); bool done_h = false; - CUDA_CALL(cudaMemcpyFromSymbol( - &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost)); + CUDA_CALL(hipMemcpyFromSymbol( + &done_h, HIP_SYMBOL(done_d), sizeof(done_h), 0, hipMemcpyDeviceToHost)); return done_h; } @@ -155,7 +156,7 @@ bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { template void WeightedNeighborMatching( const aten::CSRMatrix &csr, const NDArray weight, IdArray result) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto &ctx = result->ctx; auto device = runtime::DeviceAPI::Get(ctx); device->SetDevice(ctx); @@ -216,7 +217,7 @@ void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) { device->SetDevice(ctx); // generate random weights - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); NDArray weight = NDArray::Empty( {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx); float *weight_data = static_cast(weight->data); diff --git a/src/geometry/cuda/edge_coarsening_impl.cu.prehip b/src/geometry/cuda/edge_coarsening_impl.cu.prehip new file mode 100644 index 000000000000..6907410c6477 --- /dev/null +++ b/src/geometry/cuda/edge_coarsening_impl.cu.prehip @@ -0,0 +1,239 @@ +/** + * Copyright (c) 2019 by Contributors + * @file geometry/cuda/edge_coarsening_impl.cu + * @brief Edge coarsening CUDA implementation + */ +#include +#include +#include +#include + +#include + +#include "../../array/cuda/utils.h" +#include "../../runtime/cuda/cuda_common.h" +#include "../geometry_op.h" + +#define BLOCKS(N, T) (N + T - 1) / T + +namespace dgl { +namespace geometry { +namespace impl { + +constexpr float BLUE_P = 0.53406; +constexpr int BLUE = -1; +constexpr int RED = -2; +constexpr int EMPTY_IDX = -1; + +__device__ bool done_d; +__global__ void init_done_kernel() { done_d = true; } + +__global__ void generate_uniform_kernel( + float *ret_values, size_t num, uint64_t seed) { + size_t id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < num) { + curandState state; + curand_init(seed, id, 0, &state); + ret_values[id] = curand_uniform(&state); + } +} + +template +__global__ void colorize_kernel( + const float *prop, int64_t num_elem, IdType *result) { + const IdType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elem) { + if (result[idx] < 0) { // if unmatched + result[idx] = (prop[idx] > BLUE_P) ? RED : BLUE; + done_d = false; + } + } +} + +template +__global__ void weighted_propose_kernel( + const IdType *indptr, const IdType *indices, const FloatType *weights, + int64_t num_elem, IdType *proposal, IdType *result) { + const IdType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elem) { + if (result[idx] != BLUE) return; + + bool has_unmatched_neighbor = false; + FloatType weight_max = 0.; + IdType v_max = EMPTY_IDX; + + for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) { + auto v = indices[i]; + + if (result[v] < 0) has_unmatched_neighbor = true; + if (result[v] == RED && weights[i] >= weight_max) { + v_max = v; + weight_max = weights[i]; + } + } + + proposal[idx] = v_max; + if (!has_unmatched_neighbor) result[idx] = idx; + } +} + +template +__global__ void weighted_respond_kernel( + const IdType *indptr, const IdType *indices, const FloatType *weights, + int64_t num_elem, IdType *proposal, IdType *result) { + const IdType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elem) { + if (result[idx] != RED) return; + + bool has_unmatched_neighbors = false; + IdType v_max = -1; + FloatType weight_max = 0.; + + for (IdType i = indptr[idx]; i < indptr[idx + 1]; ++i) { + auto v = indices[i]; + + if (result[v] < 0) { + has_unmatched_neighbors = true; + } + if (result[v] == BLUE && proposal[v] == idx && weights[i] >= weight_max) { + v_max = v; + weight_max = weights[i]; + } + } + if (v_max >= 0) { + result[v_max] = min(idx, v_max); + result[idx] = min(idx, v_max); + } + + if (!has_unmatched_neighbors) result[idx] = idx; + } +} + +/** @brief The colorize procedure. This procedure randomly marks unmarked + * nodes with BLUE(-1) and RED(-2) and checks whether the node matching + * process has finished. + */ +template +bool Colorize(IdType *result_data, int64_t num_nodes, float *const prop) { + // initial done signal + cudaStream_t stream = runtime::getCurrentCUDAStream(); + CUDA_KERNEL_CALL(init_done_kernel, 1, 1, 0, stream); + + // generate color prop for each node + uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX); + auto num_threads = cuda::FindNumThreads(num_nodes); + auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads)); + CUDA_KERNEL_CALL( + generate_uniform_kernel, num_blocks, num_threads, 0, stream, prop, + num_nodes, seed); + + // call kernel + CUDA_KERNEL_CALL( + colorize_kernel, num_blocks, num_threads, 0, stream, prop, num_nodes, + result_data); + bool done_h = false; + CUDA_CALL(cudaMemcpyFromSymbol( + &done_h, done_d, sizeof(done_h), 0, cudaMemcpyDeviceToHost)); + return done_h; +} + +/** @brief Weighted neighbor matching procedure (GPU version). + * This implementation is from `A GPU Algorithm for Greedy Graph Matching + * `__ + * + * This algorithm has three parts: colorize, propose and respond. + * In colorize procedure, each unmarked node will be marked as BLUE or + * RED randomly. If all nodes are marked, finish and return. + * In propose procedure, each BLUE node will propose to the RED + * neighbor with the largest weight (or randomly choose one if without weight). + * If all its neighbors are marked, mark this node with its id. + * In respond procedure, each RED node will respond to the BLUE neighbor + * that has proposed to it and has the largest weight. If all neighbors + * are marked, mark this node with its id. Else match this (BLUE, RED) node + * pair and mark them with the smaller id between them. + */ +template +void WeightedNeighborMatching( + const aten::CSRMatrix &csr, const NDArray weight, IdArray result) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto &ctx = result->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + device->SetDevice(ctx); + + // create proposal tensor + const int64_t num_nodes = result->shape[0]; + IdArray proposal = aten::Full(-1, num_nodes, sizeof(IdType) * 8, ctx); + + // get data ptrs + IdType *indptr_data = static_cast(csr.indptr->data); + IdType *indices_data = static_cast(csr.indices->data); + IdType *result_data = static_cast(result->data); + IdType *proposal_data = static_cast(proposal->data); + FloatType *weight_data = static_cast(weight->data); + + // allocate workspace for prop used in Colorize() + float *prop = static_cast( + device->AllocWorkspace(ctx, num_nodes * sizeof(float))); + + auto num_threads = cuda::FindNumThreads(num_nodes); + auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_nodes, num_threads)); + while (!Colorize(result_data, num_nodes, prop)) { + CUDA_KERNEL_CALL( + weighted_propose_kernel, num_blocks, num_threads, 0, stream, + indptr_data, indices_data, weight_data, num_nodes, proposal_data, + result_data); + CUDA_KERNEL_CALL( + weighted_respond_kernel, num_blocks, num_threads, 0, stream, + indptr_data, indices_data, weight_data, num_nodes, proposal_data, + result_data); + } + device->FreeWorkspace(ctx, prop); +} +template void WeightedNeighborMatching( + const aten::CSRMatrix &csr, const NDArray weight, IdArray result); +template void WeightedNeighborMatching( + const aten::CSRMatrix &csr, const NDArray weight, IdArray result); +template void WeightedNeighborMatching( + const aten::CSRMatrix &csr, const NDArray weight, IdArray result); +template void WeightedNeighborMatching( + const aten::CSRMatrix &csr, const NDArray weight, IdArray result); + +/** @brief Unweighted neighbor matching procedure (GPU version). + * Instead of directly sample neighbors, we assign each neighbor + * with a random weight. We use random weight for 2 reasons: + * 1. Random sample for each node in GPU is expensive. Although + * we can perform a global group-wise (neighborhood of each + * node as a group) random permutation as in CPU version, + * it still cost too much compared to directly using random weights. + * 2. Graph is sparse, thus neighborhood of each node is small, + * which is suitable for GPU implementation. + */ +template +void NeighborMatching(const aten::CSRMatrix &csr, IdArray result) { + const int64_t num_edges = csr.indices->shape[0]; + const auto &ctx = result->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + device->SetDevice(ctx); + + // generate random weights + cudaStream_t stream = runtime::getCurrentCUDAStream(); + NDArray weight = NDArray::Empty( + {num_edges}, DGLDataType{kDGLFloat, sizeof(float) * 8, 1}, ctx); + float *weight_data = static_cast(weight->data); + uint64_t seed = dgl::RandomEngine::ThreadLocal()->RandInt(UINT64_MAX); + auto num_threads = cuda::FindNumThreads(num_edges); + auto num_blocks = cuda::FindNumBlocks<'x'>(BLOCKS(num_edges, num_threads)); + CUDA_KERNEL_CALL( + generate_uniform_kernel, num_blocks, num_threads, 0, stream, weight_data, + num_edges, seed); + + WeightedNeighborMatching(csr, weight, result); +} +template void NeighborMatching( + const aten::CSRMatrix &csr, IdArray result); +template void NeighborMatching( + const aten::CSRMatrix &csr, IdArray result); + +} // namespace impl +} // namespace geometry +} // namespace dgl diff --git a/src/geometry/cuda/geometry_op_impl.cu b/src/geometry/cuda/geometry_op_impl.cu index ac3b05966322..eb8a38de3070 100644 --- a/src/geometry/cuda/geometry_op_impl.cu +++ b/src/geometry/cuda/geometry_op_impl.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2019 by Contributors * @file geometry/cuda/geometry_op_impl.cc @@ -95,7 +96,7 @@ template void FarthestPointSampler( NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, IdArray start_idx, IdArray result) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const FloatType* array_data = static_cast(array->data); @@ -110,7 +111,7 @@ void FarthestPointSampler( // sample for each cloud in the batch IdType* start_idx_data = static_cast(start_idx->data); - CUDA_CALL(cudaSetDevice(array->ctx.device_id)); + CUDA_CALL(hipSetDevice(array->ctx.device_id)); CUDA_KERNEL_CALL( fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size, diff --git a/src/geometry/cuda/geometry_op_impl.cu.prehip b/src/geometry/cuda/geometry_op_impl.cu.prehip new file mode 100644 index 000000000000..ac3b05966322 --- /dev/null +++ b/src/geometry/cuda/geometry_op_impl.cu.prehip @@ -0,0 +1,135 @@ +/** + * Copyright (c) 2019 by Contributors + * @file geometry/cuda/geometry_op_impl.cc + * @brief Geometry operator CUDA implementation + */ +#include + +#include "../../c_api_common.h" +#include "../../runtime/cuda/cuda_common.h" +#include "../geometry_op.h" + +#define THREADS 1024 + +namespace dgl { +namespace geometry { +namespace impl { + +/** + * @brief Farthest Point Sampler without the need to compute all pairs of + * distance. + * + * The input array has shape (N, d), where N is the number of points, and d is + * the dimension. It consists of a (flatten) batch of point clouds. + * + * In each batch, the algorithm starts with the sample index specified by + * ``start_idx``. Then for each point, we maintain the minimum to-sample + * distance. Finally, we pick the point with the maximum such distance. This + * process will be repeated for ``sample_points`` - 1 times. + */ +template +__global__ void fps_kernel( + const FloatType* array_data, const int64_t batch_size, + const int64_t sample_points, const int64_t point_in_batch, + const int64_t dim, const IdType* start_idx, FloatType* dist_data, + IdType* ret_data) { + const int64_t thread_idx = threadIdx.x; + const int64_t batch_idx = blockIdx.x; + + const int64_t array_start = point_in_batch * batch_idx; + const int64_t ret_start = sample_points * batch_idx; + + __shared__ FloatType dist_max_ht[THREADS]; + __shared__ int64_t dist_argmax_ht[THREADS]; + + // start with random initialization + if (thread_idx == 0) { + ret_data[ret_start] = (IdType)(start_idx[batch_idx]); + } + + // sample the rest `sample_points - 1` points + for (auto i = 0; i < sample_points - 1; i++) { + __syncthreads(); + + // the last sampled point + int64_t sample_idx = (int64_t)(ret_data[ret_start + i]); + dist_argmax_ht[thread_idx] = 0; + dist_max_ht[thread_idx] = (FloatType)(-1.); + + // multi-thread distance calculation + for (auto j = thread_idx; j < point_in_batch; j += THREADS) { + FloatType one_dist = (FloatType)(0.); + for (auto d = 0; d < dim; d++) { + FloatType tmp = array_data[(array_start + j) * dim + d] - + array_data[(array_start + sample_idx) * dim + d]; + one_dist += tmp * tmp; + } + + if (i == 0 || dist_data[array_start + j] > one_dist) { + dist_data[array_start + j] = one_dist; + } + + if (dist_data[array_start + j] > dist_max_ht[thread_idx]) { + dist_argmax_ht[thread_idx] = j; + dist_max_ht[thread_idx] = dist_data[array_start + j]; + } + } + + __syncthreads(); + + if (thread_idx == 0) { + FloatType best = dist_max_ht[0]; + int64_t best_idx = dist_argmax_ht[0]; + for (auto j = 1; j < THREADS; j++) { + if (dist_max_ht[j] > best) { + best = dist_max_ht[j]; + best_idx = dist_argmax_ht[j]; + } + } + ret_data[ret_start + i + 1] = (IdType)(best_idx); + } + } +} + +template +void FarthestPointSampler( + NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, + IdArray start_idx, IdArray result) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const FloatType* array_data = static_cast(array->data); + + const int64_t point_in_batch = array->shape[0] / batch_size; + const int64_t dim = array->shape[1]; + + // return value + IdType* ret_data = static_cast(result->data); + + // distance + FloatType* dist_data = static_cast(dist->data); + + // sample for each cloud in the batch + IdType* start_idx_data = static_cast(start_idx->data); + CUDA_CALL(cudaSetDevice(array->ctx.device_id)); + + CUDA_KERNEL_CALL( + fps_kernel, batch_size, THREADS, 0, stream, array_data, batch_size, + sample_points, point_in_batch, dim, start_idx_data, dist_data, ret_data); +} + +template void FarthestPointSampler( + NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, + IdArray start_idx, IdArray result); +template void FarthestPointSampler( + NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, + IdArray start_idx, IdArray result); +template void FarthestPointSampler( + NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, + IdArray start_idx, IdArray result); +template void FarthestPointSampler( + NDArray array, int64_t batch_size, int64_t sample_points, NDArray dist, + IdArray start_idx, IdArray result); + +} // namespace impl +} // namespace geometry +} // namespace dgl diff --git a/src/graph/heterograph_capi.cc b/src/graph/heterograph_capi.cc index 9e88d01c1ba5..f8dcd95162ac 100644 --- a/src/graph/heterograph_capi.cc +++ b/src/graph/heterograph_capi.cc @@ -687,7 +687,7 @@ DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat") } }; -#if !(defined(DGL_USE_CUDA)) +#if !(defined(DGL_USE_ROCM)) runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f); #else get_format_f(0, hg->NumEdgeTypes()); diff --git a/src/graph/heterograph_capi.cc.prehip b/src/graph/heterograph_capi.cc.prehip new file mode 100644 index 000000000000..9e88d01c1ba5 --- /dev/null +++ b/src/graph/heterograph_capi.cc.prehip @@ -0,0 +1,841 @@ +/** + * Copyright (c) 2020 by Contributors + * @file graph/heterograph_capi.cc + * @brief Heterograph CAPI bindings. + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../c_api_common.h" +#include "./heterograph.h" +#include "unit_graph.h" + +using namespace dgl::runtime; + +namespace dgl { + +///////////////////////// Unitgraph functions ///////////////////////// + +// XXX(minjie): Ideally, Unitgraph should be invisible to python side + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCOO") + .set_body([](DGLArgs args, DGLRetValue* rv) { + int64_t nvtypes = args[0]; + int64_t num_src = args[1]; + int64_t num_dst = args[2]; + IdArray row = args[3]; + IdArray col = args[4]; + List formats = args[5]; + bool row_sorted = args[6]; + bool col_sorted = args[7]; + std::vector formats_vec; + for (Value val : formats) { + std::string fmt = val->data; + formats_vec.push_back(ParseSparseFormat(fmt)); + } + const auto code = SparseFormatsToCode(formats_vec); + auto hgptr = CreateFromCOO( + nvtypes, num_src, num_dst, row, col, row_sorted, col_sorted, code); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateUnitGraphFromCSR") + .set_body([](DGLArgs args, DGLRetValue* rv) { + int64_t nvtypes = args[0]; + int64_t num_src = args[1]; + int64_t num_dst = args[2]; + IdArray indptr = args[3]; + IdArray indices = args[4]; + IdArray edge_ids = args[5]; + List formats = args[6]; + bool transpose = args[7]; + std::vector formats_vec; + for (Value val : formats) { + std::string fmt = val->data; + formats_vec.push_back(ParseSparseFormat(fmt)); + } + const auto code = SparseFormatsToCode(formats_vec); + if (!transpose) { + auto hgptr = CreateFromCSR( + nvtypes, num_src, num_dst, indptr, indices, edge_ids, code); + *rv = HeteroGraphRef(hgptr); + } else { + auto hgptr = CreateFromCSC( + nvtypes, num_src, num_dst, indptr, indices, edge_ids, code); + *rv = HeteroGraphRef(hgptr); + } + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateHeteroGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + GraphRef meta_graph = args[0]; + List rel_graphs = args[1]; + std::vector rel_ptrs; + rel_ptrs.reserve(rel_graphs.size()); + for (const auto& ref : rel_graphs) { + rel_ptrs.push_back(ref.sptr()); + } + auto hgptr = CreateHeteroGraph(meta_graph.sptr(), rel_ptrs); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL( + "heterograph_index._CAPI_DGLHeteroCreateHeteroGraphWithNumNodes") + .set_body([](DGLArgs args, DGLRetValue* rv) { + GraphRef meta_graph = args[0]; + List rel_graphs = args[1]; + IdArray num_nodes_per_type = args[2]; + std::vector rel_ptrs; + rel_ptrs.reserve(rel_graphs.size()); + for (const auto& ref : rel_graphs) { + rel_ptrs.push_back(ref.sptr()); + } + auto hgptr = CreateHeteroGraph( + meta_graph.sptr(), rel_ptrs, num_nodes_per_type.ToVector()); + *rv = HeteroGraphRef(hgptr); + }); + +///////////////////////// HeteroGraph member functions ///////////////////////// + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetMetaGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->meta_graph(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsMetaGraphUniBipartite") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + GraphPtr mg = hg->meta_graph(); + *rv = mg->IsUniBipartite(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetRelationGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + CHECK_LE(etype, hg->NumEdgeTypes()) << "invalid edge type " << etype; + auto unit_graph = hg->GetRelationGraph(etype); + auto meta_graph = unit_graph->meta_graph(); + auto hgptr = CreateHeteroGraph( + meta_graph, {unit_graph}, unit_graph->NumVerticesPerType()); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFlattenedGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List etypes = args[1]; + std::vector etypes_vec; + for (Value val : etypes) { + // (gq) have to decompose it into two statements because of a weird MSVC + // internal error + dgl_id_t id = val->data; + etypes_vec.push_back(id); + } + + *rv = FlattenedHeteroGraphRef(hg->Flatten(etypes_vec)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddVertices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t vtype = args[1]; + int64_t num = args[2]; + hg->AddVertices(vtype, num); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddEdge") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t src = args[2]; + dgl_id_t dst = args[3]; + hg->AddEdge(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAddEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray src = args[2]; + IdArray dst = args[3]; + hg->AddEdges(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroClear") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + hg->Clear(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDataType") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->DataType(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroContext") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->Context(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsPinned") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->IsPinned(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumBits") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->NumBits(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsMultigraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->IsMultigraph(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroIsReadonly") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = hg->IsReadonly(); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumVertices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t vtype = args[1]; + *rv = static_cast(hg->NumVertices(vtype)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroNumEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + *rv = static_cast(hg->NumEdges(etype)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasVertex") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t vtype = args[1]; + dgl_id_t vid = args[2]; + *rv = hg->HasVertex(vtype, vid); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasVertices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t vtype = args[1]; + IdArray vids = args[2]; + *rv = hg->HasVertices(vtype, vids); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasEdgeBetween") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t src = args[2]; + dgl_id_t dst = args[3]; + *rv = hg->HasEdgeBetween(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroHasEdgesBetween") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray src = args[2]; + IdArray dst = args[3]; + *rv = hg->HasEdgesBetween(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPredecessors") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t dst = args[2]; + *rv = hg->Predecessors(etype, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSuccessors") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t src = args[2]; + *rv = hg->Successors(etype, src); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeId") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t src = args[2]; + dgl_id_t dst = args[3]; + *rv = hg->EdgeId(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeIdsAll") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray src = args[2]; + IdArray dst = args[3]; + const auto& ret = hg->EdgeIdsAll(etype, src, dst); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeIdsOne") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray src = args[2]; + IdArray dst = args[3]; + *rv = hg->EdgeIdsOne(etype, src, dst); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroFindEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray eids = args[2]; + const auto& ret = hg->FindEdges(etype, eids); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInEdges_1") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t vid = args[2]; + const auto& ret = hg->InEdges(etype, vid); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInEdges_2") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray vids = args[2]; + const auto& ret = hg->InEdges(etype, vids); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutEdges_1") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t vid = args[2]; + const auto& ret = hg->OutEdges(etype, vid); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutEdges_2") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray vids = args[2]; + const auto& ret = hg->OutEdges(etype, vids); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + std::string order = args[2]; + const auto& ret = hg->Edges(etype, order); + *rv = ConvertEdgeArrayToPackedFunc(ret); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInDegree") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t vid = args[2]; + *rv = static_cast(hg->InDegree(etype, vid)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroInDegrees") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray vids = args[2]; + *rv = hg->InDegrees(etype, vids); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutDegree") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + dgl_id_t vid = args[2]; + *rv = static_cast(hg->OutDegree(etype, vid)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroOutDegrees") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + IdArray vids = args[2]; + *rv = hg->OutDegrees(etype, vids); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetAdj") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_type_t etype = args[1]; + bool transpose = args[2]; + std::string fmt = args[3]; + *rv = ConvertNDArrayVectorToPackedFunc(hg->GetAdj(etype, transpose, fmt)); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroVertexSubgraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List vids = args[1]; + std::vector vid_vec; + vid_vec.reserve(vids.size()); + for (Value val : vids) { + vid_vec.push_back(val->data); + } + std::shared_ptr subg( + new HeteroSubgraph(hg->VertexSubgraph(vid_vec))); + *rv = HeteroSubgraphRef(subg); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroEdgeSubgraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List eids = args[1]; + bool preserve_nodes = args[2]; + std::vector eid_vec; + eid_vec.reserve(eids.size()); + for (Value val : eids) { + eid_vec.push_back(val->data); + } + std::shared_ptr subg( + new HeteroSubgraph(hg->EdgeSubgraph(eid_vec, preserve_nodes))); + *rv = HeteroSubgraphRef(subg); + }); + +///////////////////////// HeteroSubgraph members ///////////////////////// + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroSubgraphRef subg = args[0]; + *rv = HeteroGraphRef(subg->graph); + }); + +DGL_REGISTER_GLOBAL( + "heterograph_index._CAPI_DGLHeteroSubgraphGetInducedVertices") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroSubgraphRef subg = args[0]; + List induced_verts; + for (IdArray arr : subg->induced_vertices) { + induced_verts.push_back(Value(MakeValue(arr))); + } + *rv = induced_verts; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSubgraphGetInducedEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroSubgraphRef subg = args[0]; + List induced_edges; + for (IdArray arr : subg->induced_edges) { + induced_edges.push_back(Value(MakeValue(arr))); + } + *rv = induced_edges; + }); + +///////////////////////// Global functions and algorithms +//////////////////////////// + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroAsNumBits") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + int bits = args[1]; + HeteroGraphPtr bhg_ptr = hg.sptr(); + auto hg_ptr = std::dynamic_pointer_cast(bhg_ptr); + HeteroGraphPtr hg_new; + if (hg_ptr) { + hg_new = HeteroGraph::AsNumBits(hg_ptr, bits); + } else { + hg_new = UnitGraph::AsNumBits(bhg_ptr, bits); + } + *rv = HeteroGraphRef(hg_new); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyTo") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + int device_type = args[1]; + int device_id = args[2]; + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + HeteroGraphPtr hg_new = HeteroGraph::CopyTo(hg.sptr(), ctx); + *rv = HeteroGraphRef(hg_new); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPinMemory") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + HeteroGraphPtr hg_new = HeteroGraph::PinMemory(hg.sptr()); + *rv = HeteroGraphRef(hg_new); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroPinMemory_") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + auto hgindex = std::dynamic_pointer_cast(hg.sptr()); + hgindex->PinMemory_(); + *rv = hg; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroUnpinMemory_") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + auto hgindex = std::dynamic_pointer_cast(hg.sptr()); + hgindex->UnpinMemory_(); + *rv = hg; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroRecordStream") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + DGLStreamHandle stream = args[1]; + auto hgindex = std::dynamic_pointer_cast(hg.sptr()); + hgindex->RecordStream(stream); + *rv = hg; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCopyToSharedMem") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + std::string name = args[1]; + List ntypes = args[2]; + List etypes = args[3]; + List fmts = args[4]; + auto ntypes_vec = ListValueToVector(ntypes); + auto etypes_vec = ListValueToVector(etypes); + std::set fmts_set; + for (const auto& fmt : fmts) { + std::string fmt_data = fmt->data; + fmts_set.insert(fmt_data); + } + auto hg_share = HeteroGraph::CopyToSharedMem( + hg.sptr(), name, ntypes_vec, etypes_vec, fmts_set); + *rv = HeteroGraphRef(hg_share); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFromSharedMem") + .set_body([](DGLArgs args, DGLRetValue* rv) { + std::string name = args[0]; + HeteroGraphPtr hg; + std::vector ntypes; + std::vector etypes; + std::tie(hg, ntypes, etypes) = HeteroGraph::CreateFromSharedMem(name); + List ntypes_list; + List etypes_list; + for (const auto& ntype : ntypes) + ntypes_list.push_back(Value(MakeValue(ntype))); + for (const auto& etype : etypes) + etypes_list.push_back(Value(MakeValue(etype))); + List ret; + ret.push_back(HeteroGraphRef(hg)); + ret.push_back(ntypes_list); + ret.push_back(etypes_list); + *rv = ret; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroJointUnion") + .set_body([](DGLArgs args, DGLRetValue* rv) { + GraphRef meta_graph = args[0]; + List component_graphs = args[1]; + CHECK(component_graphs.size() > 1) + << "Expect graph list to have at least two graphs"; + std::vector component_ptrs; + component_ptrs.reserve(component_graphs.size()); + const int64_t bits = component_graphs[0]->NumBits(); + const DGLContext ctx = component_graphs[0]->Context(); + for (const auto& component : component_graphs) { + component_ptrs.push_back(component.sptr()); + CHECK_EQ(component->NumBits(), bits) + << "Expect graphs to joint union have the same index dtype(int" + << bits << "), but got int" << component->NumBits(); + CHECK_EQ(component->Context(), ctx) + << "Expect graphs to joint union have the same context" << ctx + << "), but got " << component->Context(); + } + + auto hgptr = JointUnionHeteroGraph(meta_graph.sptr(), component_ptrs); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDisjointUnion_v2") + .set_body([](DGLArgs args, DGLRetValue* rv) { + GraphRef meta_graph = args[0]; + List component_graphs = args[1]; + CHECK(component_graphs.size() > 0) + << "Expect graph list has at least one graph"; + std::vector component_ptrs; + component_ptrs.reserve(component_graphs.size()); + const int64_t bits = component_graphs[0]->NumBits(); + const DGLContext ctx = component_graphs[0]->Context(); + for (const auto& component : component_graphs) { + component_ptrs.push_back(component.sptr()); + CHECK_EQ(component->NumBits(), bits) + << "Expect graphs to batch have the same index dtype(int" << bits + << "), but got int" << component->NumBits(); + CHECK_EQ(component->Context(), ctx) + << "Expect graphs to batch have the same context" << ctx + << "), but got " << component->Context(); + } + + auto hgptr = DisjointUnionHeteroGraph2(meta_graph.sptr(), component_ptrs); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL( + "heterograph_index._CAPI_DGLHeteroDisjointPartitionBySizes_v2") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + const IdArray vertex_sizes = args[1]; + const IdArray edge_sizes = args[2]; + std::vector ret; + ret = DisjointPartitionHeteroBySizes2( + hg->meta_graph(), hg.sptr(), vertex_sizes, edge_sizes); + List ret_list; + for (HeteroGraphPtr hgptr : ret) { + ret_list.push_back(HeteroGraphRef(hgptr)); + } + *rv = ret_list; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroDisjointPartitionBySizes") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + const IdArray vertex_sizes = args[1]; + const IdArray edge_sizes = args[2]; + const int64_t bits = hg->NumBits(); + std::vector ret; + ATEN_ID_BITS_SWITCH(bits, IdType, { + ret = DisjointPartitionHeteroBySizes( + hg->meta_graph(), hg.sptr(), vertex_sizes, edge_sizes); + }); + List ret_list; + for (HeteroGraphPtr hgptr : ret) { + ret_list.push_back(HeteroGraphRef(hgptr)); + } + *rv = ret_list; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroSlice") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + const IdArray num_nodes_per_type = args[1]; + const IdArray start_nid_per_type = args[2]; + const IdArray num_edges_per_type = args[3]; + const IdArray start_eid_per_type = args[4]; + auto hgptr = SliceHeteroGraph( + hg->meta_graph(), hg.sptr(), num_nodes_per_type, start_nid_per_type, + num_edges_per_type, start_eid_per_type); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetCreatedFormats") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List format_list; + dgl_format_code_t code = hg->GetRelationGraph(0)->GetCreatedFormats(); + for (auto format : CodeToSparseFormats(code)) { + format_list.push_back(Value(MakeValue(ToStringSparseFormat(format)))); + } + *rv = format_list; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetAllowedFormats") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List format_list; + dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats(); + for (auto format : CodeToSparseFormats(code)) { + format_list.push_back(Value(MakeValue(ToStringSparseFormat(format)))); + } + *rv = format_list; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroCreateFormat") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + dgl_format_code_t code = hg->GetRelationGraph(0)->GetAllowedFormats(); + auto get_format_f = [&](size_t etype_b, size_t etype_e) { + for (auto etype = etype_b; etype < etype_e; ++etype) { + auto bg = + std::dynamic_pointer_cast(hg->GetRelationGraph(etype)); + for (auto format : CodeToSparseFormats(code)) bg->GetFormat(format); + } + }; + +#if !(defined(DGL_USE_CUDA)) + runtime::parallel_for(0, hg->NumEdgeTypes(), get_format_f); +#else + get_format_f(0, hg->NumEdgeTypes()); +#endif + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroGetFormatGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + List formats = args[1]; + std::vector formats_vec; + for (Value val : formats) { + std::string fmt = val->data; + formats_vec.push_back(ParseSparseFormat(fmt)); + } + auto hgptr = hg->GetGraphInFormat(SparseFormatsToCode(formats_vec)); + *rv = HeteroGraphRef(hgptr); + }); + +DGL_REGISTER_GLOBAL("subgraph._CAPI_DGLInSubgraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + const auto& nodes = ListValueToVector(args[1]); + bool relabel_nodes = args[2]; + std::shared_ptr ret(new HeteroSubgraph); + *ret = InEdgeGraph(hg.sptr(), nodes, relabel_nodes); + *rv = HeteroGraphRef(ret); + }); + +DGL_REGISTER_GLOBAL("subgraph._CAPI_DGLOutSubgraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + const auto& nodes = ListValueToVector(args[1]); + bool relabel_nodes = args[2]; + std::shared_ptr ret(new HeteroSubgraph); + *ret = OutEdgeGraph(hg.sptr(), nodes, relabel_nodes); + *rv = HeteroGraphRef(ret); + }); + +DGL_REGISTER_GLOBAL("transform._CAPI_DGLAsImmutableGraph") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + *rv = GraphRef(hg->AsImmutableGraph()); + }); + +DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortOutEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + NDArray tag = args[1]; + int64_t num_tag = args[2]; + + CHECK_EQ(hg->Context().device_type, kDGLCPU) + << "Only support sorting by tag on cpu"; + CHECK(aten::IsValidIdArray(tag)); + CHECK_EQ(tag->ctx.device_type, kDGLCPU) + << "Only support sorting by tag on cpu"; + + const auto csr = hg->GetCSRMatrix(0); + + NDArray tag_pos = aten::NullArray(); + aten::CSRMatrix output; + std::tie(output, tag_pos) = aten::CSRSortByTag(csr, tag, num_tag); + HeteroGraphPtr output_hg = + CreateFromCSR(hg->NumVertexTypes(), output, ALL_CODE); + List ret; + ret.push_back(HeteroGraphRef(output_hg)); + ret.push_back(Value(MakeValue(tag_pos))); + *rv = ret; + }); + +DGL_REGISTER_GLOBAL("transform._CAPI_DGLHeteroSortInEdges") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + NDArray tag = args[1]; + int64_t num_tag = args[2]; + + CHECK_EQ(hg->Context().device_type, kDGLCPU) + << "Only support sorting by tag on cpu"; + CHECK(aten::IsValidIdArray(tag)); + CHECK_EQ(tag->ctx.device_type, kDGLCPU) + << "Only support sorting by tag on cpu"; + + const auto csc = hg->GetCSCMatrix(0); + + NDArray tag_pos = aten::NullArray(); + aten::CSRMatrix output; + std::tie(output, tag_pos) = aten::CSRSortByTag(csc, tag, num_tag); + + HeteroGraphPtr output_hg = + CreateFromCSC(hg->NumVertexTypes(), output, ALL_CODE); + List ret; + ret.push_back(HeteroGraphRef(output_hg)); + ret.push_back(Value(MakeValue(tag_pos))); + *rv = ret; + }); + +DGL_REGISTER_GLOBAL("heterograph._CAPI_DGLFindSrcDstNtypes") + .set_body([](DGLArgs args, DGLRetValue* rv) { + GraphRef metagraph = args[0]; + std::unordered_set dst_set; + std::unordered_set src_set; + + for (uint64_t eid = 0; eid < metagraph->NumEdges(); ++eid) { + auto edge = metagraph->FindEdge(eid); + auto src = edge.first; + auto dst = edge.second; + dst_set.insert(dst); + src_set.insert(src); + } + + List srclist, dstlist; + List> ret_list; + for (uint64_t nid = 0; nid < metagraph->NumVertices(); ++nid) { + auto is_dst = dst_set.count(nid); + auto is_src = src_set.count(nid); + if (is_dst && is_src) + return; + else if (is_dst) + dstlist.push_back(Value(MakeValue(static_cast(nid)))); + else + // If a node type is isolated, put it in srctype as defined in the + // Python docstring. + srclist.push_back(Value(MakeValue(static_cast(nid)))); + } + ret_list.push_back(srclist); + ret_list.push_back(dstlist); + *rv = ret_list; + }); + +DGL_REGISTER_GLOBAL("heterograph_index._CAPI_DGLHeteroReverse") + .set_body([](DGLArgs args, DGLRetValue* rv) { + HeteroGraphRef hg = args[0]; + CHECK_GT(hg->NumEdgeTypes(), 0); + auto g = std::dynamic_pointer_cast(hg.sptr()); + std::vector rev_ugs; + const auto& ugs = g->relation_graphs(); + rev_ugs.resize(ugs.size()); + + for (size_t i = 0; i < ugs.size(); ++i) { + const auto& rev_ug = ugs[i]->Reverse(); + rev_ugs[i] = rev_ug; + } + // node types are not changed + const auto& num_nodes = g->NumVerticesPerType(); + const auto& meta_edges = hg->meta_graph()->Edges("eid"); + // reverse the metagraph + const auto& rev_meta = ImmutableGraph::CreateFromCOO( + hg->meta_graph()->NumVertices(), meta_edges.dst, meta_edges.src); + *rv = CreateHeteroGraph(rev_meta, rev_ugs, num_nodes); + }); +} // namespace dgl diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cu b/src/graph/sampling/randomwalks/frequency_hashmap.cu index feb88b4c86e8..d805be0d8f97 100644 --- a/src/graph/sampling/randomwalks/frequency_hashmap.cu +++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file graph/sampling/frequency_hashmap.cu @@ -5,7 +6,7 @@ */ #include -#include // NOLINT +#include // NOLINT #include #include @@ -71,7 +72,7 @@ __global__ void _count_frequency( } } - using BlockReduce = typename cub::BlockReduce; + using BlockReduce = typename hipcub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_space; count = BlockReduce(temp_space).Sum(count); @@ -112,7 +113,7 @@ __global__ void _compact_frequency( int64_t last_idx = start_idx + TILE_SIZE; const IdxType block_offset = edge_blocks_prefix[blockIdx.x]; - using BlockScan = typename cub::BlockScan; + using BlockScan = typename hipcub::BlockScan; __shared__ typename BlockScan::TempStorage temp_space; BlockPrefixCallbackOp prefix_op(0); @@ -246,7 +247,7 @@ inline __device__ IdxType DeviceEdgeHashmap::GetEdgeCount( template FrequencyHashmap::FrequencyHashmap( int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, - cudaStream_t stream, int64_t edge_table_scale) { + hipStream_t stream, int64_t edge_table_scale) { _ctx = ctx; _stream = stream; num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale); @@ -259,7 +260,7 @@ FrequencyHashmap::FrequencyHashmap( constexpr int TILE_SIZE = BLOCK_SIZE * 8; dim3 block(BLOCK_SIZE); dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE); - CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType))); + CUDA_CALL(hipMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType))); CUDA_KERNEL_CALL( (_init_edge_table), grid, block, 0, _stream, edge_hashmap, (num_dst * num_items_each_dst)); @@ -300,7 +301,7 @@ std::tuple FrequencyHashmap::Topk( // _edge_hashmap bool *is_first_position = static_cast( device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges))); - CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges))); + CUDA_CALL(hipMemset(is_first_position, 0, sizeof(bool) * (num_edges))); // double space to use ExclusiveSum auto edge_blocks_prefix_data = static_cast(device->AllocWorkspace( _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1))); @@ -327,11 +328,11 @@ std::tuple FrequencyHashmap::Topk( // 2.1 ExclusiveSum the edge_blocks_prefix void *d_temp_storage = nullptr; size_t temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, edge_blocks_prefix, edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, edge_blocks_prefix, edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); device->FreeWorkspace(_ctx, d_temp_storage); @@ -365,19 +366,19 @@ std::tuple FrequencyHashmap::Topk( // 3.1 ExclusiveSum the num_unique_each_node d_temp_storage = nullptr; temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, num_unique_each_node, num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, num_unique_each_node, num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); device->FreeWorkspace(_ctx, d_temp_storage); // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency // Create a set of DoubleBuffers to wrap pairs of device pointers - cub::DoubleBuffer d_unique_frequency( + hipcub::DoubleBuffer d_unique_frequency( unique_frequency, unique_frequency_alternate); - cub::DoubleBuffer d_unique_src_edges( + hipcub::DoubleBuffer d_unique_src_edges( unique_src_edges, unique_src_edges_alternate); // Determine temporary device storage requirements d_temp_storage = nullptr; @@ -385,12 +386,12 @@ std::tuple FrequencyHashmap::Topk( // the DeviceRadixSort is faster than DeviceSegmentedRadixSort, // especially when num_dst_nodes is large (about ~10000) if (dtype.bits == 32) { - CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_unique_frequency, d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, _stream)); } else { - CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_unique_frequency, d_unique_src_edges, num_unique_edges, num_dst_nodes, num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, @@ -398,12 +399,12 @@ std::tuple FrequencyHashmap::Topk( } d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); if (dtype.bits == 32) { - CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_unique_frequency, d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, _stream)); } else { - CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( + CUDA_CALL(hipcub::DeviceSegmentedRadixSort::SortPairsDescending( d_temp_storage, temp_storage_bytes, d_unique_frequency, d_unique_src_edges, num_unique_edges, num_dst_nodes, num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, @@ -422,11 +423,11 @@ std::tuple FrequencyHashmap::Topk( // use unique_output_offsets; d_temp_storage = nullptr; temp_storage_bytes = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, num_unique_each_node, unique_output_offsets, num_dst_nodes + 1, _stream)); d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( d_temp_storage, temp_storage_bytes, num_unique_each_node, unique_output_offsets, num_dst_nodes + 1, _stream)); device->FreeWorkspace(_ctx, d_temp_storage); diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip b/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip new file mode 100644 index 000000000000..feb88b4c86e8 --- /dev/null +++ b/src/graph/sampling/randomwalks/frequency_hashmap.cu.prehip @@ -0,0 +1,471 @@ +/** + * Copyright (c) 2021 by Contributors + * @file graph/sampling/frequency_hashmap.cu + * @brief frequency hashmap - used to select top-k frequency edges of each node + */ + +#include +#include // NOLINT +#include +#include + +#include "../../../array/cuda/atomic.cuh" +#include "../../../runtime/cuda/cuda_common.h" +#include "frequency_hashmap.cuh" + +namespace dgl { + +namespace sampling { + +namespace impl { + +namespace { + +int64_t _table_size(const int64_t num, const int64_t scale) { + /** + * Calculate the number of buckets in the hashtable. To guarantee we can + * fill the hashtable in the worst case, we must use a number of buckets which + * is a power of two. + * https://en.wikipedia.org/wiki/Quadratic_probing#Limitations + */ + const int64_t next_pow2 = 1 << static_cast(1 + std::log2(num >> 1)); + return next_pow2 << scale; +} + +template +__global__ void _init_edge_table(void *edge_hashmap, int64_t edges_len) { + using EdgeItem = typename DeviceEdgeHashmap::EdgeItem; + auto edge_hashmap_t = static_cast(edge_hashmap); + int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x; + int64_t last_idx = start_idx + TILE_SIZE; +#pragma unroll(4) + for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) { + if (idx < edges_len) { + EdgeItem *edge = (edge_hashmap_t + idx); + edge->src = static_cast(-1); + edge->cnt = static_cast(0); + } + } +} + +template +__global__ void _count_frequency( + const IdxType *src_data, const int64_t num_edges, + const int64_t num_edges_per_node, IdxType *edge_blocks_prefix, + bool *is_first_position, DeviceEdgeHashmap device_edge_hashmap) { + int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x; + int64_t last_idx = start_idx + TILE_SIZE; + + IdxType count = 0; + for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) { + if (idx < num_edges) { + IdxType src = src_data[idx]; + if (src == static_cast(-1)) { + continue; + } + IdxType dst_idx = (idx / num_edges_per_node); + if (device_edge_hashmap.InsertEdge(src, dst_idx) == 0) { + is_first_position[idx] = true; + ++count; + } + } + } + + using BlockReduce = typename cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_space; + + count = BlockReduce(temp_space).Sum(count); + if (threadIdx.x == 0) { + edge_blocks_prefix[blockIdx.x] = count; + if (blockIdx.x == 0) { + edge_blocks_prefix[gridDim.x] = 0; + } + } +} + +/** + * This structure is used with cub's block-level prefixscan in order to + * keep a running sum as items are iteratively processed. + */ +template +struct BlockPrefixCallbackOp { + T _running_total; + + __device__ BlockPrefixCallbackOp(const T running_total) + : _running_total(running_total) {} + + __device__ T operator()(const T block_aggregate) { + const T old_prefix = _running_total; + _running_total += block_aggregate; + return old_prefix; + } +}; + +template +__global__ void _compact_frequency( + const IdxType *src_data, const IdxType *dst_data, const int64_t num_edges, + const int64_t num_edges_per_node, const IdxType *edge_blocks_prefix, + const bool *is_first_position, IdxType *num_unique_each_node, + IdxType *unique_src_edges, Idx64Type *unique_frequency, + DeviceEdgeHashmap device_edge_hashmap) { + int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x; + int64_t last_idx = start_idx + TILE_SIZE; + const IdxType block_offset = edge_blocks_prefix[blockIdx.x]; + + using BlockScan = typename cub::BlockScan; + __shared__ typename BlockScan::TempStorage temp_space; + BlockPrefixCallbackOp prefix_op(0); + + for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) { + IdxType flag = 0; + if (idx < num_edges) { + IdxType src = src_data[idx]; + IdxType dst_idx = (idx / num_edges_per_node); + if (idx % num_edges_per_node == 0) { + num_unique_each_node[dst_idx] = + device_edge_hashmap.GetDstCount(dst_idx); + } + if (is_first_position[idx] == true) { + flag = 1; + } + BlockScan(temp_space).ExclusiveSum(flag, flag, prefix_op); + __syncthreads(); + if (is_first_position[idx] == true) { + const IdxType pos = (block_offset + flag); + unique_src_edges[pos] = src; + if (sizeof(IdxType) != sizeof(Idx64Type) && + sizeof(IdxType) == 4) { // if IdxType is a 32-bit data + unique_frequency[pos] = + ((static_cast(num_edges / num_edges_per_node - dst_idx) + << 32) | + device_edge_hashmap.GetEdgeCount(src, dst_idx)); + } else { + unique_frequency[pos] = + device_edge_hashmap.GetEdgeCount(src, dst_idx); + } + } + } + } +} + +template +__global__ void _get_pick_num( + IdxType *num_unique_each_node, const int64_t num_pick, + const int64_t num_dst_nodes) { + int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x; + int64_t last_idx = start_idx + TILE_SIZE; +#pragma unroll(4) + for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) { + if (idx < num_dst_nodes) { + IdxType &num_unique = num_unique_each_node[idx]; + num_unique = min(num_unique, static_cast(num_pick)); + } + } +} + +template +__global__ void _pick_data( + const Idx64Type *unique_frequency, const IdxType *unique_src_edges, + const IdxType *unique_input_offsets, const IdxType *dst_data, + const int64_t num_edges_per_node, const int64_t num_dst_nodes, + const int64_t num_edges, const IdxType *unique_output_offsets, + IdxType *output_src, IdxType *output_dst, IdxType *output_frequency) { + int64_t start_idx = (blockIdx.x * TILE_SIZE) + threadIdx.x; + int64_t last_idx = start_idx + TILE_SIZE; + + for (int64_t idx = start_idx; idx < last_idx; idx += BLOCK_SIZE) { + if (idx < num_dst_nodes) { + const int64_t dst_pos = (idx * num_edges_per_node); + assert(dst_pos < num_edges); + const IdxType dst = dst_data[dst_pos]; + const IdxType last_output_offset = unique_output_offsets[idx + 1]; + assert( + (last_output_offset - unique_output_offsets[idx]) <= + (unique_input_offsets[idx + 1] - unique_input_offsets[idx])); + for (IdxType output_idx = unique_output_offsets[idx], + input_idx = unique_input_offsets[idx]; + output_idx < last_output_offset; ++output_idx, ++input_idx) { + output_src[output_idx] = unique_src_edges[input_idx]; + output_dst[output_idx] = dst; + output_frequency[output_idx] = + static_cast(unique_frequency[input_idx]); + } + } + } +} + +} // namespace + +// return the old cnt of this edge +template +inline __device__ IdxType DeviceEdgeHashmap::InsertEdge( + const IdxType &src, const IdxType &dst_idx) { + IdxType start_off = dst_idx * _num_items_each_dst; + IdxType pos = EdgeHash(src); + IdxType delta = 1; + IdxType old_cnt = static_cast(-1); + while (true) { + IdxType old_src = dgl::aten::cuda::AtomicCAS( + &_edge_hashmap[start_off + pos].src, static_cast(-1), src); + if (old_src == static_cast(-1) || old_src == src) { + // first insert + old_cnt = dgl::aten::cuda::AtomicAdd( + &_edge_hashmap[start_off + pos].cnt, static_cast(1)); + if (old_src == static_cast(-1)) { + assert(dst_idx < _num_dst); + dgl::aten::cuda::AtomicAdd( + &_dst_unique_edges[dst_idx], static_cast(1)); + } + break; + } + pos = EdgeHash(pos + delta); + delta += 1; + } + return old_cnt; +} + +template +inline __device__ IdxType +DeviceEdgeHashmap::GetDstCount(const IdxType &dst_idx) { + return _dst_unique_edges[dst_idx]; +} + +template +inline __device__ IdxType DeviceEdgeHashmap::GetEdgeCount( + const IdxType &src, const IdxType &dst_idx) { + IdxType start_off = dst_idx * _num_items_each_dst; + IdxType pos = EdgeHash(src); + IdxType delta = 1; + while (_edge_hashmap[start_off + pos].src != src) { + pos = EdgeHash(pos + delta); + delta += 1; + } + return _edge_hashmap[start_off + pos].cnt; +} + +template +FrequencyHashmap::FrequencyHashmap( + int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, + cudaStream_t stream, int64_t edge_table_scale) { + _ctx = ctx; + _stream = stream; + num_items_each_dst = _table_size(num_items_each_dst, edge_table_scale); + auto device = dgl::runtime::DeviceAPI::Get(_ctx); + auto dst_unique_edges = static_cast( + device->AllocWorkspace(_ctx, (num_dst) * sizeof(IdxType))); + auto edge_hashmap = static_cast(device->AllocWorkspace( + _ctx, (num_dst * num_items_each_dst) * sizeof(EdgeItem))); + constexpr int BLOCK_SIZE = 256; + constexpr int TILE_SIZE = BLOCK_SIZE * 8; + dim3 block(BLOCK_SIZE); + dim3 grid((num_dst * num_items_each_dst + TILE_SIZE - 1) / TILE_SIZE); + CUDA_CALL(cudaMemset(dst_unique_edges, 0, (num_dst) * sizeof(IdxType))); + CUDA_KERNEL_CALL( + (_init_edge_table), grid, block, 0, + _stream, edge_hashmap, (num_dst * num_items_each_dst)); + _device_edge_hashmap = new DeviceEdgeHashmap( + num_dst, num_items_each_dst, dst_unique_edges, edge_hashmap); + _dst_unique_edges = dst_unique_edges; + _edge_hashmap = edge_hashmap; +} + +template +FrequencyHashmap::~FrequencyHashmap() { + auto device = dgl::runtime::DeviceAPI::Get(_ctx); + delete _device_edge_hashmap; + _device_edge_hashmap = nullptr; + device->FreeWorkspace(_ctx, _dst_unique_edges); + _dst_unique_edges = nullptr; + device->FreeWorkspace(_ctx, _edge_hashmap); + _edge_hashmap = nullptr; +} + +template +std::tuple FrequencyHashmap::Topk( + const IdxType *src_data, const IdxType *dst_data, DGLDataType dtype, + const int64_t num_edges, const int64_t num_edges_per_node, + const int64_t num_pick) { + using Idx64Type = int64_t; + const int64_t num_dst_nodes = (num_edges / num_edges_per_node); + constexpr int BLOCK_SIZE = 256; + // XXX: a experienced value, best performance in GV100 + constexpr int TILE_SIZE = BLOCK_SIZE * 32; + const dim3 block(BLOCK_SIZE); + const dim3 edges_grid((num_edges + TILE_SIZE - 1) / TILE_SIZE); + auto device = dgl::runtime::DeviceAPI::Get(_ctx); + const IdxType num_edge_blocks = static_cast(edges_grid.x); + IdxType num_unique_edges = 0; + + // to mark if this position of edges is the first inserting position for + // _edge_hashmap + bool *is_first_position = static_cast( + device->AllocWorkspace(_ctx, sizeof(bool) * (num_edges))); + CUDA_CALL(cudaMemset(is_first_position, 0, sizeof(bool) * (num_edges))); + // double space to use ExclusiveSum + auto edge_blocks_prefix_data = static_cast(device->AllocWorkspace( + _ctx, 2 * sizeof(IdxType) * (num_edge_blocks + 1))); + IdxType *edge_blocks_prefix = edge_blocks_prefix_data; + IdxType *edge_blocks_prefix_alternate = + (edge_blocks_prefix_data + (num_edge_blocks + 1)); + // triple space to use ExclusiveSum and unique_output_offsets + auto num_unique_each_node_data = static_cast( + device->AllocWorkspace(_ctx, 3 * sizeof(IdxType) * (num_dst_nodes + 1))); + IdxType *num_unique_each_node = num_unique_each_node_data; + IdxType *num_unique_each_node_alternate = + (num_unique_each_node_data + (num_dst_nodes + 1)); + IdxType *unique_output_offsets = + (num_unique_each_node_data + 2 * (num_dst_nodes + 1)); + + // 1. Scan the all edges and count the unique edges and unique edges for each + // dst node + CUDA_KERNEL_CALL( + (_count_frequency), edges_grid, block, 0, + _stream, src_data, num_edges, num_edges_per_node, edge_blocks_prefix, + is_first_position, *_device_edge_hashmap); + + // 2. Compact the unique edges frequency + // 2.1 ExclusiveSum the edge_blocks_prefix + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, edge_blocks_prefix, + edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); + d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, edge_blocks_prefix, + edge_blocks_prefix_alternate, num_edge_blocks + 1, _stream)); + device->FreeWorkspace(_ctx, d_temp_storage); + std::swap(edge_blocks_prefix, edge_blocks_prefix_alternate); + device->CopyDataFromTo( + &edge_blocks_prefix[num_edge_blocks], 0, &num_unique_edges, 0, + sizeof(num_unique_edges), _ctx, DGLContext{kDGLCPU, 0}, dtype); + device->StreamSync(_ctx, _stream); + // 2.2 Allocate the data of unique edges and frequency + // double space to use SegmentedRadixSort + auto unique_src_edges_data = static_cast( + device->AllocWorkspace(_ctx, 2 * sizeof(IdxType) * (num_unique_edges))); + IdxType *unique_src_edges = unique_src_edges_data; + IdxType *unique_src_edges_alternate = + unique_src_edges_data + num_unique_edges; + // double space to use SegmentedRadixSort + auto unique_frequency_data = static_cast( + device->AllocWorkspace(_ctx, 2 * sizeof(Idx64Type) * (num_unique_edges))); + Idx64Type *unique_frequency = unique_frequency_data; + Idx64Type *unique_frequency_alternate = + unique_frequency_data + num_unique_edges; + // 2.3 Compact the unique edges and their frequency + CUDA_KERNEL_CALL( + (_compact_frequency), + edges_grid, block, 0, _stream, src_data, dst_data, num_edges, + num_edges_per_node, edge_blocks_prefix, is_first_position, + num_unique_each_node, unique_src_edges, unique_frequency, + *_device_edge_hashmap); + + // 3. SegmentedRadixSort the unique edges and unique_frequency + // 3.1 ExclusiveSum the num_unique_each_node + d_temp_storage = nullptr; + temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, num_unique_each_node, + num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); + d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, num_unique_each_node, + num_unique_each_node_alternate, num_dst_nodes + 1, _stream)); + device->FreeWorkspace(_ctx, d_temp_storage); + // 3.2 SegmentedRadixSort the unique_src_edges and unique_frequency + // Create a set of DoubleBuffers to wrap pairs of device pointers + cub::DoubleBuffer d_unique_frequency( + unique_frequency, unique_frequency_alternate); + cub::DoubleBuffer d_unique_src_edges( + unique_src_edges, unique_src_edges_alternate); + // Determine temporary device storage requirements + d_temp_storage = nullptr; + temp_storage_bytes = 0; + // the DeviceRadixSort is faster than DeviceSegmentedRadixSort, + // especially when num_dst_nodes is large (about ~10000) + if (dtype.bits == 32) { + CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_unique_frequency, + d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, + _stream)); + } else { + CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_unique_frequency, + d_unique_src_edges, num_unique_edges, num_dst_nodes, + num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, + sizeof(Idx64Type) * 8, _stream)); + } + d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); + if (dtype.bits == 32) { + CUDA_CALL(cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_unique_frequency, + d_unique_src_edges, num_unique_edges, 0, sizeof(Idx64Type) * 8, + _stream)); + } else { + CUDA_CALL(cub::DeviceSegmentedRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, d_unique_frequency, + d_unique_src_edges, num_unique_edges, num_dst_nodes, + num_unique_each_node_alternate, num_unique_each_node_alternate + 1, 0, + sizeof(Idx64Type) * 8, _stream)); + } + device->FreeWorkspace(_ctx, d_temp_storage); + + // 4. Get the final pick number for each dst node + // 4.1 Reset the min(num_pick, num_unique_each_node) to num_unique_each_node + constexpr int NODE_TILE_SIZE = BLOCK_SIZE * 2; + const dim3 nodes_grid((num_dst_nodes + NODE_TILE_SIZE - 1) / NODE_TILE_SIZE); + CUDA_KERNEL_CALL( + (_get_pick_num), nodes_grid, block, + 0, _stream, num_unique_each_node, num_pick, num_dst_nodes); + // 4.2 ExclusiveSum the new num_unique_each_node as unique_output_offsets + // use unique_output_offsets; + d_temp_storage = nullptr; + temp_storage_bytes = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, num_unique_each_node, + unique_output_offsets, num_dst_nodes + 1, _stream)); + d_temp_storage = device->AllocWorkspace(_ctx, temp_storage_bytes); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + d_temp_storage, temp_storage_bytes, num_unique_each_node, + unique_output_offsets, num_dst_nodes + 1, _stream)); + device->FreeWorkspace(_ctx, d_temp_storage); + + // 5. Pick the data to result + IdxType num_output = 0; + device->CopyDataFromTo( + &unique_output_offsets[num_dst_nodes], 0, &num_output, 0, + sizeof(num_output), _ctx, DGLContext{kDGLCPU, 0}, dtype); + device->StreamSync(_ctx, _stream); + + IdArray res_src = + IdArray::Empty({static_cast(num_output)}, dtype, _ctx); + IdArray res_dst = + IdArray::Empty({static_cast(num_output)}, dtype, _ctx); + IdArray res_cnt = + IdArray::Empty({static_cast(num_output)}, dtype, _ctx); + CUDA_KERNEL_CALL( + (_pick_data), nodes_grid, + block, 0, _stream, d_unique_frequency.Current(), + d_unique_src_edges.Current(), num_unique_each_node_alternate, dst_data, + num_edges_per_node, num_dst_nodes, num_edges, unique_output_offsets, + res_src.Ptr(), res_dst.Ptr(), res_cnt.Ptr()); + + device->FreeWorkspace(_ctx, is_first_position); + device->FreeWorkspace(_ctx, edge_blocks_prefix_data); + device->FreeWorkspace(_ctx, num_unique_each_node_data); + device->FreeWorkspace(_ctx, unique_src_edges_data); + device->FreeWorkspace(_ctx, unique_frequency_data); + + return std::make_tuple(res_src, res_dst, res_cnt); +} + +template class FrequencyHashmap; + +template class FrequencyHashmap; + +}; // namespace impl + +}; // namespace sampling + +}; // namespace dgl diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cuh b/src/graph/sampling/randomwalks/frequency_hashmap.cuh index 3c1a1c0649b6..622bcc59671e 100644 --- a/src/graph/sampling/randomwalks/frequency_hashmap.cuh +++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh @@ -56,7 +56,7 @@ class FrequencyHashmap { FrequencyHashmap() = delete; FrequencyHashmap( int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, - cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale); + hipStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale); ~FrequencyHashmap(); using EdgeItem = typename DeviceEdgeHashmap::EdgeItem; std::tuple Topk( @@ -66,7 +66,7 @@ class FrequencyHashmap { private: DGLContext _ctx; - cudaStream_t _stream; + hipStream_t _stream; DeviceEdgeHashmap *_device_edge_hashmap; IdxType *_dst_unique_edges; EdgeItem *_edge_hashmap; diff --git a/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip b/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip new file mode 100644 index 000000000000..3c1a1c0649b6 --- /dev/null +++ b/src/graph/sampling/randomwalks/frequency_hashmap.cuh.prehip @@ -0,0 +1,79 @@ +/** + * Copyright (c) 2021 by Contributors + * @file graph/sampling/frequency_hashmap.cuh + * @brief frequency hashmap - used to select top-k frequency edges of each node + */ + +#ifndef DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_ +#define DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_ + +#include +#include + +#include + +namespace dgl { +namespace sampling { +namespace impl { + +template +class DeviceEdgeHashmap { + public: + struct EdgeItem { + IdxType src; + IdxType cnt; + }; + DeviceEdgeHashmap() = delete; + DeviceEdgeHashmap( + int64_t num_dst, int64_t num_items_each_dst, IdxType *dst_unique_edges, + EdgeItem *edge_hashmap) + : _num_dst(num_dst), + _num_items_each_dst(num_items_each_dst), + _dst_unique_edges(dst_unique_edges), + _edge_hashmap(edge_hashmap) {} + // return the old cnt of this edge + inline __device__ IdxType + InsertEdge(const IdxType &src, const IdxType &dst_idx); + inline __device__ IdxType GetDstCount(const IdxType &dst_idx); + inline __device__ IdxType + GetEdgeCount(const IdxType &src, const IdxType &dst_idx); + + private: + int64_t _num_dst; + int64_t _num_items_each_dst; + IdxType *_dst_unique_edges; + EdgeItem *_edge_hashmap; + + inline __device__ IdxType EdgeHash(const IdxType &id) const { + return id % _num_items_each_dst; + } +}; + +template +class FrequencyHashmap { + public: + static constexpr int64_t kDefaultEdgeTableScale = 3; + FrequencyHashmap() = delete; + FrequencyHashmap( + int64_t num_dst, int64_t num_items_each_dst, DGLContext ctx, + cudaStream_t stream, int64_t edge_table_scale = kDefaultEdgeTableScale); + ~FrequencyHashmap(); + using EdgeItem = typename DeviceEdgeHashmap::EdgeItem; + std::tuple Topk( + const IdxType *src_data, const IdxType *dst_data, DGLDataType dtype, + const int64_t num_edges, const int64_t num_edges_per_node, + const int64_t num_pick); + + private: + DGLContext _ctx; + cudaStream_t _stream; + DeviceEdgeHashmap *_device_edge_hashmap; + IdxType *_dst_unique_edges; + EdgeItem *_edge_hashmap; +}; + +}; // namespace impl +}; // namespace sampling +}; // namespace dgl + +#endif // DGL_GRAPH_SAMPLING_RANDOMWALKS_FREQUENCY_HASHMAP_CUH_ diff --git a/src/graph/sampling/randomwalks/get_node_types_gpu.cu b/src/graph/sampling/randomwalks/get_node_types_gpu.cu index 79e8d2596e9a..72d2addb6e26 100644 --- a/src/graph/sampling/randomwalks/get_node_types_gpu.cu +++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu @@ -4,7 +4,7 @@ * @brief DGL sampler */ -#include +#include #include #include #include diff --git a/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip b/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip new file mode 100644 index 000000000000..79e8d2596e9a --- /dev/null +++ b/src/graph/sampling/randomwalks/get_node_types_gpu.cu.prehip @@ -0,0 +1,72 @@ +/** + * Copyright (c) 2021 by Contributors + * @file graph/sampling/get_node_types_gpu.cu + * @brief DGL sampler + */ + +#include +#include +#include +#include + +#include + +#include "randomwalks_impl.h" + +namespace dgl { + +using namespace dgl::runtime; +using namespace dgl::aten; + +namespace sampling { + +namespace impl { + +template +TypeArray GetNodeTypesFromMetapath( + const HeteroGraphPtr hg, const TypeArray metapath) { + uint64_t num_etypes = metapath->shape[0]; + + auto cpu_ctx = DGLContext{kDGLCPU, 0}; + auto metapath_ctx = metapath->ctx; + auto stream = DeviceAPI::Get(metapath_ctx)->GetStream(); + + TypeArray h_result = + TypeArray::Empty({metapath->shape[0] + 1}, metapath->dtype, cpu_ctx); + auto h_result_data = h_result.Ptr(); + + auto h_metapath = metapath.CopyTo(cpu_ctx); + DeviceAPI::Get(metapath_ctx)->StreamSync(metapath_ctx, stream); + const IdxType *h_metapath_data = h_metapath.Ptr(); + + dgl_type_t curr_type = hg->GetEndpointTypes(h_metapath_data[0]).first; + h_result_data[0] = curr_type; + + for (uint64_t i = 0; i < num_etypes; ++i) { + auto src_dst_type = hg->GetEndpointTypes(h_metapath_data[i]); + dgl_type_t srctype = src_dst_type.first; + dgl_type_t dsttype = src_dst_type.second; + + if (srctype != curr_type) { + LOG(FATAL) << "source of edge type #" << i + << " does not match destination of edge type #" << i - 1; + } + curr_type = dsttype; + h_result_data[i + 1] = dsttype; + } + + auto result = h_result.CopyTo(metapath->ctx); + DeviceAPI::Get(metapath_ctx)->StreamSync(metapath_ctx, stream); + return result; +} + +template TypeArray GetNodeTypesFromMetapath( + const HeteroGraphPtr hg, const TypeArray metapath); +template TypeArray GetNodeTypesFromMetapath( + const HeteroGraphPtr hg, const TypeArray metapath); + +}; // namespace impl + +}; // namespace sampling + +}; // namespace dgl diff --git a/src/graph/sampling/randomwalks/randomwalk_gpu.cu b/src/graph/sampling/randomwalks/randomwalk_gpu.cu index bb7a48906acf..7e822b961476 100644 --- a/src/graph/sampling/randomwalks/randomwalk_gpu.cu +++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu @@ -1,16 +1,17 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021-2022 by Contributors * @file graph/sampling/randomwalk_gpu.cu * @brief CUDA random walk sampleing */ -#include +#include #include #include #include #include -#include +#include #include #include #include @@ -48,10 +49,10 @@ __global__ void _RandomWalkKernel( int64_t last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_seeds); int64_t trace_length = (max_num_steps + 1); - curandState rng; + hiprandState rng; // reference: - // https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes - curand_init(rand_seed + idx, 0, 0, &rng); + // https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes + hiprand_init(rand_seed + idx, 0, 0, &rng); while (idx < last_idx) { IdType curr = seed_data[idx]; @@ -68,18 +69,18 @@ __global__ void _RandomWalkKernel( if (deg == 0) { // the degree is zero break; } - const int64_t num = curand(&rng) % deg; + const int64_t num = hiprand(&rng) % deg; IdType pick = graph.in_cols[in_row_start + num]; IdType eid = (graph.data ? graph.data[in_row_start + num] : in_row_start + num); *traces_data_ptr = pick; *eids_data_ptr = eid; if ((restart_prob_size > 1) && - (curand_uniform(&rng) < restart_prob_data[step_idx])) { + (hiprand_uniform(&rng) < restart_prob_data[step_idx])) { break; } else if ( (restart_prob_size == 1) && - (curand_uniform(&rng) < restart_prob_data[0])) { + (hiprand_uniform(&rng) < restart_prob_data[0])) { break; } ++traces_data_ptr; @@ -107,10 +108,10 @@ __global__ void _RandomWalkBiasedKernel( int64_t last_idx = min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_seeds); int64_t trace_length = (max_num_steps + 1); - curandState rng; + hiprandState rng; // reference: - // https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes - curand_init(rand_seed + idx, 0, 0, &rng); + // https://docs.nvidia.com/cuda/hiprand/device-api-overview.html#performance-notes + hiprand_init(rand_seed + idx, 0, 0, &rng); while (idx < last_idx) { IdType curr = seed_data[idx]; @@ -133,9 +134,9 @@ __global__ void _RandomWalkBiasedKernel( const FloatType *prob = probs[metapath_id]; int64_t num; if (prob == nullptr) { - num = curand(&rng) % deg; + num = hiprand(&rng) % deg; } else { - auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng); + auto rnd_sum_w = prob_sum[curr] * hiprand_uniform(&rng); FloatType sum_w{0.}; for (num = 0; num < deg; ++num) { sum_w += prob[in_row_start + num]; @@ -149,11 +150,11 @@ __global__ void _RandomWalkBiasedKernel( *traces_data_ptr = pick; *eids_data_ptr = eid; if ((restart_prob_size > 1) && - (curand_uniform(&rng) < restart_prob_data[step_idx])) { + (hiprand_uniform(&rng) < restart_prob_data[step_idx])) { break; } else if ( (restart_prob_size == 1) && - (curand_uniform(&rng) < restart_prob_data[0])) { + (hiprand_uniform(&rng) < restart_prob_data[0])) { break; } ++traces_data_ptr; @@ -202,7 +203,7 @@ std::pair RandomWalkUniform( : nullptr); } // use cuda stream from local thread - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = DeviceAPI::Get(ctx); auto d_graphs = static_cast *>(device->AllocWorkspace( ctx, (num_etypes) * sizeof(GraphKernelData))); @@ -263,7 +264,7 @@ std::pair RandomWalkBiased( IdType *traces_data = traces.Ptr(); IdType *eids_data = eids.Ptr(); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto device = DeviceAPI::Get(ctx); // new probs and prob sums pointers assert(num_etypes == static_cast(prob.size())); @@ -297,11 +298,11 @@ std::pair RandomWalkBiased( // calculate the sum of the neighbor weights const IdType *d_offsets = static_cast(csr.indptr->data); size_t temp_storage_size = 0; - CUDA_CALL(cub::DeviceSegmentedReduce::Sum( + CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum( nullptr, temp_storage_size, probs[etype], prob_sums[etype], num_segments, d_offsets, d_offsets + 1, stream)); void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size); - CUDA_CALL(cub::DeviceSegmentedReduce::Sum( + CUDA_CALL(hipcub::DeviceSegmentedReduce::Sum( temp_storage, temp_storage_size, probs[etype], prob_sums[etype], num_segments, d_offsets, d_offsets + 1, stream)); device->FreeWorkspace(ctx, temp_storage); @@ -396,7 +397,7 @@ std::pair RandomWalkWithRestart( auto device = dgl::runtime::DeviceAPI::Get(device_ctx); // use cuda stream from local thread - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); device->CopyDataFromTo( &restart_prob, 0, restart_prob_array.Ptr(), 0, sizeof(double), DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype); @@ -449,7 +450,7 @@ std::tuple SelectPinSageNeighbors( const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node); auto ctx = src->ctx; // use cuda stream from local thread - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); auto frequency_hashmap = FrequencyHashmap( num_dst_nodes, num_samples_per_node, ctx, stream); auto ret = frequency_hashmap.Topk( diff --git a/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip b/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip new file mode 100644 index 000000000000..bb7a48906acf --- /dev/null +++ b/src/graph/sampling/randomwalks/randomwalk_gpu.cu.prehip @@ -0,0 +1,496 @@ +/** + * Copyright (c) 2021-2022 by Contributors + * @file graph/sampling/randomwalk_gpu.cu + * @brief CUDA random walk sampleing + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../../../runtime/cuda/cuda_common.h" +#include "frequency_hashmap.cuh" + +namespace dgl { + +using namespace dgl::runtime; +using namespace dgl::aten; + +namespace sampling { + +namespace impl { + +namespace { + +template +struct GraphKernelData { + const IdType *in_ptr; + const IdType *in_cols; + const IdType *data; +}; + +template +__global__ void _RandomWalkKernel( + const uint64_t rand_seed, const IdType *seed_data, const int64_t num_seeds, + const IdType *metapath_data, const uint64_t max_num_steps, + const GraphKernelData *graphs, const FloatType *restart_prob_data, + const int64_t restart_prob_size, const int64_t max_nodes, + IdType *out_traces_data, IdType *out_eids_data) { + assert(BLOCK_SIZE == blockDim.x); + int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x; + int64_t last_idx = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_seeds); + int64_t trace_length = (max_num_steps + 1); + curandState rng; + // reference: + // https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes + curand_init(rand_seed + idx, 0, 0, &rng); + + while (idx < last_idx) { + IdType curr = seed_data[idx]; + assert(curr < max_nodes); + IdType *traces_data_ptr = &out_traces_data[idx * trace_length]; + IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps]; + *(traces_data_ptr++) = curr; + int64_t step_idx; + for (step_idx = 0; step_idx < max_num_steps; ++step_idx) { + IdType metapath_id = metapath_data[step_idx]; + const GraphKernelData &graph = graphs[metapath_id]; + const int64_t in_row_start = graph.in_ptr[curr]; + const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr]; + if (deg == 0) { // the degree is zero + break; + } + const int64_t num = curand(&rng) % deg; + IdType pick = graph.in_cols[in_row_start + num]; + IdType eid = + (graph.data ? graph.data[in_row_start + num] : in_row_start + num); + *traces_data_ptr = pick; + *eids_data_ptr = eid; + if ((restart_prob_size > 1) && + (curand_uniform(&rng) < restart_prob_data[step_idx])) { + break; + } else if ( + (restart_prob_size == 1) && + (curand_uniform(&rng) < restart_prob_data[0])) { + break; + } + ++traces_data_ptr; + ++eids_data_ptr; + curr = pick; + } + for (; step_idx < max_num_steps; ++step_idx) { + *(traces_data_ptr++) = -1; + *(eids_data_ptr++) = -1; + } + idx += BLOCK_SIZE; + } +} + +template +__global__ void _RandomWalkBiasedKernel( + const uint64_t rand_seed, const IdType *seed_data, const int64_t num_seeds, + const IdType *metapath_data, const uint64_t max_num_steps, + const GraphKernelData *graphs, const FloatType **probs, + const FloatType **prob_sums, const FloatType *restart_prob_data, + const int64_t restart_prob_size, const int64_t max_nodes, + IdType *out_traces_data, IdType *out_eids_data) { + assert(BLOCK_SIZE == blockDim.x); + int64_t idx = blockIdx.x * TILE_SIZE + threadIdx.x; + int64_t last_idx = + min(static_cast(blockIdx.x + 1) * TILE_SIZE, num_seeds); + int64_t trace_length = (max_num_steps + 1); + curandState rng; + // reference: + // https://docs.nvidia.com/cuda/curand/device-api-overview.html#performance-notes + curand_init(rand_seed + idx, 0, 0, &rng); + + while (idx < last_idx) { + IdType curr = seed_data[idx]; + assert(curr < max_nodes); + IdType *traces_data_ptr = &out_traces_data[idx * trace_length]; + IdType *eids_data_ptr = &out_eids_data[idx * max_num_steps]; + *(traces_data_ptr++) = curr; + int64_t step_idx; + for (step_idx = 0; step_idx < max_num_steps; ++step_idx) { + IdType metapath_id = metapath_data[step_idx]; + const GraphKernelData &graph = graphs[metapath_id]; + const int64_t in_row_start = graph.in_ptr[curr]; + const int64_t deg = graph.in_ptr[curr + 1] - graph.in_ptr[curr]; + if (deg == 0) { // the degree is zero + break; + } + + // randomly select by weight + const FloatType *prob_sum = prob_sums[metapath_id]; + const FloatType *prob = probs[metapath_id]; + int64_t num; + if (prob == nullptr) { + num = curand(&rng) % deg; + } else { + auto rnd_sum_w = prob_sum[curr] * curand_uniform(&rng); + FloatType sum_w{0.}; + for (num = 0; num < deg; ++num) { + sum_w += prob[in_row_start + num]; + if (sum_w >= rnd_sum_w) break; + } + } + + IdType pick = graph.in_cols[in_row_start + num]; + IdType eid = + (graph.data ? graph.data[in_row_start + num] : in_row_start + num); + *traces_data_ptr = pick; + *eids_data_ptr = eid; + if ((restart_prob_size > 1) && + (curand_uniform(&rng) < restart_prob_data[step_idx])) { + break; + } else if ( + (restart_prob_size == 1) && + (curand_uniform(&rng) < restart_prob_data[0])) { + break; + } + ++traces_data_ptr; + ++eids_data_ptr; + curr = pick; + } + for (; step_idx < max_num_steps; ++step_idx) { + *(traces_data_ptr++) = -1; + *(eids_data_ptr++) = -1; + } + idx += BLOCK_SIZE; + } +} + +} // namespace + +// random walk for uniform choice +template +std::pair RandomWalkUniform( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + FloatArray restart_prob) { + const int64_t max_num_steps = metapath->shape[0]; + const IdType *metapath_data = static_cast(metapath->data); + const int64_t begin_ntype = + hg->meta_graph()->FindEdge(metapath_data[0]).first; + const int64_t max_nodes = hg->NumVertices(begin_ntype); + int64_t num_etypes = hg->NumEdgeTypes(); + auto ctx = seeds->ctx; + + const IdType *seed_data = static_cast(seeds->data); + CHECK(seeds->ndim == 1) << "seeds shape is not one dimension."; + const int64_t num_seeds = seeds->shape[0]; + int64_t trace_length = max_num_steps + 1; + IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx); + IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx); + IdType *traces_data = traces.Ptr(); + IdType *eids_data = eids.Ptr(); + + std::vector> h_graphs(num_etypes); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const CSRMatrix &csr = hg->GetCSRMatrix(etype); + h_graphs[etype].in_ptr = static_cast(csr.indptr->data); + h_graphs[etype].in_cols = static_cast(csr.indices->data); + h_graphs[etype].data = + (CSRHasData(csr) ? static_cast(csr.data->data) + : nullptr); + } + // use cuda stream from local thread + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = DeviceAPI::Get(ctx); + auto d_graphs = static_cast *>(device->AllocWorkspace( + ctx, (num_etypes) * sizeof(GraphKernelData))); + // copy graph metadata pointers to GPU + device->CopyDataFromTo( + h_graphs.data(), 0, d_graphs, 0, + (num_etypes) * sizeof(GraphKernelData), DGLContext{kDGLCPU, 0}, + ctx, hg->GetCSRMatrix(0).indptr->dtype); + // copy metapath to GPU + auto d_metapath = metapath.CopyTo(ctx); + const IdType *d_metapath_data = static_cast(d_metapath->data); + + constexpr int BLOCK_SIZE = 256; + constexpr int TILE_SIZE = BLOCK_SIZE * 4; + dim3 block(256); + dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE); + const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); + ATEN_FLOAT_TYPE_SWITCH( + restart_prob->dtype, FloatType, "random walk GPU kernel", { + CHECK(restart_prob->ctx.device_type == kDGLCUDA) + << "restart prob should be in GPU."; + CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1."; + const FloatType *restart_prob_data = restart_prob.Ptr(); + const int64_t restart_prob_size = restart_prob->shape[0]; + CUDA_KERNEL_CALL( + (_RandomWalkKernel), grid, + block, 0, stream, random_seed, seed_data, num_seeds, + d_metapath_data, max_num_steps, d_graphs, restart_prob_data, + restart_prob_size, max_nodes, traces_data, eids_data); + }); + + device->FreeWorkspace(ctx, d_graphs); + return std::make_pair(traces, eids); +} + +/** + * @brief Random walk for biased choice. We use inverse transform sampling to + * choose the next step. + */ +template +std::pair RandomWalkBiased( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, FloatArray restart_prob) { + const int64_t max_num_steps = metapath->shape[0]; + const IdType *metapath_data = static_cast(metapath->data); + const int64_t begin_ntype = + hg->meta_graph()->FindEdge(metapath_data[0]).first; + const int64_t max_nodes = hg->NumVertices(begin_ntype); + int64_t num_etypes = hg->NumEdgeTypes(); + auto ctx = seeds->ctx; + + const IdType *seed_data = static_cast(seeds->data); + CHECK(seeds->ndim == 1) << "seeds shape is not one dimension."; + const int64_t num_seeds = seeds->shape[0]; + int64_t trace_length = max_num_steps + 1; + IdArray traces = IdArray::Empty({num_seeds, trace_length}, seeds->dtype, ctx); + IdArray eids = IdArray::Empty({num_seeds, max_num_steps}, seeds->dtype, ctx); + IdType *traces_data = traces.Ptr(); + IdType *eids_data = eids.Ptr(); + + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto device = DeviceAPI::Get(ctx); + // new probs and prob sums pointers + assert(num_etypes == static_cast(prob.size())); + std::unique_ptr probs(new FloatType *[prob.size()]); + std::unique_ptr prob_sums(new FloatType *[prob.size()]); + std::vector prob_sums_arr; + prob_sums_arr.reserve(prob.size()); + + // graphs + std::vector> h_graphs(num_etypes); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const CSRMatrix &csr = hg->GetCSRMatrix(etype); + h_graphs[etype].in_ptr = static_cast(csr.indptr->data); + h_graphs[etype].in_cols = static_cast(csr.indices->data); + h_graphs[etype].data = + (CSRHasData(csr) ? static_cast(csr.data->data) + : nullptr); + + int64_t num_segments = csr.indptr->shape[0] - 1; + // will handle empty probs in the kernel + if (IsNullArray(prob[etype])) { + probs[etype] = nullptr; + prob_sums[etype] = nullptr; + continue; + } + probs[etype] = prob[etype].Ptr(); + prob_sums_arr.push_back( + FloatArray::Empty({num_segments}, prob[etype]->dtype, ctx)); + prob_sums[etype] = prob_sums_arr[etype].Ptr(); + + // calculate the sum of the neighbor weights + const IdType *d_offsets = static_cast(csr.indptr->data); + size_t temp_storage_size = 0; + CUDA_CALL(cub::DeviceSegmentedReduce::Sum( + nullptr, temp_storage_size, probs[etype], prob_sums[etype], + num_segments, d_offsets, d_offsets + 1, stream)); + void *temp_storage = device->AllocWorkspace(ctx, temp_storage_size); + CUDA_CALL(cub::DeviceSegmentedReduce::Sum( + temp_storage, temp_storage_size, probs[etype], prob_sums[etype], + num_segments, d_offsets, d_offsets + 1, stream)); + device->FreeWorkspace(ctx, temp_storage); + } + + // copy graph metadata pointers to GPU + auto d_graphs = static_cast *>(device->AllocWorkspace( + ctx, (num_etypes) * sizeof(GraphKernelData))); + device->CopyDataFromTo( + h_graphs.data(), 0, d_graphs, 0, + (num_etypes) * sizeof(GraphKernelData), DGLContext{kDGLCPU, 0}, + ctx, hg->GetCSRMatrix(0).indptr->dtype); + // copy probs pointers to GPU + const FloatType **probs_dev = static_cast( + device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *))); + device->CopyDataFromTo( + probs.get(), 0, probs_dev, 0, (num_etypes) * sizeof(FloatType *), + DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype); + // copy probs_sum pointers to GPU + const FloatType **prob_sums_dev = static_cast( + device->AllocWorkspace(ctx, num_etypes * sizeof(FloatType *))); + device->CopyDataFromTo( + prob_sums.get(), 0, prob_sums_dev, 0, (num_etypes) * sizeof(FloatType *), + DGLContext{kDGLCPU, 0}, ctx, prob[0]->dtype); + // copy metapath to GPU + auto d_metapath = metapath.CopyTo(ctx); + const IdType *d_metapath_data = static_cast(d_metapath->data); + + constexpr int BLOCK_SIZE = 256; + constexpr int TILE_SIZE = BLOCK_SIZE * 4; + dim3 block(256); + dim3 grid((num_seeds + TILE_SIZE - 1) / TILE_SIZE); + const uint64_t random_seed = RandomEngine::ThreadLocal()->RandInt(1000000000); + CHECK(restart_prob->ctx.device_type == kDGLCUDA) + << "restart prob should be in GPU."; + CHECK(restart_prob->ndim == 1) << "restart prob dimension should be 1."; + const FloatType *restart_prob_data = restart_prob.Ptr(); + const int64_t restart_prob_size = restart_prob->shape[0]; + CUDA_KERNEL_CALL( + (_RandomWalkBiasedKernel), grid, + block, 0, stream, random_seed, seed_data, num_seeds, d_metapath_data, + max_num_steps, d_graphs, probs_dev, prob_sums_dev, restart_prob_data, + restart_prob_size, max_nodes, traces_data, eids_data); + + device->FreeWorkspace(ctx, d_graphs); + device->FreeWorkspace(ctx, probs_dev); + device->FreeWorkspace(ctx, prob_sums_dev); + return std::make_pair(traces, eids); +} + +template +std::pair RandomWalk( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob) { + bool isUniform = true; + for (const auto &etype_prob : prob) { + if (!IsNullArray(etype_prob)) { + isUniform = false; + break; + } + } + + auto restart_prob = + NDArray::Empty({0}, DGLDataType{kDGLFloat, 32, 1}, DGLContext{XPU, 0}); + if (!isUniform) { + std::pair ret; + ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", { + ret = RandomWalkBiased( + hg, seeds, metapath, prob, restart_prob); + }); + return ret; + } else { + return RandomWalkUniform(hg, seeds, metapath, restart_prob); + } +} + +template +std::pair RandomWalkWithRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, double restart_prob) { + bool isUniform = true; + for (const auto &etype_prob : prob) { + if (!IsNullArray(etype_prob)) { + isUniform = false; + break; + } + } + + auto device_ctx = seeds->ctx; + auto restart_prob_array = + NDArray::Empty({1}, DGLDataType{kDGLFloat, 64, 1}, device_ctx); + auto device = dgl::runtime::DeviceAPI::Get(device_ctx); + + // use cuda stream from local thread + cudaStream_t stream = runtime::getCurrentCUDAStream(); + device->CopyDataFromTo( + &restart_prob, 0, restart_prob_array.Ptr(), 0, sizeof(double), + DGLContext{kDGLCPU, 0}, device_ctx, restart_prob_array->dtype); + device->StreamSync(device_ctx, stream); + + if (!isUniform) { + std::pair ret; + ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", { + ret = RandomWalkBiased( + hg, seeds, metapath, prob, restart_prob_array); + }); + return ret; + } else { + return RandomWalkUniform( + hg, seeds, metapath, restart_prob_array); + } +} + +template +std::pair RandomWalkWithStepwiseRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, FloatArray restart_prob) { + bool isUniform = true; + for (const auto &etype_prob : prob) { + if (!IsNullArray(etype_prob)) { + isUniform = false; + break; + } + } + + if (!isUniform) { + std::pair ret; + ATEN_FLOAT_TYPE_SWITCH(prob[0]->dtype, FloatType, "probability", { + ret = RandomWalkBiased( + hg, seeds, metapath, prob, restart_prob); + }); + return ret; + } else { + return RandomWalkUniform(hg, seeds, metapath, restart_prob); + } +} + +template +std::tuple SelectPinSageNeighbors( + const IdArray src, const IdArray dst, const int64_t num_samples_per_node, + const int64_t k) { + CHECK(src->ctx.device_type == kDGLCUDA) << "IdArray needs be on GPU!"; + const IdxType *src_data = src.Ptr(); + const IdxType *dst_data = dst.Ptr(); + const int64_t num_dst_nodes = (dst->shape[0] / num_samples_per_node); + auto ctx = src->ctx; + // use cuda stream from local thread + cudaStream_t stream = runtime::getCurrentCUDAStream(); + auto frequency_hashmap = FrequencyHashmap( + num_dst_nodes, num_samples_per_node, ctx, stream); + auto ret = frequency_hashmap.Topk( + src_data, dst_data, src->dtype, src->shape[0], num_samples_per_node, k); + return ret; +} + +template std::pair RandomWalk( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob); +template std::pair RandomWalk( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob); + +template std::pair RandomWalkWithRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, double restart_prob); +template std::pair RandomWalkWithRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, double restart_prob); + +template std::pair +RandomWalkWithStepwiseRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, FloatArray restart_prob); +template std::pair +RandomWalkWithStepwiseRestart( + const HeteroGraphPtr hg, const IdArray seeds, const TypeArray metapath, + const std::vector &prob, FloatArray restart_prob); + +template std::tuple +SelectPinSageNeighbors( + const IdArray src, const IdArray dst, const int64_t num_samples_per_node, + const int64_t k); +template std::tuple +SelectPinSageNeighbors( + const IdArray src, const IdArray dst, const int64_t num_samples_per_node, + const int64_t k); + +}; // namespace impl + +}; // namespace sampling + +}; // namespace dgl diff --git a/src/graph/transform/cuda/cuda_compact_graph.cu b/src/graph/transform/cuda/cuda_compact_graph.cu index 359da3f0d41e..576fecba8e01 100644 --- a/src/graph/transform/cuda/cuda_compact_graph.cu +++ b/src/graph/transform/cuda/cuda_compact_graph.cu @@ -18,7 +18,7 @@ * all given graphs with the same set of nodes. */ -#include +#include #include #include @@ -55,10 +55,10 @@ template void BuildNodeMaps( const std::vector &input_nodes, DeviceNodeMap *const node_maps, int64_t *const count_unique_device, - std::vector *const unique_nodes_device, cudaStream_t stream) { + std::vector *const unique_nodes_device, hipStream_t stream) { const int64_t num_ntypes = static_cast(input_nodes.size()); - CUDA_CALL(cudaMemsetAsync( + CUDA_CALL(hipMemsetAsync( count_unique_device, 0, num_ntypes * sizeof(*count_unique_device), stream)); @@ -81,7 +81,7 @@ std::pair, std::vector> CompactGraphsGPU( const std::vector &always_preserve) { const auto &ctx = graphs[0]->Context(); auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); CHECK_EQ(ctx.device_type, kDGLCUDA); diff --git a/src/graph/transform/cuda/cuda_compact_graph.cu.prehip b/src/graph/transform/cuda/cuda_compact_graph.cu.prehip new file mode 100644 index 000000000000..359da3f0d41e --- /dev/null +++ b/src/graph/transform/cuda/cuda_compact_graph.cu.prehip @@ -0,0 +1,247 @@ +/** + * Copyright 2021 Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file graph/transform/cuda/cuda_compact_graph.cu + * @brief Functions to find and eliminate the common isolated nodes across + * all given graphs with the same set of nodes. + */ + +#include +#include +#include + +#include +#include +#include + +#include "../../../runtime/cuda/cuda_common.h" +#include "../../heterograph.h" +#include "../compact.h" +#include "cuda_map_edges.cuh" + +using namespace dgl::aten; +using namespace dgl::runtime::cuda; +using namespace dgl::transform::cuda; + +namespace dgl { +namespace transform { + +namespace { + +/** + * @brief This function builds node maps for each node type, preserving the + * order of the input nodes. Here it is assumed the nodes are not unique, + * and thus a unique list is generated. + * + * @param input_nodes The set of input nodes. + * @param node_maps The node maps to be constructed. + * @param count_unique_device The number of unique nodes (on the GPU). + * @param unique_nodes_device The unique nodes (on the GPU). + * @param stream The stream to operate on. + */ +template +void BuildNodeMaps( + const std::vector &input_nodes, + DeviceNodeMap *const node_maps, int64_t *const count_unique_device, + std::vector *const unique_nodes_device, cudaStream_t stream) { + const int64_t num_ntypes = static_cast(input_nodes.size()); + + CUDA_CALL(cudaMemsetAsync( + count_unique_device, 0, num_ntypes * sizeof(*count_unique_device), + stream)); + + // possibly duplicated nodes + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + const IdArray &nodes = input_nodes[ntype]; + if (nodes->shape[0] > 0) { + CHECK_EQ(nodes->ctx.device_type, kDGLCUDA); + node_maps->LhsHashTable(ntype).FillWithDuplicates( + nodes.Ptr(), nodes->shape[0], + (*unique_nodes_device)[ntype].Ptr(), + count_unique_device + ntype, stream); + } + } +} + +template +std::pair, std::vector> CompactGraphsGPU( + const std::vector &graphs, + const std::vector &always_preserve) { + const auto &ctx = graphs[0]->Context(); + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + CHECK_EQ(ctx.device_type, kDGLCUDA); + + // Step 1: Collect the nodes that has connections for each type. + const uint64_t num_ntypes = graphs[0]->NumVertexTypes(); + std::vector> all_edges( + graphs.size()); // all_edges[i][etype] + + // count the number of nodes per type + std::vector max_vertex_cnt(num_ntypes, 0); + for (size_t i = 0; i < graphs.size(); ++i) { + const HeteroGraphPtr curr_graph = graphs[i]; + const int64_t num_etypes = curr_graph->NumEdgeTypes(); + + for (IdType etype = 0; etype < num_etypes; ++etype) { + IdType srctype, dsttype; + std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype); + + const int64_t n_edges = curr_graph->NumEdges(etype); + max_vertex_cnt[srctype] += n_edges; + max_vertex_cnt[dsttype] += n_edges; + } + } + + for (size_t i = 0; i < always_preserve.size(); ++i) { + max_vertex_cnt[i] += always_preserve[i]->shape[0]; + } + + // gather all nodes + std::vector all_nodes(num_ntypes); + std::vector node_offsets(num_ntypes, 0); + + for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) { + all_nodes[ntype] = + NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8); + // copy the nodes in always_preserve + if (ntype < always_preserve.size() && + always_preserve[ntype]->shape[0] > 0) { + device->CopyDataFromTo( + always_preserve[ntype].Ptr(), 0, + all_nodes[ntype].Ptr(), node_offsets[ntype], + sizeof(IdType) * always_preserve[ntype]->shape[0], + always_preserve[ntype]->ctx, all_nodes[ntype]->ctx, + always_preserve[ntype]->dtype); + node_offsets[ntype] += sizeof(IdType) * always_preserve[ntype]->shape[0]; + } + } + + for (size_t i = 0; i < graphs.size(); ++i) { + const HeteroGraphPtr curr_graph = graphs[i]; + const int64_t num_etypes = curr_graph->NumEdgeTypes(); + + all_edges[i].reserve(num_etypes); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + dgl_type_t srctype, dsttype; + std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype); + + const EdgeArray edges = curr_graph->Edges(etype, "eid"); + + if (edges.src.defined()) { + device->CopyDataFromTo( + edges.src.Ptr(), 0, all_nodes[srctype].Ptr(), + node_offsets[srctype], sizeof(IdType) * edges.src->shape[0], + edges.src->ctx, all_nodes[srctype]->ctx, edges.src->dtype); + node_offsets[srctype] += sizeof(IdType) * edges.src->shape[0]; + } + if (edges.dst.defined()) { + device->CopyDataFromTo( + edges.dst.Ptr(), 0, all_nodes[dsttype].Ptr(), + node_offsets[dsttype], sizeof(IdType) * edges.dst->shape[0], + edges.dst->ctx, all_nodes[dsttype]->ctx, edges.dst->dtype); + node_offsets[dsttype] += sizeof(IdType) * edges.dst->shape[0]; + } + all_edges[i].push_back(edges); + } + } + + // Step 2: Relabel the nodes for each type to a smaller ID space + // using BuildNodeMaps + + // allocate space for map creation + // the hashmap on GPU + DeviceNodeMap node_maps(max_vertex_cnt, 0, ctx, stream); + // number of unique nodes per type on CPU + std::vector num_induced_nodes(num_ntypes); + // number of unique nodes per type on GPU + int64_t *count_unique_device = static_cast( + device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes)); + // the set of unique nodes per type + std::vector induced_nodes(num_ntypes); + for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) { + induced_nodes[ntype] = + NewIdArray(max_vertex_cnt[ntype], ctx, sizeof(IdType) * 8); + } + + BuildNodeMaps( + all_nodes, &node_maps, count_unique_device, &induced_nodes, stream); + + device->CopyDataFromTo( + count_unique_device, 0, num_induced_nodes.data(), 0, + sizeof(*num_induced_nodes.data()) * num_ntypes, ctx, + DGLContext{kDGLCPU, 0}, DGLDataType{kDGLInt, 64, 1}); + device->StreamSync(ctx, stream); + + // wait for the node counts to finish transferring + device->FreeWorkspace(ctx, count_unique_device); + + // resize induced nodes + for (uint64_t ntype = 0; ntype < num_ntypes; ++ntype) { + induced_nodes[ntype]->shape[0] = num_induced_nodes[ntype]; + } + + // Step 3: Remap the edges of each graph using MapEdges + std::vector new_graphs; + for (size_t i = 0; i < graphs.size(); ++i) { + const HeteroGraphPtr curr_graph = graphs[i]; + const auto meta_graph = curr_graph->meta_graph(); + const int64_t num_etypes = curr_graph->NumEdgeTypes(); + + std::vector rel_graphs; + rel_graphs.reserve(num_etypes); + + std::vector new_src; + std::vector new_dst; + std::tie(new_src, new_dst) = + MapEdges(curr_graph, all_edges[i], node_maps, stream); + + for (IdType etype = 0; etype < num_etypes; ++etype) { + IdType srctype, dsttype; + std::tie(srctype, dsttype) = curr_graph->GetEndpointTypes(etype); + + rel_graphs.push_back(UnitGraph::CreateFromCOO( + srctype == dsttype ? 1 : 2, induced_nodes[srctype]->shape[0], + induced_nodes[dsttype]->shape[0], new_src[etype], new_dst[etype])); + } + + new_graphs.push_back( + CreateHeteroGraph(meta_graph, rel_graphs, num_induced_nodes)); + } + + return std::make_pair(new_graphs, induced_nodes); +} + +} // namespace + +template <> +std::pair, std::vector> +CompactGraphs( + const std::vector &graphs, + const std::vector &always_preserve) { + return CompactGraphsGPU(graphs, always_preserve); +} + +template <> +std::pair, std::vector> +CompactGraphs( + const std::vector &graphs, + const std::vector &always_preserve) { + return CompactGraphsGPU(graphs, always_preserve); +} + +} // namespace transform +} // namespace dgl diff --git a/src/graph/transform/cuda/cuda_map_edges.cuh b/src/graph/transform/cuda/cuda_map_edges.cuh index 93f1f3e2d927..9ae512f62926 100644 --- a/src/graph/transform/cuda/cuda_map_edges.cuh +++ b/src/graph/transform/cuda/cuda_map_edges.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright 2020-2022 Contributors * @@ -22,7 +23,7 @@ #include #include -#include +#include #include #include @@ -113,7 +114,7 @@ class DeviceNodeMap { DeviceNodeMap( const std::vector& num_nodes, const int64_t offset, - DGLContext ctx, cudaStream_t stream) + DGLContext ctx, hipStream_t stream) : num_types_(num_nodes.size()), rhs_offset_(offset), hash_tables_(), @@ -185,7 +186,7 @@ inline IdType RoundUp(const IdType num, const size_t unit) { template std::tuple, std::vector> MapEdges( HeteroGraphPtr graph, const std::vector& edge_sets, - const DeviceNodeMap& node_map, cudaStream_t stream) { + const DeviceNodeMap& node_map, hipStream_t stream) { constexpr const int BLOCK_SIZE = 128; constexpr const size_t TILE_SIZE = 1024; diff --git a/src/graph/transform/cuda/cuda_map_edges.cuh.prehip b/src/graph/transform/cuda/cuda_map_edges.cuh.prehip new file mode 100644 index 000000000000..93f1f3e2d927 --- /dev/null +++ b/src/graph/transform/cuda/cuda_map_edges.cuh.prehip @@ -0,0 +1,240 @@ +/** + * Copyright 2020-2022 Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file graph/transform/cuda/cuda_map_edges.cuh + * @brief Device level functions for mapping edges. + */ + +#ifndef DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_ +#define DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../../../runtime/cuda/cuda_common.h" +#include "../../../runtime/cuda/cuda_hashtable.cuh" + +using namespace dgl::aten; +using namespace dgl::runtime::cuda; + +namespace dgl { +namespace transform { + +namespace cuda { + +template +__device__ void map_vertex_ids( + const IdType* const global, IdType* const new_global, + const IdType num_vertices, const DeviceOrderedHashTable& table) { + assert(BLOCK_SIZE == blockDim.x); + + using Mapping = typename OrderedHashTable::Mapping; + + const IdType tile_start = TILE_SIZE * blockIdx.x; + const IdType tile_end = min(TILE_SIZE * (blockIdx.x + 1), num_vertices); + + for (IdType idx = threadIdx.x + tile_start; idx < tile_end; + idx += BLOCK_SIZE) { + const Mapping& mapping = *table.Search(global[idx]); + new_global[idx] = mapping.local; + } +} + +/** + * @brief Generate mapped edge endpoint ids. + * + * @tparam IdType The type of id. + * @tparam BLOCK_SIZE The size of each thread block. + * @tparam TILE_SIZE The number of edges to process per thread block. + * @param global_srcs_device The source ids to map. + * @param new_global_srcs_device The mapped source ids (output). + * @param global_dsts_device The destination ids to map. + * @param new_global_dsts_device The mapped destination ids (output). + * @param num_edges The number of edges to map. + * @param src_mapping The mapping of sources ids. + * @param src_hash_size The the size of source id hash table/mapping. + * @param dst_mapping The mapping of destination ids. + * @param dst_hash_size The the size of destination id hash table/mapping. + */ +template +__global__ void map_edge_ids( + const IdType* const global_srcs_device, + IdType* const new_global_srcs_device, + const IdType* const global_dsts_device, + IdType* const new_global_dsts_device, const IdType num_edges, + DeviceOrderedHashTable src_mapping, + DeviceOrderedHashTable dst_mapping) { + assert(BLOCK_SIZE == blockDim.x); + assert(2 == gridDim.y); + + if (blockIdx.y == 0) { + map_vertex_ids( + global_srcs_device, new_global_srcs_device, num_edges, src_mapping); + } else { + map_vertex_ids( + global_dsts_device, new_global_dsts_device, num_edges, dst_mapping); + } +} + +/** + * @brief Device level node maps for each node type. + * + * @param num_nodes Number of nodes per type. + * @param offset When offset is set to 0, LhsHashTable is identical to + * RhsHashTable. Or set to num_nodes.size()/2 to use seperated + * LhsHashTable and RhsHashTable. + * @param ctx The DGL context. + * @param stream The stream to operate on. + */ +template +class DeviceNodeMap { + public: + using Mapping = typename OrderedHashTable::Mapping; + + DeviceNodeMap( + const std::vector& num_nodes, const int64_t offset, + DGLContext ctx, cudaStream_t stream) + : num_types_(num_nodes.size()), + rhs_offset_(offset), + hash_tables_(), + ctx_(ctx) { + auto device = runtime::DeviceAPI::Get(ctx); + + hash_tables_.reserve(num_types_); + for (int64_t i = 0; i < num_types_; ++i) { + hash_tables_.emplace_back( + new OrderedHashTable(num_nodes[i], ctx_, stream)); + } + } + + OrderedHashTable& LhsHashTable(const size_t index) { + return HashData(index); + } + + OrderedHashTable& RhsHashTable(const size_t index) { + return HashData(index + rhs_offset_); + } + + const OrderedHashTable& LhsHashTable(const size_t index) const { + return HashData(index); + } + + const OrderedHashTable& RhsHashTable(const size_t index) const { + return HashData(index + rhs_offset_); + } + + IdType LhsHashSize(const size_t index) const { return HashSize(index); } + + IdType RhsHashSize(const size_t index) const { + return HashSize(rhs_offset_ + index); + } + + size_t Size() const { return hash_tables_.size(); } + + private: + int64_t num_types_; + size_t rhs_offset_; + std::vector>> hash_tables_; + DGLContext ctx_; + + inline OrderedHashTable& HashData(const size_t index) { + CHECK_LT(index, hash_tables_.size()); + return *hash_tables_[index]; + } + + inline const OrderedHashTable& HashData(const size_t index) const { + CHECK_LT(index, hash_tables_.size()); + return *hash_tables_[index]; + } + + inline IdType HashSize(const size_t index) const { + return HashData(index).size(); + } +}; + +template +inline size_t RoundUpDiv(const IdType num, const size_t divisor) { + return static_cast(num / divisor) + (num % divisor == 0 ? 0 : 1); +} + +template +inline IdType RoundUp(const IdType num, const size_t unit) { + return RoundUpDiv(num, unit) * unit; +} + +template +std::tuple, std::vector> MapEdges( + HeteroGraphPtr graph, const std::vector& edge_sets, + const DeviceNodeMap& node_map, cudaStream_t stream) { + constexpr const int BLOCK_SIZE = 128; + constexpr const size_t TILE_SIZE = 1024; + + const auto& ctx = graph->Context(); + + std::vector new_lhs; + new_lhs.reserve(edge_sets.size()); + std::vector new_rhs; + new_rhs.reserve(edge_sets.size()); + + // The next peformance optimization here, is to perform mapping of all edge + // types in a single kernel launch. + const int64_t num_edge_sets = static_cast(edge_sets.size()); + for (int64_t etype = 0; etype < num_edge_sets; ++etype) { + const EdgeArray& edges = edge_sets[etype]; + if (edges.id.defined() && edges.src->shape[0] > 0) { + const int64_t num_edges = edges.src->shape[0]; + + new_lhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8)); + new_rhs.emplace_back(NewIdArray(num_edges, ctx, sizeof(IdType) * 8)); + + const auto src_dst_types = graph->GetEndpointTypes(etype); + const int src_type = src_dst_types.first; + const int dst_type = src_dst_types.second; + + const dim3 grid(RoundUpDiv(num_edges, TILE_SIZE), 2); + const dim3 block(BLOCK_SIZE); + + // map the srcs + CUDA_KERNEL_CALL( + (map_edge_ids), grid, block, 0, stream, + edges.src.Ptr(), new_lhs.back().Ptr(), + edges.dst.Ptr(), new_rhs.back().Ptr(), num_edges, + node_map.LhsHashTable(src_type).DeviceHandle(), + node_map.RhsHashTable(dst_type).DeviceHandle()); + } else { + new_lhs.emplace_back( + aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)); + new_rhs.emplace_back( + aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)); + } + } + + return std::tuple, std::vector>( + std::move(new_lhs), std::move(new_rhs)); +} + +} // namespace cuda +} // namespace transform +} // namespace dgl + +#endif // DGL_GRAPH_TRANSFORM_CUDA_CUDA_MAP_EDGES_CUH_ diff --git a/src/graph/transform/cuda/cuda_to_block.cu b/src/graph/transform/cuda/cuda_to_block.cu index a8bffc8cc6e8..f09bd694aaf4 100644 --- a/src/graph/transform/cuda/cuda_to_block.cu +++ b/src/graph/transform/cuda/cuda_to_block.cu @@ -20,7 +20,7 @@ * Tested via python wrapper: python/dgl/path/to/to_block.py */ -#include +#include #include #include #include @@ -69,10 +69,10 @@ class DeviceNodeMapMaker { const std::vector& lhs_nodes, const std::vector& rhs_nodes, DeviceNodeMap* const node_maps, int64_t* const count_lhs_device, - std::vector* const lhs_device, cudaStream_t stream) { + std::vector* const lhs_device, hipStream_t stream) { const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); - CUDA_CALL(cudaMemsetAsync( + CUDA_CALL(hipMemsetAsync( count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream)); // possibly dublicate lhs nodes @@ -112,7 +112,7 @@ class DeviceNodeMapMaker { void Make( const std::vector& lhs_nodes, const std::vector& rhs_nodes, - DeviceNodeMap* const node_maps, cudaStream_t stream) { + DeviceNodeMap* const node_maps, hipStream_t stream) { const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); // unique lhs nodes @@ -155,7 +155,7 @@ struct CUDAIdsMapper { std::vector& num_nodes_per_type = *num_nodes_per_type_ptr; const bool generate_lhs_nodes = lhs_nodes.empty(); auto device = runtime::DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); // Allocate space for map creation process. DeviceNodeMapMaker maker(maxNodesPerType); @@ -168,7 +168,7 @@ struct CUDAIdsMapper { } } - cudaEvent_t copyEvent; + hipEvent_t copyEvent; NDArray new_len_tensor; // Populate the mappings. if (generate_lhs_nodes) { @@ -179,7 +179,7 @@ struct CUDAIdsMapper { src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, stream); - CUDA_CALL(cudaEventCreate(©Event)); + CUDA_CALL(hipEventCreate(©Event)); if (TensorDispatcher::Global()->IsAvailable()) { new_len_tensor = NDArray::PinnedEmpty( {num_ntypes}, DGLDataTypeTraits::dtype, @@ -190,11 +190,11 @@ struct CUDAIdsMapper { {num_ntypes}, DGLDataTypeTraits::dtype, DGLContext{kDGLCPU, 0}); } - CUDA_CALL(cudaMemcpyAsync( + CUDA_CALL(hipMemcpyAsync( new_len_tensor->data, count_lhs_device, sizeof(*num_nodes_per_type.data()) * num_ntypes, - cudaMemcpyDeviceToHost, stream)); - CUDA_CALL(cudaEventRecord(copyEvent, stream)); + hipMemcpyDeviceToHost, stream)); + CUDA_CALL(hipEventRecord(copyEvent, stream)); device->FreeWorkspace(ctx, count_lhs_device); } else { @@ -209,8 +209,8 @@ struct CUDAIdsMapper { if (generate_lhs_nodes) { // wait for the previous copy - CUDA_CALL(cudaEventSynchronize(copyEvent)); - CUDA_CALL(cudaEventDestroy(copyEvent)); + CUDA_CALL(hipEventSynchronize(copyEvent)); + CUDA_CALL(hipEventDestroy(copyEvent)); // Resize lhs nodes. for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { diff --git a/src/graph/transform/cuda/cuda_to_block.cu.prehip b/src/graph/transform/cuda/cuda_to_block.cu.prehip new file mode 100644 index 000000000000..a8bffc8cc6e8 --- /dev/null +++ b/src/graph/transform/cuda/cuda_to_block.cu.prehip @@ -0,0 +1,258 @@ +/** + * Copyright 2020-2021 Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file graph/transform/cuda/cuda_to_block.cu + * @brief Functions to convert a set of edges into a graph block with local + * ids. + * + * Tested via python wrapper: python/dgl/path/to/to_block.py + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include "../../../runtime/cuda/cuda_common.h" +#include "../../heterograph.h" +#include "../to_block.h" +#include "cuda_map_edges.cuh" + +using namespace dgl::aten; +using namespace dgl::runtime::cuda; +using namespace dgl::transform::cuda; +using TensorDispatcher = dgl::runtime::TensorDispatcher; + +namespace dgl { +namespace transform { + +namespace { + +template +class DeviceNodeMapMaker { + public: + explicit DeviceNodeMapMaker(const std::vector& maxNodesPerType) + : max_num_nodes_(0) { + max_num_nodes_ = + *std::max_element(maxNodesPerType.begin(), maxNodesPerType.end()); + } + + /** + * @brief This function builds node maps for each node type, preserving the + * order of the input nodes. Here it is assumed the lhs_nodes are not unique, + * and thus a unique list is generated. + * + * @param lhs_nodes The set of source input nodes. + * @param rhs_nodes The set of destination input nodes. + * @param node_maps The node maps to be constructed. + * @param count_lhs_device The number of unique source nodes (on the GPU). + * @param lhs_device The unique source nodes (on the GPU). + * @param stream The stream to operate on. + */ + void Make( + const std::vector& lhs_nodes, + const std::vector& rhs_nodes, + DeviceNodeMap* const node_maps, int64_t* const count_lhs_device, + std::vector* const lhs_device, cudaStream_t stream) { + const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); + + CUDA_CALL(cudaMemsetAsync( + count_lhs_device, 0, num_ntypes * sizeof(*count_lhs_device), stream)); + + // possibly dublicate lhs nodes + const int64_t lhs_num_ntypes = static_cast(lhs_nodes.size()); + for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) { + const IdArray& nodes = lhs_nodes[ntype]; + if (nodes->shape[0] > 0) { + CHECK_EQ(nodes->ctx.device_type, kDGLCUDA); + node_maps->LhsHashTable(ntype).FillWithDuplicates( + nodes.Ptr(), nodes->shape[0], + (*lhs_device)[ntype].Ptr(), count_lhs_device + ntype, + stream); + } + } + + // unique rhs nodes + const int64_t rhs_num_ntypes = static_cast(rhs_nodes.size()); + for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) { + const IdArray& nodes = rhs_nodes[ntype]; + if (nodes->shape[0] > 0) { + node_maps->RhsHashTable(ntype).FillWithUnique( + nodes.Ptr(), nodes->shape[0], stream); + } + } + } + + /** + * @brief This function builds node maps for each node type, preserving the + * order of the input nodes. Here it is assumed both lhs_nodes and rhs_nodes + * are unique. + * + * @param lhs_nodes The set of source input nodes. + * @param rhs_nodes The set of destination input nodes. + * @param node_maps The node maps to be constructed. + * @param stream The stream to operate on. + */ + void Make( + const std::vector& lhs_nodes, + const std::vector& rhs_nodes, + DeviceNodeMap* const node_maps, cudaStream_t stream) { + const int64_t num_ntypes = lhs_nodes.size() + rhs_nodes.size(); + + // unique lhs nodes + const int64_t lhs_num_ntypes = static_cast(lhs_nodes.size()); + for (int64_t ntype = 0; ntype < lhs_num_ntypes; ++ntype) { + const IdArray& nodes = lhs_nodes[ntype]; + if (nodes->shape[0] > 0) { + CHECK_EQ(nodes->ctx.device_type, kDGLCUDA); + node_maps->LhsHashTable(ntype).FillWithUnique( + nodes.Ptr(), nodes->shape[0], stream); + } + } + + // unique rhs nodes + const int64_t rhs_num_ntypes = static_cast(rhs_nodes.size()); + for (int64_t ntype = 0; ntype < rhs_num_ntypes; ++ntype) { + const IdArray& nodes = rhs_nodes[ntype]; + if (nodes->shape[0] > 0) { + node_maps->RhsHashTable(ntype).FillWithUnique( + nodes.Ptr(), nodes->shape[0], stream); + } + } + } + + private: + IdType max_num_nodes_; +}; + +template +struct CUDAIdsMapper { + std::tuple, std::vector> operator()( + const HeteroGraphPtr& graph, bool include_rhs_in_lhs, int64_t num_ntypes, + const DGLContext& ctx, const std::vector& maxNodesPerType, + const std::vector& edge_arrays, + const std::vector& src_nodes, + const std::vector& rhs_nodes, + std::vector* const lhs_nodes_ptr, + std::vector* const num_nodes_per_type_ptr) { + std::vector& lhs_nodes = *lhs_nodes_ptr; + std::vector& num_nodes_per_type = *num_nodes_per_type_ptr; + const bool generate_lhs_nodes = lhs_nodes.empty(); + auto device = runtime::DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + // Allocate space for map creation process. + DeviceNodeMapMaker maker(maxNodesPerType); + DeviceNodeMap node_maps(maxNodesPerType, num_ntypes, ctx, stream); + if (generate_lhs_nodes) { + lhs_nodes.reserve(num_ntypes); + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + lhs_nodes.emplace_back( + NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8)); + } + } + + cudaEvent_t copyEvent; + NDArray new_len_tensor; + // Populate the mappings. + if (generate_lhs_nodes) { + int64_t* count_lhs_device = static_cast( + device->AllocWorkspace(ctx, sizeof(int64_t) * num_ntypes * 2)); + + maker.Make( + src_nodes, rhs_nodes, &node_maps, count_lhs_device, &lhs_nodes, + stream); + + CUDA_CALL(cudaEventCreate(©Event)); + if (TensorDispatcher::Global()->IsAvailable()) { + new_len_tensor = NDArray::PinnedEmpty( + {num_ntypes}, DGLDataTypeTraits::dtype, + DGLContext{kDGLCPU, 0}); + } else { + // use pageable memory, it will unecessarily block but be functional + new_len_tensor = NDArray::Empty( + {num_ntypes}, DGLDataTypeTraits::dtype, + DGLContext{kDGLCPU, 0}); + } + CUDA_CALL(cudaMemcpyAsync( + new_len_tensor->data, count_lhs_device, + sizeof(*num_nodes_per_type.data()) * num_ntypes, + cudaMemcpyDeviceToHost, stream)); + CUDA_CALL(cudaEventRecord(copyEvent, stream)); + + device->FreeWorkspace(ctx, count_lhs_device); + } else { + maker.Make(lhs_nodes, rhs_nodes, &node_maps, stream); + + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + num_nodes_per_type[ntype] = lhs_nodes[ntype]->shape[0]; + } + } + // Map node numberings from global to local, and build pointer for CSR. + auto ret = MapEdges(graph, edge_arrays, node_maps, stream); + + if (generate_lhs_nodes) { + // wait for the previous copy + CUDA_CALL(cudaEventSynchronize(copyEvent)); + CUDA_CALL(cudaEventDestroy(copyEvent)); + + // Resize lhs nodes. + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + num_nodes_per_type[ntype] = + static_cast(new_len_tensor->data)[ntype]; + lhs_nodes[ntype]->shape[0] = num_nodes_per_type[ntype]; + } + } + + return ret; + } +}; + +template +std::tuple> ToBlockGPU( + HeteroGraphPtr graph, const std::vector& rhs_nodes, + bool include_rhs_in_lhs, std::vector* const lhs_nodes_ptr) { + return dgl::transform::ProcessToBlock( + graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr, + CUDAIdsMapper()); +} + +} // namespace + +// Use explicit names to get around MSVC's broken mangling that thinks the +// following two functions are the same. Using template<> fails to export the +// symbols. +std::tuple> +// ToBlock +ToBlockGPU32( + HeteroGraphPtr graph, const std::vector& rhs_nodes, + bool include_rhs_in_lhs, std::vector* const lhs_nodes) { + return ToBlockGPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +std::tuple> +// ToBlock +ToBlockGPU64( + HeteroGraphPtr graph, const std::vector& rhs_nodes, + bool include_rhs_in_lhs, std::vector* const lhs_nodes) { + return ToBlockGPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +} // namespace transform +} // namespace dgl diff --git a/src/graph/transform/cuda/knn.cu b/src/graph/transform/cuda/knn.cu index 988ff4f3e9e7..606352be5a03 100644 --- a/src/graph/transform/cuda/knn.cu +++ b/src/graph/transform/cuda/knn.cu @@ -1,16 +1,17 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2020 by Contributors * @file graph/transform/cuda/knn.cu * @brief k-nearest-neighbor (KNN) implementation (cuda) */ -#include +#include #include #include #include #include -#include // NOLINT +#include // NOLINT #include #include #include @@ -467,7 +468,7 @@ void BruteForceKNNCuda( const NDArray& data_points, const IdArray& data_offsets, const NDArray& query_points, const IdArray& query_offsets, const int k, IdArray result) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = data_points->ctx; auto device = runtime::DeviceAPI::Get(ctx); const int64_t batch_size = data_offsets->shape[0] - 1; @@ -512,7 +513,7 @@ void BruteForceKNNSharedCuda( const NDArray& data_points, const IdArray& data_offsets, const NDArray& query_points, const IdArray& query_offsets, const int k, IdArray result) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = data_points->ctx; auto device = runtime::DeviceAPI::Get(ctx); const int64_t batch_size = data_offsets->shape[0] - 1; @@ -528,8 +529,8 @@ void BruteForceKNNSharedCuda( // get max shared memory per block in bytes // determine block size according to this value int max_sharedmem_per_block = 0; - CUDA_CALL(cudaDeviceGetAttribute( - &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + CUDA_CALL(hipDeviceGetAttribute( + &max_sharedmem_per_block, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id)); const int64_t single_shared_mem = static_cast(Pow2Align( (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType), @@ -552,17 +553,17 @@ void BruteForceKNNSharedCuda( GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream, query_offsets_data, num_block_per_segment, batch_size, block_size); size_t prefix_temp_size = 0; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum, batch_size, stream)); void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum, batch_size, stream)); device->FreeWorkspace(ctx, prefix_temp); // wait for results - CUDA_CALL(cudaStreamSynchronize(stream)); + CUDA_CALL(hipStreamSynchronize(stream)); int64_t num_blocks = 0, final_elem = 0, copyoffset = (batch_size - 1) * sizeof(IdType); @@ -603,10 +604,10 @@ void BruteForceKNNSharedCuda( /** @brief Setup rng state for nn-descent */ __global__ void SetupRngKernel( - curandState* states, const uint64_t seed, const size_t n) { + hiprandState* states, const uint64_t seed, const size_t n) { size_t id = blockIdx.x * blockDim.x + threadIdx.x; if (id < n) { - curand_init(seed, id, 0, states + id); + hiprand_init(seed, id, 0, states + id); } } @@ -622,8 +623,8 @@ __global__ void RandomInitNeighborsKernel( const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; IdType batch_idx = 0; if (point_idx >= offsets[batch_size]) return; - curandState state; - curand_init(seed, point_idx, 0, &state); + hiprandState state; + hiprand_init(seed, point_idx, 0, &state); // find the segment location in the input batch for (IdType b = 0; b < batch_size + 1; ++b) { @@ -646,7 +647,7 @@ __global__ void RandomInitNeighborsKernel( current_central_nodes[i] = point_idx; } for (IdType i = k; i < segment_size; ++i) { - const IdType j = static_cast(curand(&state) % (i + 1)); + const IdType j = static_cast(hiprand(&state) % (i + 1)); if (j < k) current_neighbors[j] = i + segment_start; } @@ -674,8 +675,8 @@ __global__ void FindCandidatesKernel( const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; IdType batch_idx = 0; if (point_idx >= offsets[batch_size]) return; - curandState state; - curand_init(seed, point_idx, 0, &state); + hiprandState state; + hiprand_init(seed, point_idx, 0, &state); // find the segment location in the input batch for (IdType b = 0; b < batch_size + 1; ++b) { @@ -711,7 +712,7 @@ __global__ void FindCandidatesKernel( if (curr_num < num_candidates) { candidate_data[curr_num] = candidate; } else { - IdType pos = static_cast(curand(&state) % (curr_num + 1)); + IdType pos = static_cast(hiprand(&state) % (curr_num + 1)); if (pos < num_candidates) candidate_data[pos] = candidate; } ++candidate_array[0]; @@ -732,7 +733,7 @@ __global__ void FindCandidatesKernel( if (curr_num < num_candidates) { candidate_data[curr_num] = reverse_candidate; } else { - IdType pos = static_cast(curand(&state) % (curr_num + 1)); + IdType pos = static_cast(hiprand(&state) % (curr_num + 1)); if (pos < num_candidates) candidate_data[pos] = reverse_candidate; } ++candidate_array[0]; @@ -873,7 +874,7 @@ template void NNDescent( const NDArray& points, const IdArray& offsets, IdArray result, const int k, const int num_iters, const int num_candidates, const double delta) { - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const auto& ctx = points->ctx; auto device = runtime::DeviceAPI::Get(ctx); const int64_t num_nodes = points->shape[0]; @@ -887,7 +888,7 @@ void NNDescent( uint64_t seed; int warp_size = 0; CUDA_CALL( - cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id)); + hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, ctx.device_id)); // We don't need large block sizes, since there's not much inter-thread // communication int64_t block_size = warp_size; @@ -911,7 +912,7 @@ void NNDescent( IdType* total_num_updates_d = static_cast(device->AllocWorkspace(ctx, sizeof(IdType))); - CUDA_CALL(cub::DeviceReduce::Sum( + CUDA_CALL(hipcub::DeviceReduce::Sum( nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes, stream)); IdType* sum_temp_storage = @@ -942,7 +943,7 @@ void NNDescent( feature_size); total_num_updates = 0; - CUDA_CALL(cub::DeviceReduce::Sum( + CUDA_CALL(hipcub::DeviceReduce::Sum( sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d, num_nodes, stream)); device->CopyDataFromTo( diff --git a/src/graph/transform/cuda/knn.cu.prehip b/src/graph/transform/cuda/knn.cu.prehip new file mode 100644 index 000000000000..988ff4f3e9e7 --- /dev/null +++ b/src/graph/transform/cuda/knn.cu.prehip @@ -0,0 +1,997 @@ +/** + * Copyright (c) 2020 by Contributors + * @file graph/transform/cuda/knn.cu + * @brief k-nearest-neighbor (KNN) implementation (cuda) + */ + +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include +#include +#include + +#include "../../../array/cuda/utils.h" +#include "../../../runtime/cuda/cuda_common.h" +#include "../knn.h" + +namespace dgl { +namespace transform { +namespace impl { + +/** + * @brief Given input `size`, find the smallest value + * greater or equal to `size` that is a multiple of `align`. + * + * e.g. Pow2Align(17, 4) = 20, Pow2Align(17, 8) = 24 + */ +template +static __host__ __device__ std::enable_if_t::value, Type> +Pow2Align(Type size, Type align) { + if (align <= 1 || size <= 0) return size; + return ((size - 1) | (align - 1)) + 1; +} + +/** + * @brief Utility class used to avoid linker errors with extern + * unsized shared memory arrays with templated type + */ +template +struct SharedMemory { + __device__ inline operator Type*() { + extern __shared__ int __smem[]; + return reinterpret_cast(__smem); + } + + __device__ inline operator const Type*() const { + extern __shared__ int __smem[]; + return reinterpret_cast(__smem); + } +}; + +// specialize for double to avoid unaligned memory +// access compile errors +template <> +struct SharedMemory { + __device__ inline operator double*() { + extern __shared__ double __smem_d[]; + return reinterpret_cast(__smem_d); + } + + __device__ inline operator const double*() const { + extern __shared__ double __smem_d[]; + return reinterpret_cast(__smem_d); + } +}; + +/** @brief Compute Euclidean distance between two vectors in a cuda kernel */ +template +__device__ FloatType +EuclideanDist(const FloatType* vec1, const FloatType* vec2, const int64_t dim) { + FloatType dist = 0; + IdType idx = 0; + for (; idx < dim - 3; idx += 4) { + FloatType diff0 = vec1[idx] - vec2[idx]; + FloatType diff1 = vec1[idx + 1] - vec2[idx + 1]; + FloatType diff2 = vec1[idx + 2] - vec2[idx + 2]; + FloatType diff3 = vec1[idx + 3] - vec2[idx + 3]; + + dist += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3; + } + + for (; idx < dim; ++idx) { + FloatType diff = vec1[idx] - vec2[idx]; + dist += diff * diff; + } + + return dist; +} + +/** + * @brief Compute Euclidean distance between two vectors in a cuda kernel, + * return positive infinite value if the intermediate distance is greater + * than the worst distance. + */ +template +__device__ FloatType EuclideanDistWithCheck( + const FloatType* vec1, const FloatType* vec2, const int64_t dim, + const FloatType worst_dist) { + FloatType dist = 0; + IdType idx = 0; + bool early_stop = false; + + for (; idx < dim - 3; idx += 4) { + FloatType diff0 = vec1[idx] - vec2[idx]; + FloatType diff1 = vec1[idx + 1] - vec2[idx + 1]; + FloatType diff2 = vec1[idx + 2] - vec2[idx + 2]; + FloatType diff3 = vec1[idx + 3] - vec2[idx + 3]; + + dist += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3; + if (dist > worst_dist) { + early_stop = true; + idx = dim; + break; + } + } + + for (; idx < dim; ++idx) { + FloatType diff = vec1[idx] - vec2[idx]; + dist += diff * diff; + if (dist > worst_dist) { + early_stop = true; + break; + } + } + + if (early_stop) { + return std::numeric_limits::max(); + } else { + return dist; + } +} + +template +__device__ void BuildHeap(IdType* indices, FloatType* dists, int size) { + for (int i = size / 2 - 1; i >= 0; --i) { + IdType idx = i; + while (true) { + IdType largest = idx; + IdType left = idx * 2 + 1; + IdType right = left + 1; + if (left < size && dists[left] > dists[largest]) { + largest = left; + } + if (right < size && dists[right] > dists[largest]) { + largest = right; + } + if (largest != idx) { + IdType tmp_idx = indices[largest]; + indices[largest] = indices[idx]; + indices[idx] = tmp_idx; + + FloatType tmp_dist = dists[largest]; + dists[largest] = dists[idx]; + dists[idx] = tmp_dist; + idx = largest; + } else { + break; + } + } + } +} + +template +__device__ void HeapInsert( + IdType* indices, FloatType* dist, IdType new_idx, FloatType new_dist, + int size, bool check_repeat = false) { + if (new_dist > dist[0]) return; + + // check if we have it + if (check_repeat) { + for (IdType i = 0; i < size; ++i) { + if (indices[i] == new_idx) return; + } + } + + IdType left = 0, right = 0, idx = 0, largest = 0; + dist[0] = new_dist; + indices[0] = new_idx; + while (true) { + left = idx * 2 + 1; + right = left + 1; + if (left < size && dist[left] > dist[largest]) { + largest = left; + } + if (right < size && dist[right] > dist[largest]) { + largest = right; + } + if (largest != idx) { + IdType tmp_idx = indices[idx]; + indices[idx] = indices[largest]; + indices[largest] = tmp_idx; + + FloatType tmp_dist = dist[idx]; + dist[idx] = dist[largest]; + dist[largest] = tmp_dist; + + idx = largest; + } else { + break; + } + } +} + +template +__device__ bool FlaggedHeapInsert( + IdType* indices, FloatType* dist, bool* flags, IdType new_idx, + FloatType new_dist, bool new_flag, int size, bool check_repeat = false) { + if (new_dist > dist[0]) return false; + + // check if we have it + if (check_repeat) { + for (IdType i = 0; i < size; ++i) { + if (indices[i] == new_idx) return false; + } + } + + IdType left = 0, right = 0, idx = 0, largest = 0; + dist[0] = new_dist; + indices[0] = new_idx; + flags[0] = new_flag; + while (true) { + left = idx * 2 + 1; + right = left + 1; + if (left < size && dist[left] > dist[largest]) { + largest = left; + } + if (right < size && dist[right] > dist[largest]) { + largest = right; + } + if (largest != idx) { + IdType tmp_idx = indices[idx]; + indices[idx] = indices[largest]; + indices[largest] = tmp_idx; + + FloatType tmp_dist = dist[idx]; + dist[idx] = dist[largest]; + dist[largest] = tmp_dist; + + bool tmp_flag = flags[idx]; + flags[idx] = flags[largest]; + flags[largest] = tmp_flag; + + idx = largest; + } else { + break; + } + } + return true; +} + +/** + * @brief Brute force kNN kernel. Compute distance for each pair of input points + * and get the result directly (without a distance matrix). + */ +template +__global__ void BruteforceKnnKernel( + const FloatType* data_points, const IdType* data_offsets, + const FloatType* query_points, const IdType* query_offsets, const int k, + FloatType* dists, IdType* query_out, IdType* data_out, + const int64_t num_batches, const int64_t feature_size) { + const IdType q_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (q_idx >= query_offsets[num_batches]) return; + IdType batch_idx = 0; + for (IdType b = 0; b < num_batches + 1; ++b) { + if (query_offsets[b] > q_idx) { + batch_idx = b - 1; + break; + } + } + const IdType data_start = data_offsets[batch_idx], + data_end = data_offsets[batch_idx + 1]; + + for (IdType k_idx = 0; k_idx < k; ++k_idx) { + query_out[q_idx * k + k_idx] = q_idx; + dists[q_idx * k + k_idx] = std::numeric_limits::max(); + } + FloatType worst_dist = std::numeric_limits::max(); + + for (IdType d_idx = data_start; d_idx < data_end; ++d_idx) { + FloatType tmp_dist = EuclideanDistWithCheck( + query_points + q_idx * feature_size, data_points + d_idx * feature_size, + feature_size, worst_dist); + + IdType out_offset = q_idx * k; + HeapInsert( + data_out + out_offset, dists + out_offset, d_idx, tmp_dist, k); + worst_dist = dists[q_idx * k]; + } +} + +/** + * @brief Same as BruteforceKnnKernel, but use shared memory as buffer. + * This kernel divides query points and data points into blocks. For each + * query block, it will make a loop over all data blocks and compute distances. + * This kernel is faster when the dimension of input points is not large. + */ +template +__global__ void BruteforceKnnShareKernel( + const FloatType* data_points, const IdType* data_offsets, + const FloatType* query_points, const IdType* query_offsets, + const IdType* block_batch_id, const IdType* local_block_id, const int k, + FloatType* dists, IdType* query_out, IdType* data_out, + const int64_t num_batches, const int64_t feature_size) { + const IdType block_idx = static_cast(blockIdx.x); + const IdType block_size = static_cast(blockDim.x); + const IdType batch_idx = block_batch_id[block_idx]; + const IdType local_bid = local_block_id[block_idx]; + const IdType query_start = query_offsets[batch_idx] + block_size * local_bid; + const IdType query_end = + min(query_start + block_size, query_offsets[batch_idx + 1]); + if (query_start >= query_end) return; + const IdType query_idx = query_start + threadIdx.x; + const IdType data_start = data_offsets[batch_idx]; + const IdType data_end = data_offsets[batch_idx + 1]; + + // shared memory: points in block + distance buffer + result buffer + FloatType* data_buff = SharedMemory(); + FloatType* query_buff = data_buff + block_size * feature_size; + FloatType* dist_buff = query_buff + block_size * feature_size; + IdType* res_buff = reinterpret_cast(Pow2Align( + reinterpret_cast(dist_buff + block_size * k), sizeof(IdType))); + FloatType worst_dist = std::numeric_limits::max(); + + // initialize dist buff with inf value + for (auto i = 0; i < k; ++i) { + dist_buff[threadIdx.x + i * block_size] = + std::numeric_limits::max(); + } + + // load query data to shared memory + // TODO(tianqi): could be better here to exploit coalesce global memory + // access. + if (query_idx < query_end) { + for (auto i = 0; i < feature_size; ++i) { + // to avoid bank conflict, we use transpose here + query_buff[threadIdx.x + i * block_size] = + query_points[query_idx * feature_size + i]; + } + } + + // perform computation on each tile + for (auto tile_start = data_start; tile_start < data_end; + tile_start += block_size) { + // each thread load one data point into the shared memory + IdType load_idx = tile_start + threadIdx.x; + if (load_idx < data_end) { + for (auto i = 0; i < feature_size; ++i) { + data_buff[threadIdx.x * feature_size + i] = + data_points[load_idx * feature_size + i]; + } + } + __syncthreads(); + + // compute distance for one tile + IdType true_block_size = min(data_end - tile_start, block_size); + if (query_idx < query_end) { + for (IdType d_idx = 0; d_idx < true_block_size; ++d_idx) { + FloatType tmp_dist = 0; + bool early_stop = false; + IdType dim_idx = 0; + + for (; dim_idx < feature_size - 3; dim_idx += 4) { + FloatType diff0 = query_buff[threadIdx.x + block_size * (dim_idx)] - + data_buff[d_idx * feature_size + dim_idx]; + FloatType diff1 = + query_buff[threadIdx.x + block_size * (dim_idx + 1)] - + data_buff[d_idx * feature_size + dim_idx + 1]; + FloatType diff2 = + query_buff[threadIdx.x + block_size * (dim_idx + 2)] - + data_buff[d_idx * feature_size + dim_idx + 2]; + FloatType diff3 = + query_buff[threadIdx.x + block_size * (dim_idx + 3)] - + data_buff[d_idx * feature_size + dim_idx + 3]; + + tmp_dist += + diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3; + + if (tmp_dist > worst_dist) { + early_stop = true; + dim_idx = feature_size; + break; + } + } + + for (; dim_idx < feature_size; ++dim_idx) { + const FloatType diff = + query_buff[threadIdx.x + dim_idx * block_size] - + data_buff[d_idx * feature_size + dim_idx]; + tmp_dist += diff * diff; + + if (tmp_dist > worst_dist) { + early_stop = true; + break; + } + } + + if (early_stop) continue; + + HeapInsert( + res_buff + threadIdx.x * k, dist_buff + threadIdx.x * k, + d_idx + tile_start, tmp_dist, k); + worst_dist = dist_buff[threadIdx.x * k]; + } + } + __syncthreads(); + } + + // copy result to global memory + if (query_idx < query_end) { + for (auto i = 0; i < k; ++i) { + dists[query_idx * k + i] = dist_buff[threadIdx.x * k + i]; + data_out[query_idx * k + i] = res_buff[threadIdx.x * k + i]; + query_out[query_idx * k + i] = query_idx; + } + } +} + +/** @brief determine the number of blocks for each segment */ +template +__global__ void GetNumBlockPerSegment( + const IdType* offsets, IdType* out, const int64_t batch_size, + const int64_t block_size) { + const IdType idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < batch_size) { + out[idx] = (offsets[idx + 1] - offsets[idx] - 1) / block_size + 1; + } +} + +/** @brief Get the batch index and local index in segment for each block */ +template +__global__ void GetBlockInfo( + const IdType* num_block_prefixsum, IdType* block_batch_id, + IdType* local_block_id, size_t batch_size, size_t num_blocks) { + const IdType idx = blockIdx.x * blockDim.x + threadIdx.x; + IdType i = 0; + + if (idx < num_blocks) { + for (; i < batch_size; ++i) { + if (num_block_prefixsum[i] > idx) break; + } + i--; + block_batch_id[idx] = i; + local_block_id[idx] = idx - num_block_prefixsum[i]; + } +} + +/** + * @brief Brute force kNN. Compute distance for each pair of input points and + * get the result directly (without a distance matrix). + * + * @tparam FloatType The type of input points. + * @tparam IdType The type of id. + * @param data_points NDArray of dataset points. + * @param data_offsets offsets of point index in data points. + * @param query_points NDArray of query points + * @param query_offsets offsets of point index in query points. + * @param k the number of nearest points + * @param result output array + */ +template +void BruteForceKNNCuda( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = data_points->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t batch_size = data_offsets->shape[0] - 1; + const int64_t feature_size = data_points->shape[1]; + const IdType* data_offsets_data = data_offsets.Ptr(); + const IdType* query_offsets_data = query_offsets.Ptr(); + const FloatType* data_points_data = data_points.Ptr(); + const FloatType* query_points_data = query_points.Ptr(); + IdType* query_out = result.Ptr(); + IdType* data_out = query_out + k * query_points->shape[0]; + + FloatType* dists = static_cast(device->AllocWorkspace( + ctx, k * query_points->shape[0] * sizeof(FloatType))); + + const int64_t block_size = cuda::FindNumThreads(query_points->shape[0]); + const int64_t num_blocks = (query_points->shape[0] - 1) / block_size + 1; + CUDA_KERNEL_CALL( + BruteforceKnnKernel, num_blocks, block_size, 0, stream, data_points_data, + data_offsets_data, query_points_data, query_offsets_data, k, dists, + query_out, data_out, batch_size, feature_size); + + device->FreeWorkspace(ctx, dists); +} + +/** + * @brief Brute force kNN with shared memory. + * This function divides query points and data points into blocks. For each + * query block, it will make a loop over all data blocks and compute distances. + * It will be faster when the dimension of input points is not large. + * + * @tparam FloatType The type of input points. + * @tparam IdType The type of id. + * @param data_points NDArray of dataset points. + * @param data_offsets offsets of point index in data points. + * @param query_points NDArray of query points + * @param query_offsets offsets of point index in query points. + * @param k the number of nearest points + * @param result output array + */ +template +void BruteForceKNNSharedCuda( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = data_points->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t batch_size = data_offsets->shape[0] - 1; + const int64_t feature_size = data_points->shape[1]; + const IdType* data_offsets_data = data_offsets.Ptr(); + const IdType* query_offsets_data = query_offsets.Ptr(); + const FloatType* data_points_data = data_points.Ptr(); + const FloatType* query_points_data = query_points.Ptr(); + IdType* query_out = result.Ptr(); + IdType* data_out = query_out + k * query_points->shape[0]; + constexpr size_t smem_align = std::max(sizeof(IdType), sizeof(FloatType)); + + // get max shared memory per block in bytes + // determine block size according to this value + int max_sharedmem_per_block = 0; + CUDA_CALL(cudaDeviceGetAttribute( + &max_sharedmem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + ctx.device_id)); + const int64_t single_shared_mem = static_cast(Pow2Align( + (k + 2 * feature_size) * sizeof(FloatType) + k * sizeof(IdType), + smem_align)); + + const int64_t block_size = + cuda::FindNumThreads(max_sharedmem_per_block / single_shared_mem); + + // Determine the number of blocks. We first get the number of blocks for each + // segment. Then we get the block id offset via prefix sum. + IdType* num_block_per_segment = static_cast( + device->AllocWorkspace(ctx, batch_size * sizeof(IdType))); + IdType* num_block_prefixsum = static_cast( + device->AllocWorkspace(ctx, batch_size * sizeof(IdType))); + + // block size for GetNumBlockPerSegment computation + int64_t temp_block_size = cuda::FindNumThreads(batch_size); + int64_t temp_num_blocks = (batch_size - 1) / temp_block_size + 1; + CUDA_KERNEL_CALL( + GetNumBlockPerSegment, temp_num_blocks, temp_block_size, 0, stream, + query_offsets_data, num_block_per_segment, batch_size, block_size); + size_t prefix_temp_size = 0; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, prefix_temp_size, num_block_per_segment, num_block_prefixsum, + batch_size, stream)); + void* prefix_temp = device->AllocWorkspace(ctx, prefix_temp_size); + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + prefix_temp, prefix_temp_size, num_block_per_segment, num_block_prefixsum, + batch_size, stream)); + device->FreeWorkspace(ctx, prefix_temp); + + // wait for results + CUDA_CALL(cudaStreamSynchronize(stream)); + + int64_t num_blocks = 0, final_elem = 0, + copyoffset = (batch_size - 1) * sizeof(IdType); + device->CopyDataFromTo( + num_block_prefixsum, copyoffset, &num_blocks, 0, sizeof(IdType), ctx, + DGLContext{kDGLCPU, 0}, query_offsets->dtype); + device->CopyDataFromTo( + num_block_per_segment, copyoffset, &final_elem, 0, sizeof(IdType), ctx, + DGLContext{kDGLCPU, 0}, query_offsets->dtype); + num_blocks += final_elem; + device->FreeWorkspace(ctx, num_block_per_segment); + + // get batch id and local id in segment + temp_block_size = cuda::FindNumThreads(num_blocks); + temp_num_blocks = (num_blocks - 1) / temp_block_size + 1; + IdType* block_batch_id = static_cast( + device->AllocWorkspace(ctx, num_blocks * sizeof(IdType))); + IdType* local_block_id = static_cast( + device->AllocWorkspace(ctx, num_blocks * sizeof(IdType))); + CUDA_KERNEL_CALL( + GetBlockInfo, temp_num_blocks, temp_block_size, 0, stream, + num_block_prefixsum, block_batch_id, local_block_id, batch_size, + num_blocks); + + FloatType* dists = static_cast(device->AllocWorkspace( + ctx, k * query_points->shape[0] * sizeof(FloatType))); + CUDA_KERNEL_CALL( + BruteforceKnnShareKernel, num_blocks, block_size, + single_shared_mem * block_size, stream, data_points_data, + data_offsets_data, query_points_data, query_offsets_data, block_batch_id, + local_block_id, k, dists, query_out, data_out, batch_size, feature_size); + + device->FreeWorkspace(ctx, num_block_prefixsum); + device->FreeWorkspace(ctx, dists); + device->FreeWorkspace(ctx, local_block_id); + device->FreeWorkspace(ctx, block_batch_id); +} + +/** @brief Setup rng state for nn-descent */ +__global__ void SetupRngKernel( + curandState* states, const uint64_t seed, const size_t n) { + size_t id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < n) { + curand_init(seed, id, 0, states + id); + } +} + +/** + * @brief Randomly initialize neighbors (sampling without replacement) + * for each nodes + */ +template +__global__ void RandomInitNeighborsKernel( + const FloatType* points, const IdType* offsets, IdType* central_nodes, + IdType* neighbors, FloatType* dists, bool* flags, const int k, + const int64_t feature_size, const int64_t batch_size, const uint64_t seed) { + const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; + IdType batch_idx = 0; + if (point_idx >= offsets[batch_size]) return; + curandState state; + curand_init(seed, point_idx, 0, &state); + + // find the segment location in the input batch + for (IdType b = 0; b < batch_size + 1; ++b) { + if (offsets[b] > point_idx) { + batch_idx = b - 1; + break; + } + } + + const IdType segment_size = offsets[batch_idx + 1] - offsets[batch_idx]; + IdType* current_neighbors = neighbors + point_idx * k; + IdType* current_central_nodes = central_nodes + point_idx * k; + bool* current_flags = flags + point_idx * k; + FloatType* current_dists = dists + point_idx * k; + IdType segment_start = offsets[batch_idx]; + + // reservoir sampling + for (IdType i = 0; i < k; ++i) { + current_neighbors[i] = i + segment_start; + current_central_nodes[i] = point_idx; + } + for (IdType i = k; i < segment_size; ++i) { + const IdType j = static_cast(curand(&state) % (i + 1)); + if (j < k) current_neighbors[j] = i + segment_start; + } + + // compute distances and set flags + for (IdType i = 0; i < k; ++i) { + current_flags[i] = true; + current_dists[i] = EuclideanDist( + points + point_idx * feature_size, + points + current_neighbors[i] * feature_size, feature_size); + } + + // build heap + BuildHeap(neighbors + point_idx * k, current_dists, k); +} + +/** + * @brief Randomly select candidates from current knn and reverse-knn graph for + * nn-descent. + */ +template +__global__ void FindCandidatesKernel( + const IdType* offsets, IdType* new_candidates, IdType* old_candidates, + IdType* neighbors, bool* flags, const uint64_t seed, + const int64_t batch_size, const int num_candidates, const int k) { + const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; + IdType batch_idx = 0; + if (point_idx >= offsets[batch_size]) return; + curandState state; + curand_init(seed, point_idx, 0, &state); + + // find the segment location in the input batch + for (IdType b = 0; b < batch_size + 1; ++b) { + if (offsets[b] > point_idx) { + batch_idx = b - 1; + break; + } + } + + IdType segment_start = offsets[batch_idx], + segment_end = offsets[batch_idx + 1]; + IdType* current_neighbors = neighbors + point_idx * k; + bool* current_flags = flags + point_idx * k; + + // reset candidates + IdType* new_candidates_ptr = + new_candidates + point_idx * (num_candidates + 1); + IdType* old_candidates_ptr = + old_candidates + point_idx * (num_candidates + 1); + new_candidates_ptr[0] = 0; + old_candidates_ptr[0] = 0; + + // select candidates from current knn graph + // here we use candidate[0] for reservoir sampling temporarily + for (IdType i = 0; i < k; ++i) { + IdType candidate = current_neighbors[i]; + IdType* candidate_array = + current_flags[i] ? new_candidates_ptr : old_candidates_ptr; + IdType curr_num = candidate_array[0]; + IdType* candidate_data = candidate_array + 1; + + // reservoir sampling + if (curr_num < num_candidates) { + candidate_data[curr_num] = candidate; + } else { + IdType pos = static_cast(curand(&state) % (curr_num + 1)); + if (pos < num_candidates) candidate_data[pos] = candidate; + } + ++candidate_array[0]; + } + + // select candidates from current reverse knn graph + // here we use candidate[0] for reservoir sampling temporarily + IdType index_start = segment_start * k, index_end = segment_end * k; + for (IdType i = index_start; i < index_end; ++i) { + if (neighbors[i] == point_idx) { + IdType reverse_candidate = (i - index_start) / k + segment_start; + IdType* candidate_array = + flags[i] ? new_candidates_ptr : old_candidates_ptr; + IdType curr_num = candidate_array[0]; + IdType* candidate_data = candidate_array + 1; + + // reservoir sampling + if (curr_num < num_candidates) { + candidate_data[curr_num] = reverse_candidate; + } else { + IdType pos = static_cast(curand(&state) % (curr_num + 1)); + if (pos < num_candidates) candidate_data[pos] = reverse_candidate; + } + ++candidate_array[0]; + } + } + + // set candidate[0] back to length + if (new_candidates_ptr[0] > num_candidates) + new_candidates_ptr[0] = num_candidates; + if (old_candidates_ptr[0] > num_candidates) + old_candidates_ptr[0] = num_candidates; + + // mark new_candidates as old + IdType num_new_candidates = new_candidates_ptr[0]; + for (IdType i = 0; i < k; ++i) { + IdType neighbor_idx = current_neighbors[i]; + + if (current_flags[i]) { + for (IdType j = 1; j < num_new_candidates + 1; ++j) { + if (new_candidates_ptr[j] == neighbor_idx) { + current_flags[i] = false; + break; + } + } + } + } +} + +/** @brief Update knn graph according to selected candidates for nn-descent */ +template +__global__ void UpdateNeighborsKernel( + const FloatType* points, const IdType* offsets, IdType* neighbors, + IdType* new_candidates, IdType* old_candidates, FloatType* distances, + bool* flags, IdType* num_updates, const int64_t batch_size, + const int num_candidates, const int k, const int64_t feature_size) { + const IdType point_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (point_idx >= offsets[batch_size]) return; + IdType* current_neighbors = neighbors + point_idx * k; + bool* current_flags = flags + point_idx * k; + FloatType* current_dists = distances + point_idx * k; + IdType* new_candidates_ptr = + new_candidates + point_idx * (num_candidates + 1); + IdType* old_candidates_ptr = + old_candidates + point_idx * (num_candidates + 1); + IdType num_new_candidates = new_candidates_ptr[0]; + IdType num_old_candidates = old_candidates_ptr[0]; + IdType current_num_updates = 0; + + // process new candidates + for (IdType i = 1; i <= num_new_candidates; ++i) { + IdType new_c = new_candidates_ptr[i]; + + // new/old candidates of the current new candidate + IdType* twohop_new_ptr = new_candidates + new_c * (num_candidates + 1); + IdType* twohop_old_ptr = old_candidates + new_c * (num_candidates + 1); + IdType num_twohop_new = twohop_new_ptr[0]; + IdType num_twohop_old = twohop_old_ptr[0]; + FloatType worst_dist = current_dists[0]; + + // new - new + for (IdType j = 1; j <= num_twohop_new; ++j) { + IdType twohop_new_c = twohop_new_ptr[j]; + FloatType new_dist = EuclideanDistWithCheck( + points + point_idx * feature_size, + points + twohop_new_c * feature_size, feature_size, worst_dist); + + if (FlaggedHeapInsert( + current_neighbors, current_dists, current_flags, twohop_new_c, + new_dist, true, k, true)) { + ++current_num_updates; + worst_dist = current_dists[0]; + } + } + + // new - old + for (IdType j = 1; j <= num_twohop_old; ++j) { + IdType twohop_old_c = twohop_old_ptr[j]; + FloatType new_dist = EuclideanDistWithCheck( + points + point_idx * feature_size, + points + twohop_old_c * feature_size, feature_size, worst_dist); + + if (FlaggedHeapInsert( + current_neighbors, current_dists, current_flags, twohop_old_c, + new_dist, true, k, true)) { + ++current_num_updates; + worst_dist = current_dists[0]; + } + } + } + + // process old candidates + for (IdType i = 1; i <= num_old_candidates; ++i) { + IdType old_c = old_candidates_ptr[i]; + + // new candidates of the current old candidate + IdType* twohop_new_ptr = new_candidates + old_c * (num_candidates + 1); + IdType num_twohop_new = twohop_new_ptr[0]; + FloatType worst_dist = current_dists[0]; + + // old - new + for (IdType j = 1; j <= num_twohop_new; ++j) { + IdType twohop_new_c = twohop_new_ptr[j]; + FloatType new_dist = EuclideanDistWithCheck( + points + point_idx * feature_size, + points + twohop_new_c * feature_size, feature_size, worst_dist); + + if (FlaggedHeapInsert( + current_neighbors, current_dists, current_flags, twohop_new_c, + new_dist, true, k, true)) { + ++current_num_updates; + worst_dist = current_dists[0]; + } + } + } + + num_updates[point_idx] = current_num_updates; +} + +} // namespace impl + +template +void KNN( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result, const std::string& algorithm) { + if (algorithm == std::string("bruteforce")) { + impl::BruteForceKNNCuda( + data_points, data_offsets, query_points, query_offsets, k, result); + } else if (algorithm == std::string("bruteforce-sharemem")) { + impl::BruteForceKNNSharedCuda( + data_points, data_offsets, query_points, query_offsets, k, result); + } else { + LOG(FATAL) << "Algorithm " << algorithm << " is not supported on CUDA."; + } +} + +template +void NNDescent( + const NDArray& points, const IdArray& offsets, IdArray result, const int k, + const int num_iters, const int num_candidates, const double delta) { + cudaStream_t stream = runtime::getCurrentCUDAStream(); + const auto& ctx = points->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t num_nodes = points->shape[0]; + const int64_t feature_size = points->shape[1]; + const int64_t batch_size = offsets->shape[0] - 1; + const IdType* offsets_data = offsets.Ptr(); + const FloatType* points_data = points.Ptr(); + + IdType* central_nodes = result.Ptr(); + IdType* neighbors = central_nodes + k * num_nodes; + uint64_t seed; + int warp_size = 0; + CUDA_CALL( + cudaDeviceGetAttribute(&warp_size, cudaDevAttrWarpSize, ctx.device_id)); + // We don't need large block sizes, since there's not much inter-thread + // communication + int64_t block_size = warp_size; + int64_t num_blocks = (num_nodes - 1) / block_size + 1; + + // allocate space for candidates, distances and flags + // we use the first element in candidate array to represent length + IdType* new_candidates = static_cast(device->AllocWorkspace( + ctx, num_nodes * (num_candidates + 1) * sizeof(IdType))); + IdType* old_candidates = static_cast(device->AllocWorkspace( + ctx, num_nodes * (num_candidates + 1) * sizeof(IdType))); + IdType* num_updates = static_cast( + device->AllocWorkspace(ctx, num_nodes * sizeof(IdType))); + FloatType* distances = static_cast( + device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType))); + bool* flags = static_cast( + device->AllocWorkspace(ctx, num_nodes * k * sizeof(IdType))); + + size_t sum_temp_size = 0; + IdType total_num_updates = 0; + IdType* total_num_updates_d = + static_cast(device->AllocWorkspace(ctx, sizeof(IdType))); + + CUDA_CALL(cub::DeviceReduce::Sum( + nullptr, sum_temp_size, num_updates, total_num_updates_d, num_nodes, + stream)); + IdType* sum_temp_storage = + static_cast(device->AllocWorkspace(ctx, sum_temp_size)); + + // random initialize neighbors + seed = RandomEngine::ThreadLocal()->RandInt( + std::numeric_limits::max()); + CUDA_KERNEL_CALL( + impl::RandomInitNeighborsKernel, num_blocks, block_size, 0, stream, + points_data, offsets_data, central_nodes, neighbors, distances, flags, k, + feature_size, batch_size, seed); + + for (int i = 0; i < num_iters; ++i) { + // select candidates + seed = RandomEngine::ThreadLocal()->RandInt( + std::numeric_limits::max()); + CUDA_KERNEL_CALL( + impl::FindCandidatesKernel, num_blocks, block_size, 0, stream, + offsets_data, new_candidates, old_candidates, neighbors, flags, seed, + batch_size, num_candidates, k); + + // update + CUDA_KERNEL_CALL( + impl::UpdateNeighborsKernel, num_blocks, block_size, 0, stream, + points_data, offsets_data, neighbors, new_candidates, old_candidates, + distances, flags, num_updates, batch_size, num_candidates, k, + feature_size); + + total_num_updates = 0; + CUDA_CALL(cub::DeviceReduce::Sum( + sum_temp_storage, sum_temp_size, num_updates, total_num_updates_d, + num_nodes, stream)); + device->CopyDataFromTo( + total_num_updates_d, 0, &total_num_updates, 0, sizeof(IdType), ctx, + DGLContext{kDGLCPU, 0}, offsets->dtype); + + if (total_num_updates <= static_cast(delta * k * num_nodes)) { + break; + } + } + + device->FreeWorkspace(ctx, new_candidates); + device->FreeWorkspace(ctx, old_candidates); + device->FreeWorkspace(ctx, num_updates); + device->FreeWorkspace(ctx, distances); + device->FreeWorkspace(ctx, flags); + device->FreeWorkspace(ctx, total_num_updates_d); + device->FreeWorkspace(ctx, sum_temp_storage); +} + +template void KNN( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result, const std::string& algorithm); +template void KNN( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result, const std::string& algorithm); +template void KNN( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result, const std::string& algorithm); +template void KNN( + const NDArray& data_points, const IdArray& data_offsets, + const NDArray& query_points, const IdArray& query_offsets, const int k, + IdArray result, const std::string& algorithm); + +template void NNDescent( + const NDArray& points, const IdArray& offsets, IdArray result, const int k, + const int num_iters, const int num_candidates, const double delta); +template void NNDescent( + const NDArray& points, const IdArray& offsets, IdArray result, const int k, + const int num_iters, const int num_candidates, const double delta); +template void NNDescent( + const NDArray& points, const IdArray& offsets, IdArray result, const int k, + const int num_iters, const int num_candidates, const double delta); +template void NNDescent( + const NDArray& points, const IdArray& offsets, IdArray result, const int k, + const int num_iters, const int num_candidates, const double delta); + +} // namespace transform +} // namespace dgl diff --git a/src/graph/transform/to_block.cc b/src/graph/transform/to_block.cc index 00f4769a9a25..963a6dfe0402 100644 --- a/src/graph/transform/to_block.cc +++ b/src/graph/transform/to_block.cc @@ -314,7 +314,7 @@ std::tuple> ToBlock( return ToBlockCPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); } -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM // Forward declaration of GPU ToBlock implementations - actual implementation is // in @@ -343,7 +343,7 @@ std::tuple> ToBlock( return ToBlockGPU64(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); } -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM DGL_REGISTER_GLOBAL("capi._CAPI_DGLToBlock") .set_body([](DGLArgs args, DGLRetValue *rv) { diff --git a/src/graph/transform/to_block.cc.prehip b/src/graph/transform/to_block.cc.prehip new file mode 100644 index 000000000000..00f4769a9a25 --- /dev/null +++ b/src/graph/transform/to_block.cc.prehip @@ -0,0 +1,383 @@ +/** + * Copyright 2019-2021 Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file graph/transform/to_block.cc + * @brief Convert a graph to a bipartite-structured graph. + * + * Tested via python wrapper: python/dgl/path/to/to_block.py + */ + +#include "to_block.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../../array/cpu/concurrent_id_hash_map.h" + +namespace dgl { + +using namespace dgl::runtime; +using namespace dgl::aten; + +namespace transform { + +namespace { + +template +struct CPUIdsMapper { + std::tuple, std::vector> operator()( + const HeteroGraphPtr &graph, bool include_rhs_in_lhs, int64_t num_ntypes, + const DGLContext &ctx, const std::vector &max_nodes_per_type, + const std::vector &edge_arrays, + const std::vector &src_nodes, + const std::vector &rhs_nodes, + std::vector *const lhs_nodes_ptr, + std::vector *const num_nodes_per_type_ptr) { + std::vector &lhs_nodes = *lhs_nodes_ptr; + std::vector &num_nodes_per_type = *num_nodes_per_type_ptr; + + const bool generate_lhs_nodes = lhs_nodes.empty(); + if (generate_lhs_nodes) { + lhs_nodes.reserve(num_ntypes); + } + + std::vector> lhs_nodes_map(num_ntypes); + std::vector> rhs_nodes_map(num_ntypes); + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + IdArray unique_ids = + aten::NullArray(DGLDataTypeTraits::dtype, ctx); + if (!aten::IsNullArray(src_nodes[ntype])) { + auto num_seeds = include_rhs_in_lhs ? rhs_nodes[ntype]->shape[0] : 0; + unique_ids = lhs_nodes_map[ntype].Init(src_nodes[ntype], num_seeds); + } + if (generate_lhs_nodes) { + num_nodes_per_type[ntype] = unique_ids->shape[0]; + lhs_nodes.emplace_back(unique_ids); + } + } + + // Skip rhs mapping construction to save efforts when rhs is already + // contained in lhs. + if (!include_rhs_in_lhs) { + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + if (!aten::IsNullArray(rhs_nodes[ntype])) { + rhs_nodes_map[ntype].Init( + rhs_nodes[ntype], rhs_nodes[ntype]->shape[0]); + } + } + } + + // Map node numberings from global to local, and build pointer for CSR. + std::vector new_lhs; + std::vector new_rhs; + new_lhs.reserve(edge_arrays.size()); + new_rhs.reserve(edge_arrays.size()); + const int64_t num_etypes = static_cast(edge_arrays.size()); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const EdgeArray &edges = edge_arrays[etype]; + if (edges.id.defined() && !aten::IsNullArray(edges.src)) { + const auto src_dst_types = graph->GetEndpointTypes(etype); + const int src_type = src_dst_types.first; + const int dst_type = src_dst_types.second; + new_lhs.emplace_back(lhs_nodes_map[src_type].MapIds(edges.src)); + if (include_rhs_in_lhs) { + new_rhs.emplace_back(lhs_nodes_map[dst_type].MapIds(edges.dst)); + } else { + new_rhs.emplace_back(rhs_nodes_map[dst_type].MapIds(edges.dst)); + } + } else { + new_lhs.emplace_back( + aten::NullArray(DGLDataTypeTraits::dtype, ctx)); + new_rhs.emplace_back( + aten::NullArray(DGLDataTypeTraits::dtype, ctx)); + } + } + return std::tuple, std::vector>( + std::move(new_lhs), std::move(new_rhs)); + } +}; + +// Since partial specialization is not allowed for functions, use this as an +// intermediate for ToBlock where XPU = kDGLCPU. +template +std::tuple> ToBlockCPU( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes_ptr) { + return dgl::transform::ProcessToBlock( + graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes_ptr, + CPUIdsMapper()); +} + +} // namespace + +template +std::tuple> ProcessToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes_ptr, + IdsMapper &&ids_mapper) { + std::vector &lhs_nodes = *lhs_nodes_ptr; + const bool generate_lhs_nodes = lhs_nodes.empty(); + + const auto &ctx = graph->Context(); + auto device = runtime::DeviceAPI::Get(ctx); + + // Since DST nodes are included in SRC nodes, a common requirement is to fetch + // the DST node features from the SRC nodes features. To avoid expensive + // sparse lookup, the function assures that the DST nodes in both SRC and DST + // sets have the same ids. As a result, given the node feature tensor ``X`` of + // type ``utype``, the following code finds the corresponding DST node + // features of type ``vtype``: + + const int64_t num_etypes = graph->NumEdgeTypes(); + const int64_t num_ntypes = graph->NumVertexTypes(); + + CHECK(rhs_nodes.size() == static_cast(num_ntypes)) + << "rhs_nodes not given for every node type"; + + std::vector edge_arrays(num_etypes); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const auto src_dst_types = graph->GetEndpointTypes(etype); + const dgl_type_t dsttype = src_dst_types.second; + if (!aten::IsNullArray(rhs_nodes[dsttype])) { + edge_arrays[etype] = graph->Edges(etype); + } + } + + // Count lhs and rhs nodes. + std::vector maxNodesPerType(num_ntypes * 2, 0); + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + maxNodesPerType[ntype + num_ntypes] += rhs_nodes[ntype]->shape[0]; + + if (generate_lhs_nodes) { + if (include_rhs_in_lhs) { + maxNodesPerType[ntype] += rhs_nodes[ntype]->shape[0]; + } + } else { + maxNodesPerType[ntype] += lhs_nodes[ntype]->shape[0]; + } + } + if (generate_lhs_nodes) { + // We don't have lhs_nodes, see we need to count inbound edges to get an + // upper bound. + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const auto src_dst_types = graph->GetEndpointTypes(etype); + const dgl_type_t srctype = src_dst_types.first; + if (edge_arrays[etype].src.defined()) { + maxNodesPerType[srctype] += edge_arrays[etype].src->shape[0]; + } + } + } + + // Gather lhs_nodes. + std::vector src_nodes(num_ntypes); + if (generate_lhs_nodes) { + std::vector src_node_offsets(num_ntypes, 0); + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + src_nodes[ntype] = + NewIdArray(maxNodesPerType[ntype], ctx, sizeof(IdType) * 8); + if (include_rhs_in_lhs) { + // Place rhs nodes first. + device->CopyDataFromTo( + rhs_nodes[ntype].Ptr(), 0, src_nodes[ntype].Ptr(), + src_node_offsets[ntype], + sizeof(IdType) * rhs_nodes[ntype]->shape[0], rhs_nodes[ntype]->ctx, + src_nodes[ntype]->ctx, rhs_nodes[ntype]->dtype); + src_node_offsets[ntype] += sizeof(IdType) * rhs_nodes[ntype]->shape[0]; + } + } + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const auto src_dst_types = graph->GetEndpointTypes(etype); + const dgl_type_t srctype = src_dst_types.first; + if (edge_arrays[etype].src.defined()) { + device->CopyDataFromTo( + edge_arrays[etype].src.Ptr(), 0, + src_nodes[srctype].Ptr(), src_node_offsets[srctype], + sizeof(IdType) * edge_arrays[etype].src->shape[0], + rhs_nodes[srctype]->ctx, src_nodes[srctype]->ctx, + rhs_nodes[srctype]->dtype); + + src_node_offsets[srctype] += + sizeof(IdType) * edge_arrays[etype].src->shape[0]; + } + } + } else { + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + src_nodes[ntype] = lhs_nodes[ntype]; + } + } + + std::vector num_nodes_per_type(num_ntypes * 2); + // Populate RHS nodes from what we already know. + for (int64_t ntype = 0; ntype < num_ntypes; ++ntype) { + num_nodes_per_type[num_ntypes + ntype] = rhs_nodes[ntype]->shape[0]; + } + + std::vector new_lhs; + std::vector new_rhs; + std::tie(new_lhs, new_rhs) = ids_mapper( + graph, include_rhs_in_lhs, num_ntypes, ctx, maxNodesPerType, edge_arrays, + src_nodes, rhs_nodes, lhs_nodes_ptr, &num_nodes_per_type); + + std::vector induced_edges; + induced_edges.reserve(num_etypes); + for (int64_t etype = 0; etype < num_etypes; ++etype) { + if (edge_arrays[etype].id.defined()) { + induced_edges.push_back(edge_arrays[etype].id); + } else { + induced_edges.push_back( + aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx)); + } + } + + // Build metagraph. + const auto meta_graph = graph->meta_graph(); + const EdgeArray etypes = meta_graph->Edges("eid"); + const IdArray new_dst = Add(etypes.dst, num_ntypes); + const auto new_meta_graph = + ImmutableGraph::CreateFromCOO(num_ntypes * 2, etypes.src, new_dst); + + // Allocate vector for graph relations while GPU is busy. + std::vector rel_graphs; + rel_graphs.reserve(num_etypes); + + // Build the heterograph. + for (int64_t etype = 0; etype < num_etypes; ++etype) { + const auto src_dst_types = graph->GetEndpointTypes(etype); + const dgl_type_t srctype = src_dst_types.first; + const dgl_type_t dsttype = src_dst_types.second; + + if (rhs_nodes[dsttype]->shape[0] == 0) { + // No rhs nodes are given for this edge type. Create an empty graph. + rel_graphs.push_back(CreateFromCOO( + 2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0], + aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx), + aten::NullArray(DGLDataType{kDGLInt, sizeof(IdType) * 8, 1}, ctx))); + } else { + rel_graphs.push_back(CreateFromCOO( + 2, lhs_nodes[srctype]->shape[0], rhs_nodes[dsttype]->shape[0], + new_lhs[etype], new_rhs[etype])); + } + } + + HeteroGraphPtr new_graph = + CreateHeteroGraph(new_meta_graph, rel_graphs, num_nodes_per_type); + + // Return the new graph, the new src nodes, and new edges. + return std::make_tuple(new_graph, induced_edges); +} + +template std::tuple> +ProcessToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes_ptr, + IdsMapper &&get_maping_ids); + +template std::tuple> +ProcessToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes_ptr, + IdsMapper &&get_maping_ids); + +template <> +std::tuple> ToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes) { + return ToBlockCPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +template <> +std::tuple> ToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes) { + return ToBlockCPU(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +#ifdef DGL_USE_CUDA + +// Forward declaration of GPU ToBlock implementations - actual implementation is +// in +// ./cuda/cuda_to_block.cu +// This is to get around the broken name mangling in VS2019 CL 16.5.5 + +// CUDA 11.3 which complains that the two template specializations have the same +// signature. +std::tuple> ToBlockGPU32( + HeteroGraphPtr, const std::vector &, bool, + std::vector *const); +std::tuple> ToBlockGPU64( + HeteroGraphPtr, const std::vector &, bool, + std::vector *const); + +template <> +std::tuple> ToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes) { + return ToBlockGPU32(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +template <> +std::tuple> ToBlock( + HeteroGraphPtr graph, const std::vector &rhs_nodes, + bool include_rhs_in_lhs, std::vector *const lhs_nodes) { + return ToBlockGPU64(graph, rhs_nodes, include_rhs_in_lhs, lhs_nodes); +} + +#endif // DGL_USE_CUDA + +DGL_REGISTER_GLOBAL("capi._CAPI_DGLToBlock") + .set_body([](DGLArgs args, DGLRetValue *rv) { + const HeteroGraphRef graph_ref = args[0]; + const std::vector &rhs_nodes = + ListValueToVector(args[1]); + const bool include_rhs_in_lhs = args[2]; + std::vector lhs_nodes = ListValueToVector(args[3]); + + HeteroGraphPtr new_graph; + std::vector induced_edges; + + ATEN_XPU_SWITCH_CUDA(graph_ref->Context().device_type, XPU, "ToBlock", { + ATEN_ID_TYPE_SWITCH(graph_ref->DataType(), IdType, { + std::tie(new_graph, induced_edges) = ToBlock( + graph_ref.sptr(), rhs_nodes, include_rhs_in_lhs, &lhs_nodes); + }); + }); + + List lhs_nodes_ref; + for (IdArray &array : lhs_nodes) + lhs_nodes_ref.push_back(Value(MakeValue(array))); + List induced_edges_ref; + for (IdArray &array : induced_edges) + induced_edges_ref.push_back(Value(MakeValue(array))); + + List ret; + ret.push_back(HeteroGraphRef(new_graph)); + ret.push_back(lhs_nodes_ref); + ret.push_back(induced_edges_ref); + + *rv = ret; + }); + +}; // namespace transform + +}; // namespace dgl diff --git a/src/partition/cuda/partition_op.cu b/src/partition/cuda/partition_op.cu index 04d10acd1e09..4db276b873b1 100644 --- a/src/partition/cuda/partition_op.cu +++ b/src/partition/cuda/partition_op.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file ndarray_partition.h @@ -6,7 +7,7 @@ #include -#include +#include #include "../../runtime/cuda/cuda_common.h" #include "../../runtime/workspace.h" @@ -239,7 +240,7 @@ std::pair GeneratePermutationFromRemainder( const auto& ctx = in_idx->ctx; auto device = DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t num_in = in_idx->shape[0]; @@ -295,13 +296,13 @@ std::pair GeneratePermutationFromRemainder( IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); size_t sort_workspace_size; - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), static_cast(perm_in->data), perm_out, num_in, 0, part_bits, stream)); Workspace sort_workspace(device, ctx, sort_workspace_size); - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( sort_workspace.get(), sort_workspace_size, proc_id_in.get(), proc_id_out.get(), static_cast(perm_in->data), perm_out, num_in, 0, part_bits, stream)); @@ -317,7 +318,7 @@ std::pair GeneratePermutationFromRemainder( static_assert( sizeof(AtomicCount) == sizeof(*out_counts), "AtomicCount must be the same width as int64_t for atomicAdd " - "in cub::DeviceHistogram::HistogramEven() to work"); + "in hipcub::DeviceHistogram::HistogramEven() to work"); // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, // add a compile time check against the cub version to allow @@ -327,14 +328,14 @@ std::pair GeneratePermutationFromRemainder( "value of int."; size_t hist_workspace_size; - CUDA_CALL(cub::DeviceHistogram::HistogramEven( + CUDA_CALL(hipcub::DeviceHistogram::HistogramEven( nullptr, hist_workspace_size, proc_id_out.get(), reinterpret_cast(out_counts), num_parts + 1, static_cast(0), static_cast(num_parts), static_cast(num_in), stream)); Workspace hist_workspace(device, ctx, hist_workspace_size); - CUDA_CALL(cub::DeviceHistogram::HistogramEven( + CUDA_CALL(hipcub::DeviceHistogram::HistogramEven( hist_workspace.get(), hist_workspace_size, proc_id_out.get(), reinterpret_cast(out_counts), num_parts + 1, static_cast(0), static_cast(num_parts), @@ -352,7 +353,7 @@ template std::pair GeneratePermutationFromRemainder< template IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) { const auto& ctx = global_idx->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_parts > 1) { IdArray local_idx = @@ -387,7 +388,7 @@ IdArray MapToGlobalFromRemainder( << num_parts; const auto& ctx = local_idx->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_parts > 1) { IdArray global_idx = @@ -423,7 +424,7 @@ std::pair GeneratePermutationFromRange( const auto& ctx = in_idx->ctx; auto device = DeviceAPI::Get(ctx); - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); const int64_t num_in = in_idx->shape[0]; @@ -470,13 +471,13 @@ std::pair GeneratePermutationFromRange( IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); size_t sort_workspace_size; - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), static_cast(perm_in->data), perm_out, num_in, 0, part_bits, stream)); Workspace sort_workspace(device, ctx, sort_workspace_size); - CUDA_CALL(cub::DeviceRadixSort::SortPairs( + CUDA_CALL(hipcub::DeviceRadixSort::SortPairs( sort_workspace.get(), sort_workspace_size, proc_id_in.get(), proc_id_out.get(), static_cast(perm_in->data), perm_out, num_in, 0, part_bits, stream)); @@ -492,7 +493,7 @@ std::pair GeneratePermutationFromRange( static_assert( sizeof(AtomicCount) == sizeof(*out_counts), "AtomicCount must be the same width as int64_t for atomicAdd " - "in cub::DeviceHistogram::HistogramEven() to work"); + "in hipcub::DeviceHistogram::HistogramEven() to work"); // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, // add a compile time check against the cub version to allow @@ -502,14 +503,14 @@ std::pair GeneratePermutationFromRange( "value of int."; size_t hist_workspace_size; - CUDA_CALL(cub::DeviceHistogram::HistogramEven( + CUDA_CALL(hipcub::DeviceHistogram::HistogramEven( nullptr, hist_workspace_size, proc_id_out.get(), reinterpret_cast(out_counts), num_parts + 1, static_cast(0), static_cast(num_parts), static_cast(num_in), stream)); Workspace hist_workspace(device, ctx, hist_workspace_size); - CUDA_CALL(cub::DeviceHistogram::HistogramEven( + CUDA_CALL(hipcub::DeviceHistogram::HistogramEven( hist_workspace.get(), hist_workspace_size, proc_id_out.get(), reinterpret_cast(out_counts), num_parts + 1, static_cast(0), static_cast(num_parts), @@ -536,7 +537,7 @@ template IdArray MapToLocalFromRange( const int num_parts, IdArray range, IdArray global_idx) { const auto& ctx = global_idx->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_parts > 1 && global_idx->shape[0] > 0) { IdArray local_idx = @@ -576,7 +577,7 @@ IdArray MapToGlobalFromRange( << num_parts; const auto& ctx = local_idx->ctx; - cudaStream_t stream = runtime::getCurrentCUDAStream(); + hipStream_t stream = runtime::getCurrentCUDAStream(); if (num_parts > 1 && local_idx->shape[0] > 0) { IdArray global_idx = diff --git a/src/partition/cuda/partition_op.cu.prehip b/src/partition/cuda/partition_op.cu.prehip new file mode 100644 index 000000000000..04d10acd1e09 --- /dev/null +++ b/src/partition/cuda/partition_op.cu.prehip @@ -0,0 +1,613 @@ +/** + * Copyright (c) 2021 by Contributors + * @file ndarray_partition.h + * @brief Operations on partition implemented in CUDA. + */ + +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" +#include "../../runtime/workspace.h" +#include "../partition_op.h" + +using namespace dgl::runtime; + +namespace dgl { +namespace partition { +namespace impl { + +namespace { + +/** + * @brief Kernel to map global element IDs to partition IDs by remainder. + * + * @tparam IdType The type of ID. + * @param global The global element IDs. + * @param num_elements The number of element IDs. + * @param num_parts The number of partitions. + * @param part_id The mapped partition ID (outupt). + */ +template +__global__ void _MapProcByRemainderKernel( + const IdType* const global, const int64_t num_elements, + const int64_t num_parts, IdType* const part_id) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = + blockDim.x * static_cast(blockIdx.x) + threadIdx.x; + + if (idx < num_elements) { + part_id[idx] = global[idx] % num_parts; + } +} + +/** + * @brief Kernel to map global element IDs to partition IDs, using a bit-mask. + * The number of partitions must be a power a two. + * + * @tparam IdType The type of ID. + * @param global The global element IDs. + * @param num_elements The number of element IDs. + * @param mask The bit-mask with 1's for each bit to keep from the element ID to + * extract the partition ID (e.g., an 8 partition mask would be 0x07). + * @param part_id The mapped partition ID (outupt). + */ +template +__global__ void _MapProcByMaskRemainderKernel( + const IdType* const global, const int64_t num_elements, const IdType mask, + IdType* const part_id) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = + blockDim.x * static_cast(blockIdx.x) + threadIdx.x; + + if (idx < num_elements) { + part_id[idx] = global[idx] & mask; + } +} + +/** + * @brief Kernel to map global element IDs to local element IDs. + * + * @tparam IdType The type of ID. + * @param global The global element IDs. + * @param num_elements The number of IDs. + * @param num_parts The number of partitions. + * @param local The local element IDs (output). + */ +template +__global__ void _MapLocalIndexByRemainderKernel( + const IdType* const global, const int64_t num_elements, const int num_parts, + IdType* const local) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + + if (idx < num_elements) { + local[idx] = global[idx] / num_parts; + } +} + +/** + * @brief Kernel to map local element IDs within a partition to their global + * IDs, using the remainder over the number of partitions. + * + * @tparam IdType The type of ID. + * @param local The local element IDs. + * @param part_id The partition to map local elements from. + * @param num_elements The number of elements to map. + * @param num_parts The number of partitions. + * @param global The global element IDs (output). + */ +template +__global__ void _MapGlobalIndexByRemainderKernel( + const IdType* const local, const int part_id, const int64_t num_elements, + const int num_parts, IdType* const global) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + + assert(part_id < num_parts); + + if (idx < num_elements) { + global[idx] = (local[idx] * num_parts) + part_id; + } +} + +/** + * @brief Device function to perform a binary search to find to which partition + * a given ID belongs. + * + * @tparam RangeType The type of range. + * @param range The prefix-sum of IDs assigned to partitions. + * @param num_parts The number of partitions. + * @param target The element ID to find the partition of. + * + * @return The partition. + */ +template +__device__ RangeType _SearchRange( + const RangeType* const range, const int num_parts, const RangeType target) { + int start = 0; + int end = num_parts; + int cur = (end + start) / 2; + + assert(range[0] == 0); + assert(target < range[num_parts]); + + while (start + 1 < end) { + if (target < range[cur]) { + end = cur; + } else { + start = cur; + } + cur = (start + end) / 2; + } + + return cur; +} + +/** + * @brief Kernel to map element IDs to partition IDs. + * + * @tparam IdType The type of element ID. + * @tparam RangeType The type of of the range. + * @param range The prefix-sum of IDs assigned to partitions. + * @param global The global element IDs. + * @param num_elements The number of element IDs. + * @param num_parts The number of partitions. + * @param part_id The partition ID assigned to each element (output). + */ +template +__global__ void _MapProcByRangeKernel( + const RangeType* const range, const IdType* const global, + const int64_t num_elements, const int64_t num_parts, + IdType* const part_id) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = + blockDim.x * static_cast(blockIdx.x) + threadIdx.x; + + // rely on caching to load the range into L1 cache + if (idx < num_elements) { + part_id[idx] = static_cast(_SearchRange( + range, static_cast(num_parts), + static_cast(global[idx]))); + } +} + +/** + * @brief Kernel to map global element IDs to their ID within their respective + * partition. + * + * @tparam IdType The type of element ID. + * @tparam RangeType The type of the range. + * @param range The prefix-sum of IDs assigned to partitions. + * @param global The global element IDs. + * @param num_elements The number of elements. + * @param num_parts The number of partitions. + * @param local The local element IDs (output). + */ +template +__global__ void _MapLocalIndexByRangeKernel( + const RangeType* const range, const IdType* const global, + const int64_t num_elements, const int num_parts, IdType* const local) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + + // rely on caching to load the range into L1 cache + if (idx < num_elements) { + const int proc = _SearchRange( + range, static_cast(num_parts), + static_cast(global[idx])); + local[idx] = global[idx] - range[proc]; + } +} + +/** + * @brief Kernel to map local element IDs within a partition to their global + * IDs. + * + * @tparam IdType The type of ID. + * @tparam RangeType The type of the range. + * @param range The prefix-sum of IDs assigend to partitions. + * @param local The local element IDs. + * @param part_id The partition to map local elements from. + * @param num_elements The number of elements to map. + * @param num_parts The number of partitions. + * @param global The global element IDs (output). + */ +template +__global__ void _MapGlobalIndexByRangeKernel( + const RangeType* const range, const IdType* const local, const int part_id, + const int64_t num_elements, const int num_parts, IdType* const global) { + assert(num_elements <= gridDim.x * blockDim.x); + const int64_t idx = threadIdx.x + blockDim.x * blockIdx.x; + + assert(part_id < num_parts); + + // rely on caching to load the range into L1 cache + if (idx < num_elements) { + global[idx] = local[idx] + range[part_id]; + } +} +} // namespace + +// Remainder Based Partition Operations + +template +std::pair GeneratePermutationFromRemainder( + int64_t array_size, int num_parts, IdArray in_idx) { + std::pair result; + + const auto& ctx = in_idx->ctx; + auto device = DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int64_t num_in = in_idx->shape[0]; + + CHECK_GE(num_parts, 1) << "The number of partitions (" << num_parts + << ") must be at least 1."; + if (num_parts == 1) { + // no permutation + result.first = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); + result.second = aten::Full(num_in, num_parts, sizeof(int64_t) * 8, ctx); + + return result; + } + + result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType) * 8); + result.second = aten::Full(0, num_parts, sizeof(int64_t) * 8, ctx); + int64_t* out_counts = static_cast(result.second->data); + if (num_in == 0) { + // now that we've zero'd out_counts, nothing left to do for an empty + // mapping + return result; + } + + const int64_t part_bits = + static_cast(std::ceil(std::log2(num_parts))); + + // First, generate a mapping of indexes to processors + Workspace proc_id_in(device, ctx, num_in); + { + const dim3 block(256); + const dim3 grid((num_in + block.x - 1) / block.x); + + if (num_parts < (1 << part_bits)) { + // num_parts is not a power of 2 + CUDA_KERNEL_CALL( + _MapProcByRemainderKernel, grid, block, 0, stream, + static_cast(in_idx->data), num_in, num_parts, + proc_id_in.get()); + } else { + // num_parts is a power of 2 + CUDA_KERNEL_CALL( + _MapProcByMaskRemainderKernel, grid, block, 0, stream, + static_cast(in_idx->data), num_in, + static_cast(num_parts - 1), // bit mask + proc_id_in.get()); + } + } + + // then create a permutation array that groups processors together by + // performing a radix sort + Workspace proc_id_out(device, ctx, num_in); + IdType* perm_out = static_cast(result.first->data); + { + IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); + + size_t sort_workspace_size; + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), + static_cast(perm_in->data), perm_out, num_in, 0, part_bits, + stream)); + + Workspace sort_workspace(device, ctx, sort_workspace_size); + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + sort_workspace.get(), sort_workspace_size, proc_id_in.get(), + proc_id_out.get(), static_cast(perm_in->data), perm_out, + num_in, 0, part_bits, stream)); + } + // explicitly free so workspace can be re-used + proc_id_in.free(); + + // perform a histogram and then prefixsum on the sorted proc_id vector + + // Count the number of values to be sent to each processor + { + using AtomicCount = unsigned long long; // NOLINT + static_assert( + sizeof(AtomicCount) == sizeof(*out_counts), + "AtomicCount must be the same width as int64_t for atomicAdd " + "in cub::DeviceHistogram::HistogramEven() to work"); + + // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, + // add a compile time check against the cub version to allow + // num_in > (2 << 31). + CHECK(num_in < static_cast(std::numeric_limits::max())) + << "number of values to insert into histogram must be less than max " + "value of int."; + + size_t hist_workspace_size; + CUDA_CALL(cub::DeviceHistogram::HistogramEven( + nullptr, hist_workspace_size, proc_id_out.get(), + reinterpret_cast(out_counts), num_parts + 1, + static_cast(0), static_cast(num_parts), + static_cast(num_in), stream)); + + Workspace hist_workspace(device, ctx, hist_workspace_size); + CUDA_CALL(cub::DeviceHistogram::HistogramEven( + hist_workspace.get(), hist_workspace_size, proc_id_out.get(), + reinterpret_cast(out_counts), num_parts + 1, + static_cast(0), static_cast(num_parts), + static_cast(num_in), stream)); + } + + return result; +} + +template std::pair GeneratePermutationFromRemainder< + kDGLCUDA, int32_t>(int64_t array_size, int num_parts, IdArray in_idx); +template std::pair GeneratePermutationFromRemainder< + kDGLCUDA, int64_t>(int64_t array_size, int num_parts, IdArray in_idx); + +template +IdArray MapToLocalFromRemainder(const int num_parts, IdArray global_idx) { + const auto& ctx = global_idx->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + if (num_parts > 1) { + IdArray local_idx = + aten::NewIdArray(global_idx->shape[0], ctx, sizeof(IdType) * 8); + + const dim3 block(128); + const dim3 grid((global_idx->shape[0] + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _MapLocalIndexByRemainderKernel, grid, block, 0, stream, + static_cast(global_idx->data), global_idx->shape[0], + num_parts, static_cast(local_idx->data)); + + return local_idx; + } else { + // no mapping to be done + return global_idx; + } +} + +template IdArray MapToLocalFromRemainder( + int num_parts, IdArray in_idx); +template IdArray MapToLocalFromRemainder( + int num_parts, IdArray in_idx); + +template +IdArray MapToGlobalFromRemainder( + const int num_parts, IdArray local_idx, const int part_id) { + CHECK_LT(part_id, num_parts) + << "Invalid partition id " << part_id << "/" << num_parts; + CHECK_GE(part_id, 0) << "Invalid partition id " << part_id << "/" + << num_parts; + + const auto& ctx = local_idx->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + if (num_parts > 1) { + IdArray global_idx = + aten::NewIdArray(local_idx->shape[0], ctx, sizeof(IdType) * 8); + + const dim3 block(128); + const dim3 grid((local_idx->shape[0] + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _MapGlobalIndexByRemainderKernel, grid, block, 0, stream, + static_cast(local_idx->data), part_id, + global_idx->shape[0], num_parts, + static_cast(global_idx->data)); + + return global_idx; + } else { + // no mapping to be done + return local_idx; + } +} + +template IdArray MapToGlobalFromRemainder( + int num_parts, IdArray in_idx, int part_id); +template IdArray MapToGlobalFromRemainder( + int num_parts, IdArray in_idx, int part_id); + +// Range Based Partition Operations + +template +std::pair GeneratePermutationFromRange( + int64_t array_size, int num_parts, IdArray range, IdArray in_idx) { + std::pair result; + + const auto& ctx = in_idx->ctx; + auto device = DeviceAPI::Get(ctx); + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + const int64_t num_in = in_idx->shape[0]; + + CHECK_GE(num_parts, 1) << "The number of partitions (" << num_parts + << ") must be at least 1."; + if (num_parts == 1) { + // no permutation + result.first = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); + result.second = aten::Full(num_in, num_parts, sizeof(int64_t) * 8, ctx); + + return result; + } + + result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType) * 8); + result.second = aten::Full(0, num_parts, sizeof(int64_t) * 8, ctx); + int64_t* out_counts = static_cast(result.second->data); + if (num_in == 0) { + // now that we've zero'd out_counts, nothing left to do for an empty + // mapping + return result; + } + + const int64_t part_bits = + static_cast(std::ceil(std::log2(num_parts))); + + // First, generate a mapping of indexes to processors + Workspace proc_id_in(device, ctx, num_in); + { + const dim3 block(256); + const dim3 grid((num_in + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _MapProcByRangeKernel, grid, block, 0, stream, + static_cast(range->data), + static_cast(in_idx->data), num_in, num_parts, + proc_id_in.get()); + } + + // then create a permutation array that groups processors together by + // performing a radix sort + Workspace proc_id_out(device, ctx, num_in); + IdType* perm_out = static_cast(result.first->data); + { + IdArray perm_in = aten::Range(0, num_in, sizeof(IdType) * 8, ctx); + + size_t sort_workspace_size; + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + nullptr, sort_workspace_size, proc_id_in.get(), proc_id_out.get(), + static_cast(perm_in->data), perm_out, num_in, 0, part_bits, + stream)); + + Workspace sort_workspace(device, ctx, sort_workspace_size); + CUDA_CALL(cub::DeviceRadixSort::SortPairs( + sort_workspace.get(), sort_workspace_size, proc_id_in.get(), + proc_id_out.get(), static_cast(perm_in->data), perm_out, + num_in, 0, part_bits, stream)); + } + // explicitly free so workspace can be re-used + proc_id_in.free(); + + // perform a histogram and then prefixsum on the sorted proc_id vector + + // Count the number of values to be sent to each processor + { + using AtomicCount = unsigned long long; // NOLINT + static_assert( + sizeof(AtomicCount) == sizeof(*out_counts), + "AtomicCount must be the same width as int64_t for atomicAdd " + "in cub::DeviceHistogram::HistogramEven() to work"); + + // TODO(dlasalle): Once https://github.com/NVIDIA/cub/pull/287 is merged, + // add a compile time check against the cub version to allow + // num_in > (2 << 31). + CHECK(num_in < static_cast(std::numeric_limits::max())) + << "number of values to insert into histogram must be less than max " + "value of int."; + + size_t hist_workspace_size; + CUDA_CALL(cub::DeviceHistogram::HistogramEven( + nullptr, hist_workspace_size, proc_id_out.get(), + reinterpret_cast(out_counts), num_parts + 1, + static_cast(0), static_cast(num_parts), + static_cast(num_in), stream)); + + Workspace hist_workspace(device, ctx, hist_workspace_size); + CUDA_CALL(cub::DeviceHistogram::HistogramEven( + hist_workspace.get(), hist_workspace_size, proc_id_out.get(), + reinterpret_cast(out_counts), num_parts + 1, + static_cast(0), static_cast(num_parts), + static_cast(num_in), stream)); + } + + return result; +} + +template std::pair +GeneratePermutationFromRange( + int64_t array_size, int num_parts, IdArray range, IdArray in_idx); +template std::pair +GeneratePermutationFromRange( + int64_t array_size, int num_parts, IdArray range, IdArray in_idx); +template std::pair +GeneratePermutationFromRange( + int64_t array_size, int num_parts, IdArray range, IdArray in_idx); +template std::pair +GeneratePermutationFromRange( + int64_t array_size, int num_parts, IdArray range, IdArray in_idx); + +template +IdArray MapToLocalFromRange( + const int num_parts, IdArray range, IdArray global_idx) { + const auto& ctx = global_idx->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + if (num_parts > 1 && global_idx->shape[0] > 0) { + IdArray local_idx = + aten::NewIdArray(global_idx->shape[0], ctx, sizeof(IdType) * 8); + + const dim3 block(128); + const dim3 grid((global_idx->shape[0] + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _MapLocalIndexByRangeKernel, grid, block, 0, stream, + static_cast(range->data), + static_cast(global_idx->data), global_idx->shape[0], + num_parts, static_cast(local_idx->data)); + + return local_idx; + } else { + // no mapping to be done + return global_idx; + } +} + +template IdArray MapToLocalFromRange( + int num_parts, IdArray range, IdArray in_idx); +template IdArray MapToLocalFromRange( + int num_parts, IdArray range, IdArray in_idx); +template IdArray MapToLocalFromRange( + int num_parts, IdArray range, IdArray in_idx); +template IdArray MapToLocalFromRange( + int num_parts, IdArray range, IdArray in_idx); + +template +IdArray MapToGlobalFromRange( + const int num_parts, IdArray range, IdArray local_idx, const int part_id) { + CHECK_LT(part_id, num_parts) + << "Invalid partition id " << part_id << "/" << num_parts; + CHECK_GE(part_id, 0) << "Invalid partition id " << part_id << "/" + << num_parts; + + const auto& ctx = local_idx->ctx; + cudaStream_t stream = runtime::getCurrentCUDAStream(); + + if (num_parts > 1 && local_idx->shape[0] > 0) { + IdArray global_idx = + aten::NewIdArray(local_idx->shape[0], ctx, sizeof(IdType) * 8); + + const dim3 block(128); + const dim3 grid((local_idx->shape[0] + block.x - 1) / block.x); + + CUDA_KERNEL_CALL( + _MapGlobalIndexByRangeKernel, grid, block, 0, stream, + static_cast(range->data), + static_cast(local_idx->data), part_id, + global_idx->shape[0], num_parts, + static_cast(global_idx->data)); + + return global_idx; + } else { + // no mapping to be done + return local_idx; + } +} + +template IdArray MapToGlobalFromRange( + int num_parts, IdArray range, IdArray in_idx, int part_id); +template IdArray MapToGlobalFromRange( + int num_parts, IdArray range, IdArray in_idx, int part_id); +template IdArray MapToGlobalFromRange( + int num_parts, IdArray range, IdArray in_idx, int part_id); +template IdArray MapToGlobalFromRange( + int num_parts, IdArray range, IdArray in_idx, int part_id); + +} // namespace impl +} // namespace partition +} // namespace dgl diff --git a/src/partition/ndarray_partition.cc b/src/partition/ndarray_partition.cc index 196f9c7535fb..f83326ed9c72 100644 --- a/src/partition/ndarray_partition.cc +++ b/src/partition/ndarray_partition.cc @@ -37,7 +37,7 @@ class RemainderPartition : public NDArrayPartition { std::pair GeneratePermutation( IdArray in_idx) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { @@ -54,7 +54,7 @@ class RemainderPartition : public NDArrayPartition { } IdArray MapToLocal(IdArray in_idx) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { @@ -71,7 +71,7 @@ class RemainderPartition : public NDArrayPartition { } IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { @@ -116,7 +116,7 @@ class RangePartition : public NDArrayPartition { std::pair GeneratePermutation( IdArray in_idx) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { if (ctx.device_type != range_->ctx.device_type || @@ -142,7 +142,7 @@ class RangePartition : public NDArrayPartition { } IdArray MapToLocal(IdArray in_idx) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { @@ -161,7 +161,7 @@ class RangePartition : public NDArrayPartition { } IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM auto ctx = in_idx->ctx; if (ctx.device_type == kDGLCUDA) { ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { diff --git a/src/partition/ndarray_partition.cc.prehip b/src/partition/ndarray_partition.cc.prehip new file mode 100644 index 000000000000..196f9c7535fb --- /dev/null +++ b/src/partition/ndarray_partition.cc.prehip @@ -0,0 +1,266 @@ +/** + * Copyright (c) 2021 by Contributors + * @file ndarray_partition.cc + * @brief DGL utilities for working with the partitioned NDArrays + */ + +#include "ndarray_partition.h" + +#include +#include + +#include +#include + +#include "../c_api_common.h" +#include "partition_op.h" + +using namespace dgl::runtime; + +namespace dgl { +namespace partition { + +NDArrayPartition::NDArrayPartition( + const int64_t array_size, const int num_parts) + : array_size_(array_size), num_parts_(num_parts) {} + +int64_t NDArrayPartition::ArraySize() const { return array_size_; } + +int NDArrayPartition::NumParts() const { return num_parts_; } + +class RemainderPartition : public NDArrayPartition { + public: + RemainderPartition(const int64_t array_size, const int num_parts) + : NDArrayPartition(array_size, num_parts) { + // do nothing + } + + std::pair GeneratePermutation( + IdArray in_idx) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + return impl::GeneratePermutationFromRemainder( + ArraySize(), NumParts(), in_idx); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return std::pair{}; + } + + IdArray MapToLocal(IdArray in_idx) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + return impl::MapToLocalFromRemainder( + NumParts(), in_idx); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return IdArray{}; + } + + IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + return impl::MapToGlobalFromRemainder( + NumParts(), in_idx, part_id); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return IdArray{}; + } + + int64_t PartSize(const int part_id) const override { + CHECK_LT(part_id, NumParts()) << "Invalid part ID (" << part_id + << ") for " + "partition of size " + << NumParts() << "."; + return ArraySize() / NumParts() + (part_id < ArraySize() % NumParts()); + } +}; + +class RangePartition : public NDArrayPartition { + public: + RangePartition(const int64_t array_size, const int num_parts, IdArray range) + : NDArrayPartition(array_size, num_parts), + range_(range), + // We also need a copy of the range on the CPU, to compute partition + // sizes. We require the input range on the GPU, as if we have multiple + // GPUs, we can't know which is the proper one to copy the array to, but + // we have only one CPU context, and can safely copy the array to that. + range_cpu_(range.CopyTo(DGLContext{kDGLCPU, 0})) { + auto ctx = range->ctx; + if (ctx.device_type != kDGLCUDA) { + LOG(FATAL) << "The range for an NDArrayPartition is only supported " + " on GPUs. Transfer the range to the target device before " + "creating the partition."; + } + } + + std::pair GeneratePermutation( + IdArray in_idx) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + if (ctx.device_type != range_->ctx.device_type || + ctx.device_id != range_->ctx.device_id) { + LOG(FATAL) << "The range for the NDArrayPartition and the input " + "array must be on the same device: " + << ctx << " vs. " << range_->ctx; + } + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, { + return impl::GeneratePermutationFromRange< + kDGLCUDA, IdType, RangeType>( + ArraySize(), NumParts(), range_, in_idx); + }); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return std::pair{}; + } + + IdArray MapToLocal(IdArray in_idx) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, { + return impl::MapToLocalFromRange( + NumParts(), range_, in_idx); + }); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return IdArray{}; + } + + IdArray MapToGlobal(IdArray in_idx, const int part_id) const override { +#ifdef DGL_USE_CUDA + auto ctx = in_idx->ctx; + if (ctx.device_type == kDGLCUDA) { + ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, { + ATEN_ID_TYPE_SWITCH(range_->dtype, RangeType, { + return impl::MapToGlobalFromRange( + NumParts(), range_, in_idx, part_id); + }); + }); + } +#endif + + LOG(FATAL) << "Remainder based partitioning for the CPU is not yet " + "implemented."; + // should be unreachable + return IdArray{}; + } + + int64_t PartSize(const int part_id) const override { + CHECK_LT(part_id, NumParts()) << "Invalid part ID (" << part_id + << ") for " + "partition of size " + << NumParts() << "."; + int64_t part_size = -1; + ATEN_ID_TYPE_SWITCH(range_cpu_->dtype, RangeType, { + const RangeType* const ptr = + static_cast(range_cpu_->data); + part_size = ptr[part_id + 1] - ptr[part_id]; + }); + return part_size; + } + + private: + IdArray range_; + IdArray range_cpu_; +}; + +NDArrayPartitionRef CreatePartitionRemainderBased( + const int64_t array_size, const int num_parts) { + return NDArrayPartitionRef( + std::make_shared(array_size, num_parts)); +} + +NDArrayPartitionRef CreatePartitionRangeBased( + const int64_t array_size, const int num_parts, IdArray range) { + return NDArrayPartitionRef( + std::make_shared(array_size, num_parts, range)); +} + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionCreateRemainderBased") + .set_body([](DGLArgs args, DGLRetValue* rv) { + int64_t array_size = args[0]; + int num_parts = args[1]; + + *rv = CreatePartitionRemainderBased(array_size, num_parts); + }); + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionCreateRangeBased") + .set_body([](DGLArgs args, DGLRetValue* rv) { + const int64_t array_size = args[0]; + const int num_parts = args[1]; + IdArray range = args[2]; + + *rv = CreatePartitionRangeBased(array_size, num_parts, range); + }); + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGetPartSize") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArrayPartitionRef part = args[0]; + int part_id = args[1]; + + *rv = part->PartSize(part_id); + }); + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToLocal") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArrayPartitionRef part = args[0]; + IdArray idxs = args[1]; + + *rv = part->MapToLocal(idxs); + }); + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArrayPartitionRef part = args[0]; + IdArray idxs = args[1]; + const int part_id = args[2]; + + *rv = part->MapToGlobal(idxs, part_id); + }); + +DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation") + .set_body([](DGLArgs args, DGLRetValue* rv) { + NDArrayPartitionRef part = args[0]; + IdArray idxs = args[1]; + + std::pair part_perm = part->GeneratePermutation(idxs); + *rv = + ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second}); + }); + +} // namespace partition +} // namespace dgl diff --git a/src/random/continuous_seed.h b/src/random/continuous_seed.h index 60ce6762806b..a4478f4e8084 100644 --- a/src/random/continuous_seed.h +++ b/src/random/continuous_seed.h @@ -25,7 +25,7 @@ #include #ifdef __NVCC__ -#include +#include #else #include @@ -61,16 +61,16 @@ class continuous_seed { #ifdef __CUDA_ARCH__ __device__ inline float uniform(const uint64_t t) const { const uint64_t kCurandSeed = 999961; // Could be any random number. - curandStatePhilox4_32_10_t rng; - curand_init(kCurandSeed, s[0], t, &rng); + hiprandStatePhilox4_32_10_t rng; + hiprand_init(kCurandSeed, s[0], t, &rng); float rnd; if (s[0] != s[1]) { - rnd = c[0] * curand_normal(&rng); - curand_init(kCurandSeed, s[1], t, &rng); - rnd += c[1] * curand_normal(&rng); + rnd = c[0] * hiprand_normal(&rng); + hiprand_init(kCurandSeed, s[1], t, &rng); + rnd += c[1] * hiprand_normal(&rng); rnd = normcdff(rnd); } else { - rnd = curand_uniform(&rng); + rnd = hiprand_uniform(&rng); } return rnd; } diff --git a/src/random/continuous_seed.h.prehip b/src/random/continuous_seed.h.prehip new file mode 100644 index 000000000000..60ce6762806b --- /dev/null +++ b/src/random/continuous_seed.h.prehip @@ -0,0 +1,100 @@ +/*! + * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file dgl/continuous_seed.h + * @brief CPU and CUDA implementation for continuous random seeds + */ +#ifndef DGL_RANDOM_CONTINUOUS_SEED_H_ +#define DGL_RANDOM_CONTINUOUS_SEED_H_ + +#include + +#include + +#ifdef __NVCC__ +#include +#else +#include + +#include "pcg_random.hpp" +#endif // __CUDA_ARCH__ + +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.707106781186547524401 +#endif // M_SQRT1_2 + +namespace dgl { +namespace random { + +class continuous_seed { + uint64_t s[2]; + float c[2]; + + public: + /* implicit */ continuous_seed(const int64_t seed) { // NOLINT + s[0] = s[1] = seed; + c[0] = c[1] = 0; + } + + continuous_seed(IdArray seed_arr, float r) { + auto seed = seed_arr.Ptr(); + s[0] = seed[0]; + s[1] = seed[seed_arr->shape[0] - 1]; + const auto pi = std::acos(-1.0); + c[0] = std::cos(pi * r / 2); + c[1] = std::sin(pi * r / 2); + } + +#ifdef __CUDA_ARCH__ + __device__ inline float uniform(const uint64_t t) const { + const uint64_t kCurandSeed = 999961; // Could be any random number. + curandStatePhilox4_32_10_t rng; + curand_init(kCurandSeed, s[0], t, &rng); + float rnd; + if (s[0] != s[1]) { + rnd = c[0] * curand_normal(&rng); + curand_init(kCurandSeed, s[1], t, &rng); + rnd += c[1] * curand_normal(&rng); + rnd = normcdff(rnd); + } else { + rnd = curand_uniform(&rng); + } + return rnd; + } +#else + inline float uniform(const uint64_t t) const { + pcg32 ng0(s[0], t); + float rnd; + if (s[0] != s[1]) { + std::normal_distribution norm; + rnd = c[0] * norm(ng0); + pcg32 ng1(s[1], t); + norm.reset(); + rnd += c[1] * norm(ng1); + rnd = std::erfc(-rnd * static_cast(M_SQRT1_2)) / 2.0f; + } else { + std::uniform_real_distribution uni; + rnd = uni(ng0); + } + return rnd; + } +#endif // __CUDA_ARCH__ +}; + +} // namespace random +} // namespace dgl + +#endif // DGL_RANDOM_CONTINUOUS_SEED_H_ diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index a679e3576395..583a1816a6d1 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -122,13 +122,13 @@ void DeviceAPI::SyncStreamFromTo( } bool DeviceAPI::PinData(void* ptr, size_t nbytes) { - LOG(FATAL) << "Device does not support cudaHostRegister api."; + LOG(FATAL) << "Device does not support hipHostRegister api."; return false; } void* DeviceAPI::AllocPinnedDataSpace( size_t nbytes, void** ctx, void** deleter) { - LOG(FATAL) << "Device does not support cudaHostAlloc api."; + LOG(FATAL) << "Device does not support hipHostAlloc api."; return nullptr; } @@ -137,7 +137,7 @@ void DeviceAPI::FreePinnedDataSpace(void** deleter) { } void DeviceAPI::UnpinData(void* ptr) { - LOG(FATAL) << "Device does not support cudaHostUnregister api."; + LOG(FATAL) << "Device does not support hipHostUnregister api."; } } // namespace runtime } // namespace dgl diff --git a/src/runtime/c_runtime_api.cc.prehip b/src/runtime/c_runtime_api.cc.prehip new file mode 100644 index 000000000000..a679e3576395 --- /dev/null +++ b/src/runtime/c_runtime_api.cc.prehip @@ -0,0 +1,418 @@ +/** + * Copyright (c) 2016-2022 by Contributors + * @file c_runtime_api.cc + * @brief Runtime API implementation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "runtime_base.h" + +namespace dgl { +namespace runtime { + +/** + * @brief The name of Device API factory. + * @param type The device type. + */ +inline std::string DeviceName(int type) { + switch (type) { + case kDGLCPU: + return "cpu"; + case kDGLCUDA: + return "cuda"; + // add more device here once supported + default: + LOG(FATAL) << "unknown type =" << type; + return "Unknown"; + } +} + +class DeviceAPIManager { + public: + static const int kMaxDeviceAPI = 32; + // Get API + static DeviceAPI* Get(const DGLContext& ctx) { return Get(ctx.device_type); } + static DeviceAPI* Get(int dev_type, bool allow_missing = false) { + return Global()->GetAPI(dev_type, allow_missing); + } + + private: + std::array api_; + DeviceAPI* rpc_api_{nullptr}; + std::mutex mutex_; + // constructor + DeviceAPIManager() { std::fill(api_.begin(), api_.end(), nullptr); } + // Global static variable. + static DeviceAPIManager* Global() { + static DeviceAPIManager inst; + return &inst; + } + // Get or initialize API. + DeviceAPI* GetAPI(int type, bool allow_missing) { + if (type < kRPCSessMask) { + if (api_[type] != nullptr) return api_[type]; + std::lock_guard lock(mutex_); + if (api_[type] != nullptr) return api_[type]; + api_[type] = GetAPI(DeviceName(type), allow_missing); + return api_[type]; + } else { + if (rpc_api_ != nullptr) return rpc_api_; + std::lock_guard lock(mutex_); + if (rpc_api_ != nullptr) return rpc_api_; + rpc_api_ = GetAPI("rpc", allow_missing); + return rpc_api_; + } + } + DeviceAPI* GetAPI(const std::string name, bool allow_missing) { + std::string factory = "device_api." + name; + auto* f = Registry::Get(factory); + if (f == nullptr) { + CHECK(allow_missing) + << "Device API " << name + << " is not enabled. Please install the cuda version of dgl."; + return nullptr; + } + void* ptr = (*f)(); + return static_cast(ptr); + } +}; + +DeviceAPI* DeviceAPI::Get(DGLContext ctx, bool allow_missing) { + return DeviceAPIManager::Get( + static_cast(ctx.device_type), allow_missing); +} + +DeviceAPI* DeviceAPI::Get(DGLDeviceType dev_type, bool allow_missing) { + return DeviceAPIManager::Get(static_cast(dev_type), allow_missing); +} + +void* DeviceAPI::AllocWorkspace( + DGLContext ctx, size_t size, DGLDataType type_hint) { + return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); +} + +void DeviceAPI::FreeWorkspace(DGLContext ctx, void* ptr) { + FreeDataSpace(ctx, ptr); +} + +DGLStreamHandle DeviceAPI::CreateStream(DGLContext ctx) { + LOG(FATAL) << "Device does not support stream api."; + return 0; +} + +void DeviceAPI::FreeStream(DGLContext ctx, DGLStreamHandle stream) { + LOG(FATAL) << "Device does not support stream api."; +} + +void DeviceAPI::SyncStreamFromTo( + DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) { + LOG(FATAL) << "Device does not support stream api."; +} + +bool DeviceAPI::PinData(void* ptr, size_t nbytes) { + LOG(FATAL) << "Device does not support cudaHostRegister api."; + return false; +} + +void* DeviceAPI::AllocPinnedDataSpace( + size_t nbytes, void** ctx, void** deleter) { + LOG(FATAL) << "Device does not support cudaHostAlloc api."; + return nullptr; +} + +void DeviceAPI::FreePinnedDataSpace(void** deleter) { + LOG(FATAL) << "Device does not support cudaHostFree api."; +} + +void DeviceAPI::UnpinData(void* ptr) { + LOG(FATAL) << "Device does not support cudaHostUnregister api."; +} +} // namespace runtime +} // namespace dgl + +using namespace dgl::runtime; + +struct DGLRuntimeEntry { + std::string ret_str; + std::string last_error; + DGLByteArray ret_bytes; +}; + +typedef dmlc::ThreadLocalStore DGLAPIRuntimeStore; + +const char* DGLGetLastError() { + return DGLAPIRuntimeStore::Get()->last_error.c_str(); +} + +void DGLAPISetLastError(const char* msg) { +#ifndef _LIBCPP_SGX_CONFIG + DGLAPIRuntimeStore::Get()->last_error = msg; +#else + sgx::OCallPackedFunc("__sgx_set_last_error__", msg); +#endif +} + +int DGLModLoadFromFile( + const char* file_name, const char* format, DGLModuleHandle* out) { + API_BEGIN(); + Module m = Module::LoadFromFile(file_name, format); + *out = new Module(m); + API_END(); +} + +int DGLModImport(DGLModuleHandle mod, DGLModuleHandle dep) { + API_BEGIN(); + static_cast(mod)->Import(*static_cast(dep)); + API_END(); +} + +int DGLModGetFunction( + DGLModuleHandle mod, const char* func_name, int query_imports, + DGLFunctionHandle* func) { + API_BEGIN(); + PackedFunc pf = + static_cast(mod)->GetFunction(func_name, query_imports != 0); + if (pf != nullptr) { + *func = new PackedFunc(pf); + } else { + *func = nullptr; + } + API_END(); +} + +int DGLModFree(DGLModuleHandle mod) { + API_BEGIN(); + delete static_cast(mod); + API_END(); +} + +int DGLBackendGetFuncFromEnv( + void* mod_node, const char* func_name, DGLFunctionHandle* func) { + API_BEGIN(); + *func = + (DGLFunctionHandle)(static_cast(mod_node)->GetFuncFromEnv( + func_name)); + API_END(); +} + +void* DGLBackendAllocWorkspace( + int device_type, int device_id, uint64_t size, int dtype_code_hint, + int dtype_bits_hint) { + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + + DGLDataType type_hint; + type_hint.code = static_cast(dtype_code_hint); + type_hint.bits = static_cast(dtype_bits_hint); + type_hint.lanes = 1; + + return DeviceAPIManager::Get(ctx)->AllocWorkspace( + ctx, static_cast(size), type_hint); +} + +int DGLBackendFreeWorkspace(int device_type, int device_id, void* ptr) { + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->FreeWorkspace(ctx, ptr); + return 0; +} + +int DGLBackendRunOnce(void** handle, int (*f)(void*), void* cdata, int nbytes) { + if (*handle == nullptr) { + *handle = reinterpret_cast(1); + return (*f)(cdata); + } + return 0; +} + +int DGLFuncFree(DGLFunctionHandle func) { + API_BEGIN(); + delete static_cast(func); + API_END(); +} + +int DGLFuncCall( + DGLFunctionHandle func, DGLValue* args, int* arg_type_codes, int num_args, + DGLValue* ret_val, int* ret_type_code) { + API_BEGIN(); + DGLRetValue rv; + (*static_cast(func)) + .CallPacked(DGLArgs(args, arg_type_codes, num_args), &rv); + // handle return string. + if (rv.type_code() == kStr || rv.type_code() == kDGLDataType || + rv.type_code() == kBytes) { + DGLRuntimeEntry* e = DGLAPIRuntimeStore::Get(); + if (rv.type_code() != kDGLDataType) { + e->ret_str = *rv.ptr(); + } else { + e->ret_str = rv.operator std::string(); + } + if (rv.type_code() == kBytes) { + e->ret_bytes.data = e->ret_str.c_str(); + e->ret_bytes.size = e->ret_str.length(); + *ret_type_code = kBytes; + ret_val->v_handle = &(e->ret_bytes); + } else { + *ret_type_code = kStr; + ret_val->v_str = e->ret_str.c_str(); + } + } else { + rv.MoveToCHost(ret_val, ret_type_code); + } + API_END(); +} + +int DGLCFuncSetReturn( + DGLRetValueHandle ret, DGLValue* value, int* type_code, int num_ret) { + API_BEGIN(); + CHECK_EQ(num_ret, 1); + DGLRetValue* rv = static_cast(ret); + *rv = DGLArgValue(value[0], type_code[0]); + API_END(); +} + +int DGLFuncCreateFromCFunc( + DGLPackedCFunc func, void* resource_handle, DGLPackedCFuncFinalizer fin, + DGLFunctionHandle* out) { + API_BEGIN(); + if (fin == nullptr) { + *out = + new PackedFunc([func, resource_handle](DGLArgs args, DGLRetValue* rv) { + int ret = func( + (DGLValue*)args.values, (int*)args.type_codes, // NOLINT(*) + args.num_args, rv, resource_handle); + if (ret != 0) { + std::string err = "DGLCall CFunc Error:\n"; + err += DGLGetLastError(); + throw dmlc::Error(err); + } + }); + } else { + // wrap it in a shared_ptr, with fin as deleter. + // so fin will be called when the lambda went out of scope. + std::shared_ptr rpack(resource_handle, fin); + *out = new PackedFunc([func, rpack](DGLArgs args, DGLRetValue* rv) { + int ret = func( + (DGLValue*)args.values, (int*)args.type_codes, // NOLINT(*) + args.num_args, rv, rpack.get()); + if (ret != 0) { + std::string err = "DGLCall CFunc Error:\n"; + err += DGLGetLastError(); + throw dmlc::Error(err); + } + }); + } + API_END(); +} + +int DGLStreamCreate(int device_type, int device_id, DGLStreamHandle* out) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + *out = DeviceAPIManager::Get(ctx)->CreateStream(ctx); + API_END(); +} + +int DGLStreamFree(int device_type, int device_id, DGLStreamHandle stream) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->FreeStream(ctx, stream); + API_END(); +} + +int DGLSetStream(int device_type, int device_id, DGLStreamHandle stream) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->SetStream(ctx, stream); + API_END(); +} + +int DGLGetStream(int device_type, int device_id, DGLStreamHandle* stream) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + *stream = DeviceAPIManager::Get(ctx)->GetStream(); + API_END(); +} + +int DGLSynchronize(int device_type, int device_id, DGLStreamHandle stream) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->StreamSync(ctx, stream); + API_END(); +} + +int DGLStreamStreamSynchronize( + int device_type, int device_id, DGLStreamHandle src, DGLStreamHandle dst) { + API_BEGIN(); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + DeviceAPIManager::Get(ctx)->SyncStreamFromTo(ctx, src, dst); + API_END(); +} + +int DGLCbArgToReturn(DGLValue* value, int code) { + API_BEGIN(); + dgl::runtime::DGLRetValue rv; + rv = dgl::runtime::DGLArgValue(*value, code); + int tcode; + rv.MoveToCHost(value, &tcode); + CHECK_EQ(tcode, code); + API_END(); +} + +int DGLLoadTensorAdapter(const char* path) { + return TensorDispatcher::Global()->Load(path) ? 0 : -1; +} + +// set device api +DGL_REGISTER_GLOBAL(dgl::runtime::symbol::dgl_set_device) + .set_body([](DGLArgs args, DGLRetValue* ret) { + DGLContext ctx; + ctx.device_type = static_cast(args[0].operator int()); + ctx.device_id = args[1]; + DeviceAPIManager::Get(ctx)->SetDevice(ctx); + }); + +// set device api +DGL_REGISTER_GLOBAL("_GetDeviceAttr") + .set_body([](DGLArgs args, DGLRetValue* ret) { + DGLContext ctx; + ctx.device_type = static_cast(args[0].operator int()); + ctx.device_id = args[1]; + + DeviceAttrKind kind = static_cast(args[2].operator int()); + if (kind == kExist) { + DeviceAPI* api = DeviceAPIManager::Get(ctx.device_type, true); + if (api != nullptr) { + api->GetAttr(ctx, kind, ret); + } else { + *ret = 0; + } + } else { + DeviceAPIManager::Get(ctx)->GetAttr(ctx, kind, ret); + } + }); diff --git a/src/runtime/cuda/cuda_common.h b/src/runtime/cuda/cuda_common.h index 2fa26e101342..1534b5ce0c38 100644 --- a/src/runtime/cuda/cuda_common.h +++ b/src/runtime/cuda/cuda_common.h @@ -6,10 +6,10 @@ #ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_ #define DGL_RUNTIME_CUDA_CUDA_COMMON_H_ -#include -#include -#include -#include +#include +#include +#include +#include #include #include @@ -26,7 +26,7 @@ namespace runtime { runtime::CUDAWorkspaceAllocator allocator(ctx); const auto stream = runtime::getCurrentCUDAStream(); - const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + const auto exec_policy = thrust::hip::par_nosync(allocator).on(stream); now, one can pass exec_policy to thrust functions @@ -79,112 +79,112 @@ inline bool is_zero(dim3 size) { #define CUDA_DRIVER_CALL(x) \ { \ - CUresult result = x; \ - if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ + hipError_t result = x; \ + if (result != hipSuccess && result != hipErrorDeinitialized) { \ const char* msg; \ - cuGetErrorName(result, &msg); \ + hipDrvGetErrorName(result, &msg); \ LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \ } \ } #define CUDA_CALL(func) \ { \ - cudaError_t e = (func); \ - CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ - << "CUDA: " << cudaGetErrorString(e); \ + hipError_t e = (func); \ + CHECK(e == hipSuccess || e == hipErrorDeinitialized) \ + << "CUDA: " << hipGetErrorString(e); \ } #define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \ { \ if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \ (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \ - cudaError_t e = cudaGetLastError(); \ - CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ - << "CUDA kernel launch error: " << cudaGetErrorString(e); \ + hipError_t e = hipGetLastError(); \ + CHECK(e == hipSuccess || e == hipErrorDeinitialized) \ + << "CUDA kernel launch error: " << hipGetErrorString(e); \ } \ } #define CUSPARSE_CALL(func) \ { \ - cusparseStatus_t e = (func); \ - CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \ + hipsparseStatus_t e = (func); \ + CHECK(e == HIPSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \ } #define CUBLAS_CALL(func) \ { \ - cublasStatus_t e = (func); \ - CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \ + hipblasStatus_t e = (func); \ + CHECK(e == HIPBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \ } #define CURAND_CALL(func) \ { \ - curandStatus_t e = (func); \ - CHECK(e == CURAND_STATUS_SUCCESS) \ + hiprandStatus_t e = (func); \ + CHECK(e == HIPRAND_STATUS_SUCCESS) \ << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \ << __FILE__ << ":" << __LINE__; \ } -inline const char* curandGetErrorString(curandStatus_t error) { +inline const char* curandGetErrorString(hiprandStatus_t error) { switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; + case HIPRAND_STATUS_SUCCESS: + return "HIPRAND_STATUS_SUCCESS"; + case HIPRAND_STATUS_VERSION_MISMATCH: + return "HIPRAND_STATUS_VERSION_MISMATCH"; + case HIPRAND_STATUS_NOT_INITIALIZED: + return "HIPRAND_STATUS_NOT_INITIALIZED"; + case HIPRAND_STATUS_ALLOCATION_FAILED: + return "HIPRAND_STATUS_ALLOCATION_FAILED"; + case HIPRAND_STATUS_TYPE_ERROR: + return "HIPRAND_STATUS_TYPE_ERROR"; + case HIPRAND_STATUS_OUT_OF_RANGE: + return "HIPRAND_STATUS_OUT_OF_RANGE"; + case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE: + return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE"; + case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case HIPRAND_STATUS_LAUNCH_FAILURE: + return "HIPRAND_STATUS_LAUNCH_FAILURE"; + case HIPRAND_STATUS_PREEXISTING_FAILURE: + return "HIPRAND_STATUS_PREEXISTING_FAILURE"; + case HIPRAND_STATUS_INITIALIZATION_FAILED: + return "HIPRAND_STATUS_INITIALIZATION_FAILED"; + case HIPRAND_STATUS_ARCH_MISMATCH: + return "HIPRAND_STATUS_ARCH_MISMATCH"; + case HIPRAND_STATUS_INTERNAL_ERROR: + return "HIPRAND_STATUS_INTERNAL_ERROR"; } // To suppress compiler warning. - return "Unrecognized curand error string"; + return "Unrecognized hiprand error string"; } /** - * @brief Cast data type to cudaDataType_t. + * @brief Cast data type to hipDataType. */ template struct cuda_dtype { - static constexpr cudaDataType_t value = CUDA_R_32F; + static constexpr hipDataType value = HIP_R_32F; }; template <> struct cuda_dtype<__half> { - static constexpr cudaDataType_t value = CUDA_R_16F; + static constexpr hipDataType value = HIP_R_16F; }; #if BF16_ENABLED template <> -struct cuda_dtype<__nv_bfloat16> { - static constexpr cudaDataType_t value = CUDA_R_16BF; +struct cuda_dtype<__hip_bfloat16> { + static constexpr hipDataType value = HIP_R_16BF; }; #endif // BF16_ENABLED template <> struct cuda_dtype { - static constexpr cudaDataType_t value = CUDA_R_32F; + static constexpr hipDataType value = HIP_R_32F; }; template <> struct cuda_dtype { - static constexpr cudaDataType_t value = CUDA_R_64F; + static constexpr hipDataType value = HIP_R_64F; }; /* @@ -202,7 +202,7 @@ struct accum_dtype<__half> { #if BF16_ENABLED template <> -struct accum_dtype<__nv_bfloat16> { +struct accum_dtype<__hip_bfloat16> { typedef float type; }; #endif // BF16_ENABLED @@ -219,21 +219,21 @@ struct accum_dtype { #if CUDART_VERSION >= 11000 /** - * @brief Cast index data type to cusparseIndexType_t. + * @brief Cast index data type to hipsparseIndexType_t. */ template struct cusparse_idtype { - static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; + static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I; }; template <> struct cusparse_idtype { - static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; + static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_32I; }; template <> struct cusparse_idtype { - static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I; + static constexpr hipsparseIndexType_t value = HIPSPARSE_INDEX_64I; }; #endif @@ -241,9 +241,9 @@ struct cusparse_idtype { class CUDAThreadEntry { public: /** @brief The cusparse handler */ - cusparseHandle_t cusparse_handle{nullptr}; + hipsparseHandle_t cusparse_handle{nullptr}; /** @brief The cublas handler */ - cublasHandle_t cublas_handle{nullptr}; + hipblasHandle_t cublas_handle{nullptr}; /** @brief thread local pool*/ WorkspacePool pool; /** @brief constructor */ @@ -253,7 +253,7 @@ class CUDAThreadEntry { }; /** @brief Get the current CUDA stream */ -cudaStream_t getCurrentCUDAStream(); +hipStream_t getCurrentCUDAStream(); } // namespace runtime } // namespace dgl #endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_ diff --git a/src/runtime/cuda/cuda_common.h.prehip b/src/runtime/cuda/cuda_common.h.prehip new file mode 100644 index 000000000000..2fa26e101342 --- /dev/null +++ b/src/runtime/cuda/cuda_common.h.prehip @@ -0,0 +1,259 @@ +/** + * Copyright (c) 2017 by Contributors + * @file cuda_common.h + * @brief Common utilities for CUDA + */ +#ifndef DGL_RUNTIME_CUDA_CUDA_COMMON_H_ +#define DGL_RUNTIME_CUDA_CUDA_COMMON_H_ + +#include +#include +#include +#include +#include + +#include +#include + +#include "../workspace_pool.h" + +namespace dgl { +namespace runtime { + +/* + How to use this class to get a nonblocking thrust execution policy that uses + DGL's memory pool and the current cuda stream + + runtime::CUDAWorkspaceAllocator allocator(ctx); + const auto stream = runtime::getCurrentCUDAStream(); + const auto exec_policy = thrust::cuda::par_nosync(allocator).on(stream); + + now, one can pass exec_policy to thrust functions + + to get an integer array of size 1000 whose lifetime is managed by unique_ptr, + use: auto int_array = allocator.alloc_unique(1000); int_array.get() gives + the raw pointer. +*/ +class CUDAWorkspaceAllocator { + DGLContext ctx; + + public: + typedef char value_type; + + void operator()(void* ptr) const { + runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr); + } + + explicit CUDAWorkspaceAllocator(DGLContext ctx) : ctx(ctx) {} + + CUDAWorkspaceAllocator& operator=(const CUDAWorkspaceAllocator&) = default; + + template + std::unique_ptr alloc_unique( + std::size_t size) const { + return std::unique_ptr( + reinterpret_cast(runtime::DeviceAPI::Get(ctx)->AllocWorkspace( + ctx, sizeof(T) * size)), + *this); + } + + char* allocate(std::ptrdiff_t size) const { + return reinterpret_cast( + runtime::DeviceAPI::Get(ctx)->AllocWorkspace(ctx, size)); + } + + void deallocate(char* ptr, std::size_t) const { + runtime::DeviceAPI::Get(ctx)->FreeWorkspace(ctx, ptr); + } +}; + +template +inline bool is_zero(T size) { + return size == 0; +} + +template <> +inline bool is_zero(dim3 size) { + return size.x == 0 || size.y == 0 || size.z == 0; +} + +#define CUDA_DRIVER_CALL(x) \ + { \ + CUresult result = x; \ + if (result != CUDA_SUCCESS && result != CUDA_ERROR_DEINITIALIZED) { \ + const char* msg; \ + cuGetErrorName(result, &msg); \ + LOG(FATAL) << "CUDAError: " #x " failed with error: " << msg; \ + } \ + } + +#define CUDA_CALL(func) \ + { \ + cudaError_t e = (func); \ + CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ + << "CUDA: " << cudaGetErrorString(e); \ + } + +#define CUDA_KERNEL_CALL(kernel, nblks, nthrs, shmem, stream, ...) \ + { \ + if (!dgl::runtime::is_zero((nblks)) && !dgl::runtime::is_zero((nthrs))) { \ + (kernel)<<<(nblks), (nthrs), (shmem), (stream)>>>(__VA_ARGS__); \ + cudaError_t e = cudaGetLastError(); \ + CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ + << "CUDA kernel launch error: " << cudaGetErrorString(e); \ + } \ + } + +#define CUSPARSE_CALL(func) \ + { \ + cusparseStatus_t e = (func); \ + CHECK(e == CUSPARSE_STATUS_SUCCESS) << "CUSPARSE ERROR: " << e; \ + } + +#define CUBLAS_CALL(func) \ + { \ + cublasStatus_t e = (func); \ + CHECK(e == CUBLAS_STATUS_SUCCESS) << "CUBLAS ERROR: " << e; \ + } + +#define CURAND_CALL(func) \ + { \ + curandStatus_t e = (func); \ + CHECK(e == CURAND_STATUS_SUCCESS) \ + << "CURAND Error: " << dgl::runtime::curandGetErrorString(e) << " at " \ + << __FILE__ << ":" << __LINE__; \ + } + +inline const char* curandGetErrorString(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + // To suppress compiler warning. + return "Unrecognized curand error string"; +} + +/** + * @brief Cast data type to cudaDataType_t. + */ +template +struct cuda_dtype { + static constexpr cudaDataType_t value = CUDA_R_32F; +}; + +template <> +struct cuda_dtype<__half> { + static constexpr cudaDataType_t value = CUDA_R_16F; +}; + +#if BF16_ENABLED +template <> +struct cuda_dtype<__nv_bfloat16> { + static constexpr cudaDataType_t value = CUDA_R_16BF; +}; +#endif // BF16_ENABLED + +template <> +struct cuda_dtype { + static constexpr cudaDataType_t value = CUDA_R_32F; +}; + +template <> +struct cuda_dtype { + static constexpr cudaDataType_t value = CUDA_R_64F; +}; + +/* + * \brief Accumulator type for SpMM. + */ +template +struct accum_dtype { + typedef float type; +}; + +template <> +struct accum_dtype<__half> { + typedef float type; +}; + +#if BF16_ENABLED +template <> +struct accum_dtype<__nv_bfloat16> { + typedef float type; +}; +#endif // BF16_ENABLED + +template <> +struct accum_dtype { + typedef float type; +}; + +template <> +struct accum_dtype { + typedef double type; +}; + +#if CUDART_VERSION >= 11000 +/** + * @brief Cast index data type to cusparseIndexType_t. + */ +template +struct cusparse_idtype { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; +}; + +template <> +struct cusparse_idtype { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; +}; + +template <> +struct cusparse_idtype { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I; +}; +#endif + +/** @brief Thread local workspace */ +class CUDAThreadEntry { + public: + /** @brief The cusparse handler */ + cusparseHandle_t cusparse_handle{nullptr}; + /** @brief The cublas handler */ + cublasHandle_t cublas_handle{nullptr}; + /** @brief thread local pool*/ + WorkspacePool pool; + /** @brief constructor */ + CUDAThreadEntry(); + // get the threadlocal workspace + static CUDAThreadEntry* ThreadLocal(); +}; + +/** @brief Get the current CUDA stream */ +cudaStream_t getCurrentCUDAStream(); +} // namespace runtime +} // namespace dgl +#endif // DGL_RUNTIME_CUDA_CUDA_COMMON_H_ diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index aa97d7a79d08..069938df9aab 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -3,7 +3,7 @@ * @file cuda_device_api.cc * @brief GPU specific API */ -#include +#include #include #include #include @@ -18,13 +18,13 @@ class CUDADeviceAPI final : public DeviceAPI { public: CUDADeviceAPI() { int count; - auto err = cudaGetDeviceCount(&count); + auto err = hipGetDeviceCount(&count); switch (err) { - case cudaSuccess: + case hipSuccess: break; default: count = 0; - cudaGetLastError(); + hipGetLastError(); } is_available_ = count > 0; } @@ -32,67 +32,67 @@ class CUDADeviceAPI final : public DeviceAPI { bool IsAvailable() final { return is_available_; } void SetDevice(DGLContext ctx) final { - CUDA_CALL(cudaSetDevice(ctx.device_id)); + CUDA_CALL(hipSetDevice(ctx.device_id)); } void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final { int value = 0; switch (kind) { case kExist: value = - (cudaDeviceGetAttribute( - &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) == - cudaSuccess); + (hipDeviceGetAttribute( + &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id) == + hipSuccess); break; case kMaxThreadsPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeMaxThreadsPerBlock, ctx.device_id)); break; } case kWarpSize: { CUDA_CALL( - cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id)); + hipDeviceGetAttribute(&value, hipDeviceAttributeWarpSize, ctx.device_id)); break; } case kMaxSharedMemoryPerBlock: { - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeMaxSharedMemoryPerBlock, ctx.device_id)); break; } case kComputeVersion: { std::ostringstream os; - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrComputeCapabilityMajor, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeComputeCapabilityMajor, ctx.device_id)); os << value << "."; - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrComputeCapabilityMinor, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeComputeCapabilityMinor, ctx.device_id)); os << value; *rv = os.str(); return; } case kDeviceName: { - cudaDeviceProp props; - CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id)); + hipDeviceProp_t props; + CUDA_CALL(hipGetDeviceProperties(&props, ctx.device_id)); *rv = std::string(props.name); return; } case kMaxClockRate: { - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrClockRate, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeClockRate, ctx.device_id)); break; } case kMultiProcessorCount: { - CUDA_CALL(cudaDeviceGetAttribute( - &value, cudaDevAttrMultiProcessorCount, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &value, hipDeviceAttributeMultiprocessorCount, ctx.device_id)); break; } case kMaxThreadDimensions: { int dims[3]; - CUDA_CALL(cudaDeviceGetAttribute( - &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id)); - CUDA_CALL(cudaDeviceGetAttribute( - &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id)); - CUDA_CALL(cudaDeviceGetAttribute( - &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &dims[0], hipDeviceAttributeMaxBlockDimX, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &dims[1], hipDeviceAttributeMaxBlockDimY, ctx.device_id)); + CUDA_CALL(hipDeviceGetAttribute( + &dims[2], hipDeviceAttributeMaxBlockDimZ, ctx.device_id)); std::stringstream ss; // use json string to return multiple int values; ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; @@ -114,7 +114,7 @@ class CUDADeviceAPI final : public DeviceAPI { } CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes"; void* ret; - CUDA_CALL(cudaMalloc(&ret, nbytes)); + CUDA_CALL(hipMalloc(&ret, nbytes)); return ret; } @@ -124,32 +124,32 @@ class CUDADeviceAPI final : public DeviceAPI { if (tensor_dispatcher->IsAvailable()) { return tensor_dispatcher->CUDAFreeWorkspace(ptr); } - CUDA_CALL(cudaFree(ptr)); + CUDA_CALL(hipFree(ptr)); } void CopyDataFromTo( const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint, DGLStreamHandle stream) { - cudaStream_t cu_stream = static_cast(stream); + hipStream_t cu_stream = static_cast(stream); from = static_cast(from) + from_offset; to = static_cast(to) + to_offset; if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) { - CUDA_CALL(cudaSetDevice(ctx_from.device_id)); + CUDA_CALL(hipSetDevice(ctx_from.device_id)); if (ctx_from.device_id == ctx_to.device_id) { - GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); + GPUCopy(from, to, size, hipMemcpyDeviceToDevice, cu_stream); } else { - CUDA_CALL(cudaMemcpyPeerAsync( + CUDA_CALL(hipMemcpyPeerAsync( to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream)); } } else if ( ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) { - CUDA_CALL(cudaSetDevice(ctx_from.device_id)); - GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); + CUDA_CALL(hipSetDevice(ctx_from.device_id)); + GPUCopy(from, to, size, hipMemcpyDeviceToHost, cu_stream); } else if ( ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) { - CUDA_CALL(cudaSetDevice(ctx_to.device_id)); - GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); + CUDA_CALL(hipSetDevice(ctx_to.device_id)); + GPUCopy(from, to, size, hipMemcpyHostToDevice, cu_stream); } else { LOG(FATAL) << "expect copy from/to GPU or between GPU"; } @@ -166,9 +166,9 @@ class CUDADeviceAPI final : public DeviceAPI { } // To ensure correct behavior, `record_event` must be invoked anytime a - // pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync + // pointer from PyTorch CachingHostAllocator is used in a hipMemcpyAsync // call. It provides a way to re-use freed pinned (page-locked) memory - // allocations and avoid device sync due to cudaFreeHost calls. + // allocations and avoid device sync due to hipHostFree calls. void RecordedCopyDataFromTo( void* from, size_t from_offset, void* to, size_t to_offset, size_t size, DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint, @@ -179,7 +179,7 @@ class CUDADeviceAPI final : public DeviceAPI { stream); auto tensor_dispatcher = TensorDispatcher::Global(); if (tensor_dispatcher->IsAvailable()) { - auto custream = static_cast(stream); + auto custream = static_cast(stream); void* ptr = ctx_to.device_type == kDGLCPU ? to : from; int id = ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id; @@ -188,34 +188,34 @@ class CUDADeviceAPI final : public DeviceAPI { } DGLStreamHandle CreateStream(DGLContext ctx) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); - cudaStream_t retval; + CUDA_CALL(hipSetDevice(ctx.device_id)); + hipStream_t retval; // make sure the legacy default stream won't block on this stream - CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); + CUDA_CALL(hipStreamCreateWithFlags(&retval, hipStreamNonBlocking)); return static_cast(retval); } void FreeStream(DGLContext ctx, DGLStreamHandle stream) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); - cudaStream_t cu_stream = static_cast(stream); - CUDA_CALL(cudaStreamDestroy(cu_stream)); + CUDA_CALL(hipSetDevice(ctx.device_id)); + hipStream_t cu_stream = static_cast(stream); + CUDA_CALL(hipStreamDestroy(cu_stream)); } void SyncStreamFromTo( DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) { - CUDA_CALL(cudaSetDevice(ctx.device_id)); - cudaStream_t src_stream = static_cast(event_src); - cudaStream_t dst_stream = static_cast(event_dst); - cudaEvent_t evt; - CUDA_CALL(cudaEventCreate(&evt)); - CUDA_CALL(cudaEventRecord(evt, src_stream)); - CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0)); - CUDA_CALL(cudaEventDestroy(evt)); + CUDA_CALL(hipSetDevice(ctx.device_id)); + hipStream_t src_stream = static_cast(event_src); + hipStream_t dst_stream = static_cast(event_dst); + hipEvent_t evt; + CUDA_CALL(hipEventCreate(&evt)); + CUDA_CALL(hipEventRecord(evt, src_stream)); + CUDA_CALL(hipStreamWaitEvent(dst_stream, evt, 0)); + CUDA_CALL(hipEventDestroy(evt)); } void StreamSync(DGLContext ctx, DGLStreamHandle stream) final { - CUDA_CALL(cudaSetDevice(ctx.device_id)); - CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); + CUDA_CALL(hipSetDevice(ctx.device_id)); + CUDA_CALL(hipStreamSynchronize(static_cast(stream))); } /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management, @@ -230,7 +230,7 @@ class CUDADeviceAPI final : public DeviceAPI { return static_cast(getCurrentCUDAStream()); } - /** NOTE: cudaHostRegister can be called from an arbitrary GPU device, + /** NOTE: hipHostRegister can be called from an arbitrary GPU device, * so we don't need to specify a ctx. * The pinned memory can be seen by all CUDA contexts, * not just the one that performed the allocation @@ -244,13 +244,13 @@ class CUDADeviceAPI final : public DeviceAPI { if (tensor_dispatcher->IsAvailable()) { tensor_dispatcher->CUDAHostAllocatorEmptyCache(); } - CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault)); + CUDA_CALL(hipHostRegister(ptr, nbytes, hipHostRegisterDefault)); return true; } void UnpinData(void* ptr) { if (ptr == nullptr) return; - CUDA_CALL(cudaHostUnregister(ptr)); + CUDA_CALL(hipHostUnregister(ptr)); } void* AllocPinnedDataSpace( @@ -276,33 +276,33 @@ class CUDADeviceAPI final : public DeviceAPI { // can't be a pinned tensor if CUDA context is unavailable. if (!is_available_) return false; - cudaPointerAttributes attr; - cudaError_t status = cudaPointerGetAttributes(&attr, ptr); + hipPointerAttribute_t attr; + hipError_t status = hipPointerGetAttributes(&attr, ptr); bool result = false; switch (status) { - case cudaErrorInvalidValue: + case hipErrorInvalidValue: // might be a normal CPU tensor in CUDA 10.2- - cudaGetLastError(); // clear error + hipGetLastError(); // clear error break; - case cudaSuccess: - result = (attr.type == cudaMemoryTypeHost); + case hipSuccess: + result = (attr.type == hipMemoryTypeHost); break; - case cudaErrorInitializationError: - case cudaErrorNoDevice: - case cudaErrorInsufficientDriver: - case cudaErrorInvalidDevice: + case hipErrorNotInitialized: + case hipErrorNoDevice: + case hipErrorInsufficientDriver: + case hipErrorInvalidDevice: // We don't want to fail in these particular cases since this function // can be called when users only want to run on CPU even if CUDA API is // enabled, or in a forked subprocess where CUDA context cannot be // initialized. So we just mark the CUDA context to unavailable and // return. is_available_ = false; - cudaGetLastError(); // clear error + hipGetLastError(); // clear error break; default: LOG(FATAL) << "error while determining memory status: " - << cudaGetErrorString(status); + << hipGetErrorString(status); break; } @@ -338,13 +338,13 @@ class CUDADeviceAPI final : public DeviceAPI { private: static void GPUCopy( - const void* from, void* to, size_t size, cudaMemcpyKind kind, - cudaStream_t stream) { - CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); - if (stream == 0 && kind == cudaMemcpyDeviceToHost) { + const void* from, void* to, size_t size, hipMemcpyKind kind, + hipStream_t stream) { + CUDA_CALL(hipMemcpyAsync(to, from, size, kind, stream)); + if (stream == 0 && kind == hipMemcpyDeviceToHost) { // only wait for the copy, when it's on the default stream, and it's to // host memory - CUDA_CALL(cudaStreamSynchronize(stream)); + CUDA_CALL(hipStreamSynchronize(stream)); } } @@ -359,7 +359,7 @@ CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() { return CUDAThreadStore::Get(); } -cudaStream_t getCurrentCUDAStream() { +hipStream_t getCurrentCUDAStream() { TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); if (tensor_dispatcher->IsAvailable()) return tensor_dispatcher->CUDAGetCurrentStream(); diff --git a/src/runtime/cuda/cuda_device_api.cc.prehip b/src/runtime/cuda/cuda_device_api.cc.prehip new file mode 100644 index 000000000000..aa97d7a79d08 --- /dev/null +++ b/src/runtime/cuda/cuda_device_api.cc.prehip @@ -0,0 +1,377 @@ +/** + * Copyright (c) 2017-2022 by Contributors + * @file cuda_device_api.cc + * @brief GPU specific API + */ +#include +#include +#include +#include +#include + +#include "cuda_common.h" + +namespace dgl { +namespace runtime { + +class CUDADeviceAPI final : public DeviceAPI { + public: + CUDADeviceAPI() { + int count; + auto err = cudaGetDeviceCount(&count); + switch (err) { + case cudaSuccess: + break; + default: + count = 0; + cudaGetLastError(); + } + is_available_ = count > 0; + } + + bool IsAvailable() final { return is_available_; } + + void SetDevice(DGLContext ctx) final { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + } + void GetAttr(DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) final { + int value = 0; + switch (kind) { + case kExist: + value = + (cudaDeviceGetAttribute( + &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id) == + cudaSuccess); + break; + case kMaxThreadsPerBlock: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrMaxThreadsPerBlock, ctx.device_id)); + break; + } + case kWarpSize: { + CUDA_CALL( + cudaDeviceGetAttribute(&value, cudaDevAttrWarpSize, ctx.device_id)); + break; + } + case kMaxSharedMemoryPerBlock: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrMaxSharedMemoryPerBlock, ctx.device_id)); + break; + } + case kComputeVersion: { + std::ostringstream os; + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrComputeCapabilityMajor, ctx.device_id)); + os << value << "."; + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrComputeCapabilityMinor, ctx.device_id)); + os << value; + *rv = os.str(); + return; + } + case kDeviceName: { + cudaDeviceProp props; + CUDA_CALL(cudaGetDeviceProperties(&props, ctx.device_id)); + *rv = std::string(props.name); + return; + } + case kMaxClockRate: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrClockRate, ctx.device_id)); + break; + } + case kMultiProcessorCount: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrMultiProcessorCount, ctx.device_id)); + break; + } + case kMaxThreadDimensions: { + int dims[3]; + CUDA_CALL(cudaDeviceGetAttribute( + &dims[0], cudaDevAttrMaxBlockDimX, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute( + &dims[1], cudaDevAttrMaxBlockDimY, ctx.device_id)); + CUDA_CALL(cudaDeviceGetAttribute( + &dims[2], cudaDevAttrMaxBlockDimZ, ctx.device_id)); + + std::stringstream ss; // use json string to return multiple int values; + ss << "[" << dims[0] << ", " << dims[1] << ", " << dims[2] << "]"; + *rv = ss.str(); + return; + } + } + *rv = value; + } + void* AllocDataSpace( + DGLContext ctx, size_t nbytes, size_t alignment, + DGLDataType type_hint) final { + SetDevice(ctx); + // Redirect to PyTorch's allocator when available. + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) { + return tensor_dispatcher->CUDAAllocWorkspace( + nbytes, getCurrentCUDAStream()); + } + CHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes"; + void* ret; + CUDA_CALL(cudaMalloc(&ret, nbytes)); + return ret; + } + + void FreeDataSpace(DGLContext ctx, void* ptr) final { + SetDevice(ctx); + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) { + return tensor_dispatcher->CUDAFreeWorkspace(ptr); + } + CUDA_CALL(cudaFree(ptr)); + } + + void CopyDataFromTo( + const void* from, size_t from_offset, void* to, size_t to_offset, + size_t size, DGLContext ctx_from, DGLContext ctx_to, + DGLDataType type_hint, DGLStreamHandle stream) { + cudaStream_t cu_stream = static_cast(stream); + from = static_cast(from) + from_offset; + to = static_cast(to) + to_offset; + if (ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCUDA) { + CUDA_CALL(cudaSetDevice(ctx_from.device_id)); + if (ctx_from.device_id == ctx_to.device_id) { + GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); + } else { + CUDA_CALL(cudaMemcpyPeerAsync( + to, ctx_to.device_id, from, ctx_from.device_id, size, cu_stream)); + } + } else if ( + ctx_from.device_type == kDGLCUDA && ctx_to.device_type == kDGLCPU) { + CUDA_CALL(cudaSetDevice(ctx_from.device_id)); + GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); + } else if ( + ctx_from.device_type == kDGLCPU && ctx_to.device_type == kDGLCUDA) { + CUDA_CALL(cudaSetDevice(ctx_to.device_id)); + GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); + } else { + LOG(FATAL) << "expect copy from/to GPU or between GPU"; + } + } + + void CopyDataFromTo( + const void* from, size_t from_offset, void* to, size_t to_offset, + size_t size, DGLContext ctx_from, DGLContext ctx_to, + DGLDataType type_hint) final { + auto stream = GetStream(); + CopyDataFromTo( + from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint, + stream); + } + + // To ensure correct behavior, `record_event` must be invoked anytime a + // pointer from PyTorch CachingHostAllocator is used in a cudaMemcpyAsync + // call. It provides a way to re-use freed pinned (page-locked) memory + // allocations and avoid device sync due to cudaFreeHost calls. + void RecordedCopyDataFromTo( + void* from, size_t from_offset, void* to, size_t to_offset, size_t size, + DGLContext ctx_from, DGLContext ctx_to, DGLDataType type_hint, + void* pytorch_ctx) final { + auto stream = GetStream(); + CopyDataFromTo( + from, from_offset, to, to_offset, size, ctx_from, ctx_to, type_hint, + stream); + auto tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) { + auto custream = static_cast(stream); + void* ptr = ctx_to.device_type == kDGLCPU ? to : from; + int id = + ctx_to.device_type == kDGLCPU ? ctx_from.device_id : ctx_to.device_id; + tensor_dispatcher->CUDARecordHostAlloc(ptr, pytorch_ctx, custream, id); + } + } + + DGLStreamHandle CreateStream(DGLContext ctx) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t retval; + // make sure the legacy default stream won't block on this stream + CUDA_CALL(cudaStreamCreateWithFlags(&retval, cudaStreamNonBlocking)); + return static_cast(retval); + } + + void FreeStream(DGLContext ctx, DGLStreamHandle stream) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t cu_stream = static_cast(stream); + CUDA_CALL(cudaStreamDestroy(cu_stream)); + } + + void SyncStreamFromTo( + DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst) { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + cudaStream_t src_stream = static_cast(event_src); + cudaStream_t dst_stream = static_cast(event_dst); + cudaEvent_t evt; + CUDA_CALL(cudaEventCreate(&evt)); + CUDA_CALL(cudaEventRecord(evt, src_stream)); + CUDA_CALL(cudaStreamWaitEvent(dst_stream, evt, 0)); + CUDA_CALL(cudaEventDestroy(evt)); + } + + void StreamSync(DGLContext ctx, DGLStreamHandle stream) final { + CUDA_CALL(cudaSetDevice(ctx.device_id)); + CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); + } + + /** NOTE: If the backend is PyTorch, we will use PyTorch's stream management, + * so just avoid calling our SetStream/CreateStream unless + * you really need advanced stream control. + * TODO(Xin): Redirect this to PyTorch or remove it. + * PyTorch allows external CUDA streams to be set as current since v1.11. + */ + void SetStream(DGLContext ctx, DGLStreamHandle stream) final {} + + DGLStreamHandle GetStream() const final { + return static_cast(getCurrentCUDAStream()); + } + + /** NOTE: cudaHostRegister can be called from an arbitrary GPU device, + * so we don't need to specify a ctx. + * The pinned memory can be seen by all CUDA contexts, + * not just the one that performed the allocation + */ + bool PinData(void* ptr, size_t nbytes) override { + // prevent users from pinning empty tensors or graphs + if (ptr == nullptr || nbytes == 0) return false; + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + // Minimize the pinned memory pool allocated by backend (via tensoradapter) + // to preserve enough memory for DGL inherited in-place pin-memory operation + if (tensor_dispatcher->IsAvailable()) { + tensor_dispatcher->CUDAHostAllocatorEmptyCache(); + } + CUDA_CALL(cudaHostRegister(ptr, nbytes, cudaHostRegisterDefault)); + return true; + } + + void UnpinData(void* ptr) { + if (ptr == nullptr) return; + CUDA_CALL(cudaHostUnregister(ptr)); + } + + void* AllocPinnedDataSpace( + size_t nbytes, void** ctx, void** deleter) override { + // prevent pinning empty tensors or graphs + if (nbytes == 0) return nullptr; + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + CHECK(tensor_dispatcher->IsAvailable()) + << "CachingHostAllocator is not available in the current backend " + "PyTorch. Please update the PyTorch version to 1.11+"; + return tensor_dispatcher->CUDAAllocHostWorkspace(nbytes, ctx, deleter); + } + + void FreePinnedDataSpace(void** deleter) override { + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + CHECK(tensor_dispatcher->IsAvailable()) + << "CachingHostAllocator is not available in the current backend " + "PyTorch. Please update the PyTorch version to 1.11+"; + tensor_dispatcher->CUDAFreeHostWorkspace(deleter); + } + + bool IsPinned(const void* ptr) override { + // can't be a pinned tensor if CUDA context is unavailable. + if (!is_available_) return false; + + cudaPointerAttributes attr; + cudaError_t status = cudaPointerGetAttributes(&attr, ptr); + bool result = false; + + switch (status) { + case cudaErrorInvalidValue: + // might be a normal CPU tensor in CUDA 10.2- + cudaGetLastError(); // clear error + break; + case cudaSuccess: + result = (attr.type == cudaMemoryTypeHost); + break; + case cudaErrorInitializationError: + case cudaErrorNoDevice: + case cudaErrorInsufficientDriver: + case cudaErrorInvalidDevice: + // We don't want to fail in these particular cases since this function + // can be called when users only want to run on CPU even if CUDA API is + // enabled, or in a forked subprocess where CUDA context cannot be + // initialized. So we just mark the CUDA context to unavailable and + // return. + is_available_ = false; + cudaGetLastError(); // clear error + break; + default: + LOG(FATAL) << "error while determining memory status: " + << cudaGetErrorString(status); + break; + } + + return result; + } + + void* AllocWorkspace( + DGLContext ctx, size_t size, DGLDataType type_hint) final { + SetDevice(ctx); + // Redirect to PyTorch's allocator when available. + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) + return tensor_dispatcher->CUDAAllocWorkspace( + size, getCurrentCUDAStream()); + + return CUDAThreadEntry::ThreadLocal()->pool.AllocWorkspace(ctx, size); + } + + void FreeWorkspace(DGLContext ctx, void* data) final { + SetDevice(ctx); + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) + return tensor_dispatcher->CUDAFreeWorkspace(data); + + CUDAThreadEntry::ThreadLocal()->pool.FreeWorkspace(ctx, data); + } + + static const std::shared_ptr& Global() { + static std::shared_ptr inst = + std::make_shared(); + return inst; + } + + private: + static void GPUCopy( + const void* from, void* to, size_t size, cudaMemcpyKind kind, + cudaStream_t stream) { + CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); + if (stream == 0 && kind == cudaMemcpyDeviceToHost) { + // only wait for the copy, when it's on the default stream, and it's to + // host memory + CUDA_CALL(cudaStreamSynchronize(stream)); + } + } + + bool is_available_ = true; +}; + +typedef dmlc::ThreadLocalStore CUDAThreadStore; + +CUDAThreadEntry::CUDAThreadEntry() : pool(kDGLCUDA, CUDADeviceAPI::Global()) {} + +CUDAThreadEntry* CUDAThreadEntry::ThreadLocal() { + return CUDAThreadStore::Get(); +} + +cudaStream_t getCurrentCUDAStream() { + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + if (tensor_dispatcher->IsAvailable()) + return tensor_dispatcher->CUDAGetCurrentStream(); + else // return the default stream when TA is not available + return nullptr; +} + +DGL_REGISTER_GLOBAL("device_api.cuda") + .set_body([](DGLArgs args, DGLRetValue* rv) { + DeviceAPI* ptr = CUDADeviceAPI::Global().get(); + *rv = static_cast(ptr); + }); + +} // namespace runtime +} // namespace dgl diff --git a/src/runtime/cuda/cuda_hashtable.cu b/src/runtime/cuda/cuda_hashtable.cu index e24ead9ee336..cb8ccbb3a47a 100644 --- a/src/runtime/cuda/cuda_hashtable.cu +++ b/src/runtime/cuda/cuda_hashtable.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file runtime/cuda/cuda_device_common.cuh @@ -5,7 +6,7 @@ */ #include -#include // NOLINT +#include // NOLINT #include "../../array/cuda/atomic.cuh" #include "cuda_common.h" @@ -246,7 +247,7 @@ __global__ void count_hashmap( DeviceOrderedHashTable table, IdType* const num_unique) { assert(BLOCK_SIZE == blockDim.x); - using BlockReduce = typename cub::BlockReduce; + using BlockReduce = typename hipcub::BlockReduce; using Mapping = typename DeviceOrderedHashTable::Mapping; const size_t block_start = TILE_SIZE * blockIdx.x; @@ -300,7 +301,7 @@ __global__ void compact_hashmap( assert(BLOCK_SIZE == blockDim.x); using FlagType = uint16_t; - using BlockScan = typename cub::BlockScan; + using BlockScan = typename hipcub::BlockScan; using Mapping = typename DeviceOrderedHashTable::Mapping; constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE; @@ -359,7 +360,7 @@ DeviceOrderedHashTable OrderedHashTable::DeviceHandle() const { template OrderedHashTable::OrderedHashTable( - const size_t size, DGLContext ctx, cudaStream_t stream, const int scale) + const size_t size, DGLContext ctx, hipStream_t stream, const int scale) : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) { // make sure we will at least as many buckets as items. CHECK_GT(scale, 0); @@ -368,7 +369,7 @@ OrderedHashTable::OrderedHashTable( table_ = static_cast( device->AllocWorkspace(ctx_, sizeof(Mapping) * size_)); - CUDA_CALL(cudaMemsetAsync( + CUDA_CALL(hipMemsetAsync( table_, DeviceOrderedHashTable::kEmptyKey, sizeof(Mapping) * size_, stream)); } @@ -382,7 +383,7 @@ OrderedHashTable::~OrderedHashTable() { template void OrderedHashTable::FillWithDuplicates( const IdType* const input, const size_t num_input, IdType* const unique, - int64_t* const num_unique, cudaStream_t stream) { + int64_t* const num_unique, hipStream_t stream) { auto device = runtime::DeviceAPI::Get(ctx_); const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; @@ -404,12 +405,12 @@ void OrderedHashTable::FillWithDuplicates( input, num_input, device_table, item_prefix); size_t workspace_bytes; - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( nullptr, workspace_bytes, static_cast(nullptr), static_cast(nullptr), grid.x + 1, stream)); void* workspace = device->AllocWorkspace(ctx_, workspace_bytes); - CUDA_CALL(cub::DeviceScan::ExclusiveSum( + CUDA_CALL(hipcub::DeviceScan::ExclusiveSum( workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1, stream)); device->FreeWorkspace(ctx_, workspace); @@ -422,7 +423,7 @@ void OrderedHashTable::FillWithDuplicates( template void OrderedHashTable::FillWithUnique( - const IdType* const input, const size_t num_input, cudaStream_t stream) { + const IdType* const input, const size_t num_input, hipStream_t stream) { const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; const dim3 grid(num_tiles); diff --git a/src/runtime/cuda/cuda_hashtable.cu.prehip b/src/runtime/cuda/cuda_hashtable.cu.prehip new file mode 100644 index 000000000000..e24ead9ee336 --- /dev/null +++ b/src/runtime/cuda/cuda_hashtable.cu.prehip @@ -0,0 +1,443 @@ +/** + * Copyright (c) 2021 by Contributors + * @file runtime/cuda/cuda_device_common.cuh + * @brief Device level functions for within cuda kernels. + */ + +#include +#include // NOLINT + +#include "../../array/cuda/atomic.cuh" +#include "cuda_common.h" +#include "cuda_hashtable.cuh" + +using namespace dgl::aten::cuda; + +namespace dgl { +namespace runtime { +namespace cuda { + +namespace { + +constexpr static const int BLOCK_SIZE = 256; +constexpr static const size_t TILE_SIZE = 1024; + +/** + * @brief This is the mutable version of the DeviceOrderedHashTable, for use in + * inserting elements into the hashtable. + * + * @tparam IdType The type of ID to store in the hashtable. + */ +template +class MutableDeviceOrderedHashTable : public DeviceOrderedHashTable { + public: + typedef typename DeviceOrderedHashTable::Mapping* Iterator; + static constexpr IdType kEmptyKey = DeviceOrderedHashTable::kEmptyKey; + + /** + * @brief Create a new mutable hashtable for use on the device. + * + * @param hostTable The original hash table on the host. + */ + explicit MutableDeviceOrderedHashTable( + OrderedHashTable* const hostTable) + : DeviceOrderedHashTable(hostTable->DeviceHandle()) {} + + /** + * @brief Find the mutable mapping of a given key within the hash table. + * + * WARNING: The key must exist within the hashtable. Searching for a key not + * in the hashtable is undefined behavior. + * + * @param id The key to search for. + * + * @return The mapping. + */ + inline __device__ Iterator Search(const IdType id) { + const IdType pos = SearchForPosition(id); + + return GetMutable(pos); + } + + /** + * @brief Attempt to insert into the hash table at a specific location. + * + * @param pos The position to insert at. + * @param id The ID to insert into the hash table. + * @param index The original index of the item being inserted. + * + * @return True, if the insertion was successful. + */ + inline __device__ bool AttemptInsertAt( + const size_t pos, const IdType id, const size_t index) { + const IdType key = AtomicCAS(&GetMutable(pos)->key, kEmptyKey, id); + if (key == kEmptyKey || key == id) { + // we either set a match key, or found a matching key, so then place the + // minimum index in position. Match the type of atomicMin, so ignore + // linting + atomicMin( + reinterpret_cast( // NOLINT + &GetMutable(pos)->index), + static_cast(index)); // NOLINT + return true; + } else { + // we need to search elsewhere + return false; + } + } + + /** + * @brief Insert key-index pair into the hashtable. + * + * @param id The ID to insert. + * @param index The index at which the ID occured. + * + * @return An iterator to inserted mapping. + */ + inline __device__ Iterator Insert(const IdType id, const size_t index) { + size_t pos = Hash(id); + + // linearly scan for an empty slot or matching entry + IdType delta = 1; + while (!AttemptInsertAt(pos, id, index)) { + pos = Hash(pos + delta); + delta += 1; + } + + return GetMutable(pos); + } + + private: + /** + * @brief Get a mutable iterator to the given bucket in the hashtable. + * + * @param pos The given bucket. + * + * @return The iterator. + */ + inline __device__ Iterator GetMutable(const size_t pos) { + assert(pos < this->size_); + // The parent class Device is read-only, but we ensure this can only be + // constructed from a mutable version of OrderedHashTable, making this + // a safe cast to perform. + return const_cast(this->table_ + pos); + } +}; + +/** + * @brief Calculate the number of buckets in the hashtable. To guarantee we can + * fill the hashtable in the worst case, we must use a number of buckets which + * is a power of two. + * https://en.wikipedia.org/wiki/Quadratic_probing#Limitations + * + * @param num The number of items to insert (should be an upper bound on the + * number of unique keys). + * @param scale The power of two larger the number of buckets should be than the + * unique keys. + * + * @return The number of buckets the table should contain. + */ +size_t TableSize(const size_t num, const int scale) { + const size_t next_pow2 = 1 << static_cast(1 + std::log2(num >> 1)); + return next_pow2 << scale; +} + +/** + * @brief This structure is used with cub's block-level prefixscan in order to + * keep a running sum as items are iteratively processed. + * + * @tparam IdType The type to perform the prefixsum on. + */ +template +struct BlockPrefixCallbackOp { + IdType running_total_; + + __device__ BlockPrefixCallbackOp(const IdType running_total) + : running_total_(running_total) {} + + __device__ IdType operator()(const IdType block_aggregate) { + const IdType old_prefix = running_total_; + running_total_ += block_aggregate; + return old_prefix; + } +}; + +} // namespace + +/** + * @brief This generates a hash map where the keys are the global item numbers, + * and the values are indexes, and inputs may have duplciates. + * + * @tparam IdType The type of of id. + * @tparam BLOCK_SIZE The size of the thread block. + * @tparam TILE_SIZE The number of entries each thread block will process. + * @param items The items to insert. + * @param num_items The number of items to insert. + * @param table The hash table. + */ +template +__global__ void generate_hashmap_duplicates( + const IdType* const items, const int64_t num_items, + MutableDeviceOrderedHashTable table) { + assert(BLOCK_SIZE == blockDim.x); + + const size_t block_start = TILE_SIZE * blockIdx.x; + const size_t block_end = TILE_SIZE * (blockIdx.x + 1); + +#pragma unroll + for (size_t index = threadIdx.x + block_start; index < block_end; + index += BLOCK_SIZE) { + if (index < num_items) { + table.Insert(items[index], index); + } + } +} + +/** + * @brief This generates a hash map where the keys are the global item numbers, + * and the values are indexes, and all inputs are unique. + * + * @tparam IdType The type of of id. + * @tparam BLOCK_SIZE The size of the thread block. + * @tparam TILE_SIZE The number of entries each thread block will process. + * @param items The unique items to insert. + * @param num_items The number of items to insert. + * @param table The hash table. + */ +template +__global__ void generate_hashmap_unique( + const IdType* const items, const int64_t num_items, + MutableDeviceOrderedHashTable table) { + assert(BLOCK_SIZE == blockDim.x); + + using Iterator = typename MutableDeviceOrderedHashTable::Iterator; + + const size_t block_start = TILE_SIZE * blockIdx.x; + const size_t block_end = TILE_SIZE * (blockIdx.x + 1); + +#pragma unroll + for (size_t index = threadIdx.x + block_start; index < block_end; + index += BLOCK_SIZE) { + if (index < num_items) { + const Iterator pos = table.Insert(items[index], index); + + // since we are only inserting unique items, we know their local id + // will be equal to their index + pos->local = static_cast(index); + } + } +} + +/** + * @brief This counts the number of nodes inserted per thread block. + * + * @tparam IdType The type of of id. + * @tparam BLOCK_SIZE The size of the thread block. + * @tparam TILE_SIZE The number of entries each thread block will process. + * @param input The nodes to insert. + * @param num_input The number of nodes to insert. + * @param table The hash table. + * @param num_unique The number of nodes inserted into the hash table per thread + * block. + */ +template +__global__ void count_hashmap( + const IdType* items, const size_t num_items, + DeviceOrderedHashTable table, IdType* const num_unique) { + assert(BLOCK_SIZE == blockDim.x); + + using BlockReduce = typename cub::BlockReduce; + using Mapping = typename DeviceOrderedHashTable::Mapping; + + const size_t block_start = TILE_SIZE * blockIdx.x; + const size_t block_end = TILE_SIZE * (blockIdx.x + 1); + + IdType count = 0; + +#pragma unroll + for (size_t index = threadIdx.x + block_start; index < block_end; + index += BLOCK_SIZE) { + if (index < num_items) { + const Mapping& mapping = *table.Search(items[index]); + if (mapping.index == index) { + ++count; + } + } + } + + __shared__ typename BlockReduce::TempStorage temp_space; + + count = BlockReduce(temp_space).Sum(count); + + if (threadIdx.x == 0) { + num_unique[blockIdx.x] = count; + if (blockIdx.x == 0) { + num_unique[gridDim.x] = 0; + } + } +} + +/** + * @brief Update the local numbering of elements in the hashmap. + * + * @tparam IdType The type of id. + * @tparam BLOCK_SIZE The size of the thread blocks. + * @tparam TILE_SIZE The number of elements each thread block works on. + * @param items The set of non-unique items to update from. + * @param num_items The number of non-unique items. + * @param table The hash table. + * @param num_items_prefix The number of unique items preceding each thread + * block. + * @param unique_items The set of unique items (output). + * @param num_unique_items The number of unique items (output). + */ +template +__global__ void compact_hashmap( + const IdType* const items, const size_t num_items, + MutableDeviceOrderedHashTable table, + const IdType* const num_items_prefix, IdType* const unique_items, + int64_t* const num_unique_items) { + assert(BLOCK_SIZE == blockDim.x); + + using FlagType = uint16_t; + using BlockScan = typename cub::BlockScan; + using Mapping = typename DeviceOrderedHashTable::Mapping; + + constexpr const int32_t VALS_PER_THREAD = TILE_SIZE / BLOCK_SIZE; + + __shared__ typename BlockScan::TempStorage temp_space; + + const IdType offset = num_items_prefix[blockIdx.x]; + + BlockPrefixCallbackOp prefix_op(0); + + // count successful placements + for (int32_t i = 0; i < VALS_PER_THREAD; ++i) { + const IdType index = threadIdx.x + i * BLOCK_SIZE + blockIdx.x * TILE_SIZE; + + FlagType flag; + Mapping* kv; + if (index < num_items) { + kv = table.Search(items[index]); + flag = kv->index == index; + } else { + flag = 0; + } + + if (!flag) { + kv = nullptr; + } + + BlockScan(temp_space).ExclusiveSum(flag, flag, prefix_op); + __syncthreads(); + + if (kv) { + const IdType pos = offset + flag; + kv->local = pos; + unique_items[pos] = items[index]; + } + } + + if (threadIdx.x == 0 && blockIdx.x == 0) { + *num_unique_items = num_items_prefix[gridDim.x]; + } +} + +// DeviceOrderedHashTable implementation + +template +DeviceOrderedHashTable::DeviceOrderedHashTable( + const Mapping* const table, const size_t size) + : table_(table), size_(size) {} + +template +DeviceOrderedHashTable OrderedHashTable::DeviceHandle() const { + return DeviceOrderedHashTable(table_, size_); +} + +// OrderedHashTable implementation + +template +OrderedHashTable::OrderedHashTable( + const size_t size, DGLContext ctx, cudaStream_t stream, const int scale) + : table_(nullptr), size_(TableSize(size, scale)), ctx_(ctx) { + // make sure we will at least as many buckets as items. + CHECK_GT(scale, 0); + + auto device = runtime::DeviceAPI::Get(ctx_); + table_ = static_cast( + device->AllocWorkspace(ctx_, sizeof(Mapping) * size_)); + + CUDA_CALL(cudaMemsetAsync( + table_, DeviceOrderedHashTable::kEmptyKey, + sizeof(Mapping) * size_, stream)); +} + +template +OrderedHashTable::~OrderedHashTable() { + auto device = runtime::DeviceAPI::Get(ctx_); + device->FreeWorkspace(ctx_, table_); +} + +template +void OrderedHashTable::FillWithDuplicates( + const IdType* const input, const size_t num_input, IdType* const unique, + int64_t* const num_unique, cudaStream_t stream) { + auto device = runtime::DeviceAPI::Get(ctx_); + + const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; + + const dim3 grid(num_tiles); + const dim3 block(BLOCK_SIZE); + + auto device_table = MutableDeviceOrderedHashTable(this); + + CUDA_KERNEL_CALL( + (generate_hashmap_duplicates), grid, block, + 0, stream, input, num_input, device_table); + + IdType* item_prefix = static_cast( + device->AllocWorkspace(ctx_, sizeof(IdType) * (num_input + 1))); + + CUDA_KERNEL_CALL( + (count_hashmap), grid, block, 0, stream, + input, num_input, device_table, item_prefix); + + size_t workspace_bytes; + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + nullptr, workspace_bytes, static_cast(nullptr), + static_cast(nullptr), grid.x + 1, stream)); + void* workspace = device->AllocWorkspace(ctx_, workspace_bytes); + + CUDA_CALL(cub::DeviceScan::ExclusiveSum( + workspace, workspace_bytes, item_prefix, item_prefix, grid.x + 1, + stream)); + device->FreeWorkspace(ctx_, workspace); + + CUDA_KERNEL_CALL( + (compact_hashmap), grid, block, 0, stream, + input, num_input, device_table, item_prefix, unique, num_unique); + device->FreeWorkspace(ctx_, item_prefix); +} + +template +void OrderedHashTable::FillWithUnique( + const IdType* const input, const size_t num_input, cudaStream_t stream) { + const int64_t num_tiles = (num_input + TILE_SIZE - 1) / TILE_SIZE; + + const dim3 grid(num_tiles); + const dim3 block(BLOCK_SIZE); + + auto device_table = MutableDeviceOrderedHashTable(this); + + CUDA_KERNEL_CALL( + (generate_hashmap_unique), grid, block, 0, + stream, input, num_input, device_table); +} + +template class OrderedHashTable; +template class OrderedHashTable; + +} // namespace cuda +} // namespace runtime +} // namespace dgl diff --git a/src/runtime/cuda/cuda_hashtable.cuh b/src/runtime/cuda/cuda_hashtable.cuh index 8a3c3a2c990f..cae772674fef 100644 --- a/src/runtime/cuda/cuda_hashtable.cuh +++ b/src/runtime/cuda/cuda_hashtable.cuh @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /** * Copyright (c) 2021 by Contributors * @file runtime/cuda/cuda_device_common.cuh @@ -10,7 +11,7 @@ #include #include "cuda_common.h" -#include "cuda_runtime.h" +#include "hip/hip_runtime.h" namespace dgl { namespace runtime { @@ -228,7 +229,7 @@ class OrderedHashTable { * @param stream The stream to use for initializing the hashtable. */ OrderedHashTable( - const size_t size, DGLContext ctx, cudaStream_t stream, + const size_t size, DGLContext ctx, hipStream_t stream, const int scale = kDefaultScale); /** @@ -252,7 +253,7 @@ class OrderedHashTable { */ void FillWithDuplicates( const IdType* const input, const size_t num_input, IdType* const unique, - int64_t* const num_unique, cudaStream_t stream); + int64_t* const num_unique, hipStream_t stream); /** * @brief Fill the hashtable with an array of unique keys. @@ -262,7 +263,7 @@ class OrderedHashTable { * @param stream The stream to perform operations on. */ void FillWithUnique( - const IdType* const input, const size_t num_input, cudaStream_t stream); + const IdType* const input, const size_t num_input, hipStream_t stream); /** * @brief Get a verison of the hashtable usable from device functions. diff --git a/src/runtime/cuda/cuda_hashtable.cuh.prehip b/src/runtime/cuda/cuda_hashtable.cuh.prehip new file mode 100644 index 000000000000..8a3c3a2c990f --- /dev/null +++ b/src/runtime/cuda/cuda_hashtable.cuh.prehip @@ -0,0 +1,284 @@ +/** + * Copyright (c) 2021 by Contributors + * @file runtime/cuda/cuda_device_common.cuh + * @brief Device level functions for within cuda kernels. + */ + +#ifndef DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_ +#define DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_ + +#include + +#include "cuda_common.h" +#include "cuda_runtime.h" + +namespace dgl { +namespace runtime { +namespace cuda { + +template +class OrderedHashTable; + +/** + * @brief A device-side handle for a GPU hashtable for mapping items to the + * first index at which they appear in the provided data array. + * + * For any ID array A, one can view it as a mapping from the index `i` + * (continuous integer range from zero) to its element `A[i]`. This hashtable + * serves as a reverse mapping, i.e., from element `A[i]` to its index `i`. + * Quadratic probing is used for collision resolution. See + * DeviceOrderedHashTable's documentation for how the Mapping structure is + * used. + * + * The hash table should be used in two phases, with the first being populating + * the hash table with the OrderedHashTable object, and then generating this + * handle from it. This object can then be used to search the hash table, + * to find mappings, from with CUDA code. + * + * If a device-side handle is created from a hash table with the following + * entries: + * [ + * {key: 0, local: 0, index: 0}, + * {key: 3, local: 1, index: 1}, + * {key: 2, local: 2, index: 2}, + * {key: 8, local: 3, index: 4}, + * {key: 4, local: 4, index: 5}, + * {key: 1, local: 5, index: 8} + * ] + * The array [0, 3, 2, 0, 8, 4, 3, 2, 1, 8] could have `Search()` called on + * each id, to be mapped via: + * ``` + * __global__ void map(int32_t * array, + * size_t size, + * DeviceOrderedHashTable table) { + * int idx = threadIdx.x + blockIdx.x*blockDim.x; + * if (idx < size) { + * array[idx] = table.Search(array[idx])->local; + * } + * } + * ``` + * to get the remaped array: + * [0, 1, 2, 0, 3, 4, 1, 2, 5, 3] + * + * @tparam IdType The type of the IDs. + */ +template +class DeviceOrderedHashTable { + public: + /** + * @brief An entry in the hashtable. + */ + struct Mapping { + /** + * @brief The ID of the item inserted. + */ + IdType key; + /** + * @brief The index of the item in the unique list. + */ + IdType local; + /** + * @brief The index of the item when inserted into the hashtable (e.g., + * the index within the array passed into FillWithDuplicates()). + */ + int64_t index; + }; + + typedef const Mapping* ConstIterator; + + DeviceOrderedHashTable(const DeviceOrderedHashTable& other) = default; + DeviceOrderedHashTable& operator=(const DeviceOrderedHashTable& other) = + default; + + /** + * @brief Find the non-mutable mapping of a given key within the hash table. + * + * WARNING: The key must exist within the hashtable. Searching for a key not + * in the hashtable is undefined behavior. + * + * @param id The key to search for. + * + * @return An iterator to the mapping. + */ + inline __device__ ConstIterator Search(const IdType id) const { + const IdType pos = SearchForPosition(id); + + return &table_[pos]; + } + + /** + * @brief Check whether a key exists within the hashtable. + * + * @param id The key to check for. + * + * @return True if the key exists in the hashtable. + */ + inline __device__ bool Contains(const IdType id) const { + IdType pos = Hash(id); + + IdType delta = 1; + while (table_[pos].key != kEmptyKey) { + if (table_[pos].key == id) { + return true; + } + pos = Hash(pos + delta); + delta += 1; + } + return false; + } + + protected: + // Must be uniform bytes for memset to work + static constexpr IdType kEmptyKey = static_cast(-1); + + const Mapping* table_; + size_t size_; + + /** + * @brief Create a new device-side handle to the hash table. + * + * @param table The table stored in GPU memory. + * @param size The size of the table. + */ + explicit DeviceOrderedHashTable(const Mapping* table, size_t size); + + /** + * @brief Search for an item in the hash table which is known to exist. + * + * WARNING: If the ID searched for does not exist within the hashtable, this + * function will never return. + * + * @param id The ID of the item to search for. + * + * @return The the position of the item in the hashtable. + */ + inline __device__ IdType SearchForPosition(const IdType id) const { + IdType pos = Hash(id); + + // linearly scan for matching entry + IdType delta = 1; + while (table_[pos].key != id) { + assert(table_[pos].key != kEmptyKey); + pos = Hash(pos + delta); + delta += 1; + } + assert(pos < size_); + + return pos; + } + + /** + * @brief Hash an ID to a to a position in the hash table. + * + * @param id The ID to hash. + * + * @return The hash. + */ + inline __device__ size_t Hash(const IdType id) const { return id % size_; } + + friend class OrderedHashTable; +}; + +/** + * @brief A host-side handle for a GPU hashtable for mapping items to the + * first index at which they appear in the provided data array. This host-side + * handle is responsible for allocating and free the GPU memory of the + * hashtable. + * + * For any ID array A, one can view it as a mapping from the index `i` + * (continuous integer range from zero) to its element `A[i]`. This hashtable + * serves as a reverse mapping, i.e., from element `A[i]` to its index `i`. + * Quadratic probing is used for collision resolution. + * + * The hash table should be used in two phases, the first is filling the hash + * table via 'FillWithDuplicates()' or 'FillWithUnique()'. Then, the + * 'DeviceHandle()' method can be called, to get a version suitable for + * searching from device and kernel functions. + * + * If 'FillWithDuplicates()' was called with an array of: + * [0, 3, 2, 0, 8, 4, 3, 2, 1, 8] + * + * The resulting entries in the hash-table would be: + * [ + * {key: 0, local: 0, index: 0}, + * {key: 3, local: 1, index: 1}, + * {key: 2, local: 2, index: 2}, + * {key: 8, local: 3, index: 4}, + * {key: 4, local: 4, index: 5}, + * {key: 1, local: 5, index: 8} + * ] + * + * @tparam IdType The type of the IDs. + */ +template +class OrderedHashTable { + public: + static constexpr int kDefaultScale = 3; + + using Mapping = typename DeviceOrderedHashTable::Mapping; + + /** + * @brief Create a new ordered hash table. The amoutn of GPU memory + * consumed by the resulting hashtable is O(`size` * 2^`scale`). + * + * @param size The number of items to insert into the hashtable. + * @param ctx The device context to store the hashtable on. + * @param scale The power of two times larger the number of buckets should + * be than the number of items. + * @param stream The stream to use for initializing the hashtable. + */ + OrderedHashTable( + const size_t size, DGLContext ctx, cudaStream_t stream, + const int scale = kDefaultScale); + + /** + * @brief Cleanup after the hashtable. + */ + ~OrderedHashTable(); + + // Disable copying + OrderedHashTable(const OrderedHashTable& other) = delete; + OrderedHashTable& operator=(const OrderedHashTable& other) = delete; + + /** + * @brief Fill the hashtable with the array containing possibly duplicate + * IDs. + * + * @param input The array of IDs to insert. + * @param num_input The number of IDs to insert. + * @param unique The list of unique IDs inserted. + * @param num_unique The number of unique IDs inserted. + * @param stream The stream to perform operations on. + */ + void FillWithDuplicates( + const IdType* const input, const size_t num_input, IdType* const unique, + int64_t* const num_unique, cudaStream_t stream); + + /** + * @brief Fill the hashtable with an array of unique keys. + * + * @param input The array of unique IDs. + * @param num_input The number of keys. + * @param stream The stream to perform operations on. + */ + void FillWithUnique( + const IdType* const input, const size_t num_input, cudaStream_t stream); + + /** + * @brief Get a verison of the hashtable usable from device functions. + * + * @return This hashtable. + */ + DeviceOrderedHashTable DeviceHandle() const; + + private: + Mapping* table_; + size_t size_; + DGLContext ctx_; +}; + +} // namespace cuda +} // namespace runtime +} // namespace dgl + +#endif // DGL_RUNTIME_CUDA_CUDA_HASHTABLE_CUH_ diff --git a/src/runtime/cuda/gpu_cache.cu b/src/runtime/cuda/gpu_cache.cu index e2b6767e2792..f0f4eb73a43b 100644 --- a/src/runtime/cuda/gpu_cache.cu +++ b/src/runtime/cuda/gpu_cache.cu @@ -20,7 +20,7 @@ #ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_ #define DGL_RUNTIME_CUDA_GPU_CACHE_H_ -#include +#include #include #include #include @@ -55,12 +55,12 @@ class GpuCache : public runtime::Object { : num_feats(num_feats), cache(std::make_unique( (num_items + bucket_size - 1) / bucket_size, num_feats)) { - CUDA_CALL(cudaGetDevice(&cuda_device)); + CUDA_CALL(hipGetDevice(&cuda_device)); } std::tuple Query(IdArray keys) { const auto &ctx = keys->ctx; - cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); + hipStream_t stream = dgl::runtime::getCurrentCUDAStream(); auto device = dgl::runtime::DeviceAPI::Get(ctx); CHECK_EQ(ctx.device_type, kDGLCUDA) << "The keys should be on a CUDA device"; @@ -94,7 +94,7 @@ class GpuCache : public runtime::Object { } void Replace(IdArray keys, NDArray values) { - cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); + hipStream_t stream = dgl::runtime::getCurrentCUDAStream(); CHECK_EQ(keys->ctx.device_type, kDGLCUDA) << "The keys should be on a CUDA device"; CHECK_EQ(keys->ctx.device_id, cuda_device) diff --git a/src/runtime/cuda/gpu_cache.cu.prehip b/src/runtime/cuda/gpu_cache.cu.prehip new file mode 100644 index 000000000000..e2b6767e2792 --- /dev/null +++ b/src/runtime/cuda/gpu_cache.cu.prehip @@ -0,0 +1,189 @@ +/*! + * Copyright (c) 2022 by Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * \file gpu_cache.cu + * \brief Implementation of wrapper HugeCTR gpu_cache routines. + */ + +#ifndef DGL_RUNTIME_CUDA_GPU_CACHE_H_ +#define DGL_RUNTIME_CUDA_GPU_CACHE_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../../runtime/cuda/cuda_common.h" + +namespace dgl { +namespace runtime { +namespace cuda { + +template +class GpuCache : public runtime::Object { + constexpr static int set_associativity = 2; + constexpr static int WARP_SIZE = 32; + constexpr static int bucket_size = WARP_SIZE * set_associativity; + using gpu_cache_t = gpu_cache::gpu_cache< + key_t, uint64_t, std::numeric_limits::max(), set_associativity, + WARP_SIZE>; + + public: + static constexpr const char *_type_key = + sizeof(key_t) == 4 ? "cuda.GpuCache32" : "cuda.GpuCache64"; + DGL_DECLARE_OBJECT_TYPE_INFO(GpuCache, Object); + + GpuCache(size_t num_items, size_t num_feats) + : num_feats(num_feats), + cache(std::make_unique( + (num_items + bucket_size - 1) / bucket_size, num_feats)) { + CUDA_CALL(cudaGetDevice(&cuda_device)); + } + + std::tuple Query(IdArray keys) { + const auto &ctx = keys->ctx; + cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); + auto device = dgl::runtime::DeviceAPI::Get(ctx); + CHECK_EQ(ctx.device_type, kDGLCUDA) + << "The keys should be on a CUDA device"; + CHECK_EQ(ctx.device_id, cuda_device) + << "The keys should be on the correct CUDA device"; + CHECK_EQ(keys->ndim, 1) + << "The tensor of requested indices must be of dimension one."; + NDArray values = NDArray::Empty( + {keys->shape[0], (int64_t)num_feats}, DGLDataType{kDGLFloat, 32, 1}, + ctx); + IdArray missing_index = aten::NewIdArray(keys->shape[0], ctx, 64); + IdArray missing_keys = + aten::NewIdArray(keys->shape[0], ctx, sizeof(key_t) * 8); + size_t *missing_len = + static_cast(device->AllocWorkspace(ctx, sizeof(size_t))); + cache->Query( + static_cast(keys->data), keys->shape[0], + static_cast(values->data), + static_cast(missing_index->data), + static_cast(missing_keys->data), missing_len, stream); + size_t missing_len_host; + device->CopyDataFromTo( + missing_len, 0, &missing_len_host, 0, sizeof(missing_len_host), ctx, + DGLContext{kDGLCPU, 0}, keys->dtype); + device->FreeWorkspace(ctx, missing_len); + missing_index = missing_index.CreateView( + {(int64_t)missing_len_host}, missing_index->dtype); + missing_keys = + missing_keys.CreateView({(int64_t)missing_len_host}, keys->dtype); + return std::make_tuple(values, missing_index, missing_keys); + } + + void Replace(IdArray keys, NDArray values) { + cudaStream_t stream = dgl::runtime::getCurrentCUDAStream(); + CHECK_EQ(keys->ctx.device_type, kDGLCUDA) + << "The keys should be on a CUDA device"; + CHECK_EQ(keys->ctx.device_id, cuda_device) + << "The keys should be on the correct CUDA device"; + CHECK_EQ(values->ctx.device_type, kDGLCUDA) + << "The values should be on a CUDA device"; + CHECK_EQ(values->ctx.device_id, cuda_device) + << "The values should be on the correct CUDA device"; + CHECK_EQ(keys->shape[0], values->shape[0]) + << "First dimensions of keys and values must match"; + CHECK_EQ(values->shape[1], num_feats) << "Embedding dimension must match"; + cache->Replace( + static_cast(keys->data), keys->shape[0], + static_cast(values->data), stream); + } + + private: + size_t num_feats; + std::unique_ptr cache; + int cuda_device; +}; + +static_assert(sizeof(unsigned int) == 4); +DGL_DEFINE_OBJECT_REF(GpuCacheRef32, GpuCache); +// The cu file in HugeCTR gpu cache uses unsigned int and long long. +// Changing to int64_t results in a mismatch of template arguments. +static_assert(sizeof(long long) == 8); // NOLINT +DGL_DEFINE_OBJECT_REF(GpuCacheRef64, GpuCache); // NOLINT + +/* CAPI **********************************************************************/ + +using namespace dgl::runtime; + +DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheCreate") + .set_body([](DGLArgs args, DGLRetValue *rv) { + const size_t num_items = args[0]; + const size_t num_feats = args[1]; + const int num_bits = args[2]; + + if (num_bits == 32) + *rv = GpuCacheRef32( + std::make_shared>(num_items, num_feats)); + else + *rv = GpuCacheRef64(std::make_shared>( // NOLINT + num_items, num_feats)); + }); + +DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheQuery") + .set_body([](DGLArgs args, DGLRetValue *rv) { + IdArray keys = args[1]; + + List ret; + if (keys->dtype.bits == 32) { + GpuCacheRef32 cache = args[0]; + auto result = cache->Query(keys); + + ret.push_back(Value(MakeValue(std::get<0>(result)))); + ret.push_back(Value(MakeValue(std::get<1>(result)))); + ret.push_back(Value(MakeValue(std::get<2>(result)))); + } else { + GpuCacheRef64 cache = args[0]; + auto result = cache->Query(keys); + + ret.push_back(Value(MakeValue(std::get<0>(result)))); + ret.push_back(Value(MakeValue(std::get<1>(result)))); + ret.push_back(Value(MakeValue(std::get<2>(result)))); + } + + *rv = ret; + }); + +DGL_REGISTER_GLOBAL("cuda._CAPI_DGLGpuCacheReplace") + .set_body([](DGLArgs args, DGLRetValue *rv) { + IdArray keys = args[1]; + NDArray values = args[2]; + + if (keys->dtype.bits == 32) { + GpuCacheRef32 cache = args[0]; + cache->Replace(keys, values); + } else { + GpuCacheRef64 cache = args[0]; + cache->Replace(keys, values); + } + + *rv = List{}; + }); + +} // namespace cuda +} // namespace runtime +} // namespace dgl + +#endif diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index abbe3b1310f4..797dfeff9012 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -23,12 +23,12 @@ constexpr DGLDataType DGLDataTypeTraits::dtype; constexpr DGLDataType DGLDataTypeTraits::dtype; constexpr DGLDataType DGLDataTypeTraits::dtype; constexpr DGLDataType DGLDataTypeTraits::dtype; -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM constexpr DGLDataType DGLDataTypeTraits<__half>::dtype; #if BF16_ENABLED -constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype; +constexpr DGLDataType DGLDataTypeTraits<__hip_bfloat16>::dtype; #endif // BF16_ENABLED -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM constexpr DGLDataType DGLDataTypeTraits::dtype; constexpr DGLDataType DGLDataTypeTraits::dtype; @@ -262,7 +262,7 @@ void NDArray::PinContainer(NDArray::Container* ptr) { void NDArray::UnpinContainer(NDArray::Container* ptr) { auto container_is_pinned = IsContainerPinned(ptr); // The tensor may be pinned outside of DGL via a different CUDA API, - // so we cannot unpin it with cudaHostUnregister. + // so we cannot unpin it with hipHostUnregister. CHECK(ptr->pinned_by_dgl_ || !container_is_pinned) << "Cannot unpin a tensor that is pinned outside of DGL."; // 1. not pinned, do nothing diff --git a/src/runtime/ndarray.cc.prehip b/src/runtime/ndarray.cc.prehip new file mode 100644 index 000000000000..abbe3b1310f4 --- /dev/null +++ b/src/runtime/ndarray.cc.prehip @@ -0,0 +1,505 @@ +/** + * Copyright (c) 2017-2022 by Contributors + * @file ndarray.cc + * @brief NDArray container infratructure. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "runtime_base.h" + +namespace dgl { + +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; +#ifdef DGL_USE_CUDA +constexpr DGLDataType DGLDataTypeTraits<__half>::dtype; +#if BF16_ENABLED +constexpr DGLDataType DGLDataTypeTraits<__nv_bfloat16>::dtype; +#endif // BF16_ENABLED +#endif // DGL_USE_CUDA +constexpr DGLDataType DGLDataTypeTraits::dtype; +constexpr DGLDataType DGLDataTypeTraits::dtype; + +namespace runtime { + +inline void VerifyDataType(DGLDataType dtype) { + CHECK_GE(dtype.lanes, 1); + if (dtype.code == kDGLFloat) { + CHECK_EQ(dtype.bits % 8, 0); + } else { + CHECK_EQ(dtype.bits % 8, 0); + } + CHECK_EQ(dtype.bits & (dtype.bits - 1), 0); +} + +inline size_t GetDataSize(const DGLArray& arr) { + size_t size = 1; + for (dgl_index_t i = 0; i < arr.ndim; ++i) { + size *= arr.shape[i]; + } + size *= (arr.dtype.bits * arr.dtype.lanes + 7) / 8; + return size; +} + +inline size_t GetDataAlignment(const DGLArray& arr) { + size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes; + if (align < kAllocAlignment) return kAllocAlignment; + return align; +} + +void NDArray::Internal::DefaultDeleter(NDArray::Container* ptr) { + using dgl::runtime::NDArray; + if (ptr->manager_ctx != nullptr) { + static_cast(ptr->manager_ctx)->DecRef(); + } else if (ptr->mem) { + ptr->mem = nullptr; + } else if (ptr->dl_tensor.data != nullptr) { + // if the array is still pinned before freeing, unpin it. + if (ptr->pinned_by_dgl_) UnpinContainer(ptr); + if (ptr->pinned_by_pytorch_) { + DeviceAPI::Get(kDGLCUDA)->FreePinnedDataSpace( + &(ptr->pytorch_raw_deleter_)); + CHECK(ptr->pytorch_raw_deleter_ == nullptr); + ptr->pinned_by_pytorch_ = false; + ptr->pytorch_ctx_ = nullptr; + } else { + dgl::runtime::DeviceAPI::Get(ptr->dl_tensor.ctx) + ->FreeDataSpace(ptr->dl_tensor.ctx, ptr->dl_tensor.data); + } + } + delete ptr; +} + +NDArray NDArray::Internal::Create( + std::vector shape, DGLDataType dtype, DGLContext ctx) { + VerifyDataType(dtype); + // critical zone + NDArray::Container* data = new NDArray::Container(); + data->deleter = DefaultDeleter; + NDArray ret(data); + ret.data_ = data; + // RAII now in effect + // setup shape + data->shape_ = std::move(shape); + data->dl_tensor.shape = dmlc::BeginPtr(data->shape_); + data->dl_tensor.ndim = static_cast(data->shape_.size()); + // setup stride (this should be optional, but some framework + // does not support NULL stride and thus will crash the program). + data->stride_.resize(data->dl_tensor.ndim, 1); + for (int i = data->dl_tensor.ndim - 2; i >= 0; --i) { + data->stride_[i] = data->shape_[i + 1] * data->stride_[i + 1]; + } + data->dl_tensor.strides = dmlc::BeginPtr(data->stride_); + // setup dtype + data->dl_tensor.dtype = dtype; + // setup ctx + data->dl_tensor.ctx = ctx; + return ret; +} + +DGLArray* NDArray::Internal::MoveAsDGLArray(NDArray arr) { + DGLArray* tensor = reinterpret_cast(arr.data_); + CHECK(tensor == const_cast(arr.operator->())); + arr.data_ = nullptr; + return tensor; +} + +size_t NDArray::GetSize() const { return GetDataSize(data_->dl_tensor); } + +int64_t NDArray::NumElements() const { + if (data_->dl_tensor.ndim == 0) return 0; + int64_t size = 1; + for (int i = 0; i < data_->dl_tensor.ndim; ++i) { + size *= data_->dl_tensor.shape[i]; + } + return size; +} + +bool NDArray::IsContiguous() const { + CHECK(data_ != nullptr); + if (data_->dl_tensor.strides == nullptr) return true; + + // See https://github.com/dmlc/dgl/issues/2118 and PyTorch's + // compute_contiguous() implementation + int64_t z = 1; + for (int64_t i = data_->dl_tensor.ndim - 1; i >= 0; --i) { + if (data_->dl_tensor.shape[i] != 1) { + if (data_->dl_tensor.strides[i] == z) + z *= data_->dl_tensor.shape[i]; + else + return false; + } + } + return true; +} + +NDArray NDArray::CreateView( + std::vector shape, DGLDataType dtype, int64_t offset) { + CHECK(data_ != nullptr); + CHECK(IsContiguous()) << "Can only create view for compact tensor"; + NDArray ret = Internal::Create(shape, dtype, data_->dl_tensor.ctx); + ret.data_->dl_tensor.byte_offset = this->data_->dl_tensor.byte_offset; + size_t curr_size = GetDataSize(this->data_->dl_tensor); + size_t view_size = GetDataSize(ret.data_->dl_tensor); + CHECK_LE(view_size, curr_size) + << "Tries to create a view that has bigger memory than current one"; + // increase ref count + this->data_->IncRef(); + ret.data_->manager_ctx = this->data_; + ret.data_->dl_tensor.data = + static_cast(this->data_->dl_tensor.data) + offset; + return ret; +} + +NDArray NDArray::EmptyShared( + const std::string& name, std::vector shape, DGLDataType dtype, + DGLContext ctx, bool is_create) { + NDArray ret = Internal::Create(shape, dtype, ctx); + size_t size = GetDataSize(ret.data_->dl_tensor); + auto mem = std::make_shared(name); + if (is_create) { + ret.data_->dl_tensor.data = mem->CreateNew(size); + } else { + ret.data_->dl_tensor.data = mem->Open(size); + } + + ret.data_->mem = mem; + return ret; +} + +NDArray NDArray::Empty( + std::vector shape, DGLDataType dtype, DGLContext ctx) { + NDArray ret = Internal::Create(shape, dtype, ctx); + size_t size = GetDataSize(ret.data_->dl_tensor); + size_t alignment = GetDataAlignment(ret.data_->dl_tensor); + if (size > 0) + ret.data_->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace( + ret->ctx, size, alignment, ret->dtype); + return ret; +} + +void NDArray::CopyFromTo(DGLArray* from, DGLArray* to) { + size_t from_size = GetDataSize(*from); + size_t to_size = GetDataSize(*to); + CHECK_EQ(from_size, to_size) + << "DGLArrayCopyFromTo: The size must exactly match"; + + CHECK( + from->ctx.device_type == to->ctx.device_type || + from->ctx.device_type == kDGLCPU || to->ctx.device_type == kDGLCPU) + << "Can not copy across different ctx types directly"; + + // Use the context that is *not* a cpu context to get the correct device + // api manager. + DGLContext ctx = from->ctx.device_type != kDGLCPU ? from->ctx : to->ctx; + + // default: local current cuda stream + DeviceAPI::Get(ctx)->CopyDataFromTo( + from->data, static_cast(from->byte_offset), to->data, + static_cast(to->byte_offset), from_size, from->ctx, to->ctx, + from->dtype); +} + +void NDArray::RecordedCopyFromTo( + DGLArray* from, DGLArray* to, void* pytorch_ctx) { + size_t from_size = GetDataSize(*from); + size_t to_size = GetDataSize(*to); + CHECK_EQ(from_size, to_size) + << "DGLArrayCopyFromTo: The size must exactly match."; + + CHECK(from->ctx.device_type != to->ctx.device_type) + << "Recoding event is only called for the copy between CPU and GPU."; + + CHECK(from->ctx.device_type == kDGLCUDA || to->ctx.device_type == kDGLCUDA) + << "At least one CUDA ctx needs to be involved."; + + DeviceAPI::Get(kDGLCUDA)->RecordedCopyDataFromTo( + from->data, static_cast(from->byte_offset), to->data, + static_cast(to->byte_offset), from_size, from->ctx, to->ctx, + from->dtype, pytorch_ctx); +} + +NDArray NDArray::PinnedEmpty( + std::vector shape, DGLDataType dtype, DGLContext ctx) { + CHECK_EQ(ctx.device_type, kDGLCPU) << "Only NDArray on CPU can be pinned"; + NDArray ret = Internal::Create(shape, dtype, ctx); + size_t size = GetDataSize(ret.data_->dl_tensor); + if (size > 0) { + ret.data_->dl_tensor.data = DeviceAPI::Get(kDGLCUDA)->AllocPinnedDataSpace( + size, &(ret.data_->pytorch_ctx_), &(ret.data_->pytorch_raw_deleter_)); + CHECK( + ret.data_->pytorch_ctx_ != nullptr && + ret.data_->pytorch_raw_deleter_ != nullptr) + << "The allocation failed in PyTorch's CachingHostAllocator. " + << "The returned context pointer is " << ret.data_->pytorch_ctx_ + << " and the function deleter is " << ret.data_->pytorch_raw_deleter_; + ret.data_->pinned_by_pytorch_ = true; + } + return ret; +} + +void NDArray::PinContainer(NDArray::Container* ptr) { + if (IsContainerPinned(ptr)) return; + auto* tensor = &(ptr->dl_tensor); + CHECK_EQ(tensor->ctx.device_type, kDGLCPU) + << "Only NDArray on CPU can be pinned"; + ptr->pinned_by_dgl_ = + DeviceAPI::Get(kDGLCUDA)->PinData(tensor->data, GetDataSize(*tensor)); +} + +void NDArray::UnpinContainer(NDArray::Container* ptr) { + auto container_is_pinned = IsContainerPinned(ptr); + // The tensor may be pinned outside of DGL via a different CUDA API, + // so we cannot unpin it with cudaHostUnregister. + CHECK(ptr->pinned_by_dgl_ || !container_is_pinned) + << "Cannot unpin a tensor that is pinned outside of DGL."; + // 1. not pinned, do nothing + if (!container_is_pinned) return; + // 2. pinned by DGL, unpin it + DeviceAPI::Get(kDGLCUDA)->UnpinData(ptr->dl_tensor.data); + ptr->pinned_by_dgl_ = false; +} + +void NDArray::RecordStream(DGLArray* tensor, DGLStreamHandle stream) { + TensorDispatcher* tensor_dispatcher = TensorDispatcher::Global(); + CHECK(tensor_dispatcher->IsAvailable()) + << "RecordStream only works when TensorAdapter is available."; + CHECK_EQ(tensor->ctx.device_type, kDGLCUDA) + << "RecordStream only works with GPU tensors."; + + tensor_dispatcher->RecordStream(tensor->data, stream, tensor->ctx.device_id); +} + +template +NDArray NDArray::FromVector(const std::vector& vec, DGLContext ctx) { + const DGLDataType dtype = DGLDataTypeTraits::dtype; + int64_t size = static_cast(vec.size()); + NDArray ret = NDArray::Empty({size}, dtype, ctx); + DeviceAPI::Get(ctx)->CopyDataFromTo( + vec.data(), 0, static_cast(ret->data), 0, size * sizeof(T), + DGLContext{kDGLCPU, 0}, ctx, dtype); + return ret; +} + +NDArray NDArray::CreateFromRaw( + const std::vector& shape, DGLDataType dtype, DGLContext ctx, + void* raw, bool auto_free) { + NDArray ret = Internal::Create(shape, dtype, ctx); + ret.data_->dl_tensor.data = raw; + if (!auto_free) ret.data_->deleter = nullptr; + return ret; +} + +// export specializations +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); +template NDArray NDArray::FromVector( + const std::vector&, DGLContext); + +template +std::vector NDArray::ToVector() const { + const DGLDataType dtype = DGLDataTypeTraits::dtype; + CHECK(data_->dl_tensor.ndim == 1) + << "ToVector() only supported for 1D arrays"; + CHECK(data_->dl_tensor.dtype == dtype) << "dtype mismatch"; + + int64_t size = data_->dl_tensor.shape[0]; + std::vector vec(size); + const DGLContext& ctx = data_->dl_tensor.ctx; + DeviceAPI::Get(ctx)->CopyDataFromTo( + static_cast(data_->dl_tensor.data), 0, vec.data(), 0, + size * sizeof(T), ctx, DGLContext{kDGLCPU, 0}, dtype); + return vec; +} + +template std::vector NDArray::ToVector() const; +template std::vector NDArray::ToVector() const; +template std::vector NDArray::ToVector() const; +template std::vector NDArray::ToVector() const; +template std::vector NDArray::ToVector() const; +template std::vector NDArray::ToVector() const; + +std::shared_ptr NDArray::GetSharedMem() const { + return this->data_->mem; +} + +bool NDArray::IsContainerPinned(NDArray::Container* ptr) { + if (ptr->pinned_by_dgl_ || ptr->pinned_by_pytorch_) return true; + auto* tensor = &(ptr->dl_tensor); + // Can only be pinned if on CPU... + if (tensor->ctx.device_type != kDGLCPU) return false; + // ... and CUDA device API is enabled, and the tensor is indeed in pinned + // memory. + auto device = DeviceAPI::Get(kDGLCUDA, true); + return device && device->IsPinned(tensor->data); +} + +void NDArray::Save(dmlc::Stream* strm) const { + auto zc_strm = dynamic_cast(strm); + if (zc_strm) { + zc_strm->PushNDArray(*this); + return; + } + SaveDGLArray(strm, const_cast(operator->())); +} + +bool NDArray::Load(dmlc::Stream* strm) { + auto zc_strm = dynamic_cast(strm); + if (zc_strm) { + *this = zc_strm->PopNDArray(); + return true; + } + uint64_t header, reserved; + CHECK(strm->Read(&header)) << "Invalid DGLArray file format"; + CHECK(strm->Read(&reserved)) << "Invalid DGLArray file format"; + CHECK(header == kDGLNDArrayMagic) << "Invalid DGLArray file format"; + DGLContext ctx; + int ndim; + DGLDataType dtype; + CHECK(strm->Read(&ctx)) << "Invalid DGLArray file format"; + CHECK(strm->Read(&ndim)) << "Invalid DGLArray file format"; + CHECK(strm->Read(&dtype)) << "Invalid DGLArray file format"; + CHECK_EQ(ctx.device_type, kDGLCPU) + << "Invalid DGLArray context: can only save as CPU tensor"; + std::vector shape(ndim); + if (ndim != 0) { + CHECK(strm->ReadArray(&shape[0], ndim)) << "Invalid DGLArray file format"; + } + NDArray ret = NDArray::Empty(shape, dtype, ctx); + int64_t num_elems = 1; + int elem_bytes = (ret->dtype.bits + 7) / 8; + for (int i = 0; i < ret->ndim; ++i) { + num_elems *= ret->shape[i]; + } + int64_t data_byte_size; + CHECK(strm->Read(&data_byte_size)) << "Invalid DGLArray file format"; + CHECK(data_byte_size == num_elems * elem_bytes) + << "Invalid DGLArray file format"; + if (data_byte_size != 0) { + // strm->Read will return the total number of elements successfully read. + // Therefore if data_byte_size is zero, the CHECK below would fail. + CHECK(strm->Read(ret->data, data_byte_size)) + << "Invalid DGLArray file format"; + } + if (!DMLC_IO_NO_ENDIAN_SWAP) { + dmlc::ByteSwap(ret->data, elem_bytes, num_elems); + } + *this = ret; + return true; +} + +} // namespace runtime +} // namespace dgl + +using namespace dgl::runtime; + +int DGLArrayAlloc( + const dgl_index_t* shape, int ndim, int dtype_code, int dtype_bits, + int dtype_lanes, int device_type, int device_id, DGLArrayHandle* out) { + API_BEGIN(); + DGLDataType dtype; + dtype.code = static_cast(dtype_code); + dtype.bits = static_cast(dtype_bits); + dtype.lanes = static_cast(dtype_lanes); + DGLContext ctx; + ctx.device_type = static_cast(device_type); + ctx.device_id = device_id; + *out = NDArray::Internal::MoveAsDGLArray( + NDArray::Empty(std::vector(shape, shape + ndim), dtype, ctx)); + API_END(); +} + +int DGLArrayAllocSharedMem( + const char* mem_name, const dgl_index_t* shape, int ndim, int dtype_code, + int dtype_bits, int dtype_lanes, bool is_create, DGLArrayHandle* out) { + API_BEGIN(); + DGLDataType dtype; + dtype.code = static_cast(dtype_code); + dtype.bits = static_cast(dtype_bits); + dtype.lanes = static_cast(dtype_lanes); + std::vector shape_vec(shape, shape + ndim); + NDArray arr = NDArray::EmptyShared( + mem_name, shape_vec, dtype, DGLContext{kDGLCPU, 0}, is_create); + *out = NDArray::Internal::MoveAsDGLArray(arr); + API_END(); +} + +int DGLArrayFree(DGLArrayHandle handle) { + API_BEGIN(); + reinterpret_cast(handle)->DecRef(); + API_END(); +} + +int DGLArrayCopyFromTo(DGLArrayHandle from, DGLArrayHandle to) { + API_BEGIN(); + NDArray::CopyFromTo(from, to); + API_END(); +} + +int DGLArrayCopyFromBytes(DGLArrayHandle handle, void* data, size_t nbytes) { + API_BEGIN(); + DGLContext cpu_ctx; + cpu_ctx.device_type = kDGLCPU; + cpu_ctx.device_id = 0; + size_t arr_size = GetDataSize(*handle); + CHECK_EQ(arr_size, nbytes) << "DGLArrayCopyFromBytes: size mismatch"; + DeviceAPI::Get(handle->ctx) + ->CopyDataFromTo( + data, 0, handle->data, static_cast(handle->byte_offset), + nbytes, cpu_ctx, handle->ctx, handle->dtype); + API_END(); +} + +int DGLArrayCopyToBytes(DGLArrayHandle handle, void* data, size_t nbytes) { + API_BEGIN(); + DGLContext cpu_ctx; + cpu_ctx.device_type = kDGLCPU; + cpu_ctx.device_id = 0; + size_t arr_size = GetDataSize(*handle); + CHECK_EQ(arr_size, nbytes) << "DGLArrayCopyToBytes: size mismatch"; + DeviceAPI::Get(handle->ctx) + ->CopyDataFromTo( + handle->data, static_cast(handle->byte_offset), data, 0, + nbytes, handle->ctx, cpu_ctx, handle->dtype); + API_END(); +} + +int DGLArrayPinData(DGLArrayHandle handle, DGLContext ctx) { + API_BEGIN(); + auto* nd_container = reinterpret_cast(handle); + NDArray::PinContainer(nd_container); + API_END(); +} + +int DGLArrayUnpinData(DGLArrayHandle handle, DGLContext ctx) { + API_BEGIN(); + auto* nd_container = reinterpret_cast(handle); + NDArray::UnpinContainer(nd_container); + API_END(); +} + +int DGLArrayRecordStream(DGLArrayHandle handle, DGLStreamHandle stream) { + API_BEGIN(); + NDArray::RecordStream(handle, stream); + API_END(); +} diff --git a/tensoradapter/include/tensoradapter.h b/tensoradapter/include/tensoradapter.h index cf7341cac105..990043920ab8 100644 --- a/tensoradapter/include/tensoradapter.h +++ b/tensoradapter/include/tensoradapter.h @@ -1,3 +1,4 @@ +// !!! This is a file automatically generated by hipify!!! /** * Copyright (c) 2020-2022 by Contributors * @file tensoradapter.h @@ -10,9 +11,9 @@ #ifndef TENSORADAPTER_H_ #define TENSORADAPTER_H_ -#ifdef DGL_USE_CUDA -#include -#endif // DGL_USE_CUDA +#ifdef DGL_USE_ROCM +#include +#endif // DGL_USE_ROCM namespace tensoradapter { @@ -34,7 +35,7 @@ void* CPURawAlloc(size_t nbytes); */ void CPURawDelete(void* ptr); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM /** * @brief Allocate a piece of GPU memory via * PyTorch's THCCachingAllocator. @@ -43,7 +44,7 @@ void CPURawDelete(void* ptr); * @param stream The stream to be allocated on. * @return Pointer to the allocated memory. */ -void* CUDARawAlloc(size_t nbytes, cudaStream_t stream); +void* CUDARawAlloc(size_t nbytes, hipStream_t stream); /** * @brief Free the GPU memory. @@ -55,7 +56,7 @@ void CUDARawDelete(void* ptr); /** * @brief Get the current CUDA stream. */ -cudaStream_t CUDACurrentStream(); +hipStream_t CUDACurrentStream(); /** * @brief Let the caching allocator know which streams are using this tensor. @@ -64,7 +65,7 @@ cudaStream_t CUDACurrentStream(); * @param stream The stream that is using this tensor. * @param device_id Device of the tensor. */ -void RecordStream(void* ptr, cudaStream_t stream, int device_id); +void RecordStream(void* ptr, hipStream_t stream, int device_id); /** * @brief Allocate a piece of pinned CPU memory via @@ -98,14 +99,14 @@ void CUDARawHostDelete(void** raw_deleter); * @param device_id Device of the tensor. */ void CUDARecordHostAlloc( - void* data, void* ctx, cudaStream_t stream, int device_id); + void* data, void* ctx, hipStream_t stream, int device_id); /** * @brief Release cached pinned memory allocations via cudaHostFree. */ void CUDAHostAllocatorEmptyCache(); -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM } }; // namespace tensoradapter diff --git a/tensoradapter/include/tensoradapter.h.prehip b/tensoradapter/include/tensoradapter.h.prehip new file mode 100644 index 000000000000..cf7341cac105 --- /dev/null +++ b/tensoradapter/include/tensoradapter.h.prehip @@ -0,0 +1,113 @@ +/** + * Copyright (c) 2020-2022 by Contributors + * @file tensoradapter.h + * @brief Header file for functions exposed by the adapter library. + * + * Functions in this library must be exported with extern "C" so that DGL can + * locate them with dlsym(3) (or GetProcAddress on Windows). + */ + +#ifndef TENSORADAPTER_H_ +#define TENSORADAPTER_H_ + +#ifdef DGL_USE_CUDA +#include +#endif // DGL_USE_CUDA + +namespace tensoradapter { + +extern "C" { + +/** + * @brief Allocate a piece of CPU memory via + * PyTorch's CPUAllocator + * + * @param nbytes The size to be allocated. + * @return Pointer to the allocated memory. + */ +void* CPURawAlloc(size_t nbytes); + +/** + * @brief Free the CPU memory. + * + * @param ptr Pointer to the memory to be freed. + */ +void CPURawDelete(void* ptr); + +#ifdef DGL_USE_CUDA +/** + * @brief Allocate a piece of GPU memory via + * PyTorch's THCCachingAllocator. + * + * @param nbytes The size to be allocated. + * @param stream The stream to be allocated on. + * @return Pointer to the allocated memory. + */ +void* CUDARawAlloc(size_t nbytes, cudaStream_t stream); + +/** + * @brief Free the GPU memory. + * + * @param ptr Pointer to the memory to be freed. + */ +void CUDARawDelete(void* ptr); + +/** + * @brief Get the current CUDA stream. + */ +cudaStream_t CUDACurrentStream(); + +/** + * @brief Let the caching allocator know which streams are using this tensor. + * + * @param ptr Pointer of the tensor to be recorded. + * @param stream The stream that is using this tensor. + * @param device_id Device of the tensor. + */ +void RecordStream(void* ptr, cudaStream_t stream, int device_id); + +/** + * @brief Allocate a piece of pinned CPU memory via + * PyTorch's CachingHostAllocator. + * + * @param nbytes The size to be allocated. + * @param ctx Pointer to the PyTorch storage ctx ptr returned from the + * allocator. + * @param deleter Pointer to the delete function ptr returned from the + * allocator. + * @return Raw pointer to the allocated memory. + */ +void* CUDARawHostAlloc(size_t nbytes, void** ctx, void** raw_deleter); + +/** + * @brief 'Free' the pinned CPU memory via + * inserting the memory block back to the free list. + * + * @param deleter Pointer to the delete function ptr returned from the + * allocator. + */ +void CUDARawHostDelete(void** raw_deleter); + +/** + * @brief 'Record' a CUDA stream (usually from a copy kernel) for the pinned + * memory via PyTorch's CachingHostAllocator. + * + * @param data Pointer of the tensor to be recorded. + * @param ctx PyTorch storage ctx ptr returned from the allocator. + * @param stream The stream that currently consumes this tensor. + * @param device_id Device of the tensor. + */ +void CUDARecordHostAlloc( + void* data, void* ctx, cudaStream_t stream, int device_id); + +/** + * @brief Release cached pinned memory allocations via cudaHostFree. + */ +void CUDAHostAllocatorEmptyCache(); + +#endif // DGL_USE_CUDA +} + +}; // namespace tensoradapter + +#endif // TENSORADAPTER_H_ diff --git a/tensoradapter/pytorch/torch.cpp b/tensoradapter/pytorch/torch.cpp index e02b02e46dda..55183595b1a7 100644 --- a/tensoradapter/pytorch/torch.cpp +++ b/tensoradapter/pytorch/torch.cpp @@ -1,3 +1,4 @@ +// !!! This is a file automatically generated by hipify!!! /** * Copyright (c) 2020-2022 by Contributors * @file torch/torch.cpp @@ -6,13 +7,13 @@ #include #include -#ifdef DGL_USE_CUDA -#include -#include -#include -#include -#include -#endif // DGL_USE_CUDA +#ifdef DGL_USE_ROCM +#include +#include +#include +#include +#include +#endif // DGL_USE_ROCM namespace tensoradapter { @@ -26,30 +27,30 @@ TA_EXPORTS void CPURawDelete(void* ptr) { c10::GetCPUAllocator()->raw_deallocate(ptr); } -#ifdef DGL_USE_CUDA -TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) { +#ifdef DGL_USE_ROCM +TA_EXPORTS void* CUDARawAlloc(size_t nbytes, hipStream_t stream) { at::globalContext().lazyInitCUDA(); - return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream); + return c10::hip::HIPCachingAllocator::raw_alloc_with_stream(nbytes, stream); } TA_EXPORTS void CUDARawDelete(void* ptr) { - c10::cuda::CUDACachingAllocator::raw_delete(ptr); + c10::hip::HIPCachingAllocator::raw_delete(ptr); } -TA_EXPORTS cudaStream_t CUDACurrentStream() { - return at::cuda::getCurrentCUDAStream(); +TA_EXPORTS hipStream_t CUDACurrentStream() { + return at::hip::getCurrentHIPStreamMasqueradingAsCUDA(); } -TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) { +TA_EXPORTS void RecordStream(void* ptr, hipStream_t stream, int device_id) { c10::DataPtr data_ptr{ - ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(), + ptr, ptr, c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::get()->raw_deleter(), c10::Device(c10::DeviceType::CUDA, device_id)}; - c10::cuda::CUDACachingAllocator::recordStream( + c10::hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA( data_ptr, - // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it + // getStreamFromExternalMasqueradingAsCUDA doesn't exist before PyTorch 1.10, just copy it // here - c10::cuda::CUDAStream( - c10::cuda::CUDAStream::UNCHECKED, + c10::hip::HIPStreamMasqueradingAsCUDA( + c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED, c10::Stream( c10::Stream::UNSAFE, c10::Device(c10::DeviceType::CUDA, device_id), @@ -86,11 +87,11 @@ TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) { } TA_EXPORTS void CUDARecordHostAlloc( - void* ptr, void* ctx, cudaStream_t stream, int device_id) { + void* ptr, void* ctx, hipStream_t stream, int device_id) { at::cuda::CachingHostAllocator_recordEvent( ptr, ctx, - c10::cuda::CUDAStream( - c10::cuda::CUDAStream::UNCHECKED, + c10::hip::HIPStreamMasqueradingAsCUDA( + c10::hip::HIPStreamMasqueradingAsCUDA::UNCHECKED, c10::Stream( c10::Stream::UNSAFE, c10::Device(c10::DeviceType::CUDA, device_id), @@ -100,7 +101,7 @@ TA_EXPORTS void CUDARecordHostAlloc( TA_EXPORTS void CUDAHostAllocatorEmptyCache() { at::cuda::CachingHostAllocator_emptyCache(); } -#endif // DGL_USE_CUDA +#endif // DGL_USE_ROCM }; }; // namespace tensoradapter diff --git a/tensoradapter/pytorch/torch.cpp.prehip b/tensoradapter/pytorch/torch.cpp.prehip new file mode 100644 index 000000000000..e02b02e46dda --- /dev/null +++ b/tensoradapter/pytorch/torch.cpp.prehip @@ -0,0 +1,106 @@ +/** + * Copyright (c) 2020-2022 by Contributors + * @file torch/torch.cpp + * @brief Implementation of PyTorch adapter library. + */ + +#include +#include +#ifdef DGL_USE_CUDA +#include +#include +#include +#include +#include +#endif // DGL_USE_CUDA + +namespace tensoradapter { + +extern "C" { + +TA_EXPORTS void* CPURawAlloc(size_t nbytes) { + return c10::GetCPUAllocator()->raw_allocate(nbytes); +} + +TA_EXPORTS void CPURawDelete(void* ptr) { + c10::GetCPUAllocator()->raw_deallocate(ptr); +} + +#ifdef DGL_USE_CUDA +TA_EXPORTS void* CUDARawAlloc(size_t nbytes, cudaStream_t stream) { + at::globalContext().lazyInitCUDA(); + return c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(nbytes, stream); +} + +TA_EXPORTS void CUDARawDelete(void* ptr) { + c10::cuda::CUDACachingAllocator::raw_delete(ptr); +} + +TA_EXPORTS cudaStream_t CUDACurrentStream() { + return at::cuda::getCurrentCUDAStream(); +} + +TA_EXPORTS void RecordStream(void* ptr, cudaStream_t stream, int device_id) { + c10::DataPtr data_ptr{ + ptr, ptr, c10::cuda::CUDACachingAllocator::get()->raw_deleter(), + c10::Device(c10::DeviceType::CUDA, device_id)}; + c10::cuda::CUDACachingAllocator::recordStream( + data_ptr, + // getStreamFromExternal doesn't exist before PyTorch 1.10, just copy it + // here + c10::cuda::CUDAStream( + c10::cuda::CUDAStream::UNCHECKED, + c10::Stream( + c10::Stream::UNSAFE, + c10::Device(c10::DeviceType::CUDA, device_id), + reinterpret_cast(stream)))); + data_ptr.release_context(); +} + +class CUDAHostDeleter { + public: + explicit CUDAHostDeleter(std::unique_ptr ptr) + : ptr_(std::move(ptr)) {} + + private: + std::unique_ptr ptr_; +}; + +TA_EXPORTS void* CUDARawHostAlloc( + size_t nbytes, void** ctx, void** raw_deleter) { + auto data_ptr = at::cuda::getCachingHostAllocator()->allocate(nbytes); + auto raw = data_ptr.get(); + // Return the raw ctx ptr for recording event. + *ctx = data_ptr.get_context(); + + // Transfer ownership to raw_deleter. + auto* data_deleter = new CUDAHostDeleter(data_ptr.move_context()); + *raw_deleter = static_cast(data_deleter); + return raw; +} + +// Designated CUDAHostDeleter for CUDARawHostAlloc. +TA_EXPORTS void CUDARawHostDelete(void** raw_deleter) { + delete static_cast(*raw_deleter); + *raw_deleter = nullptr; +} + +TA_EXPORTS void CUDARecordHostAlloc( + void* ptr, void* ctx, cudaStream_t stream, int device_id) { + at::cuda::CachingHostAllocator_recordEvent( + ptr, ctx, + c10::cuda::CUDAStream( + c10::cuda::CUDAStream::UNCHECKED, + c10::Stream( + c10::Stream::UNSAFE, + c10::Device(c10::DeviceType::CUDA, device_id), + reinterpret_cast(stream)))); +} + +TA_EXPORTS void CUDAHostAllocatorEmptyCache() { + at::cuda::CachingHostAllocator_emptyCache(); +} +#endif // DGL_USE_CUDA +}; + +}; // namespace tensoradapter diff --git a/tests/cpp/common.h b/tests/cpp/common.h index b1e871f565b4..d0fdede622bf 100644 --- a/tests/cpp/common.h +++ b/tests/cpp/common.h @@ -5,7 +5,7 @@ static constexpr DGLContext CTX = DGLContext{kDGLCPU, 0}; static constexpr DGLContext CPU = DGLContext{kDGLCPU, 0}; -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM static constexpr DGLContext GPU = DGLContext{kDGLCUDA, 0}; #endif diff --git a/tests/cpp/common.h.prehip b/tests/cpp/common.h.prehip new file mode 100644 index 000000000000..b1e871f565b4 --- /dev/null +++ b/tests/cpp/common.h.prehip @@ -0,0 +1,56 @@ +#ifndef TEST_COMMON_H_ +#define TEST_COMMON_H_ + +#include + +static constexpr DGLContext CTX = DGLContext{kDGLCPU, 0}; +static constexpr DGLContext CPU = DGLContext{kDGLCPU, 0}; +#ifdef DGL_USE_CUDA +static constexpr DGLContext GPU = DGLContext{kDGLCUDA, 0}; +#endif + +template +inline T* Ptr(dgl::runtime::NDArray nd) { + return static_cast(nd->data); +} + +inline int64_t* PI64(dgl::runtime::NDArray nd) { + return static_cast(nd->data); +} + +inline int32_t* PI32(dgl::runtime::NDArray nd) { + return static_cast(nd->data); +} + +inline int64_t Len(dgl::runtime::NDArray nd) { return nd->shape[0]; } + +template +inline bool ArrayEQ(dgl::runtime::NDArray a1, dgl::runtime::NDArray a2) { + if (a1->ndim != a2->ndim) return false; + if (a1->dtype != a2->dtype) return false; + if (a1->ctx != a2->ctx) return false; + if (a1.NumElements() != a2.NumElements()) return false; + if (a1.NumElements() == 0) return true; + int64_t num = 1; + for (int i = 0; i < a1->ndim; ++i) { + if (a1->shape[i] != a2->shape[i]) return false; + num *= a1->shape[i]; + } + a1 = a1.CopyTo(CPU); + a2 = a2.CopyTo(CPU); + for (int64_t i = 0; i < num; ++i) + if (static_cast(a1->data)[i] != static_cast(a2->data)[i]) + return false; + return true; +} + +template +inline bool IsInArray(dgl::runtime::NDArray a, T x) { + if (!a.defined() || a->shape[0] == 0) return false; + for (int64_t i = 0; i < a->shape[0]; ++i) { + if (x == static_cast(a->data)[i]) return true; + } + return false; +} + +#endif // TEST_COMMON_H_ diff --git a/tests/cpp/test_aten.cc b/tests/cpp/test_aten.cc index 7edca973a90d..57b93c52942b 100644 --- a/tests/cpp/test_aten.cc +++ b/tests/cpp/test_aten.cc @@ -38,7 +38,7 @@ void _TestRange(DGLContext ctx) { TEST(ArrayTest, TestRange) { _TestRange(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestRange(GPU); #endif }; @@ -78,7 +78,7 @@ void _TestNumBits(DGLContext ctx) { TEST(ArrayTest, TestAsNumBits) { _TestNumBits(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestNumBits(GPU); #endif }; @@ -170,7 +170,7 @@ void _TestArith(DGLContext ctx) { TEST(ArrayTest, Arith) { _TestArith(CPU); _TestArith(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestArith(GPU); _TestArith(GPU); #endif @@ -189,7 +189,7 @@ void _TestHStack(DGLContext ctx) { TEST(ArrayTest, HStack) { _TestHStack(CPU); _TestHStack(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestHStack(GPU); _TestHStack(GPU); #endif @@ -210,7 +210,7 @@ void _TestIndexSelect(DGLContext ctx) { TEST(ArrayTest, TestIndexSelect) { _TestIndexSelect(CPU); _TestIndexSelect(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestIndexSelect(GPU); _TestIndexSelect(GPU); #endif @@ -239,7 +239,7 @@ void _TestRelabel_(DGLContext ctx) { TEST(ArrayTest, TestRelabel_) { _TestRelabel_(CPU); _TestRelabel_(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestRelabel_(GPU); _TestRelabel_(GPU); #endif @@ -556,7 +556,7 @@ void _TestDisjointUnionPartitionCoo(DGLContext ctx) { TEST(DisjointUnionTest, TestDisjointUnionPartitionCoo) { _TestDisjointUnionPartitionCoo(CPU); _TestDisjointUnionPartitionCoo(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestDisjointUnionPartitionCoo(GPU); _TestDisjointUnionPartitionCoo(GPU); #endif @@ -682,7 +682,7 @@ void _TestDisjointUnionPartitionCsr(DGLContext ctx) { TEST(DisjointUnionTest, TestDisjointUnionPartitionCsr) { _TestDisjointUnionPartitionCsr(CPU); _TestDisjointUnionPartitionCsr(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestDisjointUnionPartitionCsr(GPU); _TestDisjointUnionPartitionCsr(GPU); #endif @@ -750,7 +750,7 @@ void _TestSliceContiguousChunkCoo(DGLContext ctx) { TEST(SliceContiguousChunk, TestSliceContiguousChunkCoo) { _TestSliceContiguousChunkCoo(CPU); _TestSliceContiguousChunkCoo(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestSliceContiguousChunkCoo(GPU); _TestSliceContiguousChunkCoo(GPU); #endif @@ -817,7 +817,7 @@ void _TestSliceContiguousChunkCsr(DGLContext ctx) { TEST(SliceContiguousChunk, TestSliceContiguousChunkCsr) { _TestSliceContiguousChunkCsr(CPU); _TestSliceContiguousChunkCsr(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestSliceContiguousChunkCsr(GPU); _TestSliceContiguousChunkCsr(GPU); #endif @@ -1224,7 +1224,7 @@ void _TestCumSum(DGLContext ctx) { TEST(ArrayTest, CumSum) { _TestCumSum(CPU); _TestCumSum(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCumSum(GPU); _TestCumSum(GPU); #endif @@ -1249,7 +1249,7 @@ TEST(ArrayTest, Scatter_) { _TestScatter_(CPU); _TestScatter_(CPU); _TestScatter_(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestScatter_(GPU); _TestScatter_(GPU); _TestScatter_(GPU); @@ -1285,7 +1285,7 @@ void _TestNonZero(DGLContext ctx) { TEST(ArrayTest, NonZero) { _TestNonZero(CPU); _TestNonZero(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestNonZero(GPU); _TestNonZero(GPU); #endif @@ -1421,7 +1421,7 @@ void _TestSort(DGLContext ctx) { TEST(ArrayTest, Sort) { _TestSort(CPU); _TestSort(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestSort(GPU); _TestSort(GPU); #endif diff --git a/tests/cpp/test_aten.cc.prehip b/tests/cpp/test_aten.cc.prehip new file mode 100644 index 000000000000..7edca973a90d --- /dev/null +++ b/tests/cpp/test_aten.cc.prehip @@ -0,0 +1,1437 @@ +#include +#include + +#include "./common.h" + +using namespace dgl; +using namespace dgl::runtime; + +TEST(ArrayTest, TestCreate) { + IdArray a = aten::NewIdArray(100, CTX, 32); + ASSERT_EQ(a->dtype.bits, 32); + ASSERT_EQ(a->shape[0], 100); + + a = aten::NewIdArray(0); + ASSERT_EQ(a->shape[0], 0); + + std::vector vec = {2, 94, 232, 30}; + a = aten::VecToIdArray(vec, 32); + ASSERT_EQ(Len(a), vec.size()); + ASSERT_EQ(a->dtype.bits, 32); + for (int i = 0; i < Len(a); ++i) { + ASSERT_EQ(Ptr(a)[i], vec[i]); + } + + a = aten::VecToIdArray(std::vector()); + ASSERT_EQ(Len(a), 0); +}; + +void _TestRange(DGLContext ctx) { + IdArray a = aten::Range(10, 10, 64, ctx); + ASSERT_EQ(Len(a), 0); + a = aten::Range(10, 20, 32, ctx); + ASSERT_EQ(Len(a), 10); + ASSERT_EQ(a->dtype.bits, 32); + a = a.CopyTo(CPU); + for (int i = 0; i < 10; ++i) ASSERT_EQ(Ptr(a)[i], i + 10); +} + +TEST(ArrayTest, TestRange) { + _TestRange(CPU); +#ifdef DGL_USE_CUDA + _TestRange(GPU); +#endif +}; + +TEST(ArrayTest, TestFull) { + IdArray a = aten::Full(-100, 0, 32, CTX); + ASSERT_EQ(Len(a), 0); + a = aten::Full(-100, 13, 64, CTX); + ASSERT_EQ(Len(a), 13); + ASSERT_EQ(a->dtype.bits, 64); + for (int i = 0; i < 13; ++i) ASSERT_EQ(Ptr(a)[i], -100); +}; + +TEST(ArrayTest, TestClone) { + IdArray a = aten::NewIdArray(0); + IdArray b = aten::Clone(a); + ASSERT_EQ(Len(b), 0); + + a = aten::Range(0, 10, 32, CTX); + b = aten::Clone(a); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(PI32(b)[i], i); + } + PI32(b)[0] = -1; + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(PI32(a)[i], i); + } +}; + +void _TestNumBits(DGLContext ctx) { + IdArray a = aten::Range(0, 10, 32, ctx); + a = aten::AsNumBits(a, 64); + ASSERT_EQ(a->dtype.bits, 64); + a = a.CopyTo(CPU); + for (int i = 0; i < 10; ++i) ASSERT_EQ(PI64(a)[i], i); +} + +TEST(ArrayTest, TestAsNumBits) { + _TestNumBits(CPU); +#ifdef DGL_USE_CUDA + _TestNumBits(GPU); +#endif +}; + +template +void _TestArith(DGLContext ctx) { + const int N = 100; + IdArray a = aten::Full(-10, N, sizeof(IDX) * 8, ctx); + IdArray b = aten::Full(7, N, sizeof(IDX) * 8, ctx); + + IdArray c = a + b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -3); + c = a - b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -17); + c = a * b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -70); + c = a / b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -1); + c = -a; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 10); + c = (-a) % b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 3); + + const int val = -3; + c = aten::Add(a, val); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -13); + c = aten::Sub(a, val); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -7); + c = aten::Mul(a, val); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 30); + c = aten::Div(a, val); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 3); + c = b % 3; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 1); + + c = aten::Add(val, b); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 4); + c = aten::Sub(val, b); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -10); + c = aten::Mul(val, b); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], -21); + c = aten::Div(val, b); + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 0); + c = 3 % b; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], 3); + + a = aten::Range(0, N, sizeof(IDX) * 8, ctx); + c = a < 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i < 50)); + + c = a > 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i > 50)); + + c = a >= 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i >= 50)); + + c = a <= 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i <= 50)); + + c = a == 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i == 50)); + + c = a != 50; + c = c.CopyTo(CPU); + for (int i = 0; i < N; ++i) ASSERT_EQ(Ptr(c)[i], (int)(i != 50)); +} + +TEST(ArrayTest, Arith) { + _TestArith(CPU); + _TestArith(CPU); +#ifdef DGL_USE_CUDA + _TestArith(GPU); + _TestArith(GPU); +#endif +}; + +template +void _TestHStack(DGLContext ctx) { + IdArray a = aten::Range(0, 100, sizeof(IDX) * 8, ctx); + IdArray b = aten::Range(100, 200, sizeof(IDX) * 8, ctx); + IdArray c = aten::HStack(a, b).CopyTo(aten::CPU); + ASSERT_EQ(c->ndim, 1); + ASSERT_EQ(c->shape[0], 200); + for (int i = 0; i < 200; ++i) ASSERT_EQ(Ptr(c)[i], i); +} + +TEST(ArrayTest, HStack) { + _TestHStack(CPU); + _TestHStack(CPU); +#ifdef DGL_USE_CUDA + _TestHStack(GPU); + _TestHStack(GPU); +#endif +} + +template +void _TestIndexSelect(DGLContext ctx) { + IdArray a = aten::Range(0, 100, sizeof(IDX) * 8, ctx); + ASSERT_EQ(aten::IndexSelect(a, 50), 50); + ASSERT_TRUE(ArrayEQ( + aten::IndexSelect(a, 10, 20), aten::Range(10, 20, sizeof(IDX) * 8, ctx))); + IdArray b = + aten::VecToIdArray(std::vector({0, 20, 10}), sizeof(IDX) * 8, ctx); + IdArray c = aten::IndexSelect(a, b); + ASSERT_TRUE(ArrayEQ(b, c)); +} + +TEST(ArrayTest, TestIndexSelect) { + _TestIndexSelect(CPU); + _TestIndexSelect(CPU); +#ifdef DGL_USE_CUDA + _TestIndexSelect(GPU); + _TestIndexSelect(GPU); +#endif +} + +template +void _TestRelabel_(DGLContext ctx) { + IdArray a = + aten::VecToIdArray(std::vector({0, 20, 10}), sizeof(IDX) * 8, ctx); + IdArray b = + aten::VecToIdArray(std::vector({20, 5, 6}), sizeof(IDX) * 8, ctx); + IdArray c = aten::Relabel_({a, b}); + + IdArray ta = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + IdArray tb = + aten::VecToIdArray(std::vector({1, 3, 4}), sizeof(IDX) * 8, ctx); + IdArray tc = aten::VecToIdArray( + std::vector({0, 20, 10, 5, 6}), sizeof(IDX) * 8, ctx); + + ASSERT_TRUE(ArrayEQ(a, ta)); + ASSERT_TRUE(ArrayEQ(b, tb)); + ASSERT_TRUE(ArrayEQ(c, tc)); +} + +TEST(ArrayTest, TestRelabel_) { + _TestRelabel_(CPU); + _TestRelabel_(CPU); +#ifdef DGL_USE_CUDA + _TestRelabel_(GPU); + _TestRelabel_(GPU); +#endif +} + +template +void _TestConcat(DGLContext ctx) { + IdArray a = + aten::VecToIdArray(std::vector({1, 2, 3}), sizeof(IDX) * 8, CTX); + IdArray b = + aten::VecToIdArray(std::vector({4, 5, 6}), sizeof(IDX) * 8, CTX); + IdArray tc = aten::VecToIdArray( + std::vector({1, 2, 3, 4, 5, 6}), sizeof(IDX) * 8, CTX); + IdArray c = aten::Concat(std::vector{a, b}); + ASSERT_TRUE(ArrayEQ(c, tc)); + IdArray d = aten::Concat(std::vector{a, b, c}); + IdArray td = aten::VecToIdArray( + std::vector({1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6}), sizeof(IDX) * 8, + CTX); + ASSERT_TRUE(ArrayEQ(d, td)); +} + +template +void _TestToSimpleCsr(DGLContext ctx) { + /** + * A = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [1, 1, 1, 1], + * [3, 2, 2, 3], + * [2, 0, 0, 2]] + * + * B = CSRToSimple(A) + * B = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [1, 1, 1, 1], + * [1, 1, 1, 1], + * [1, 0, 0, 1]] + */ + IdArray a_indptr = aten::VecToIdArray( + std::vector({0, 0, 2, 6, 16, 20}), sizeof(IdType) * 8, CTX); + IdArray a_indices = aten::VecToIdArray( + std::vector( + {0, 3, 0, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 0, 3, 3}), + sizeof(IdType) * 8, CTX); + IdArray b_indptr = aten::VecToIdArray( + std::vector({0, 0, 2, 6, 10, 12}), sizeof(IdType) * 8, CTX); + IdArray b_indices = aten::VecToIdArray( + std::vector({0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + IdArray cnt = aten::VecToIdArray( + std::vector({1, 1, 1, 1, 1, 1, 3, 2, 2, 3, 2, 2}), + sizeof(IdType) * 8, CTX); + IdArray map = aten::VecToIdArray( + std::vector( + {0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_a = + aten::CSRMatrix(5, 4, a_indptr, a_indices, aten::NullArray(), true); + auto ret = CSRToSimple(csr_a); + aten::CSRMatrix csr_b = std::get<0>(ret); + IdArray ecnt = std::get<1>(ret); + IdArray emap = std::get<2>(ret); + ASSERT_EQ(csr_b.num_rows, 5); + ASSERT_EQ(csr_b.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_b.indptr, b_indptr)); + ASSERT_TRUE(ArrayEQ(csr_b.indices, b_indices)); + ASSERT_TRUE(ArrayEQ(ecnt, cnt)); + ASSERT_TRUE(ArrayEQ(emap, map)); + ASSERT_TRUE(csr_b.sorted); + + // a not sorted + a_indices = aten::VecToIdArray( + std::vector( + {0, 3, 0, 1, 2, 3, 3, 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + map = aten::VecToIdArray( + std::vector( + {0, 1, 2, 3, 4, 5, 9, 6, 6, 7, 7, 8, 8, 9, 9, 6, 10, 11, 10, 11}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_a2 = + aten::CSRMatrix(5, 4, a_indptr, a_indices, aten::NullArray(), false); + ret = CSRToSimple(csr_a2); + csr_b = std::get<0>(ret); + ecnt = std::get<1>(ret); + emap = std::get<2>(ret); + ASSERT_EQ(csr_b.num_rows, 5); + ASSERT_EQ(csr_b.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_b.indptr, b_indptr)); + ASSERT_TRUE(ArrayEQ(csr_b.indices, b_indices)); + ASSERT_TRUE(ArrayEQ(ecnt, cnt)); + ASSERT_TRUE(ArrayEQ(emap, map)); + ASSERT_TRUE(csr_b.sorted); +} + +TEST(MatrixTest, TestToSimpleCsr) { + _TestToSimpleCsr(CPU); + _TestToSimpleCsr(CPU); +} + +template +void _TestToSimpleCoo(DGLContext ctx) { + /** + * A = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [1, 1, 1, 1], + * [3, 2, 2, 3], + * [2, 0, 0, 2]] + * + * B = CSRToSimple(A) + * B = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [1, 1, 1, 1], + * [1, 1, 1, 1], + * [1, 0, 0, 1]] + */ + IdArray a_row = aten::VecToIdArray( + std::vector( + {1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4}), + sizeof(IdType) * 8, CTX); + IdArray a_col = aten::VecToIdArray( + std::vector( + {0, 3, 0, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 0, 3, 3}), + sizeof(IdType) * 8, CTX); + IdArray b_row = aten::VecToIdArray( + std::vector({1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4}), + sizeof(IdType) * 8, CTX); + IdArray b_col = aten::VecToIdArray( + std::vector({0, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + IdArray cnt = aten::VecToIdArray( + std::vector({1, 1, 1, 1, 1, 1, 3, 2, 2, 3, 2, 2}), + sizeof(IdType) * 8, CTX); + IdArray map = aten::VecToIdArray( + std::vector( + {0, 1, 2, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 10, 11, 11}), + sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a = + aten::COOMatrix(5, 4, a_row, a_col, aten::NullArray(), true, true); + auto ret = COOToSimple(coo_a); + aten::COOMatrix coo_b = std::get<0>(ret); + IdArray ecnt = std::get<1>(ret); + IdArray emap = std::get<2>(ret); + ASSERT_EQ(coo_b.num_rows, 5); + ASSERT_EQ(coo_b.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_b.row, b_row)); + ASSERT_TRUE(ArrayEQ(coo_b.col, b_col)); + ASSERT_TRUE(ArrayEQ(ecnt, cnt)); + ASSERT_TRUE(ArrayEQ(emap, map)); + ASSERT_FALSE(COOHasData(coo_b)); + ASSERT_TRUE(coo_b.row_sorted); + ASSERT_TRUE(coo_b.col_sorted); + + // a not sorted + a_row = aten::VecToIdArray( + std::vector( + {1, 2, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4}), + sizeof(IdType) * 8, CTX); + a_col = aten::VecToIdArray( + std::vector( + {0, 0, 3, 1, 2, 3, 0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + map = aten::VecToIdArray( + std::vector( + {0, 2, 1, 3, 4, 5, 6, 6, 6, 7, 7, 8, 8, 9, 9, 9, 10, 11, 10, 11}), + sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a2 = + aten::COOMatrix(5, 4, a_row, a_col, aten::NullArray(), false, false); + ret = COOToSimple(coo_a2); + coo_b = std::get<0>(ret); + ecnt = std::get<1>(ret); + emap = std::get<2>(ret); + ASSERT_EQ(coo_b.num_rows, 5); + ASSERT_EQ(coo_b.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_b.row, b_row)); + ASSERT_TRUE(ArrayEQ(coo_b.col, b_col)); + ASSERT_TRUE(ArrayEQ(ecnt, cnt)); + ASSERT_TRUE(ArrayEQ(emap, map)); + ASSERT_FALSE(COOHasData(coo_b)); + ASSERT_TRUE(coo_b.row_sorted); + ASSERT_TRUE(coo_b.col_sorted); +} + +TEST(MatrixTest, TestToSimpleCoo) { + _TestToSimpleCoo(CPU); + _TestToSimpleCoo(CPU); +} + +template +void _TestDisjointUnionPartitionCoo(DGLContext ctx) { + /** + * A = [[0, 0, 1], + * [1, 0, 1], + * [0, 1, 0]] + * + * B = [[1, 1, 0], + * [0, 1, 0]] + * + * C = [[1]] + * + * AB = [[0, 0, 1, 0, 0, 0], + * [1, 0, 1, 0, 0, 0], + * [0, 1, 0, 0, 0, 0], + * [0, 0, 0, 1, 1, 0], + * [0, 0, 0, 0, 1, 0]] + * + * ABC = [[0, 0, 1, 0, 0, 0, 0], + * [1, 0, 1, 0, 0, 0, 0], + * [0, 1, 0, 0, 0, 0, 0], + * [0, 0, 0, 1, 1, 0, 0], + * [0, 0, 0, 0, 1, 0, 0], + * [0, 0, 0, 0, 0, 0, 1]] + */ + IdArray a_row = aten::VecToIdArray( + std::vector({0, 1, 1, 2}), sizeof(IdType) * 8, CTX); + IdArray a_col = aten::VecToIdArray( + std::vector({2, 0, 2, 1}), sizeof(IdType) * 8, CTX); + IdArray b_row = aten::VecToIdArray( + std::vector({0, 0, 1}), sizeof(IdType) * 8, CTX); + IdArray b_col = aten::VecToIdArray( + std::vector({0, 1, 1}), sizeof(IdType) * 8, CTX); + IdArray b_data = aten::VecToIdArray( + std::vector({2, 0, 1}), sizeof(IdType) * 8, CTX); + IdArray c_row = + aten::VecToIdArray(std::vector({0}), sizeof(IdType) * 8, CTX); + IdArray c_col = + aten::VecToIdArray(std::vector({0}), sizeof(IdType) * 8, CTX); + IdArray ab_row = aten::VecToIdArray( + std::vector({0, 1, 1, 2, 3, 3, 4}), sizeof(IdType) * 8, CTX); + IdArray ab_col = aten::VecToIdArray( + std::vector({2, 0, 2, 1, 3, 4, 4}), sizeof(IdType) * 8, CTX); + IdArray ab_data = aten::VecToIdArray( + std::vector({0, 1, 2, 3, 6, 4, 5}), sizeof(IdType) * 8, CTX); + IdArray abc_row = aten::VecToIdArray( + std::vector({0, 1, 1, 2, 3, 3, 4, 5}), sizeof(IdType) * 8, CTX); + IdArray abc_col = aten::VecToIdArray( + std::vector({2, 0, 2, 1, 3, 4, 4, 6}), sizeof(IdType) * 8, CTX); + IdArray abc_data = aten::VecToIdArray( + std::vector({0, 1, 2, 3, 6, 4, 5, 7}), sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a = + aten::COOMatrix(3, 3, a_row, a_col, aten::NullArray(), true, false); + const aten::COOMatrix &coo_b = + aten::COOMatrix(2, 3, b_row, b_col, b_data, true, true); + const aten::COOMatrix &coo_c = + aten::COOMatrix(1, 1, c_row, c_col, aten::NullArray(), true, true); + + const std::vector coos_ab({coo_a, coo_b}); + const aten::COOMatrix &coo_ab = aten::DisjointUnionCoo(coos_ab); + ASSERT_EQ(coo_ab.num_rows, 5); + ASSERT_EQ(coo_ab.num_cols, 6); + ASSERT_TRUE(ArrayEQ(coo_ab.row, ab_row)); + ASSERT_TRUE(ArrayEQ(coo_ab.col, ab_col)); + ASSERT_TRUE(ArrayEQ(coo_ab.data, ab_data)); + ASSERT_TRUE(coo_ab.row_sorted); + ASSERT_FALSE(coo_ab.col_sorted); + + const std::vector edge_cumsum({0, 4, 7}); + const std::vector src_vertex_cumsum({0, 3, 5}); + const std::vector dst_vertex_cumsum({0, 3, 6}); + const std::vector &p_coos = + aten::DisjointPartitionCooBySizes( + coo_ab, 2, edge_cumsum, src_vertex_cumsum, dst_vertex_cumsum); + ASSERT_EQ(p_coos[0].num_rows, coo_a.num_rows); + ASSERT_EQ(p_coos[0].num_cols, coo_a.num_cols); + ASSERT_EQ(p_coos[1].num_rows, coo_b.num_rows); + ASSERT_EQ(p_coos[1].num_cols, coo_b.num_cols); + ASSERT_TRUE(ArrayEQ(p_coos[0].row, coo_a.row)); + ASSERT_TRUE(ArrayEQ(p_coos[0].col, coo_a.col)); + ASSERT_TRUE(ArrayEQ(p_coos[1].row, coo_b.row)); + ASSERT_TRUE(ArrayEQ(p_coos[1].col, coo_b.col)); + ASSERT_TRUE(ArrayEQ(p_coos[1].data, coo_b.data)); + ASSERT_TRUE(p_coos[0].row_sorted); + ASSERT_FALSE(p_coos[0].col_sorted); + ASSERT_TRUE(p_coos[1].row_sorted); + ASSERT_FALSE(p_coos[1].col_sorted); + + const std::vector coos_abc({coo_a, coo_b, coo_c}); + const aten::COOMatrix &coo_abc = aten::DisjointUnionCoo(coos_abc); + ASSERT_EQ(coo_abc.num_rows, 6); + ASSERT_EQ(coo_abc.num_cols, 7); + ASSERT_TRUE(ArrayEQ(coo_abc.row, abc_row)); + ASSERT_TRUE(ArrayEQ(coo_abc.col, abc_col)); + ASSERT_TRUE(ArrayEQ(coo_abc.data, abc_data)); + ASSERT_TRUE(coo_abc.row_sorted); + ASSERT_FALSE(coo_abc.col_sorted); + + const std::vector edge_cumsum_abc({0, 4, 7, 8}); + const std::vector src_vertex_cumsum_abc({0, 3, 5, 6}); + const std::vector dst_vertex_cumsum_abc({0, 3, 6, 7}); + const std::vector &p_coos_abc = + aten::DisjointPartitionCooBySizes( + coo_abc, 3, edge_cumsum_abc, src_vertex_cumsum_abc, + dst_vertex_cumsum_abc); + ASSERT_EQ(p_coos_abc[0].num_rows, coo_a.num_rows); + ASSERT_EQ(p_coos_abc[0].num_cols, coo_a.num_cols); + ASSERT_EQ(p_coos_abc[1].num_rows, coo_b.num_rows); + ASSERT_EQ(p_coos_abc[1].num_cols, coo_b.num_cols); + ASSERT_EQ(p_coos_abc[2].num_rows, coo_c.num_rows); + ASSERT_EQ(p_coos_abc[2].num_cols, coo_c.num_cols); + ASSERT_TRUE(ArrayEQ(p_coos_abc[0].row, coo_a.row)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[0].col, coo_a.col)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[1].row, coo_b.row)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[1].col, coo_b.col)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[1].data, coo_b.data)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[2].row, coo_c.row)); + ASSERT_TRUE(ArrayEQ(p_coos_abc[2].col, coo_c.col)); + ASSERT_TRUE(p_coos_abc[0].row_sorted); + ASSERT_FALSE(p_coos_abc[0].col_sorted); + ASSERT_TRUE(p_coos_abc[1].row_sorted); + ASSERT_FALSE(p_coos_abc[1].col_sorted); + ASSERT_TRUE(p_coos_abc[2].row_sorted); + ASSERT_FALSE(p_coos_abc[2].col_sorted); +} + +TEST(DisjointUnionTest, TestDisjointUnionPartitionCoo) { + _TestDisjointUnionPartitionCoo(CPU); + _TestDisjointUnionPartitionCoo(CPU); +#ifdef DGL_USE_CUDA + _TestDisjointUnionPartitionCoo(GPU); + _TestDisjointUnionPartitionCoo(GPU); +#endif +} + +template +void _TestDisjointUnionPartitionCsr(DGLContext ctx) { + /** + * A = [[0, 0, 1], + * [1, 0, 1], + * [0, 1, 0]] + * + * B = [[1, 1, 0], + * [0, 1, 0]] + * + * C = [[1]] + * + * BC = [[1, 1, 0, 0], + * [0, 1, 0, 0], + * [0, 0, 0, 1]], + * + * ABC = [[0, 0, 1, 0, 0, 0, 0], + * [1, 0, 1, 0, 0, 0, 0], + * [0, 1, 0, 0, 0, 0, 0], + * [0, 0, 0, 1, 1, 0, 0], + * [0, 0, 0, 0, 1, 0, 0], + * [0, 0, 0, 0, 0, 0, 1]] + */ + IdArray a_indptr = aten::VecToIdArray( + std::vector({0, 1, 3, 4}), sizeof(IdType) * 8, CTX); + IdArray a_indices = aten::VecToIdArray( + std::vector({2, 0, 2, 1}), sizeof(IdType) * 8, CTX); + IdArray b_indptr = aten::VecToIdArray( + std::vector({0, 2, 3}), sizeof(IdType) * 8, CTX); + IdArray b_indices = aten::VecToIdArray( + std::vector({0, 1, 1}), sizeof(IdType) * 8, CTX); + IdArray b_data = aten::VecToIdArray( + std::vector({2, 0, 1}), sizeof(IdType) * 8, CTX); + IdArray c_indptr = + aten::VecToIdArray(std::vector({0, 1}), sizeof(IdType) * 8, CTX); + IdArray c_indices = + aten::VecToIdArray(std::vector({0}), sizeof(IdType) * 8, CTX); + IdArray bc_indptr = aten::VecToIdArray( + std::vector({0, 2, 3, 4}), sizeof(IdType) * 8, CTX); + IdArray bc_indices = aten::VecToIdArray( + std::vector({0, 1, 1, 3}), sizeof(IdType) * 8, CTX); + IdArray bc_data = aten::VecToIdArray( + std::vector({2, 0, 1, 3}), sizeof(IdType) * 8, CTX); + IdArray abc_indptr = aten::VecToIdArray( + std::vector({0, 1, 3, 4, 6, 7, 8}), sizeof(IdType) * 8, CTX); + IdArray abc_indices = aten::VecToIdArray( + std::vector({2, 0, 2, 1, 3, 4, 4, 6}), sizeof(IdType) * 8, CTX); + IdArray abc_data = aten::VecToIdArray( + std::vector({0, 1, 2, 3, 6, 4, 5, 7}), sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_a = + aten::CSRMatrix(3, 3, a_indptr, a_indices, aten::NullArray(), false); + const aten::CSRMatrix &csr_b = + aten::CSRMatrix(2, 3, b_indptr, b_indices, b_data, true); + const aten::CSRMatrix &csr_c = + aten::CSRMatrix(1, 1, c_indptr, c_indices, aten::NullArray(), true); + + const std::vector csrs_bc({csr_b, csr_c}); + const aten::CSRMatrix &csr_bc = aten::DisjointUnionCsr(csrs_bc); + ASSERT_EQ(csr_bc.num_rows, 3); + ASSERT_EQ(csr_bc.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_bc.indptr, bc_indptr)); + ASSERT_TRUE(ArrayEQ(csr_bc.indices, bc_indices)); + ASSERT_TRUE(ArrayEQ(csr_bc.data, bc_data)); + ASSERT_TRUE(csr_bc.sorted); + + const std::vector edge_cumsum({0, 3, 4}); + const std::vector src_vertex_cumsum({0, 2, 3}); + const std::vector dst_vertex_cumsum({0, 3, 4}); + const std::vector &p_csrs = + aten::DisjointPartitionCsrBySizes( + csr_bc, 2, edge_cumsum, src_vertex_cumsum, dst_vertex_cumsum); + ASSERT_EQ(p_csrs[0].num_rows, csr_b.num_rows); + ASSERT_EQ(p_csrs[0].num_cols, csr_b.num_cols); + ASSERT_EQ(p_csrs[1].num_rows, csr_c.num_rows); + ASSERT_EQ(p_csrs[1].num_cols, csr_c.num_cols); + ASSERT_TRUE(ArrayEQ(p_csrs[0].indptr, csr_b.indptr)); + ASSERT_TRUE(ArrayEQ(p_csrs[0].indices, csr_b.indices)); + ASSERT_TRUE(ArrayEQ(p_csrs[0].data, csr_b.data)); + ASSERT_TRUE(ArrayEQ(p_csrs[1].indptr, csr_c.indptr)); + ASSERT_TRUE(ArrayEQ(p_csrs[1].indices, csr_c.indices)); + ASSERT_TRUE(p_csrs[0].sorted); + ASSERT_TRUE(p_csrs[1].sorted); + + const std::vector csrs_abc({csr_a, csr_b, csr_c}); + const aten::CSRMatrix &csr_abc = aten::DisjointUnionCsr(csrs_abc); + ASSERT_EQ(csr_abc.num_rows, 6); + ASSERT_EQ(csr_abc.num_cols, 7); + ASSERT_TRUE(ArrayEQ(csr_abc.indptr, abc_indptr)); + ASSERT_TRUE(ArrayEQ(csr_abc.indices, abc_indices)); + ASSERT_TRUE(ArrayEQ(csr_abc.data, abc_data)); + ASSERT_FALSE(csr_abc.sorted); + + const std::vector edge_cumsum_abc({0, 4, 7, 8}); + const std::vector src_vertex_cumsum_abc({0, 3, 5, 6}); + const std::vector dst_vertex_cumsum_abc({0, 3, 6, 7}); + const std::vector &p_csrs_abc = + aten::DisjointPartitionCsrBySizes( + csr_abc, 3, edge_cumsum_abc, src_vertex_cumsum_abc, + dst_vertex_cumsum_abc); + ASSERT_EQ(p_csrs_abc[0].num_rows, csr_a.num_rows); + ASSERT_EQ(p_csrs_abc[0].num_cols, csr_a.num_cols); + ASSERT_EQ(p_csrs_abc[1].num_rows, csr_b.num_rows); + ASSERT_EQ(p_csrs_abc[1].num_cols, csr_b.num_cols); + ASSERT_EQ(p_csrs_abc[2].num_rows, csr_c.num_rows); + ASSERT_EQ(p_csrs_abc[2].num_cols, csr_c.num_cols); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[0].indptr, csr_a.indptr)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[0].indices, csr_a.indices)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[1].indptr, csr_b.indptr)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[1].indices, csr_b.indices)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[1].data, csr_b.data)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[2].indptr, csr_c.indptr)); + ASSERT_TRUE(ArrayEQ(p_csrs_abc[2].indices, csr_c.indices)); + ASSERT_FALSE(p_csrs_abc[0].sorted); + ASSERT_FALSE(p_csrs_abc[1].sorted); + ASSERT_FALSE(p_csrs_abc[2].sorted); +} + +TEST(DisjointUnionTest, TestDisjointUnionPartitionCsr) { + _TestDisjointUnionPartitionCsr(CPU); + _TestDisjointUnionPartitionCsr(CPU); +#ifdef DGL_USE_CUDA + _TestDisjointUnionPartitionCsr(GPU); + _TestDisjointUnionPartitionCsr(GPU); +#endif +} + +template +void _TestSliceContiguousChunkCoo(DGLContext ctx) { + /** + * A = [[1, 0, 0, 0], + * [0, 0, 1, 0], + * [0, 0, 0, 0]] + * + * B = [[1, 0, 0], + * [0, 0, 1]] + * + * C = [[0]] + * + */ + IdArray a_row = + aten::VecToIdArray(std::vector({0, 1}), sizeof(IdType) * 8, CTX); + IdArray a_col = + aten::VecToIdArray(std::vector({0, 2}), sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a = + aten::COOMatrix(3, 4, a_row, a_col, aten::NullArray(), true, false); + + IdArray b_row = + aten::VecToIdArray(std::vector({0, 1}), sizeof(IdType) * 8, CTX); + IdArray b_col = + aten::VecToIdArray(std::vector({0, 2}), sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_b_raw = + aten::COOMatrix(2, 3, b_row, b_col, aten::NullArray(), true, false); + + const std::vector edge_range_b({0, 2}); + const std::vector src_vertex_range_b({0, 2}); + const std::vector dst_vertex_range_b({0, 3}); + const aten::COOMatrix &coo_b = aten::COOSliceContiguousChunk( + coo_a, edge_range_b, src_vertex_range_b, dst_vertex_range_b); + ASSERT_EQ(coo_b_raw.num_rows, coo_b.num_rows); + ASSERT_EQ(coo_b_raw.num_cols, coo_b.num_cols); + ASSERT_TRUE(ArrayEQ(coo_b_raw.row, coo_b.row)); + ASSERT_TRUE(ArrayEQ(coo_b_raw.col, coo_b.col)); + ASSERT_TRUE(coo_b.row_sorted); + ASSERT_FALSE(coo_b.col_sorted); + + IdArray c_row = + aten::VecToIdArray(std::vector({}), sizeof(IdType) * 8, CTX); + IdArray c_col = + aten::VecToIdArray(std::vector({}), sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_c_raw = + aten::COOMatrix(1, 1, c_row, c_col, aten::NullArray(), true, false); + + const std::vector edge_range_c({2, 2}); + const std::vector src_vertex_range_c({2, 3}); + const std::vector dst_vertex_range_c({3, 4}); + const aten::COOMatrix &coo_c = aten::COOSliceContiguousChunk( + coo_a, edge_range_c, src_vertex_range_c, dst_vertex_range_c); + ASSERT_EQ(coo_c_raw.num_rows, coo_c.num_rows); + ASSERT_EQ(coo_c_raw.num_cols, coo_c.num_cols); + ASSERT_TRUE(ArrayEQ(coo_c.row, c_row)); + ASSERT_TRUE(ArrayEQ(coo_c.col, c_col)); + ASSERT_TRUE(coo_c.row_sorted); + ASSERT_FALSE(coo_c.col_sorted); +} + +TEST(SliceContiguousChunk, TestSliceContiguousChunkCoo) { + _TestSliceContiguousChunkCoo(CPU); + _TestSliceContiguousChunkCoo(CPU); +#ifdef DGL_USE_CUDA + _TestSliceContiguousChunkCoo(GPU); + _TestSliceContiguousChunkCoo(GPU); +#endif +} + +template +void _TestSliceContiguousChunkCsr(DGLContext ctx) { + /** + * A = [[1, 0, 0, 0], + * [0, 0, 1, 0], + * [0, 0, 0, 0]] + * + * B = [[1, 0, 0], + * [0, 0, 1]] + * + * C = [[0]] + * + */ + IdArray a_indptr = aten::VecToIdArray( + std::vector({0, 1, 2, 2}), sizeof(IdType) * 8, CTX); + IdArray a_indices = + aten::VecToIdArray(std::vector({0, 2}), sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_a = + aten::CSRMatrix(3, 4, a_indptr, a_indices, aten::NullArray(), false); + + IdArray b_indptr = aten::VecToIdArray( + std::vector({0, 1, 2}), sizeof(IdType) * 8, CTX); + IdArray b_indices = + aten::VecToIdArray(std::vector({0, 2}), sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_b_raw = + aten::CSRMatrix(2, 3, b_indptr, b_indices, aten::NullArray(), false); + + const std::vector edge_range_b({0, 2}); + const std::vector src_vertex_range_b({0, 2}); + const std::vector dst_vertex_range_b({0, 3}); + const aten::CSRMatrix &csr_b = aten::CSRSliceContiguousChunk( + csr_a, edge_range_b, src_vertex_range_b, dst_vertex_range_b); + ASSERT_EQ(csr_b.num_rows, csr_b_raw.num_rows); + ASSERT_EQ(csr_b.num_cols, csr_b_raw.num_cols); + ASSERT_TRUE(ArrayEQ(csr_b.indptr, csr_b_raw.indptr)); + ASSERT_TRUE(ArrayEQ(csr_b.indices, csr_b_raw.indices)); + ASSERT_FALSE(csr_b.sorted); + + const std::vector edge_range_c({2, 2}); + const std::vector src_vertex_range_c({2, 3}); + const std::vector dst_vertex_range_c({3, 4}); + const aten::CSRMatrix &csr_c = aten::CSRSliceContiguousChunk( + csr_a, edge_range_c, src_vertex_range_c, dst_vertex_range_c); + + int64_t indptr_len = src_vertex_range_c[1] - src_vertex_range_c[0] + 1; + IdArray c_indptr = aten::Full(0, indptr_len, sizeof(IdType) * 8, CTX); + IdArray c_indices = + aten::VecToIdArray(std::vector({}), sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_c_raw = + aten::CSRMatrix(1, 1, c_indptr, c_indices, aten::NullArray(), false); + + ASSERT_EQ(csr_c.num_rows, csr_c_raw.num_rows); + ASSERT_EQ(csr_c.num_cols, csr_c_raw.num_cols); + ASSERT_TRUE(ArrayEQ(csr_c.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_c.indices, c_indices)); + ASSERT_FALSE(csr_c.sorted); +} + +TEST(SliceContiguousChunk, TestSliceContiguousChunkCsr) { + _TestSliceContiguousChunkCsr(CPU); + _TestSliceContiguousChunkCsr(CPU); +#ifdef DGL_USE_CUDA + _TestSliceContiguousChunkCsr(GPU); + _TestSliceContiguousChunkCsr(GPU); +#endif +} + +template +void _TestMatrixUnionCsr(DGLContext ctx) { + /** + * A = [[0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 1, 0, 0], + * [1, 1, 1, 1], + * [0, 1, 1, 0], + * [1, 0, 0, 1]] + * + * B = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 0, 1, 0], + * [1, 0, 0, 1], + * [1, 0, 0, 1]] + * [1, 0, 0, 1]] + * + * C = UnionCsr({A, B}) + * + * C = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 1, 1, 0], + * [2, 1, 1, 2], + * [1, 1, 1, 1]] + * [2, 0, 0, 2]] + * + * D = [[1, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [1, 0, 0, 1]] + * + * C = UnionCsr({A, B, D}) + * + * C = [[1, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 1, 1, 0], + * [2, 1, 1, 2], + * [1, 1, 1, 1]] + * [3, 0, 0, 3]] + */ + IdArray a_indptr = aten::VecToIdArray( + std::vector({0, 0, 0, 1, 5, 7, 9}), sizeof(IdType) * 8, CTX); + IdArray a_indices = aten::VecToIdArray( + std::vector({1, 0, 1, 2, 3, 1, 2, 0, 3}), sizeof(IdType) * 8, + CTX); + IdArray b_indptr = aten::VecToIdArray( + std::vector({0, 0, 2, 3, 5, 7, 9}), sizeof(IdType) * 8, CTX); + IdArray b_indices = aten::VecToIdArray( + std::vector({0, 3, 2, 0, 3, 0, 3, 0, 3}), sizeof(IdType) * 8, + CTX); + IdArray c_indptr = aten::VecToIdArray( + std::vector({0, 0, 2, 4, 10, 14, 18}), sizeof(IdType) * 8, CTX); + IdArray c_indices = aten::VecToIdArray( + std::vector( + {0, 3, 1, 2, 0, 0, 1, 2, 3, 3, 0, 1, 2, 3, 0, 0, 3, 3}), + sizeof(IdType) * 8, CTX); + IdArray c_data = aten::VecToIdArray( + std::vector( + {9, 10, 0, 11, 1, 12, 2, 3, 4, 13, 14, 5, 6, 15, 7, 16, 8, 17}), + sizeof(IdType) * 8, CTX); + + const aten::CSRMatrix &csr_a = + aten::CSRMatrix(6, 4, a_indptr, a_indices, aten::NullArray(), true); + const aten::CSRMatrix &csr_b = + aten::CSRMatrix(6, 4, b_indptr, b_indices, aten::NullArray(), true); + + const aten::CSRMatrix &csr_aUb = aten::UnionCsr({csr_a, csr_b}); + ASSERT_EQ(csr_aUb.num_rows, 6); + ASSERT_EQ(csr_aUb.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUb.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUb.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUb.data, c_data)); + ASSERT_TRUE(csr_aUb.sorted); + + IdArray a_data = aten::VecToIdArray( + std::vector({8, 7, 6, 5, 4, 3, 2, 1, 0}), sizeof(IdType) * 8, + CTX); + + c_data = aten::VecToIdArray( + std::vector( + {9, 10, 8, 11, 7, 12, 6, 5, 4, 13, 14, 3, 2, 15, 1, 16, 0, 17}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_ad = + aten::CSRMatrix(6, 4, a_indptr, a_indices, a_data, true); + const aten::CSRMatrix &csr_adUb = aten::UnionCsr({csr_ad, csr_b}); + ASSERT_EQ(csr_adUb.num_rows, 6); + ASSERT_EQ(csr_adUb.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_adUb.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_adUb.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_adUb.data, c_data)); + ASSERT_TRUE(csr_adUb.sorted); + + IdArray b_indices2 = aten::VecToIdArray( + std::vector({0, 3, 2, 0, 3, 3, 0, 0, 3}), sizeof(IdType) * 8, + CTX); + c_indices = aten::VecToIdArray( + std::vector( + {0, 3, 1, 2, 0, 1, 2, 3, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + c_data = aten::VecToIdArray( + std::vector( + {9, 10, 0, 11, 1, 2, 3, 4, 12, 13, 5, 6, 14, 15, 7, 8, 16, 17}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_b2 = + aten::CSRMatrix(6, 4, b_indptr, b_indices2, aten::NullArray(), false); + const aten::CSRMatrix &csr_aUb2 = aten::UnionCsr({csr_a, csr_b2}); + ASSERT_EQ(csr_aUb2.num_rows, 6); + ASSERT_EQ(csr_aUb2.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUb2.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUb2.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUb2.data, c_data)); + ASSERT_FALSE(csr_aUb2.sorted); + + IdArray a_indices2 = aten::VecToIdArray( + std::vector({1, 3, 2, 1, 0, 1, 2, 0, 3}), sizeof(IdType) * 8, + CTX); + c_indices = aten::VecToIdArray( + std::vector( + {0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 0, 3, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_a2 = + aten::CSRMatrix(6, 4, a_indptr, a_indices2, aten::NullArray(), false); + const aten::CSRMatrix &csr_aUb3 = aten::UnionCsr({csr_a2, csr_b}); + ASSERT_EQ(csr_aUb3.num_rows, 6); + ASSERT_EQ(csr_aUb3.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUb3.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUb3.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUb3.data, c_data)); + ASSERT_FALSE(csr_aUb3.sorted); + + c_indices = aten::VecToIdArray( + std::vector( + {0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_aUb4 = aten::UnionCsr({csr_a2, csr_b2}); + ASSERT_EQ(csr_aUb4.num_rows, 6); + ASSERT_EQ(csr_aUb4.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUb4.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUb4.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUb4.data, c_data)); + ASSERT_FALSE(csr_aUb4.sorted); + + IdArray d_indptr = aten::VecToIdArray( + std::vector({0, 1, 1, 1, 1, 1, 3}), sizeof(IdType) * 8, CTX); + IdArray d_indices = aten::VecToIdArray( + std::vector({0, 0, 3}), sizeof(IdType) * 8, CTX); + c_indptr = aten::VecToIdArray( + std::vector({0, 1, 3, 5, 11, 15, 21}), sizeof(IdType) * 8, CTX); + c_indices = aten::VecToIdArray( + std::vector( + {0, 0, 3, 1, 2, 0, 0, 1, 2, 3, 3, 0, 1, 2, 3, 0, 0, 0, 3, 3, 3}), + sizeof(IdType) * 8, CTX); + c_data = aten::VecToIdArray( + std::vector({18, 9, 10, 8, 11, 7, 12, 6, 5, 4, 13, + 14, 3, 2, 15, 1, 16, 19, 0, 17, 20}), + sizeof(IdType) * 8, CTX); + const aten::CSRMatrix &csr_d = + aten::CSRMatrix(6, 4, d_indptr, d_indices, aten::NullArray(), true); + const aten::CSRMatrix &csr_aUbUd = aten::UnionCsr({csr_ad, csr_b, csr_d}); + ASSERT_EQ(csr_aUbUd.num_rows, 6); + ASSERT_EQ(csr_aUbUd.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUbUd.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd.data, c_data)); + ASSERT_TRUE(csr_aUbUd.sorted); + + c_indices = aten::VecToIdArray( + std::vector( + {0, 0, 3, 1, 2, 3, 2, 1, 0, 0, 3, 1, 2, 3, 0, 0, 3, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + c_data = aten::VecToIdArray( + std::vector({18, 9, 10, 0, 11, 1, 2, 3, 4, 12, 13, + 5, 6, 14, 15, 7, 8, 16, 17, 19, 20}), + sizeof(IdType) * 8, CTX); + + const aten::CSRMatrix &csr_aUbUd2 = aten::UnionCsr({csr_a2, csr_b2, csr_d}); + ASSERT_EQ(csr_aUbUd2.num_rows, 6); + ASSERT_EQ(csr_aUbUd2.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.indptr, c_indptr)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.indices, c_indices)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.data, c_data)); + ASSERT_FALSE(csr_aUbUd2.sorted); +} + +TEST(MatrixUnionTest, TestMatrixUnionCsr) { + _TestMatrixUnionCsr(CPU); + _TestMatrixUnionCsr(CPU); +} + +template +void _TestMatrixUnionCoo(DGLContext ctx) { + /** + * A = [[0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 1, 0, 0], + * [1, 1, 1, 1], + * [0, 1, 1, 0], + * [1, 0, 0, 1]] + * + * B = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 0, 1, 0], + * [1, 0, 0, 1], + * [1, 0, 0, 1]] + * [1, 0, 0, 1]] + * + * C = UnionCsr({A, B}) + * + * C = [[0, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 1, 1, 0], + * [2, 1, 1, 2], + * [1, 1, 1, 1]] + * [2, 0, 0, 2]] + * + * D = [[1, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [0, 0, 0, 0], + * [1, 0, 0, 1]] + * + * C = UnionCsr({A, B, D}) + * + * C = [[1, 0, 0, 0], + * [1, 0, 0, 1], + * [0, 1, 1, 0], + * [2, 1, 1, 2], + * [1, 1, 1, 1]] + * [3, 0, 0, 3]] + */ + IdArray a_row = aten::VecToIdArray( + std::vector({2, 3, 3, 3, 3, 4, 4, 5, 5}), sizeof(IdType) * 8, + CTX); + IdArray a_col = aten::VecToIdArray( + std::vector({1, 0, 1, 2, 3, 1, 2, 0, 3}), sizeof(IdType) * 8, + CTX); + IdArray b_row = aten::VecToIdArray( + std::vector({1, 1, 2, 3, 3, 4, 4, 5, 5}), sizeof(IdType) * 8, + CTX); + IdArray b_col = aten::VecToIdArray( + std::vector({0, 3, 2, 0, 3, 0, 3, 0, 3}), sizeof(IdType) * 8, + CTX); + IdArray c_row = aten::VecToIdArray( + std::vector( + {2, 3, 3, 3, 3, 4, 4, 5, 5, 1, 1, 2, 3, 3, 4, 4, 5, 5}), + sizeof(IdType) * 8, CTX); + IdArray c_col = aten::VecToIdArray( + std::vector( + {1, 0, 1, 2, 3, 1, 2, 0, 3, 0, 3, 2, 0, 3, 0, 3, 0, 3}), + sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a = + aten::COOMatrix(6, 4, a_row, a_col, aten::NullArray(), true, true); + const aten::COOMatrix &coo_b = + aten::COOMatrix(6, 4, b_row, b_col, aten::NullArray(), true, true); + const std::vector coos_ab({coo_a, coo_b}); + const aten::COOMatrix &coo_ab = aten::UnionCoo(coos_ab); + ASSERT_EQ(coo_ab.num_rows, 6); + ASSERT_EQ(coo_ab.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_ab.row, c_row)); + ASSERT_TRUE(ArrayEQ(coo_ab.col, c_col)); + ASSERT_FALSE(COOHasData(coo_ab)); + ASSERT_FALSE(coo_ab.row_sorted); + ASSERT_FALSE(coo_ab.col_sorted); + + IdArray a_data = aten::VecToIdArray( + std::vector({2, 1, 0, 3, 4, 5, 6, 7, 8}), sizeof(IdType) * 8, + CTX); + + IdArray c_data = aten::VecToIdArray( + std::vector( + {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}), + sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_a2 = + aten::COOMatrix(6, 4, a_row, a_col, a_data, true, true); + const std::vector coos_ab2({coo_a2, coo_b}); + const aten::COOMatrix &coo_ab2 = aten::UnionCoo(coos_ab2); + ASSERT_EQ(coo_ab2.num_rows, 6); + ASSERT_EQ(coo_ab2.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_ab2.row, c_row)); + ASSERT_TRUE(ArrayEQ(coo_ab2.col, c_col)); + ASSERT_TRUE(COOHasData(coo_ab2)); + ASSERT_TRUE(ArrayEQ(coo_ab2.data, c_data)); + ASSERT_FALSE(coo_ab2.row_sorted); + ASSERT_FALSE(coo_ab2.col_sorted); + + IdArray b_data = aten::VecToIdArray( + std::vector({0, 1, 2, 3, 4, 5, 6, 8, 7}), sizeof(IdType) * 8, + CTX); + c_data = aten::VecToIdArray( + std::vector( + {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16}), + sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo_b2 = + aten::COOMatrix(6, 4, b_row, b_col, b_data, true, true); + const std::vector coos_ab3({coo_a2, coo_b2}); + const aten::COOMatrix &coo_ab3 = aten::UnionCoo(coos_ab3); + ASSERT_EQ(coo_ab3.num_rows, 6); + ASSERT_EQ(coo_ab3.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_ab3.row, c_row)); + ASSERT_TRUE(ArrayEQ(coo_ab3.col, c_col)); + ASSERT_TRUE(COOHasData(coo_ab3)); + ASSERT_TRUE(ArrayEQ(coo_ab3.data, c_data)); + ASSERT_FALSE(coo_ab3.row_sorted); + ASSERT_FALSE(coo_ab3.col_sorted); + + c_data = aten::VecToIdArray( + std::vector( + {2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 16}), + sizeof(IdType) * 8, CTX); + + const std::vector coos_ab4({coo_a2, coo_b2}); + const aten::COOMatrix &coo_ab4 = aten::UnionCoo(coos_ab4); + ASSERT_EQ(coo_ab4.num_rows, 6); + ASSERT_EQ(coo_ab4.num_cols, 4); + ASSERT_TRUE(ArrayEQ(coo_ab4.row, c_row)); + ASSERT_TRUE(ArrayEQ(coo_ab4.col, c_col)); + ASSERT_TRUE(COOHasData(coo_ab4)); + ASSERT_TRUE(ArrayEQ(coo_ab4.data, c_data)); + ASSERT_FALSE(coo_ab4.row_sorted); + ASSERT_FALSE(coo_ab4.col_sorted); + + IdArray d_row = aten::VecToIdArray( + std::vector({0, 5, 5}), sizeof(IdType) * 8, CTX); + IdArray d_col = aten::VecToIdArray( + std::vector({0, 0, 3}), sizeof(IdType) * 8, CTX); + c_row = aten::VecToIdArray( + std::vector( + {2, 3, 3, 3, 3, 4, 4, 5, 5, 1, 1, 2, 3, 3, 4, 4, 5, 5, 0, 5, 5}), + sizeof(IdType) * 8, CTX); + c_col = aten::VecToIdArray( + std::vector( + {1, 0, 1, 2, 3, 1, 2, 0, 3, 0, 3, 2, 0, 3, 0, 3, 0, 3, 0, 0, 3}), + sizeof(IdType) * 8, CTX); + + const aten::COOMatrix &coo_d = + aten::COOMatrix(6, 4, d_row, d_col, aten::NullArray(), true, true); + const aten::COOMatrix &csr_aUbUd = aten::UnionCoo({coo_a, coo_b, coo_d}); + ASSERT_EQ(csr_aUbUd.num_rows, 6); + ASSERT_EQ(csr_aUbUd.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUbUd.row, c_row)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd.col, c_col)); + ASSERT_FALSE(COOHasData(csr_aUbUd)); + ASSERT_FALSE(csr_aUbUd.row_sorted); + ASSERT_FALSE(csr_aUbUd.col_sorted); + + c_data = aten::VecToIdArray( + std::vector({2, 1, 0, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 17, 16, 18, 19, 20}), + sizeof(IdType) * 8, CTX); + + const aten::COOMatrix &csr_aUbUd2 = aten::UnionCoo({coo_a2, coo_b2, coo_d}); + ASSERT_EQ(csr_aUbUd2.num_rows, 6); + ASSERT_EQ(csr_aUbUd2.num_cols, 4); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.row, c_row)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.col, c_col)); + ASSERT_TRUE(COOHasData(csr_aUbUd2)); + ASSERT_TRUE(ArrayEQ(csr_aUbUd2.data, c_data)); + ASSERT_FALSE(csr_aUbUd2.row_sorted); + ASSERT_FALSE(csr_aUbUd2.col_sorted); +} + +TEST(MatrixUnionTest, TestMatrixUnionCoo) { + _TestMatrixUnionCoo(CPU); + _TestMatrixUnionCoo(CPU); +} + +template +void _TestCumSum(DGLContext ctx) { + IdArray a = aten::VecToIdArray( + std::vector({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX) * 8, ctx); + { + IdArray tb = aten::VecToIdArray( + std::vector({8, 14, 21, 26, 29, 29, 38}), sizeof(IDX) * 8, ctx); + IdArray b = aten::CumSum(a); + ASSERT_TRUE(ArrayEQ(b, tb)); + } + { + IdArray tb = aten::VecToIdArray( + std::vector({0, 8, 14, 21, 26, 29, 29, 38}), sizeof(IDX) * 8, ctx); + IdArray b = aten::CumSum(a, true); + ASSERT_TRUE(ArrayEQ(b, tb)); + } + a = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + { + IdArray tb = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + IdArray b = aten::CumSum(a); + ASSERT_TRUE(ArrayEQ(b, tb)); + } + { + IdArray tb = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + IdArray b = aten::CumSum(a); + ASSERT_TRUE(ArrayEQ(b, tb)); + } +} + +TEST(ArrayTest, CumSum) { + _TestCumSum(CPU); + _TestCumSum(CPU); +#ifdef DGL_USE_CUDA + _TestCumSum(GPU); + _TestCumSum(GPU); +#endif +} + +template +void _TestScatter_(DGLContext ctx) { + IdArray out = aten::Full(1, 10, 8 * sizeof(IDX), ctx); + IdArray idx = + aten::VecToIdArray(std::vector({2, 3, 9}), sizeof(IDX) * 8, ctx); + IdArray val = + aten::VecToIdArray(std::vector({-20, 30, 90}), sizeof(IDX) * 8, ctx); + aten::Scatter_(idx, val, out); + IdArray tout = aten::VecToIdArray( + std::vector({1, 1, -20, 30, 1, 1, 1, 1, 1, 90}), sizeof(IDX) * 8, + ctx); + ASSERT_TRUE(ArrayEQ(out, tout)); +} + +TEST(ArrayTest, Scatter_) { + _TestScatter_(CPU); + _TestScatter_(CPU); + _TestScatter_(CPU); + _TestScatter_(CPU); +#ifdef DGL_USE_CUDA + _TestScatter_(GPU); + _TestScatter_(GPU); + _TestScatter_(GPU); + _TestScatter_(GPU); +#endif +} + +template +void _TestNonZero(DGLContext ctx) { + auto val = aten::VecToIdArray( + std::vector({0, 1, 2, 0, -10, 0, 0, 23}), sizeof(IDX) * 8, ctx); + auto idx = aten::NonZero(val); + auto tidx = aten::VecToIdArray(std::vector({1, 2, 4, 7}), 64, ctx); + ASSERT_TRUE(ArrayEQ(idx, tidx)); + + val = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + idx = aten::NonZero(val); + tidx = aten::VecToIdArray(std::vector({}), 64, ctx); + ASSERT_TRUE(ArrayEQ(idx, tidx)); + + val = + aten::VecToIdArray(std::vector({0, 0, 0, 0}), sizeof(IDX) * 8, ctx); + idx = aten::NonZero(val); + tidx = aten::VecToIdArray(std::vector({}), 64, ctx); + ASSERT_TRUE(ArrayEQ(idx, tidx)); + + val = aten::Full(1, 3, sizeof(IDX) * 8, ctx); + idx = aten::NonZero(val); + tidx = aten::VecToIdArray(std::vector({0, 1, 2}), 64, ctx); + ASSERT_TRUE(ArrayEQ(idx, tidx)); +} + +TEST(ArrayTest, NonZero) { + _TestNonZero(CPU); + _TestNonZero(CPU); +#ifdef DGL_USE_CUDA + _TestNonZero(GPU); + _TestNonZero(GPU); +#endif +} + +template +void _TestLineGraphCOO(DGLContext ctx) { + /** + * A = [[0, 0, 1, 0], + * [1, 0, 1, 0], + * [1, 1, 0, 0], + * [0, 0, 0, 1]] + * row: 0 1 1 2 2 3 + * col: 2 0 2 0 1 3 + * ID: 0 1 2 3 4 5 + * + * B = COOLineGraph(A, backtracking=False) + * + * B = [[0, 0, 0, 0, 1, 0], + * [1, 0, 0, 0, 0, 0], + * [0, 0, 0, 1, 0, 0], + * [0, 0, 0, 0, 0, 0], + * [0, 1, 0, 0, 0, 0], + * [0, 0, 0, 0, 0, 0]] + * + * C = COOLineGraph(A, backtracking=True) + * + * C = [[0, 0, 0, 1, 1, 0], + * [1, 0, 0, 0, 0, 0], + * [0, 0, 0, 1, 1, 0], + * [1, 0, 0, 0, 0, 0], + * [0, 1, 1, 0, 0, 0], + * [0, 0, 0, 0, 0, 0]] + */ + IdArray a_row = aten::VecToIdArray( + std::vector({0, 1, 1, 2, 2, 3}), sizeof(IdType) * 8, ctx); + IdArray a_col = aten::VecToIdArray( + std::vector({2, 0, 2, 0, 1, 3}), sizeof(IdType) * 8, ctx); + IdArray b_row = aten::VecToIdArray( + std::vector({0, 1, 2, 4}), sizeof(IdType) * 8, ctx); + IdArray b_col = aten::VecToIdArray( + std::vector({4, 0, 3, 1}), sizeof(IdType) * 8, ctx); + IdArray c_row = aten::VecToIdArray( + std::vector({0, 0, 1, 2, 2, 3, 4, 4}), sizeof(IdType) * 8, ctx); + IdArray c_col = aten::VecToIdArray( + std::vector({3, 4, 0, 3, 4, 0, 1, 2}), sizeof(IdType) * 8, ctx); + + const aten::COOMatrix &coo_a = + aten::COOMatrix(4, 4, a_row, a_col, aten::NullArray(), true, false); + + const aten::COOMatrix &l_coo = COOLineGraph(coo_a, false); + ASSERT_EQ(l_coo.num_rows, 6); + ASSERT_EQ(l_coo.num_cols, 6); + ASSERT_TRUE(ArrayEQ(l_coo.row, b_row)); + ASSERT_TRUE(ArrayEQ(l_coo.col, b_col)); + ASSERT_FALSE(l_coo.row_sorted); + ASSERT_FALSE(l_coo.col_sorted); + + const aten::COOMatrix &l_coo2 = COOLineGraph(coo_a, true); + ASSERT_EQ(l_coo2.num_rows, 6); + ASSERT_EQ(l_coo2.num_cols, 6); + ASSERT_TRUE(ArrayEQ(l_coo2.row, c_row)); + ASSERT_TRUE(ArrayEQ(l_coo2.col, c_col)); + ASSERT_FALSE(l_coo2.row_sorted); + ASSERT_FALSE(l_coo2.col_sorted); + + IdArray a_data = aten::VecToIdArray( + std::vector({4, 5, 0, 1, 2, 3}), sizeof(IdType) * 8, ctx); + b_row = aten::VecToIdArray( + std::vector({4, 5, 0, 2}), sizeof(IdType) * 8, ctx); + b_col = aten::VecToIdArray( + std::vector({2, 4, 1, 5}), sizeof(IdType) * 8, ctx); + c_row = aten::VecToIdArray( + std::vector({4, 4, 5, 0, 0, 1, 2, 2}), sizeof(IdType) * 8, ctx); + c_col = aten::VecToIdArray( + std::vector({1, 2, 4, 1, 2, 4, 5, 0}), sizeof(IdType) * 8, ctx); + const aten::COOMatrix &coo_ad = + aten::COOMatrix(4, 4, a_row, a_col, a_data, true, false); + const aten::COOMatrix &ld_coo = COOLineGraph(coo_ad, false); + ASSERT_EQ(ld_coo.num_rows, 6); + ASSERT_EQ(ld_coo.num_cols, 6); + ASSERT_TRUE(ArrayEQ(ld_coo.row, b_row)); + ASSERT_TRUE(ArrayEQ(ld_coo.col, b_col)); + ASSERT_FALSE(ld_coo.row_sorted); + ASSERT_FALSE(ld_coo.col_sorted); + + const aten::COOMatrix &ld_coo2 = COOLineGraph(coo_ad, true); + ASSERT_EQ(ld_coo2.num_rows, 6); + ASSERT_EQ(ld_coo2.num_cols, 6); + ASSERT_TRUE(ArrayEQ(ld_coo2.row, c_row)); + ASSERT_TRUE(ArrayEQ(ld_coo2.col, c_col)); + ASSERT_FALSE(ld_coo2.row_sorted); + ASSERT_FALSE(ld_coo2.col_sorted); +} + +TEST(LineGraphTest, LineGraphCOO) { + _TestLineGraphCOO(CPU); + _TestLineGraphCOO(CPU); +} + +template +void _TestSort(DGLContext ctx) { + // case 1 + IdArray a = aten::VecToIdArray( + std::vector({8, 6, 7, 5, 3, 0, 9}), sizeof(IDX) * 8, ctx); + IdArray sorted_a = aten::VecToIdArray( + std::vector({0, 3, 5, 6, 7, 8, 9}), sizeof(IDX) * 8, ctx); + IdArray sorted_idx = + aten::VecToIdArray(std::vector({5, 4, 3, 1, 2, 0, 6}), 64, ctx); + + IdArray sorted, idx; + std::tie(sorted, idx) = aten::Sort(a); + ASSERT_TRUE(ArrayEQ(sorted, sorted_a)); + ASSERT_TRUE(ArrayEQ(idx, sorted_idx)); + + // case 2: empty array + a = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + sorted_a = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + sorted_idx = aten::VecToIdArray(std::vector({}), 64, ctx); + std::tie(sorted, idx) = aten::Sort(a); + ASSERT_TRUE(ArrayEQ(sorted, sorted_a)); + ASSERT_TRUE(ArrayEQ(idx, sorted_idx)); + + // case 3: array with one element + a = aten::VecToIdArray(std::vector({2}), sizeof(IDX) * 8, ctx); + sorted_a = aten::VecToIdArray(std::vector({2}), sizeof(IDX) * 8, ctx); + sorted_idx = aten::VecToIdArray(std::vector({0}), 64, ctx); + std::tie(sorted, idx) = aten::Sort(a); + ASSERT_TRUE(ArrayEQ(sorted, sorted_a)); + ASSERT_TRUE(ArrayEQ(idx, sorted_idx)); +} + +TEST(ArrayTest, Sort) { + _TestSort(CPU); + _TestSort(CPU); +#ifdef DGL_USE_CUDA + _TestSort(GPU); + _TestSort(GPU); +#endif +} + +TEST(ArrayTest, BFloatCast) { + for (int i = -100; i < 100; ++i) { + float a = i; + BFloat16 b = a; + float a_casted = b; + ASSERT_FLOAT_EQ(a, a_casted); + } +} diff --git a/tests/cpp/test_csrmm.cc b/tests/cpp/test_csrmm.cc index df8d60775dc6..880b138037a3 100644 --- a/tests/cpp/test_csrmm.cc +++ b/tests/cpp/test_csrmm.cc @@ -178,7 +178,7 @@ TEST(CsrmmTest, TestCsrmm) { _TestCsrmm(CPU); _TestCsrmm(CPU); _TestCsrmm(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCsrmm(GPU); _TestCsrmm(GPU); _TestCsrmm(GPU); @@ -191,7 +191,7 @@ TEST(CsrmmTest, TestCsrsum) { _TestCsrsum(CPU); _TestCsrsum(CPU); _TestCsrsum(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCsrsum(GPU); _TestCsrsum(GPU); _TestCsrsum(GPU); @@ -204,7 +204,7 @@ TEST(CsrmmTest, TestCsrmask) { _TestCsrmask(CPU); _TestCsrmask(CPU); _TestCsrmask(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCsrmask(GPU); _TestCsrmask(GPU); _TestCsrmask(GPU); diff --git a/tests/cpp/test_csrmm.cc.prehip b/tests/cpp/test_csrmm.cc.prehip new file mode 100644 index 000000000000..df8d60775dc6 --- /dev/null +++ b/tests/cpp/test_csrmm.cc.prehip @@ -0,0 +1,215 @@ +#include +#include +#include + +#include "../../src/array/cpu/array_utils.h" // PairHash +#include "./common.h" + +using namespace dgl; +using namespace dgl::runtime; + +namespace { + +// Unit tests: +// CSRMM(A, B) == A_mm_B +// CSRSum({A, C}) == A_plus_C +// CSRMask(A, C) = A_mask_C + +template +std::unordered_map, DType, aten::PairHash> COOToMap( + aten::COOMatrix coo, NDArray weights) { + std::unordered_map, DType, aten::PairHash> map; + + for (int64_t i = 0; i < coo.row->shape[0]; ++i) { + IdType irow = aten::IndexSelect(coo.row, i); + IdType icol = aten::IndexSelect(coo.col, i); + IdType ieid = + aten::COOHasData(coo) ? aten::IndexSelect(coo.data, i) : i; + DType idata = aten::IndexSelect(weights, ieid); + map.insert({{irow, icol}, idata}); + } + return map; +} + +template +bool CSRIsClose( + aten::CSRMatrix A, aten::CSRMatrix B, NDArray A_weights, NDArray B_weights, + DType rtol, DType atol) { + auto Amap = COOToMap(CSRToCOO(A, false), A_weights); + auto Bmap = COOToMap(CSRToCOO(B, false), B_weights); + + if (Amap.size() != Bmap.size()) return false; + + for (auto itA : Amap) { + auto itB = Bmap.find(itA.first); + if (itB == Bmap.end()) return false; + if (fabs(itA.second - itB->second) >= rtol * fabs(itA.second) + atol) + return false; + } + + return true; +} + +template +std::pair CSR_A(DGLContext ctx = CTX) { + // matrix([[0. , 0. , 1. , 0.7, 0. ], + // [0. , 0. , 0.5, 0.+, 0. ], + // [0.4, 0.7, 0. , 0.2, 0. ], + // [0. , 0. , 0. , 0. , 0.2]]) + // (0.+ indicates that the entry exists but the value is 0.) + auto csr = aten::CSRMatrix( + 4, 5, NDArray::FromVector(std::vector({0, 2, 4, 7, 8}), ctx), + NDArray::FromVector(std::vector({2, 3, 2, 3, 0, 1, 3, 4}), ctx), + NDArray::FromVector(std::vector({1, 0, 2, 3, 4, 5, 6, 7}), ctx)); + auto weights = NDArray::FromVector( + std::vector({0.7, 1.0, 0.5, 0.0, 0.4, 0.7, 0.2, 0.2}), ctx); + return {csr, weights}; +} + +template +std::pair CSR_B(DGLContext ctx = CTX) { + // matrix([[0. , 0.9, 0. , 0.6, 0. , 0.3], + // [0. , 0. , 0. , 0. , 0. , 0.4], + // [0.+, 0. , 0. , 0. , 0. , 0.9], + // [0.8, 0.2, 0.3, 0.2, 0. , 0. ], + // [0.2, 0.4, 0. , 0. , 0. , 0. ]]) + // (0.+ indicates that the entry exists but the value is 0.) + auto csr = aten::CSRMatrix( + 5, 6, NDArray::FromVector(std::vector({0, 3, 4, 6, 10, 12}), ctx), + NDArray::FromVector( + std::vector({1, 3, 5, 5, 0, 5, 0, 1, 2, 3, 0, 1}), ctx)); + auto weights = NDArray::FromVector( + std::vector( + {0.9, 0.6, 0.3, 0.4, 0.0, 0.9, 0.8, 0.2, 0.3, 0.2, 0.2, 0.4}), + ctx); + return {csr, weights}; +} + +template +std::pair CSR_C(DGLContext ctx = CTX) { + // matrix([[0. , 0. , 0. , 0.2, 0. ], + // [0. , 0. , 0. , 0.5, 0.4], + // [0. , 0.2, 0. , 0.9, 0.2], + // [0. , 1. , 0. , 0.7, 0. ]]) + auto csr = aten::CSRMatrix( + 4, 5, NDArray::FromVector(std::vector({0, 1, 3, 6, 8}), ctx), + NDArray::FromVector(std::vector({3, 3, 4, 1, 3, 4, 1, 3}), ctx)); + auto weights = NDArray::FromVector( + std::vector({0.2, 0.5, 0.4, 0.2, 0.9, 0.2, 1., 0.7}), ctx); + return {csr, weights}; +} + +template +std::pair CSR_A_mm_B(DGLContext ctx = CTX) { + // matrix([[0.56, 0.14, 0.21, 0.14, 0. , 0.9 ], + // [0.+ , 0.+ , 0.+ , 0.+ , 0. , 0.45], + // [0.16, 0.4 , 0.06, 0.28, 0. , 0.4 ], + // [0.04, 0.08, 0. , 0. , 0. , 0. ]]) + // (0.+ indicates that the entry exists but the value is 0.) + auto csr = aten::CSRMatrix( + 4, 6, NDArray::FromVector(std::vector({0, 5, 10, 15, 17}), ctx), + NDArray::FromVector( + std::vector( + {0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1, 2, 3, 5, 0, 1}), + ctx)); + auto weights = NDArray::FromVector( + std::vector( + {0.56, 0.14, 0.21, 0.14, 0.9, 0., 0., 0., 0., 0.45, 0.16, 0.4, 0.06, + 0.28, 0.4, 0.04, 0.08}), + ctx); + return {csr, weights}; +} + +template +std::pair CSR_A_plus_C(DGLContext ctx = CTX) { + auto csr = aten::CSRMatrix( + 4, 5, NDArray::FromVector(std::vector({0, 2, 5, 9, 12}), ctx), + NDArray::FromVector( + std::vector({2, 3, 2, 3, 4, 0, 1, 3, 4, 1, 3, 4}), ctx)); + auto weights = NDArray::FromVector( + std::vector( + {1., 0.9, 0.5, 0.5, 0.4, 0.4, 0.9, 1.1, 0.2, 1., 0.7, 0.2}), + ctx); + return {csr, weights}; +} + +template +NDArray CSR_A_mask_C(DGLContext ctx = CTX) { + return NDArray::FromVector( + std::vector({0.7, 0.0, 0.0, 0.7, 0.2, 0.0, 0.0, 0.0}), ctx); +} + +template +void _TestCsrmm(DGLContext ctx = CTX) { + auto A = CSR_A(ctx); + auto B = CSR_B(ctx); + auto A_mm_B = aten::CSRMM(A.first, A.second, B.first, B.second); + auto A_mm_B2 = CSR_A_mm_B(ctx); + bool result = CSRIsClose( + A_mm_B.first, A_mm_B2.first, A_mm_B.second, A_mm_B2.second, 1e-4, 1e-4); + ASSERT_TRUE(result); +} + +template +void _TestCsrsum(DGLContext ctx = CTX) { + auto A = CSR_A(ctx); + auto C = CSR_C(ctx); + auto A_plus_C = aten::CSRSum({A.first, C.first}, {A.second, C.second}); + auto A_plus_C2 = CSR_A_plus_C(ctx); + bool result = CSRIsClose( + A_plus_C.first, A_plus_C2.first, A_plus_C.second, A_plus_C2.second, 1e-4, + 1e-4); + ASSERT_TRUE(result); +} + +template +void _TestCsrmask(DGLContext ctx = CTX) { + auto A = CSR_A(ctx); + auto C = CSR_C(ctx); + auto C_coo = CSRToCOO(C.first, false); + auto A_mask_C = + aten::CSRGetData(A.first, C_coo.row, C_coo.col, A.second, 0); + auto A_mask_C2 = CSR_A_mask_C(ctx); + ASSERT_TRUE(ArrayEQ(A_mask_C, A_mask_C2)); +} + +TEST(CsrmmTest, TestCsrmm) { + _TestCsrmm(CPU); + _TestCsrmm(CPU); + _TestCsrmm(CPU); + _TestCsrmm(CPU); +#ifdef DGL_USE_CUDA + _TestCsrmm(GPU); + _TestCsrmm(GPU); + _TestCsrmm(GPU); + _TestCsrmm(GPU); +#endif +} + +TEST(CsrmmTest, TestCsrsum) { + _TestCsrsum(CPU); + _TestCsrsum(CPU); + _TestCsrsum(CPU); + _TestCsrsum(CPU); +#ifdef DGL_USE_CUDA + _TestCsrsum(GPU); + _TestCsrsum(GPU); + _TestCsrsum(GPU); + _TestCsrsum(GPU); +#endif +} + +TEST(CsrmmTest, TestCsrmask) { + _TestCsrmask(CPU); + _TestCsrmask(CPU); + _TestCsrmask(CPU); + _TestCsrmask(CPU); +#ifdef DGL_USE_CUDA + _TestCsrmask(GPU); + _TestCsrmask(GPU); + _TestCsrmask(GPU); + _TestCsrmask(GPU); +#endif +} + +}; // namespace diff --git a/tests/cpp/test_partition.cc b/tests/cpp/test_partition.cc index 4281e2646ce6..beac56204f59 100644 --- a/tests/cpp/test_partition.cc +++ b/tests/cpp/test_partition.cc @@ -77,7 +77,7 @@ void _TestRemainder_MapToX() { } TEST(PartitionTest, TestRemainderPartition) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestRemainder_GeneratePermutation(); _TestRemainder_GeneratePermutation(); @@ -185,7 +185,7 @@ void _TestRange_MapToX() { } TEST(PartitionTest, TestRangePartition) { -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestRange_GeneratePermutation(); _TestRange_GeneratePermutation(); diff --git a/tests/cpp/test_partition.cc.prehip b/tests/cpp/test_partition.cc.prehip new file mode 100644 index 000000000000..4281e2646ce6 --- /dev/null +++ b/tests/cpp/test_partition.cc.prehip @@ -0,0 +1,196 @@ +#include + +#include "../../src/partition/ndarray_partition.h" +#include "./common.h" + +using namespace dgl; +using namespace dgl::partition; + +template +void _TestRemainder_GeneratePermutation() { + const int64_t size = 160000; + const int num_parts = 7; + NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts); + + IdArray idxs = + aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0}); + + std::pair result = part->GeneratePermutation(idxs); + + // first part of result should be the permutation + IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0}); + ASSERT_TRUE(perm.Ptr() != nullptr); + ASSERT_EQ(perm->shape[0], idxs->shape[0]); + const IdType* const perm_cpu = static_cast(perm->data); + + // second part of result should be the counts + IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0}); + ASSERT_TRUE(counts.Ptr() != nullptr); + ASSERT_EQ(counts->shape[0], num_parts); + const int64_t* const counts_cpu = static_cast(counts->data); + + std::vector prefix(num_parts + 1, 0); + for (int p = 0; p < num_parts; ++p) { + prefix[p + 1] = prefix[p] + counts_cpu[p]; + } + ASSERT_EQ(prefix.back(), idxs->shape[0]); + + // copy original indexes to cpu + idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0}); + const IdType* const idxs_cpu = static_cast(idxs->data); + + for (int p = 0; p < num_parts; ++p) { + for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) { + EXPECT_EQ(idxs_cpu[perm_cpu[i]] % num_parts, p); + } + } +} + +template +void _TestRemainder_MapToX() { + const int64_t size = 160000; + const int num_parts = 7; + NDArrayPartitionRef part = CreatePartitionRemainderBased(size, num_parts); + + for (int part_id = 0; part_id < num_parts; ++part_id) { + IdArray local = aten::Range( + 0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0}); + IdArray global = part->MapToGlobal(local, part_id); + IdArray act_local = part->MapToLocal(global).CopyTo(CPU); + + // every global index should have the same remainder as the part id + ASSERT_EQ(global->shape[0], local->shape[0]); + global = global.CopyTo(CPU); + for (int64_t i = 0; i < global->shape[0]; ++i) { + EXPECT_EQ(Ptr(global)[i] % num_parts, part_id) + << "i=" << i << ", num_parts=" << num_parts + << ", part_id=" << part_id; + } + + // the remapped local indices to should match the original + local = local.CopyTo(CPU); + ASSERT_EQ(local->shape[0], act_local->shape[0]); + for (int64_t i = 0; i < act_local->shape[0]; ++i) { + EXPECT_EQ(Ptr(local)[i], Ptr(act_local)[i]); + } + } +} + +TEST(PartitionTest, TestRemainderPartition) { +#ifdef DGL_USE_CUDA + _TestRemainder_GeneratePermutation(); + _TestRemainder_GeneratePermutation(); + + _TestRemainder_MapToX(); + _TestRemainder_MapToX(); +#endif + // CPU is not implemented +} + +template +int _FindPart(const INDEX idx, const RANGE* const range, const int num_parts) { + for (int i = 0; i < num_parts; ++i) { + if (range[i + 1] > idx) { + return i; + } + } + + return -1; +} + +template +void _TestRange_GeneratePermutation() { + const int64_t size = 160000; + const int num_parts = 7; + IdArray range = aten::NewIdArray( + num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8); + for (int i = 0; i < num_parts; ++i) { + range.Ptr()[i] = (size / num_parts) * i; + } + range.Ptr()[num_parts] = size; + NDArrayPartitionRef part = CreatePartitionRangeBased( + size, num_parts, range.CopyTo(DGLContext{XPU, 0})); + + IdArray idxs = + aten::Range(0, size / 10, sizeof(IdType) * 8, DGLContext{XPU, 0}); + + std::pair result = part->GeneratePermutation(idxs); + + // first part of result should be the permutation + IdArray perm = result.first.CopyTo(DGLContext{kDGLCPU, 0}); + ASSERT_TRUE(perm.Ptr() != nullptr); + ASSERT_EQ(perm->shape[0], idxs->shape[0]); + const IdType* const perm_cpu = static_cast(perm->data); + + // second part of result should be the counts + IdArray counts = result.second.CopyTo(DGLContext{kDGLCPU, 0}); + ASSERT_TRUE(counts.Ptr() != nullptr); + ASSERT_EQ(counts->shape[0], num_parts); + const int64_t* const counts_cpu = static_cast(counts->data); + + std::vector prefix(num_parts + 1, 0); + for (int p = 0; p < num_parts; ++p) { + prefix[p + 1] = prefix[p] + counts_cpu[p]; + } + ASSERT_EQ(prefix.back(), idxs->shape[0]); + + // copy original indexes to cpu + idxs = idxs.CopyTo(DGLContext{kDGLCPU, 0}); + const IdType* const idxs_cpu = static_cast(idxs->data); + + for (int p = 0; p < num_parts; ++p) { + for (int64_t i = prefix[p]; i < prefix[p + 1]; ++i) { + EXPECT_EQ( + _FindPart(idxs_cpu[perm_cpu[i]], range.Ptr(), num_parts), p); + } + } +} + +template +void _TestRange_MapToX() { + const int64_t size = 160000; + const int num_parts = 7; + IdArray range = aten::NewIdArray( + num_parts + 1, DGLContext{kDGLCPU, 0}, sizeof(IdType) * 8); + for (int i = 0; i < num_parts; ++i) { + Ptr(range)[i] = (size / num_parts) * i; + } + range.Ptr()[num_parts] = size; + NDArrayPartitionRef part = CreatePartitionRangeBased( + size, num_parts, range.CopyTo(DGLContext{XPU, 0})); + + for (int part_id = 0; part_id < num_parts; ++part_id) { + IdArray local = aten::Range( + 0, part->PartSize(part_id), sizeof(IdType) * 8, DGLContext{XPU, 0}); + IdArray global = part->MapToGlobal(local, part_id); + IdArray act_local = part->MapToLocal(global).CopyTo(CPU); + + ASSERT_EQ(global->shape[0], local->shape[0]); + global = global.CopyTo(CPU); + for (int64_t i = 0; i < global->shape[0]; ++i) { + EXPECT_EQ( + _FindPart(Ptr(global)[i], Ptr(range), num_parts), + part_id) + << "i=" << i << ", num_parts=" << num_parts << ", part_id=" << part_id + << ", shape=" << global->shape[0]; + } + + // the remapped local indices to should match the original + local = local.CopyTo(CPU); + ASSERT_EQ(local->shape[0], act_local->shape[0]); + for (int64_t i = 0; i < act_local->shape[0]; ++i) { + EXPECT_EQ(Ptr(local)[i], Ptr(act_local)[i]); + } + } +} + +TEST(PartitionTest, TestRangePartition) { +#ifdef DGL_USE_CUDA + _TestRange_GeneratePermutation(); + _TestRange_GeneratePermutation(); + + _TestRange_MapToX(); + _TestRange_MapToX(); +#endif + // CPU is not implemented +} diff --git a/tests/cpp/test_spmat_coo.cc b/tests/cpp/test_spmat_coo.cc index 4b663a5895b6..05acf8eb624a 100644 --- a/tests/cpp/test_spmat_coo.cc +++ b/tests/cpp/test_spmat_coo.cc @@ -331,7 +331,7 @@ void _TestCOOToCSR(DGLContext ctx) { TEST(SpmatTest, COOToCSR) { _TestCOOToCSR(CPU); _TestCOOToCSR(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCOOToCSR(GPU); _TestCOOToCSR(GPU); #endif @@ -417,7 +417,7 @@ void _TestCOOSort(DGLContext ctx) { TEST(SpmatTest, COOSort) { _TestCOOSort(CPU); _TestCOOSort(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCOOSort(GPU); _TestCOOSort(GPU); #endif @@ -479,7 +479,7 @@ void _TestCOOGetData(DGLContext ctx) { TEST(SpmatTest, COOGetData) { _TestCOOGetData(CPU); _TestCOOGetData(CPU); - // #ifdef DGL_USE_CUDA + // #ifdef DGL_USE_ROCM //_TestCOOGetData(GPU); //_TestCOOGetData(GPU); // #endif diff --git a/tests/cpp/test_spmat_coo.cc.prehip b/tests/cpp/test_spmat_coo.cc.prehip new file mode 100644 index 000000000000..4b663a5895b6 --- /dev/null +++ b/tests/cpp/test_spmat_coo.cc.prehip @@ -0,0 +1,576 @@ +#include +#include +#include +#include + +#include + +#include "./common.h" + +using namespace dgl; +using namespace dgl::runtime; + +namespace { + +template +aten::CSRMatrix CSR1(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 3, 1, 4] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 3, 4, 1}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::CSRMatrix CSR2(DGLContext ctx = CTX) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 5, 3, 1, 4] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::COOMatrix COO1(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 3, 1, 4] + // row : [0, 2, 0, 1, 2] + // col : [1, 2, 2, 0, 3] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 3, 1, 2, 4}), sizeof(IDX) * 8, ctx)); +} + +template +aten::COOMatrix COO2(DGLContext ctx = CTX) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 5, 3, 1, 4] + // row : [0, 2, 0, 1, 2, 0] + // col : [1, 2, 2, 0, 3, 2] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx)); +} + +template +aten::CSRMatrix SR_CSR3(DGLContext ctx) { + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::CSRMatrix SRC_CSR3(DGLContext ctx) { + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::COOMatrix COO3(DGLContext ctx) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // row : [0, 2, 0, 1, 2, 0] + // col : [2, 2, 1, 0, 3, 2] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx)); +} + +template +aten::COOMatrix COORandomized(IDX rows_and_cols, int64_t nnz, int seed) { + std::vector vec_rows(nnz); + std::vector vec_cols(nnz); + std::vector vec_data(nnz); + +#pragma omp parallel + { + const int64_t num_threads = omp_get_num_threads(); + const int64_t thread_id = omp_get_thread_num(); + const int64_t chunk = nnz / num_threads; + const int64_t size = (thread_id == num_threads - 1) + ? nnz - chunk * (num_threads - 1) + : chunk; + auto rows = vec_rows.data() + thread_id * chunk; + auto cols = vec_cols.data() + thread_id * chunk; + auto data = vec_data.data() + thread_id * chunk; + + std::mt19937_64 gen64(seed + thread_id); + std::mt19937 gen32(seed + thread_id); + + for (int64_t i = 0; i < size; ++i) { + rows[i] = gen64() % rows_and_cols; + cols[i] = gen64() % rows_and_cols; + data[i] = gen32() % 90 + 1; + } + } + + return aten::COOMatrix( + rows_and_cols, rows_and_cols, + aten::VecToIdArray(vec_rows, sizeof(IDX) * 8, CTX), + aten::VecToIdArray(vec_cols, sizeof(IDX) * 8, CTX), + aten::VecToIdArray(vec_data, sizeof(IDX) * 8, CTX), false, false); +} + +struct SparseCOOCSR { + static constexpr uint64_t NUM_ROWS = 100; + static constexpr uint64_t NUM_COLS = 150; + static constexpr uint64_t NUM_NZ = 5; + template + static aten::COOMatrix COOSparse(const DGLContext &ctx = CTX) { + return aten::COOMatrix( + NUM_ROWS, NUM_COLS, + aten::VecToIdArray( + std::vector({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx)); + } + + template + static aten::CSRMatrix CSRSparse(const DGLContext &ctx = CTX) { + auto &&indptr = std::vector(NUM_ROWS + 1, NUM_NZ); + for (size_t i = 0; i < NUM_NZ; ++i) { + indptr[i + 1] = static_cast(i + 1); + } + indptr[0] = 0; + return aten::CSRMatrix( + NUM_ROWS, NUM_COLS, aten::VecToIdArray(indptr, sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 1, 1, 1, 1}), sizeof(IDX) * 8, ctx), + false); + } +}; + +template +aten::COOMatrix RowSorted_NullData_COO(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // row : [0, 0, 1, 2, 2] + // col : [1, 2, 0, 2, 3] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::NullArray(), true, false); +} + +template +aten::CSRMatrix RowSorted_NullData_CSR(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 1, 2, 3, 4] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 1, 2, 3, 4}), sizeof(IDX) * 8, ctx), + false); +} +} // namespace + +template +void _TestCOOToCSR(DGLContext ctx) { + auto coo = COO1(ctx); + auto csr = CSR1(ctx); + auto tcsr = aten::COOToCSR(coo); + ASSERT_FALSE(coo.row_sorted); + ASSERT_EQ(csr.num_rows, tcsr.num_rows); + ASSERT_EQ(csr.num_cols, tcsr.num_cols); + ASSERT_TRUE(ArrayEQ(csr.indptr, tcsr.indptr)); + ASSERT_TRUE(ArrayEQ(csr.indices, tcsr.indices)); + + coo = COO2(ctx); + csr = CSR2(ctx); + tcsr = aten::COOToCSR(coo); + ASSERT_EQ(coo.num_rows, csr.num_rows); + ASSERT_EQ(coo.num_cols, csr.num_cols); + ASSERT_TRUE(ArrayEQ(csr.indptr, tcsr.indptr)); + + // Convert from row sorted coo + coo = COO1(ctx); + auto rs_coo = aten::COOSort(coo, false); + auto rs_csr = CSR1(ctx); + auto rs_tcsr = aten::COOToCSR(rs_coo); + ASSERT_TRUE(rs_coo.row_sorted); + ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows); + ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols); + ASSERT_TRUE(ArrayEQ(rs_csr.indptr, rs_tcsr.indptr)); + ASSERT_TRUE(ArrayEQ(rs_tcsr.indices, rs_coo.col)); + ASSERT_TRUE(ArrayEQ(rs_tcsr.data, rs_coo.data)); + + coo = COO3(ctx); + rs_coo = aten::COOSort(coo, false); + rs_csr = SR_CSR3(ctx); + rs_tcsr = aten::COOToCSR(rs_coo); + ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows); + ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols); + ASSERT_TRUE(ArrayEQ(rs_csr.indptr, rs_tcsr.indptr)); + ASSERT_TRUE(ArrayEQ(rs_tcsr.indices, rs_coo.col)); + ASSERT_TRUE(ArrayEQ(rs_tcsr.data, rs_coo.data)); + + rs_coo = RowSorted_NullData_COO(ctx); + ASSERT_TRUE(rs_coo.row_sorted); + rs_csr = RowSorted_NullData_CSR(ctx); + rs_tcsr = aten::COOToCSR(rs_coo); + ASSERT_EQ(coo.num_rows, rs_tcsr.num_rows); + ASSERT_EQ(rs_csr.num_rows, rs_tcsr.num_rows); + ASSERT_EQ(coo.num_cols, rs_tcsr.num_cols); + ASSERT_EQ(rs_csr.num_cols, rs_tcsr.num_cols); + ASSERT_TRUE(ArrayEQ(rs_csr.indptr, rs_tcsr.indptr)); + ASSERT_TRUE(ArrayEQ(rs_csr.indices, rs_tcsr.indices)); + ASSERT_TRUE(ArrayEQ(rs_csr.data, rs_tcsr.data)); + ASSERT_TRUE(ArrayEQ(rs_coo.col, rs_tcsr.indices)); + ASSERT_FALSE(ArrayEQ(rs_coo.data, rs_tcsr.data)); + + // Convert from col sorted coo + coo = COO1(ctx); + auto src_coo = aten::COOSort(coo, true); + auto src_csr = CSR1(ctx); + auto src_tcsr = aten::COOToCSR(src_coo); + ASSERT_EQ(coo.num_rows, src_tcsr.num_rows); + ASSERT_EQ(coo.num_cols, src_tcsr.num_cols); + ASSERT_TRUE(src_tcsr.sorted); + ASSERT_TRUE(ArrayEQ(src_tcsr.indptr, src_csr.indptr)); + ASSERT_TRUE(ArrayEQ(src_tcsr.indices, src_coo.col)); + ASSERT_TRUE(ArrayEQ(src_tcsr.data, src_coo.data)); + + coo = COO3(ctx); + src_coo = aten::COOSort(coo, true); + src_csr = SRC_CSR3(ctx); + src_tcsr = aten::COOToCSR(src_coo); + ASSERT_EQ(coo.num_rows, src_tcsr.num_rows); + ASSERT_EQ(coo.num_cols, src_tcsr.num_cols); + ASSERT_TRUE(src_tcsr.sorted); + ASSERT_TRUE(ArrayEQ(src_tcsr.indptr, src_csr.indptr)); + ASSERT_TRUE(ArrayEQ(src_tcsr.indices, src_coo.col)); + ASSERT_TRUE(ArrayEQ(src_tcsr.data, src_coo.data)); + + coo = SparseCOOCSR::COOSparse(ctx); + csr = SparseCOOCSR::CSRSparse(ctx); + tcsr = aten::COOToCSR(coo); + ASSERT_FALSE(coo.row_sorted); + ASSERT_EQ(csr.num_rows, tcsr.num_rows); + ASSERT_EQ(csr.num_cols, tcsr.num_cols); + ASSERT_TRUE(ArrayEQ(csr.indptr, tcsr.indptr)); + ASSERT_TRUE(ArrayEQ(csr.indices, tcsr.indices)); +} + +TEST(SpmatTest, COOToCSR) { + _TestCOOToCSR(CPU); + _TestCOOToCSR(CPU); +#ifdef DGL_USE_CUDA + _TestCOOToCSR(GPU); + _TestCOOToCSR(GPU); +#endif +} + +template +void _TestCOOHasDuplicate() { + auto coo = COO1(); + ASSERT_FALSE(aten::COOHasDuplicate(coo)); + coo = COO2(); + ASSERT_TRUE(aten::COOHasDuplicate(coo)); +} + +TEST(SpmatTest, TestCOOHasDuplicate) { + _TestCOOHasDuplicate(); + _TestCOOHasDuplicate(); +} + +template +void _TestCOOSort(DGLContext ctx) { + auto coo = COO3(ctx); + + auto sr_coo = COOSort(coo, false); + ASSERT_EQ(coo.num_rows, sr_coo.num_rows); + ASSERT_EQ(coo.num_cols, sr_coo.num_cols); + ASSERT_TRUE(sr_coo.row_sorted); + auto flags = COOIsSorted(sr_coo); + ASSERT_TRUE(flags.first); + flags = COOIsSorted(coo); // original coo should stay the same + ASSERT_FALSE(flags.first); + ASSERT_FALSE(flags.second); + + auto src_coo = COOSort(coo, true); + ASSERT_EQ(coo.num_rows, src_coo.num_rows); + ASSERT_EQ(coo.num_cols, src_coo.num_cols); + ASSERT_TRUE(src_coo.row_sorted); + ASSERT_TRUE(src_coo.col_sorted); + flags = COOIsSorted(src_coo); + ASSERT_TRUE(flags.first); + ASSERT_TRUE(flags.second); + + // sort inplace + COOSort_(&coo); + ASSERT_TRUE(coo.row_sorted); + flags = COOIsSorted(coo); + ASSERT_TRUE(flags.first); + COOSort_(&coo, true); + ASSERT_TRUE(coo.row_sorted); + ASSERT_TRUE(coo.col_sorted); + flags = COOIsSorted(coo); + ASSERT_TRUE(flags.first); + ASSERT_TRUE(flags.second); + + // COO3 + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 1, 2, 3, 4, 5] + // row : [0, 2, 0, 1, 2, 0] + // col : [2, 2, 1, 0, 3, 2] + // Row Sorted + // data: [0, 2, 5, 3, 1, 4] + // row : [0, 0, 0, 1, 2, 2] + // col : [2, 1, 2, 0, 2, 3] + // Row Col Sorted + // data: [2, 0, 5, 3, 1, 4] + // row : [0, 0, 0, 1, 2, 2] + // col : [1, 2, 2, 0, 2, 3] + auto sort_row = aten::VecToIdArray( + std::vector({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx); + auto sort_col = aten::VecToIdArray( + std::vector({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx); + auto sort_col_data = aten::VecToIdArray( + std::vector({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx); + + ASSERT_TRUE(ArrayEQ(sr_coo.row, sort_row)); + ASSERT_TRUE(ArrayEQ(src_coo.row, sort_row)); + ASSERT_TRUE(ArrayEQ(src_coo.col, sort_col)); + ASSERT_TRUE(ArrayEQ(src_coo.data, sort_col_data)); +} + +TEST(SpmatTest, COOSort) { + _TestCOOSort(CPU); + _TestCOOSort(CPU); +#ifdef DGL_USE_CUDA + _TestCOOSort(GPU); + _TestCOOSort(GPU); +#endif +} + +template +void _TestCOOReorder() { + auto coo = COO2(); + auto new_row = + aten::VecToIdArray(std::vector({2, 0, 3, 1}), sizeof(IDX) * 8, CTX); + auto new_col = aten::VecToIdArray( + std::vector({2, 0, 4, 3, 1}), sizeof(IDX) * 8, CTX); + auto new_coo = COOReorder(coo, new_row, new_col); + ASSERT_EQ(new_coo.num_rows, coo.num_rows); + ASSERT_EQ(new_coo.num_cols, coo.num_cols); +} + +TEST(SpmatTest, TestCOOReorder) { + _TestCOOReorder(); + _TestCOOReorder(); +} + +template +void _TestCOOGetData(DGLContext ctx) { + auto coo = COO2(ctx); + // test get all data + auto x = aten::COOGetAllData(coo, 0, 0); + auto tx = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::COOGetAllData(coo, 0, 2); + tx = aten::VecToIdArray(std::vector({2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data + auto r = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto c = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::COOGetData(coo, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data on sorted + coo = aten::COOSort(coo); + r = aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + c = aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::COOGetData(coo, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data w/ broadcasting + r = aten::VecToIdArray(std::vector({0}), sizeof(IDX) * 8, ctx); + c = aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::COOGetData(coo, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +TEST(SpmatTest, COOGetData) { + _TestCOOGetData(CPU); + _TestCOOGetData(CPU); + // #ifdef DGL_USE_CUDA + //_TestCOOGetData(GPU); + //_TestCOOGetData(GPU); + // #endif +} + +template +void _TestCOOGetDataAndIndices() { + auto coo = COO2(); + auto r = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, CTX); + auto c = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, CTX); + auto x = aten::COOGetDataAndIndices(coo, r, c); + auto tr = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, CTX); + auto tc = + aten::VecToIdArray(std::vector({1, 2, 2}), sizeof(IDX) * 8, CTX); + auto td = + aten::VecToIdArray(std::vector({0, 2, 5}), sizeof(IDX) * 8, CTX); + ASSERT_TRUE(ArrayEQ(x[0], tr)); + ASSERT_TRUE(ArrayEQ(x[1], tc)); + ASSERT_TRUE(ArrayEQ(x[2], td)); +} + +TEST(SpmatTest, COOGetDataAndIndices) { + _TestCOOGetDataAndIndices(); + _TestCOOGetDataAndIndices(); +} + +template +void _TestCOOToCSRAlgs() { + // Compare results between different CPU COOToCSR implementations. + // NNZ is chosen to be bigger than the limit for the "small" matrix algorithm. + // N is set to lay on border between "sparse" and "dense" algorithm choice. + + const int64_t num_threads = std::min(256, omp_get_max_threads()); + const int64_t min_num_threads = 3; + + if (num_threads < min_num_threads) { + std::cerr << "[ ] [ INFO ]" + << "This test requires at least 3 OMP threads to work properly" + << std::endl; + GTEST_SKIP(); + return; + } + + // Select N and NNZ for COO matrix in a way than depending on number of + // threads different algorithm will be used. + // See WhichCOOToCSR in src/array/cpu/spmat_op_impl_coo.cc for details + const int64_t type_scale = sizeof(IDX) >> 1; + const int64_t small = 50 * num_threads * type_scale * type_scale; + // NNZ should be bigger than limit for small matrix algorithm + const int64_t nnz = small + 1234; + // N is chosen to lay on sparse/dense border + const int64_t n = type_scale * nnz / num_threads; + const IDX rows_nad_cols = n + 1; // should be bigger than sparse/dense border + + // Note that it will be better to set the seed to a random value when gtest + // allows to use --gtest_random_seed without --gtest_shuffle and report this + // value for reproduction. This way we can find unforeseen situations and + // potential bugs. + const auto seed = 123321; + auto coo = COORandomized(rows_nad_cols, nnz, seed); + + omp_set_num_threads(1); + // UnSortedSmallCOOToCSR will be used + auto tcsr_small = aten::COOToCSR(coo); + ASSERT_EQ(coo.num_rows, tcsr_small.num_rows); + ASSERT_EQ(coo.num_cols, tcsr_small.num_cols); + + omp_set_num_threads(num_threads - 1); + // UnSortedDenseCOOToCSR will be used + auto tcsr_dense = aten::COOToCSR(coo); + ASSERT_EQ(tcsr_small.num_rows, tcsr_dense.num_rows); + ASSERT_EQ(tcsr_small.num_cols, tcsr_dense.num_cols); + ASSERT_TRUE(ArrayEQ(tcsr_small.indptr, tcsr_dense.indptr)); + ASSERT_TRUE(ArrayEQ(tcsr_small.indices, tcsr_dense.indices)); + ASSERT_TRUE(ArrayEQ(tcsr_small.data, tcsr_dense.data)); + + omp_set_num_threads(num_threads); + // UnSortedSparseCOOToCSR will be used + auto tcsr_sparse = aten::COOToCSR(coo); + ASSERT_EQ(tcsr_small.num_rows, tcsr_sparse.num_rows); + ASSERT_EQ(tcsr_small.num_cols, tcsr_sparse.num_cols); + ASSERT_TRUE(ArrayEQ(tcsr_small.indptr, tcsr_sparse.indptr)); + ASSERT_TRUE(ArrayEQ(tcsr_small.indices, tcsr_sparse.indices)); + ASSERT_TRUE(ArrayEQ(tcsr_small.data, tcsr_sparse.data)); + return; +} + +TEST(SpmatTest, COOToCSRAlgs) { + _TestCOOToCSRAlgs(); + _TestCOOToCSRAlgs(); +} diff --git a/tests/cpp/test_spmat_csr.cc b/tests/cpp/test_spmat_csr.cc index 4604df64cdb6..2477bee84eef 100644 --- a/tests/cpp/test_spmat_csr.cc +++ b/tests/cpp/test_spmat_csr.cc @@ -214,7 +214,7 @@ TEST(SpmatTest, TestCSRIsNonZero) { _TestCSRIsNonZero1(CPU); _TestCSRIsNonZero2(CPU); _TestCSRIsNonZero2(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRIsNonZero1(GPU); _TestCSRIsNonZero1(GPU); _TestCSRIsNonZero2(GPU); @@ -238,7 +238,7 @@ void _TestCSRGetRowNNZ(DGLContext ctx) { TEST(SpmatTest, TestCSRGetRowNNZ) { _TestCSRGetRowNNZ(CPU); _TestCSRGetRowNNZ(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRGetRowNNZ(GPU); _TestCSRGetRowNNZ(GPU); #endif @@ -262,7 +262,7 @@ void _TestCSRGetRowColumnIndices(DGLContext ctx) { TEST(SpmatTest, TestCSRGetRowColumnIndices) { _TestCSRGetRowColumnIndices(CPU); _TestCSRGetRowColumnIndices(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRGetRowColumnIndices(GPU); _TestCSRGetRowColumnIndices(GPU); #endif @@ -286,7 +286,7 @@ void _TestCSRGetRowData(DGLContext ctx) { TEST(SpmatTest, TestCSRGetRowData) { _TestCSRGetRowData(CPU); _TestCSRGetRowData(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRGetRowData(GPU); _TestCSRGetRowData(GPU); #endif @@ -331,7 +331,7 @@ void _TestCSRGetData(DGLContext ctx) { TEST(SpmatTest, CSRGetData) { _TestCSRGetData(CPU); _TestCSRGetData(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRGetData(GPU); _TestCSRGetData(GPU); #endif @@ -359,7 +359,7 @@ void _TestCSRGetDataAndIndices(DGLContext ctx) { TEST(SpmatTest, CSRGetDataAndIndices) { _TestCSRGetDataAndIndices(CPU); _TestCSRGetDataAndIndices(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRGetDataAndIndices(GPU); _TestCSRGetDataAndIndices(GPU); #endif @@ -391,7 +391,7 @@ void _TestCSRTranspose(DGLContext ctx) { TEST(SpmatTest, CSRTranspose) { _TestCSRTranspose(CPU); _TestCSRTranspose(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRTranspose(GPU); _TestCSRTranspose(GPU); #endif @@ -437,7 +437,7 @@ void _TestCSRToCOO(DGLContext ctx) { TEST(SpmatTest, CSRToCOO) { _TestCSRToCOO(CPU); _TestCSRToCOO(CPU); -#if DGL_USE_CUDA +#if DGL_USE_ROCM _TestCSRToCOO(GPU); _TestCSRToCOO(GPU); #endif @@ -545,7 +545,7 @@ void _TestCSRSliceRows(DGLContext ctx) { TEST(SpmatTest, TestCSRSliceRows) { _TestCSRSliceRows(CPU); _TestCSRSliceRows(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRSliceRows(GPU); _TestCSRSliceRows(GPU); #endif @@ -693,7 +693,7 @@ TEST(SpmatTest, CSRSliceMatrix) { _TestCSRSliceMatrix1(CPU); _TestCSRSliceMatrix2(CPU); _TestCSRSliceMatrix2(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRSliceMatrix1(GPU); _TestCSRSliceMatrix1(GPU); _TestCSRSliceMatrix2(GPU); @@ -712,7 +712,7 @@ void _TestCSRHasDuplicate(DGLContext ctx) { TEST(SpmatTest, CSRHasDuplicate) { _TestCSRHasDuplicate(CPU); _TestCSRHasDuplicate(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRHasDuplicate(GPU); _TestCSRHasDuplicate(GPU); #endif @@ -736,7 +736,7 @@ void _TestCSRSort(DGLContext ctx) { TEST(SpmatTest, CSRSort) { _TestCSRSort(CPU); _TestCSRSort(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestCSRSort(GPU); _TestCSRSort(GPU); #endif diff --git a/tests/cpp/test_spmat_csr.cc.prehip b/tests/cpp/test_spmat_csr.cc.prehip new file mode 100644 index 000000000000..4604df64cdb6 --- /dev/null +++ b/tests/cpp/test_spmat_csr.cc.prehip @@ -0,0 +1,760 @@ +#include +#include + +#include "./common.h" + +using namespace dgl; +using namespace dgl::runtime; + +namespace { + +template +aten::CSRMatrix CSR1(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 3, 1, 4] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 3, 5, 5}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 3, 4, 1}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::CSRMatrix CSR2(DGLContext ctx = CTX) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 5, 3, 1, 4] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::CSRMatrix CSR3(DGLContext ctx = CTX) { + // has duplicate entries and the columns are not sorted + // [[0, 1, 1, 1, 0, 0], + // [1, 0, 0, 0, 0, 0], + // [0, 0, 1, 1, 0, 0], + // [0, 0, 0, 0, 0, 0], + // [1, 1, 1, 0, 0, 0], + // [0, 0, 0, 1, 0, 0], + // [0, 0, 0, 0, 0, 0], + // [1, 2, 1, 1, 0, 0], + // [0, 1, 0, 0, 0, 1]], + // data: [5, 2, 0, 3, 1, 4, 8, 7, 6, 9, 12, 13, 11, 10, 14, 15, 16] + return aten::CSRMatrix( + 9, 6, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6, 9, 10, 10, 15, 17}), sizeof(IDX) * 8, + ctx), + aten::VecToIdArray( + std::vector({3, 2, 1, 0, 2, 3, 1, 2, 0, 3, 1, 2, 1, 3, 0, 5, 1}), + sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector( + {0, 2, 5, 3, 1, 4, 6, 8, 7, 9, 13, 10, 11, 14, 12, 16, 15}), + sizeof(IDX) * 8, ctx), + false); +} + +template +aten::COOMatrix COO1(DGLContext ctx = CTX) { + // [[0, 1, 1, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 3, 1, 4] + // row : [0, 2, 0, 1, 2] + // col : [1, 2, 2, 0, 3] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 3, 1, 2, 4}), sizeof(IDX) * 8, ctx)); +} + +template +aten::COOMatrix COO2(DGLContext ctx = CTX) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 5, 3, 1, 4] + // row : [0, 2, 0, 1, 2, 0] + // col : [1, 2, 2, 0, 3, 2] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 3, 2}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 1, 2, 3, 4, 5}), sizeof(IDX) * 8, ctx)); +} + +template +aten::CSRMatrix SR_CSR3(DGLContext ctx) { + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 1, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({0, 2, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::CSRMatrix SRC_CSR3(DGLContext ctx) { + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + return aten::CSRMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 3, 4, 6, 6}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({1, 2, 2, 0, 2, 3}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 0, 5, 3, 1, 4}), sizeof(IDX) * 8, ctx), + false); +} + +template +aten::COOMatrix COO3(DGLContext ctx) { + // has duplicate entries + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // row : [0, 2, 0, 1, 2, 0] + // col : [2, 2, 1, 0, 3, 2] + return aten::COOMatrix( + 4, 5, + aten::VecToIdArray( + std::vector({0, 2, 0, 1, 2, 0}), sizeof(IDX) * 8, ctx), + aten::VecToIdArray( + std::vector({2, 2, 1, 0, 3, 2}), sizeof(IDX) * 8, ctx)); +} + +} // namespace + +template +void _TestCSRIsNonZero1(DGLContext ctx) { + auto csr = CSR1(ctx); + ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1)); + ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0)); + IdArray r = + aten::VecToIdArray(std::vector({2, 2, 0, 0}), sizeof(IDX) * 8, ctx); + IdArray c = + aten::VecToIdArray(std::vector({1, 1, 1, 3}), sizeof(IDX) * 8, ctx); + IdArray x = aten::CSRIsNonZero(csr, r, c); + IdArray tx = + aten::VecToIdArray(std::vector({0, 0, 1, 0}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +template +void _TestCSRIsNonZero2(DGLContext ctx) { + auto csr = CSR3(ctx); + ASSERT_TRUE(aten::CSRIsNonZero(csr, 0, 1)); + ASSERT_FALSE(aten::CSRIsNonZero(csr, 0, 0)); + IdArray r = aten::VecToIdArray( + std::vector({ + 0, + 0, + 0, + 0, + 0, + }), + sizeof(IDX) * 8, ctx); + IdArray c = aten::VecToIdArray( + std::vector({ + 0, + 1, + 2, + 3, + 4, + }), + sizeof(IDX) * 8, ctx); + IdArray x = aten::CSRIsNonZero(csr, r, c); + IdArray tx = aten::VecToIdArray( + std::vector({0, 1, 1, 1, 0}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)) << " x = " << x << ", tx = " << tx; +} + +TEST(SpmatTest, TestCSRIsNonZero) { + _TestCSRIsNonZero1(CPU); + _TestCSRIsNonZero1(CPU); + _TestCSRIsNonZero2(CPU); + _TestCSRIsNonZero2(CPU); +#ifdef DGL_USE_CUDA + _TestCSRIsNonZero1(GPU); + _TestCSRIsNonZero1(GPU); + _TestCSRIsNonZero2(GPU); + _TestCSRIsNonZero2(GPU); +#endif +} + +template +void _TestCSRGetRowNNZ(DGLContext ctx) { + auto csr = CSR2(ctx); + ASSERT_EQ(aten::CSRGetRowNNZ(csr, 0), 3); + ASSERT_EQ(aten::CSRGetRowNNZ(csr, 3), 0); + IdArray r = + aten::VecToIdArray(std::vector({0, 3}), sizeof(IDX) * 8, ctx); + IdArray x = aten::CSRGetRowNNZ(csr, r); + IdArray tx = + aten::VecToIdArray(std::vector({3, 0}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +TEST(SpmatTest, TestCSRGetRowNNZ) { + _TestCSRGetRowNNZ(CPU); + _TestCSRGetRowNNZ(CPU); +#ifdef DGL_USE_CUDA + _TestCSRGetRowNNZ(GPU); + _TestCSRGetRowNNZ(GPU); +#endif +} + +template +void _TestCSRGetRowColumnIndices(DGLContext ctx) { + auto csr = CSR2(ctx); + auto x = aten::CSRGetRowColumnIndices(csr, 0); + auto tx = + aten::VecToIdArray(std::vector({1, 2, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::CSRGetRowColumnIndices(csr, 1); + tx = aten::VecToIdArray(std::vector({0}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::CSRGetRowColumnIndices(csr, 3); + tx = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +TEST(SpmatTest, TestCSRGetRowColumnIndices) { + _TestCSRGetRowColumnIndices(CPU); + _TestCSRGetRowColumnIndices(CPU); +#ifdef DGL_USE_CUDA + _TestCSRGetRowColumnIndices(GPU); + _TestCSRGetRowColumnIndices(GPU); +#endif +} + +template +void _TestCSRGetRowData(DGLContext ctx) { + auto csr = CSR2(ctx); + auto x = aten::CSRGetRowData(csr, 0); + auto tx = + aten::VecToIdArray(std::vector({0, 2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::CSRGetRowData(csr, 1); + tx = aten::VecToIdArray(std::vector({3}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::CSRGetRowData(csr, 3); + tx = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +TEST(SpmatTest, TestCSRGetRowData) { + _TestCSRGetRowData(CPU); + _TestCSRGetRowData(CPU); +#ifdef DGL_USE_CUDA + _TestCSRGetRowData(GPU); + _TestCSRGetRowData(GPU); +#endif +} + +template +void _TestCSRGetData(DGLContext ctx) { + auto csr = CSR2(ctx); + // test get all data + auto x = aten::CSRGetAllData(csr, 0, 0); + auto tx = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + x = aten::CSRGetAllData(csr, 0, 2); + tx = aten::VecToIdArray(std::vector({2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data + auto r = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto c = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::CSRGetData(csr, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data on sorted + csr = aten::CSRSort(csr); + r = aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + c = aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::CSRGetData(csr, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); + + // test get data w/ broadcasting + r = aten::VecToIdArray(std::vector({0}), sizeof(IDX) * 8, ctx); + c = aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + x = aten::CSRGetData(csr, r, c); + tx = aten::VecToIdArray(std::vector({-1, 0, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x, tx)); +} + +TEST(SpmatTest, CSRGetData) { + _TestCSRGetData(CPU); + _TestCSRGetData(CPU); +#ifdef DGL_USE_CUDA + _TestCSRGetData(GPU); + _TestCSRGetData(GPU); +#endif +} + +template +void _TestCSRGetDataAndIndices(DGLContext ctx) { + auto csr = CSR2(ctx); + auto r = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto c = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRGetDataAndIndices(csr, r, c); + auto tr = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto tc = + aten::VecToIdArray(std::vector({1, 2, 2}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({0, 2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x[0], tr)); + ASSERT_TRUE(ArrayEQ(x[1], tc)); + ASSERT_TRUE(ArrayEQ(x[2], td)); +} + +TEST(SpmatTest, CSRGetDataAndIndices) { + _TestCSRGetDataAndIndices(CPU); + _TestCSRGetDataAndIndices(CPU); +#ifdef DGL_USE_CUDA + _TestCSRGetDataAndIndices(GPU); + _TestCSRGetDataAndIndices(GPU); +#endif +} + +template +void _TestCSRTranspose(DGLContext ctx) { + auto csr = CSR2(ctx); + auto csr_t = aten::CSRTranspose(csr); + // [[0, 1, 0, 0], + // [1, 0, 0, 0], + // [2, 0, 1, 0], + // [0, 0, 1, 0], + // [0, 0, 0, 0]] + // data: [3, 0, 2, 5, 1, 4] + ASSERT_EQ(csr_t.num_rows, 5); + ASSERT_EQ(csr_t.num_cols, 4); + auto tp = aten::VecToIdArray( + std::vector({0, 1, 2, 5, 6, 6}), sizeof(IDX) * 8, ctx); + auto ti = aten::VecToIdArray( + std::vector({1, 0, 0, 0, 2, 2}), sizeof(IDX) * 8, ctx); + auto td = aten::VecToIdArray( + std::vector({3, 0, 2, 5, 1, 4}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(csr_t.indptr, tp)); + ASSERT_TRUE(ArrayEQ(csr_t.indices, ti)); + ASSERT_TRUE(ArrayEQ(csr_t.data, td)); +} + +TEST(SpmatTest, CSRTranspose) { + _TestCSRTranspose(CPU); + _TestCSRTranspose(CPU); +#ifdef DGL_USE_CUDA + _TestCSRTranspose(GPU); + _TestCSRTranspose(GPU); +#endif +} + +template +void _TestCSRToCOO(DGLContext ctx) { + auto csr = CSR2(ctx); + { + auto coo = CSRToCOO(csr, false); + ASSERT_EQ(coo.num_rows, 4); + ASSERT_EQ(coo.num_cols, 5); + ASSERT_TRUE(coo.row_sorted); + auto tr = aten::VecToIdArray( + std::vector({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(coo.row, tr)); + ASSERT_TRUE(ArrayEQ(coo.col, csr.indices)); + ASSERT_TRUE(ArrayEQ(coo.data, csr.data)); + + // convert from sorted csr + auto s_csr = CSRSort(csr); + coo = CSRToCOO(s_csr, false); + ASSERT_EQ(coo.num_rows, 4); + ASSERT_EQ(coo.num_cols, 5); + ASSERT_TRUE(coo.row_sorted); + ASSERT_TRUE(coo.col_sorted); + tr = aten::VecToIdArray( + std::vector({0, 0, 0, 1, 2, 2}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(coo.row, tr)); + ASSERT_TRUE(ArrayEQ(coo.col, s_csr.indices)); + ASSERT_TRUE(ArrayEQ(coo.data, s_csr.data)); + } + { + auto coo = CSRToCOO(csr, true); + ASSERT_EQ(coo.num_rows, 4); + ASSERT_EQ(coo.num_cols, 5); + auto tcoo = COO2(ctx); + ASSERT_TRUE(ArrayEQ(coo.row, tcoo.row)); + ASSERT_TRUE(ArrayEQ(coo.col, tcoo.col)); + } +} + +TEST(SpmatTest, CSRToCOO) { + _TestCSRToCOO(CPU); + _TestCSRToCOO(CPU); +#if DGL_USE_CUDA + _TestCSRToCOO(GPU); + _TestCSRToCOO(GPU); +#endif +} + +template +void _TestCSRSliceRows(DGLContext ctx) { + auto csr = CSR2(ctx); + auto x = aten::CSRSliceRows(csr, 1, 4); + // [1, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 0, 0, 0]] + // data: [3, 1, 4] + ASSERT_EQ(x.num_rows, 3); + ASSERT_EQ(x.num_cols, 5); + auto tp = + aten::VecToIdArray(std::vector({0, 1, 3, 3}), sizeof(IDX) * 8, ctx); + auto ti = + aten::VecToIdArray(std::vector({0, 2, 3}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({3, 1, 4}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + + auto r = + aten::VecToIdArray(std::vector({0, 1, 3}), sizeof(IDX) * 8, ctx); + x = aten::CSRSliceRows(csr, r); + // [[0, 1, 2, 0, 0], + // [1, 0, 0, 0, 0], + // [0, 0, 0, 0, 0]] + // data: [0, 2, 5, 3] + tp = aten::VecToIdArray(std::vector({0, 3, 4, 4}), sizeof(IDX) * 8, ctx); + ti = aten::VecToIdArray(std::vector({1, 2, 2, 0}), sizeof(IDX) * 8, ctx); + td = aten::VecToIdArray(std::vector({0, 2, 5, 3}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + + // Testing non-increasing row id based slicing + r = aten::VecToIdArray(std::vector({3, 2, 1}), sizeof(IDX) * 8, ctx); + x = aten::CSRSliceRows(csr, r); + // [[0, 0, 0, 0, 0], + // [0, 0, 1, 1, 0], + // [1, 0, 0, 0, 0]] + // data: [1, 4, 3] + tp = aten::VecToIdArray(std::vector({0, 0, 2, 3}), sizeof(IDX) * 8, ctx); + ti = aten::VecToIdArray(std::vector({2, 3, 0}), sizeof(IDX) * 8, ctx); + td = aten::VecToIdArray(std::vector({1, 4, 3}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + + // Testing zero-degree row slicing with different rows + r = aten::VecToIdArray( + std::vector({1, 3, 0, 3, 2}), sizeof(IDX) * 8, ctx); + x = aten::CSRSliceRows(csr, r); + // [[1, 0, 0, 0, 0], + // [0, 0, 0, 0, 0], + // [0, 1, 2, 0, 0], + // [0, 0, 0, 0, 0], + // [0, 0, 1, 1, 0]] + // data: [3, 0, 2, 5, 1, 4] + tp = aten::VecToIdArray( + std::vector({0, 1, 1, 4, 4, 6}), sizeof(IDX) * 8, ctx); + ti = aten::VecToIdArray( + std::vector({0, 1, 2, 2, 2, 3}), sizeof(IDX) * 8, ctx); + td = aten::VecToIdArray( + std::vector({3, 0, 2, 5, 1, 4}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + + // Testing empty output (i.e. sliced rows will be zero-degree) + r = aten::VecToIdArray(std::vector({3, 3, 3}), sizeof(IDX) * 8, ctx); + x = aten::CSRSliceRows(csr, r); + // [[0, 0, 0, 0, 0], + // [0, 0, 0, 0, 0], + // [0, 0, 0, 0, 0]] + // data: [] + tp = aten::VecToIdArray(std::vector({0, 0, 0, 0}), sizeof(IDX) * 8, ctx); + ti = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + td = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + + // Testing constant output: we pick last row with at least one nnz + r = aten::VecToIdArray(std::vector({2, 2, 2}), sizeof(IDX) * 8, ctx); + x = aten::CSRSliceRows(csr, r); + // [[0, 0, 1, 1, 0], + // [0, 0, 1, 1, 0], + // [0, 0, 1, 1, 0]] + // data: [1, 4, 1, 4, 1, 4] + tp = aten::VecToIdArray(std::vector({0, 2, 4, 6}), sizeof(IDX) * 8, ctx); + ti = aten::VecToIdArray( + std::vector({2, 3, 2, 3, 2, 3}), sizeof(IDX) * 8, ctx); + td = aten::VecToIdArray( + std::vector({1, 4, 1, 4, 1, 4}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); +} + +TEST(SpmatTest, TestCSRSliceRows) { + _TestCSRSliceRows(CPU); + _TestCSRSliceRows(CPU); +#ifdef DGL_USE_CUDA + _TestCSRSliceRows(GPU); + _TestCSRSliceRows(GPU); +#endif +} + +template +void _TestCSRSliceMatrix1(DGLContext ctx) { + auto csr = CSR2(ctx); + { + // square + auto r = + aten::VecToIdArray(std::vector({0, 1, 3}), sizeof(IDX) * 8, ctx); + auto c = + aten::VecToIdArray(std::vector({1, 2, 3}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[1, 2, 0], + // [0, 0, 0], + // [0, 0, 0]] + // data: [0, 2, 5] + ASSERT_EQ(x.num_rows, 3); + ASSERT_EQ(x.num_cols, 3); + auto tp = aten::VecToIdArray( + std::vector({0, 3, 3, 3}), sizeof(IDX) * 8, ctx); + auto ti = + aten::VecToIdArray(std::vector({0, 1, 1}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({0, 2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } + { + // non-square + auto r = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + auto c = aten::VecToIdArray(std::vector({0, 1}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[0, 1], + // [1, 0], + // [0, 0]] + // data: [0, 3] + ASSERT_EQ(x.num_rows, 3); + ASSERT_EQ(x.num_cols, 2); + auto tp = aten::VecToIdArray( + std::vector({0, 1, 2, 2}), sizeof(IDX) * 8, ctx); + auto ti = + aten::VecToIdArray(std::vector({1, 0}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({0, 3}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } + { + // empty slice + auto r = aten::VecToIdArray(std::vector({2, 3}), sizeof(IDX) * 8, ctx); + auto c = aten::VecToIdArray(std::vector({0, 1}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[0, 0], + // [0, 0]] + // data: [] + ASSERT_EQ(x.num_rows, 2); + ASSERT_EQ(x.num_cols, 2); + auto tp = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto ti = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + auto td = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } +} + +template +void _TestCSRSliceMatrix2(DGLContext ctx) { + auto csr = CSR3(ctx); + { + // square + auto r = + aten::VecToIdArray(std::vector({0, 1, 3}), sizeof(IDX) * 8, ctx); + auto c = + aten::VecToIdArray(std::vector({1, 2, 3}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[1, 1, 1], + // [0, 0, 0], + // [0, 0, 0]] + // data: [5, 2, 0] + ASSERT_EQ(x.num_rows, 3); + ASSERT_EQ(x.num_cols, 3); + auto tp = aten::VecToIdArray( + std::vector({0, 3, 3, 3}), sizeof(IDX) * 8, ctx); + // indexes are in reverse order in CSR3 + auto ti = + aten::VecToIdArray(std::vector({2, 1, 0}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({0, 2, 5}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } + { + // non-square + auto r = + aten::VecToIdArray(std::vector({0, 1, 2}), sizeof(IDX) * 8, ctx); + auto c = aten::VecToIdArray(std::vector({0, 1}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[0, 1], + // [1, 0], + // [0, 0]] + // data: [0, 3] + ASSERT_EQ(x.num_rows, 3); + ASSERT_EQ(x.num_cols, 2); + auto tp = aten::VecToIdArray( + std::vector({0, 1, 2, 2}), sizeof(IDX) * 8, ctx); + auto ti = + aten::VecToIdArray(std::vector({1, 0}), sizeof(IDX) * 8, ctx); + auto td = + aten::VecToIdArray(std::vector({5, 3}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } + { + // empty slice + auto r = aten::VecToIdArray(std::vector({2, 3}), sizeof(IDX) * 8, ctx); + auto c = aten::VecToIdArray(std::vector({0, 1}), sizeof(IDX) * 8, ctx); + auto x = aten::CSRSliceMatrix(csr, r, c); + // [[0, 0], + // [0, 0]] + // data: [] + ASSERT_EQ(x.num_rows, 2); + ASSERT_EQ(x.num_cols, 2); + auto tp = + aten::VecToIdArray(std::vector({0, 0, 0}), sizeof(IDX) * 8, ctx); + auto ti = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + auto td = aten::VecToIdArray(std::vector({}), sizeof(IDX) * 8, ctx); + ASSERT_TRUE(ArrayEQ(x.indptr, tp)); + ASSERT_TRUE(ArrayEQ(x.indices, ti)); + ASSERT_TRUE(ArrayEQ(x.data, td)); + } +} + +TEST(SpmatTest, CSRSliceMatrix) { + _TestCSRSliceMatrix1(CPU); + _TestCSRSliceMatrix1(CPU); + _TestCSRSliceMatrix2(CPU); + _TestCSRSliceMatrix2(CPU); +#ifdef DGL_USE_CUDA + _TestCSRSliceMatrix1(GPU); + _TestCSRSliceMatrix1(GPU); + _TestCSRSliceMatrix2(GPU); + _TestCSRSliceMatrix2(GPU); +#endif +} + +template +void _TestCSRHasDuplicate(DGLContext ctx) { + auto csr = CSR1(ctx); + ASSERT_FALSE(aten::CSRHasDuplicate(csr)); + csr = CSR2(ctx); + ASSERT_TRUE(aten::CSRHasDuplicate(csr)); +} + +TEST(SpmatTest, CSRHasDuplicate) { + _TestCSRHasDuplicate(CPU); + _TestCSRHasDuplicate(CPU); +#ifdef DGL_USE_CUDA + _TestCSRHasDuplicate(GPU); + _TestCSRHasDuplicate(GPU); +#endif +} + +template +void _TestCSRSort(DGLContext ctx) { + auto csr = CSR1(ctx); + ASSERT_FALSE(aten::CSRIsSorted(csr)); + auto csr1 = aten::CSRSort(csr); + ASSERT_FALSE(aten::CSRIsSorted(csr)); + ASSERT_TRUE(aten::CSRIsSorted(csr1)); + ASSERT_TRUE(csr1.sorted); + aten::CSRSort_(&csr); + ASSERT_TRUE(aten::CSRIsSorted(csr)); + ASSERT_TRUE(csr.sorted); + csr = CSR2(ctx); + ASSERT_TRUE(aten::CSRIsSorted(csr)); +} + +TEST(SpmatTest, CSRSort) { + _TestCSRSort(CPU); + _TestCSRSort(CPU); +#ifdef DGL_USE_CUDA + _TestCSRSort(GPU); + _TestCSRSort(GPU); +#endif +} + +template +void _TestCSRReorder() { + auto csr = CSR2(); + auto new_row = + aten::VecToIdArray(std::vector({2, 0, 3, 1}), sizeof(IDX) * 8, CTX); + auto new_col = aten::VecToIdArray( + std::vector({2, 0, 4, 3, 1}), sizeof(IDX) * 8, CTX); + auto new_csr = CSRReorder(csr, new_row, new_col); + ASSERT_EQ(new_csr.num_rows, csr.num_rows); + ASSERT_EQ(new_csr.num_cols, csr.num_cols); +} + +TEST(SpmatTest, TestCSRReorder) { + _TestCSRReorder(); + _TestCSRReorder(); +} diff --git a/tests/cpp/test_unit_graph.cc b/tests/cpp/test_unit_graph.cc index 20cc1ebe24cf..8371321df953 100644 --- a/tests/cpp/test_unit_graph.cc +++ b/tests/cpp/test_unit_graph.cc @@ -369,7 +369,7 @@ void _TestUnitGraph_CopyTo( TEST(UniGraphTest, TestUnitGraph_CopyTo) { _TestUnitGraph_CopyTo(CPU, CPU); _TestUnitGraph_CopyTo(CPU, CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_CopyTo(CPU, GPU); _TestUnitGraph_CopyTo(GPU, GPU); _TestUnitGraph_CopyTo(GPU, CPU); @@ -382,7 +382,7 @@ TEST(UniGraphTest, TestUnitGraph_CopyTo) { TEST(UniGraphTest, TestUnitGraph_InOutDegrees) { _TestUnitGraph_InOutDegrees(CPU); _TestUnitGraph_InOutDegrees(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_InOutDegrees(GPU); _TestUnitGraph_InOutDegrees(GPU); #endif @@ -391,7 +391,7 @@ TEST(UniGraphTest, TestUnitGraph_InOutDegrees) { TEST(UniGraphTest, TestUnitGraph_Create) { _TestUnitGraph(CPU); _TestUnitGraph(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph(GPU); _TestUnitGraph(GPU); #endif @@ -400,7 +400,7 @@ TEST(UniGraphTest, TestUnitGraph_Create) { TEST(UniGraphTest, TestUnitGraph_GetInCSR) { _TestUnitGraph_GetInCSR(CPU); _TestUnitGraph_GetInCSR(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_GetInCSR(GPU); _TestUnitGraph_GetInCSR(GPU); #endif @@ -409,7 +409,7 @@ TEST(UniGraphTest, TestUnitGraph_GetInCSR) { TEST(UniGraphTest, TestUnitGraph_GetOutCSR) { _TestUnitGraph_GetOutCSR(CPU); _TestUnitGraph_GetOutCSR(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_GetOutCSR(GPU); _TestUnitGraph_GetOutCSR(GPU); #endif @@ -418,7 +418,7 @@ TEST(UniGraphTest, TestUnitGraph_GetOutCSR) { TEST(UniGraphTest, TestUnitGraph_GetCOO) { _TestUnitGraph_GetCOO(CPU); _TestUnitGraph_GetCOO(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_GetCOO(GPU); _TestUnitGraph_GetCOO(GPU); #endif @@ -427,7 +427,7 @@ TEST(UniGraphTest, TestUnitGraph_GetCOO) { TEST(UniGraphTest, TestUnitGraph_Reserve) { _TestUnitGraph_Reserve(CPU); _TestUnitGraph_Reserve(CPU); -#ifdef DGL_USE_CUDA +#ifdef DGL_USE_ROCM _TestUnitGraph_Reserve(GPU); _TestUnitGraph_Reserve(GPU); #endif diff --git a/tests/cpp/test_unit_graph.cc.prehip b/tests/cpp/test_unit_graph.cc.prehip new file mode 100644 index 000000000000..20cc1ebe24cf --- /dev/null +++ b/tests/cpp/test_unit_graph.cc.prehip @@ -0,0 +1,434 @@ +/** + * Copyright (c) 2019 by Contributors + * @file test_unit_graph.cc + * @brief Test UnitGraph + */ +#include +#include +#include +#include + +#include +#include + +#include "../../src/graph/unit_graph.h" +#include "./../src/graph/heterograph.h" +#include "./common.h" + +using namespace dgl; +using namespace dgl::runtime; + +template +aten::CSRMatrix CSR1(DGLContext ctx) { + /** + * G = [[0, 0, 1], + * [1, 0, 1], + * [0, 1, 0], + * [1, 0, 1]] + */ + IdArray g_indptr = aten::VecToIdArray( + std::vector({0, 1, 3, 4, 6}), sizeof(IdType) * 8, CTX); + IdArray g_indices = aten::VecToIdArray( + std::vector({2, 0, 2, 1, 0, 2}), sizeof(IdType) * 8, CTX); + + const aten::CSRMatrix &csr_a = + aten::CSRMatrix(4, 3, g_indptr, g_indices, aten::NullArray(), false); + return csr_a; +} + +template aten::CSRMatrix CSR1(DGLContext ctx); +template aten::CSRMatrix CSR1(DGLContext ctx); + +template +aten::COOMatrix COO1(DGLContext ctx) { + /** + * G = [[1, 1, 0], + * [0, 1, 0]] + */ + IdArray g_row = aten::VecToIdArray( + std::vector({0, 0, 1}), sizeof(IdType) * 8, CTX); + IdArray g_col = aten::VecToIdArray( + std::vector({0, 1, 1}), sizeof(IdType) * 8, CTX); + const aten::COOMatrix &coo = + aten::COOMatrix(2, 3, g_row, g_col, aten::NullArray(), true, true); + + return coo; +} + +template aten::COOMatrix COO1(DGLContext ctx); +template aten::COOMatrix COO1(DGLContext ctx); + +template +void _TestUnitGraph_InOutDegrees(DGLContext ctx) { + /** + InDegree(s) is available only if COO or CSC formats permitted. + OutDegree(s) is available only if COO or CSR formats permitted. + */ + + // COO + { + const aten::COOMatrix &coo = COO1(ctx); + auto &&g = CreateFromCOO(2, coo, COO_CODE); + ASSERT_EQ(g->InDegree(0, 0), 1); + auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context()); + ASSERT_TRUE(ArrayEQ( + g->InDegrees(0, nids), + aten::VecToIdArray({1, 2}, g->NumBits(), g->Context()))); + ASSERT_EQ(g->OutDegree(0, 0), 2); + ASSERT_TRUE(ArrayEQ( + g->OutDegrees(0, nids), + aten::VecToIdArray({2, 1}, g->NumBits(), g->Context()))); + } + // CSC + { + const aten::CSRMatrix &csr = CSR1(ctx); + auto &&g = CreateFromCSC(2, csr, CSC_CODE); + ASSERT_EQ(g->InDegree(0, 0), 1); + auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context()); + ASSERT_TRUE(ArrayEQ( + g->InDegrees(0, nids), + aten::VecToIdArray({1, 2, 1}, g->NumBits(), g->Context()))); + EXPECT_ANY_THROW(g->OutDegree(0, 0)); + EXPECT_ANY_THROW(g->OutDegrees(0, nids)); + } + // CSR + { + const aten::CSRMatrix &csr = CSR1(ctx); + auto &&g = CreateFromCSR(2, csr, CSR_CODE); + ASSERT_EQ(g->OutDegree(0, 0), 1); + auto &&nids = aten::Range(0, g->NumVertices(0), g->NumBits(), g->Context()); + ASSERT_TRUE(ArrayEQ( + g->OutDegrees(0, nids), + aten::VecToIdArray({1, 2, 1, 2}, g->NumBits(), g->Context()))); + EXPECT_ANY_THROW(g->InDegree(0, 0)); + EXPECT_ANY_THROW(g->InDegrees(0, nids)); + } +} + +template +void _TestUnitGraph(DGLContext ctx) { + const aten::CSRMatrix &csr = CSR1(ctx); + const aten::COOMatrix &coo = COO1(ctx); + + auto g = CreateFromCSC(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 4); + + g = CreateFromCSR(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 2); + + g = CreateFromCOO(2, coo); + ASSERT_EQ(g->GetCreatedFormats(), 1); + + auto src = aten::VecToIdArray({1, 2, 5, 3}); + auto dst = aten::VecToIdArray({1, 6, 2, 6}); + auto mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, COO_CODE); + ASSERT_EQ(mg->GetCreatedFormats(), 1); + auto hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, COO_CODE); + auto img = std::dynamic_pointer_cast(hmg->AsImmutableGraph()); + ASSERT_TRUE(img != nullptr); + mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, CSR_CODE | COO_CODE); + ASSERT_EQ(mg->GetCreatedFormats(), 1); + hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, CSR_CODE | COO_CODE); + img = std::dynamic_pointer_cast(hmg->AsImmutableGraph()); + ASSERT_TRUE(img != nullptr); + mg = dgl::UnitGraph::CreateFromCOO(2, 9, 8, src, dst, CSC_CODE | COO_CODE); + ASSERT_EQ(mg->GetCreatedFormats(), 1); + hmg = dgl::UnitGraph::CreateFromCOO(1, 8, 8, src, dst, CSC_CODE | COO_CODE); + img = std::dynamic_pointer_cast(hmg->AsImmutableGraph()); + ASSERT_TRUE(img != nullptr); + + g = CreateFromCSC(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 4); + + g = CreateFromCSR(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 2); + + g = CreateFromCOO(2, coo); + ASSERT_EQ(g->GetCreatedFormats(), 1); +} + +template +void _TestUnitGraph_GetInCSR(DGLContext ctx) { + const aten::CSRMatrix &csr = CSR1(ctx); + const aten::COOMatrix &coo = COO1(ctx); + + auto g = CreateFromCSC(2, csr); + auto in_csr_matrix = g->GetCSCMatrix(0); + ASSERT_EQ(in_csr_matrix.num_rows, csr.num_rows); + ASSERT_EQ(in_csr_matrix.num_cols, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 4); + + // test out csr + g = CreateFromCSR(2, csr); + auto g_ptr = g->GetGraphInFormat(CSC_CODE); + in_csr_matrix = g_ptr->GetCSCMatrix(0); + ASSERT_EQ(in_csr_matrix.num_cols, csr.num_rows); + ASSERT_EQ(in_csr_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 2); + in_csr_matrix = g->GetCSCMatrix(0); + ASSERT_EQ(in_csr_matrix.num_cols, csr.num_rows); + ASSERT_EQ(in_csr_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 6); + + // test out coo + g = CreateFromCOO(2, coo); + g_ptr = g->GetGraphInFormat(CSC_CODE); + in_csr_matrix = g_ptr->GetCSCMatrix(0); + ASSERT_EQ(in_csr_matrix.num_cols, coo.num_rows); + ASSERT_EQ(in_csr_matrix.num_rows, coo.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 1); + + in_csr_matrix = g->GetCSCMatrix(0); + ASSERT_EQ(in_csr_matrix.num_cols, coo.num_rows); + ASSERT_EQ(in_csr_matrix.num_rows, coo.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 5); +} + +template +void _TestUnitGraph_GetOutCSR(DGLContext ctx) { + const aten::CSRMatrix &csr = CSR1(ctx); + const aten::COOMatrix &coo = COO1(ctx); + + auto g = CreateFromCSC(2, csr); + auto g_ptr = g->GetGraphInFormat(CSR_CODE); + auto out_csr_matrix = g_ptr->GetCSRMatrix(0); + ASSERT_EQ(out_csr_matrix.num_cols, csr.num_rows); + ASSERT_EQ(out_csr_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 4); + out_csr_matrix = g->GetCSRMatrix(0); + ASSERT_EQ(out_csr_matrix.num_cols, csr.num_rows); + ASSERT_EQ(out_csr_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 6); + + // test out csr + g = CreateFromCSR(2, csr); + out_csr_matrix = g->GetCSRMatrix(0); + ASSERT_EQ(out_csr_matrix.num_rows, csr.num_rows); + ASSERT_EQ(out_csr_matrix.num_cols, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 2); + + // test out coo + g = CreateFromCOO(2, coo); + g_ptr = g->GetGraphInFormat(CSR_CODE); + out_csr_matrix = g_ptr->GetCSRMatrix(0); + ASSERT_EQ(out_csr_matrix.num_rows, coo.num_rows); + ASSERT_EQ(out_csr_matrix.num_cols, coo.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 1); + + out_csr_matrix = g->GetCSRMatrix(0); + ASSERT_EQ(out_csr_matrix.num_rows, coo.num_rows); + ASSERT_EQ(out_csr_matrix.num_cols, coo.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 3); +} + +template +void _TestUnitGraph_GetCOO(DGLContext ctx) { + const aten::CSRMatrix &csr = CSR1(ctx); + const aten::COOMatrix &coo = COO1(ctx); + + auto g = CreateFromCSC(2, csr); + auto g_ptr = g->GetGraphInFormat(COO_CODE); + auto out_coo_matrix = g_ptr->GetCOOMatrix(0); + ASSERT_EQ(out_coo_matrix.num_cols, csr.num_rows); + ASSERT_EQ(out_coo_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 4); + out_coo_matrix = g->GetCOOMatrix(0); + ASSERT_EQ(out_coo_matrix.num_cols, csr.num_rows); + ASSERT_EQ(out_coo_matrix.num_rows, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 5); + + // test out csr + g = CreateFromCSR(2, csr); + g_ptr = g->GetGraphInFormat(COO_CODE); + out_coo_matrix = g_ptr->GetCOOMatrix(0); + ASSERT_EQ(out_coo_matrix.num_rows, csr.num_rows); + ASSERT_EQ(out_coo_matrix.num_cols, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 2); + out_coo_matrix = g->GetCOOMatrix(0); + ASSERT_EQ(out_coo_matrix.num_rows, csr.num_rows); + ASSERT_EQ(out_coo_matrix.num_cols, csr.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 3); + + // test out coo + g = CreateFromCOO(2, coo); + out_coo_matrix = g->GetCOOMatrix(0); + ASSERT_EQ(out_coo_matrix.num_rows, coo.num_rows); + ASSERT_EQ(out_coo_matrix.num_cols, coo.num_cols); + ASSERT_EQ(g->GetCreatedFormats(), 1); +} + +template +void _TestUnitGraph_Reserve(DGLContext ctx) { + const aten::CSRMatrix &csr = CSR1(ctx); + const aten::COOMatrix &coo = COO1(ctx); + + auto g = CreateFromCSC(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 4); + auto r_g = + std::dynamic_pointer_cast(g->GetRelationGraph(0))->Reverse(); + ASSERT_EQ(r_g->GetCreatedFormats(), 2); + aten::CSRMatrix g_in_csr = g->GetCSCMatrix(0); + aten::CSRMatrix r_g_out_csr = r_g->GetCSRMatrix(0); + ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data); + ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data); + aten::CSRMatrix g_out_csr = g->GetCSRMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 6); + ASSERT_EQ(r_g->GetCreatedFormats(), 6); + aten::CSRMatrix r_g_in_csr = r_g->GetCSCMatrix(0); + ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data); + ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data); + aten::COOMatrix g_coo = g->GetCOOMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 7); + ASSERT_EQ(r_g->GetCreatedFormats(), 6); + aten::COOMatrix r_g_coo = r_g->GetCOOMatrix(0); + ASSERT_EQ(r_g->GetCreatedFormats(), 7); + ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols); + ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows); + ASSERT_TRUE(ArrayEQ(g_coo.row, r_g_coo.col)); + ASSERT_TRUE(ArrayEQ(g_coo.col, r_g_coo.row)); + + // test out csr + g = CreateFromCSR(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 2); + r_g = std::dynamic_pointer_cast(g->GetRelationGraph(0))->Reverse(); + ASSERT_EQ(r_g->GetCreatedFormats(), 4); + g_out_csr = g->GetCSRMatrix(0); + r_g_in_csr = r_g->GetCSCMatrix(0); + ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data); + ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data); + g_in_csr = g->GetCSCMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 6); + ASSERT_EQ(r_g->GetCreatedFormats(), 6); + r_g_out_csr = r_g->GetCSRMatrix(0); + ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data); + ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data); + g_coo = g->GetCOOMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 7); + ASSERT_EQ(r_g->GetCreatedFormats(), 6); + r_g_coo = r_g->GetCOOMatrix(0); + ASSERT_EQ(r_g->GetCreatedFormats(), 7); + ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols); + ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows); + ASSERT_TRUE(ArrayEQ(g_coo.row, r_g_coo.col)); + ASSERT_TRUE(ArrayEQ(g_coo.col, r_g_coo.row)); + + // test out coo + g = CreateFromCOO(2, coo); + ASSERT_EQ(g->GetCreatedFormats(), 1); + r_g = std::dynamic_pointer_cast(g->GetRelationGraph(0))->Reverse(); + ASSERT_EQ(r_g->GetCreatedFormats(), 1); + g_coo = g->GetCOOMatrix(0); + r_g_coo = r_g->GetCOOMatrix(0); + ASSERT_EQ(g_coo.num_rows, r_g_coo.num_cols); + ASSERT_EQ(g_coo.num_cols, r_g_coo.num_rows); + ASSERT_TRUE(g_coo.row->data == r_g_coo.col->data); + ASSERT_TRUE(g_coo.col->data == r_g_coo.row->data); + g_in_csr = g->GetCSCMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 5); + ASSERT_EQ(r_g->GetCreatedFormats(), 3); + r_g_out_csr = r_g->GetCSRMatrix(0); + ASSERT_TRUE(g_in_csr.indptr->data == r_g_out_csr.indptr->data); + ASSERT_TRUE(g_in_csr.indices->data == r_g_out_csr.indices->data); + g_out_csr = g->GetCSRMatrix(0); + ASSERT_EQ(g->GetCreatedFormats(), 7); + ASSERT_EQ(r_g->GetCreatedFormats(), 7); + r_g_in_csr = r_g->GetCSCMatrix(0); + ASSERT_TRUE(g_out_csr.indptr->data == r_g_in_csr.indptr->data); + ASSERT_TRUE(g_out_csr.indices->data == r_g_in_csr.indices->data); +} + +template +void _TestUnitGraph_CopyTo( + const DGLContext &src_ctx, const DGLContext &dst_ctx) { + const aten::CSRMatrix &csr = CSR1(src_ctx); + const aten::COOMatrix &coo = COO1(src_ctx); + + auto device = dgl::runtime::DeviceAPI::Get(dst_ctx); + // We don't allow SetStream in DGL for now. + auto stream = nullptr; + + auto g = dgl::UnitGraph::CreateFromCSC(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 4); + auto cg = dgl::UnitGraph::CopyTo(g, dst_ctx); + device->StreamSync(dst_ctx, stream); + ASSERT_EQ(cg->GetCreatedFormats(), 4); + + g = dgl::UnitGraph::CreateFromCSR(2, csr); + ASSERT_EQ(g->GetCreatedFormats(), 2); + cg = dgl::UnitGraph::CopyTo(g, dst_ctx); + device->StreamSync(dst_ctx, stream); + ASSERT_EQ(cg->GetCreatedFormats(), 2); + + g = dgl::UnitGraph::CreateFromCOO(2, coo); + ASSERT_EQ(g->GetCreatedFormats(), 1); + cg = dgl::UnitGraph::CopyTo(g, dst_ctx); + device->StreamSync(dst_ctx, stream); + ASSERT_EQ(cg->GetCreatedFormats(), 1); +} + +TEST(UniGraphTest, TestUnitGraph_CopyTo) { + _TestUnitGraph_CopyTo(CPU, CPU); + _TestUnitGraph_CopyTo(CPU, CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_CopyTo(CPU, GPU); + _TestUnitGraph_CopyTo(GPU, GPU); + _TestUnitGraph_CopyTo(GPU, CPU); + _TestUnitGraph_CopyTo(CPU, GPU); + _TestUnitGraph_CopyTo(GPU, GPU); + _TestUnitGraph_CopyTo(GPU, CPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_InOutDegrees) { + _TestUnitGraph_InOutDegrees(CPU); + _TestUnitGraph_InOutDegrees(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_InOutDegrees(GPU); + _TestUnitGraph_InOutDegrees(GPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_Create) { + _TestUnitGraph(CPU); + _TestUnitGraph(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph(GPU); + _TestUnitGraph(GPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_GetInCSR) { + _TestUnitGraph_GetInCSR(CPU); + _TestUnitGraph_GetInCSR(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_GetInCSR(GPU); + _TestUnitGraph_GetInCSR(GPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_GetOutCSR) { + _TestUnitGraph_GetOutCSR(CPU); + _TestUnitGraph_GetOutCSR(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_GetOutCSR(GPU); + _TestUnitGraph_GetOutCSR(GPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_GetCOO) { + _TestUnitGraph_GetCOO(CPU); + _TestUnitGraph_GetCOO(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_GetCOO(GPU); + _TestUnitGraph_GetCOO(GPU); +#endif +} + +TEST(UniGraphTest, TestUnitGraph_Reserve) { + _TestUnitGraph_Reserve(CPU); + _TestUnitGraph_Reserve(CPU); +#ifdef DGL_USE_CUDA + _TestUnitGraph_Reserve(GPU); + _TestUnitGraph_Reserve(GPU); +#endif +} diff --git a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp index 563aa5b5b75f..35d12531afd5 100644 --- a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp +++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp @@ -31,25 +31,25 @@ class gpu_cache_api { // Query API, i.e. A single read from the cache virtual void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len, - cudaStream_t stream, + hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values, - cudaStream_t stream, + hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; // Update API, i.e. update the embeddings which exist in the cache virtual void Update(const key_type* d_keys, const size_t len, const float* d_values, - cudaStream_t stream, + hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; // Dump API, i.e. dump some slabsets' keys from the cache virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, - const size_t end_set_index, cudaStream_t stream) = 0; + const size_t end_set_index, hipStream_t stream) = 0; // Record all the lookup stream of a specific cache for Update/Replace sync - virtual void Record(cudaStream_t stream) = 0; + virtual void Record(hipStream_t stream) = 0; }; } // namespace gpu_cache diff --git a/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip new file mode 100644 index 000000000000..563aa5b5b75f --- /dev/null +++ b/third_party/HugeCTR/gpu_cache/include/gpu_cache_api.hpp.prehip @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include + +#define TASK_PER_WARP_TILE_MACRO 1 + +namespace gpu_cache { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// GPU Cache API +template +class gpu_cache_api { + public: + virtual ~gpu_cache_api() noexcept(false) {} + // Query API, i.e. A single read from the cache + virtual void Query(const key_type* d_keys, const size_t len, float* d_values, + uint64_t* d_missing_index, key_type* d_missing_keys, size_t* d_missing_len, + cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; + + // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent + virtual void Replace(const key_type* d_keys, const size_t len, const float* d_values, + cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; + + // Update API, i.e. update the embeddings which exist in the cache + virtual void Update(const key_type* d_keys, const size_t len, const float* d_values, + cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) = 0; + + // Dump API, i.e. dump some slabsets' keys from the cache + virtual void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, + const size_t end_set_index, cudaStream_t stream) = 0; + + // Record all the lookup stream of a specific cache for Update/Replace sync + virtual void Record(cudaStream_t stream) = 0; +}; + +} // namespace gpu_cache diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp index 7cc61b58d78f..68045b10c588 100644 --- a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp +++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp @@ -61,22 +61,22 @@ class gpu_cache : public gpu_cache_api { // Query API, i.e. A single read from the cache void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, - key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream, + key_type* d_missing_keys, size_t* d_missing_len, hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent - void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, + void Replace(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; // Update API, i.e. update the embeddings which exist in the cache - void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, + void Update(const key_type* d_keys, const size_t len, const float* d_values, hipStream_t stream, const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; // Dump API, i.e. dump some slabsets' keys from the cache void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, - const size_t end_set_index, cudaStream_t stream) override; + const size_t end_set_index, hipStream_t stream) override; - void Record(cudaStream_t stream) override {} + void Record(hipStream_t stream) override {} public: using slabset = slab_set; diff --git a/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip new file mode 100644 index 000000000000..7cc61b58d78f --- /dev/null +++ b/third_party/HugeCTR/gpu_cache/include/nv_gpu_cache.hpp.prehip @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include + +#include "gpu_cache_api.hpp" +#ifdef LIBCUDACXX_VERSION +#include +#include +#endif + +#define SET_ASSOCIATIVITY 2 +#define SLAB_SIZE 32 +#define TASK_PER_WARP_TILE_MACRO 1 + +namespace gpu_cache { + +// slab for static slab list +template +struct static_slab { + key_type slab_[warp_size]; +}; + +// Static slablist(slabset) for GPU Cache +template +struct slab_set { + static_slab set_[set_associativity]; +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// GPU Cache +template , + typename slab_hasher = Mod_Hash> +class gpu_cache : public gpu_cache_api { + public: + // Ctor + gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size); + + // Dtor + ~gpu_cache(); + + // Query API, i.e. A single read from the cache + void Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, + key_type* d_missing_keys, size_t* d_missing_len, cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; + + // Replace API, i.e. Follow the Query API to update the content of the cache to Most Recent + void Replace(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; + + // Update API, i.e. update the embeddings which exist in the cache + void Update(const key_type* d_keys, const size_t len, const float* d_values, cudaStream_t stream, + const size_t task_per_warp_tile = TASK_PER_WARP_TILE_MACRO) override; + + // Dump API, i.e. dump some slabsets' keys from the cache + void Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, + const size_t end_set_index, cudaStream_t stream) override; + + void Record(cudaStream_t stream) override {} + + public: + using slabset = slab_set; +#ifdef LIBCUDACXX_VERSION + using atomic_ref_counter_type = cuda::atomic; + using mutex = cuda::binary_semaphore; +#endif + + private: + static const size_t BLOCK_SIZE_ = 64; + + // Cache data + slabset* keys_; + float* vals_; + ref_counter_type* slot_counter_; + + // Global counter +#ifdef LIBCUDACXX_VERSION + atomic_ref_counter_type* global_counter_; +#else + ref_counter_type* global_counter_; +#endif + // CUDA device + int dev_; + + // Cache capacity + size_t capacity_in_set_; + size_t num_slot_; + + // Embedding vector size + size_t embedding_vec_size_; + +#ifdef LIBCUDACXX_VERSION + // Array of mutex to protect (sub-)warp-level data structure, each mutex protect 1 slab set + mutex* set_mutex_; +#else + // Array of flag to protect (sub-)warp-level data structure, each flag act as a mutex and protect + // 1 slab set 1 for unlock, 0 for lock + int* set_mutex_; +#endif +}; + +} // namespace gpu_cache diff --git a/third_party/HugeCTR/gpu_cache/include/nv_util.h b/third_party/HugeCTR/gpu_cache/include/nv_util.h index f67ad6be2daf..a0a73cc1472f 100644 --- a/third_party/HugeCTR/gpu_cache/include/nv_util.h +++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h @@ -15,9 +15,9 @@ */ #pragma once -#include -#include -#include +#include +#include +#include #include #include @@ -31,27 +31,27 @@ template struct is_fp8 : std::false_type {}; template <> -struct is_fp8<__nv_fp8_e4m3> : std::true_type {}; +struct is_fp8<__hip_fp8_e4m3> : std::true_type {}; template <> -struct is_fp8<__nv_fp8_e5m2> : std::true_type {}; +struct is_fp8<__hip_fp8_e5m2> : std::true_type {}; class CudaException : public std::runtime_error { public: CudaException(const std::string& what) : runtime_error(what) {} }; -inline void cuda_check_(cudaError_t val, const char* file, int line) { - if (val != cudaSuccess) { +inline void cuda_check_(hipError_t val, const char* file, int line) { + if (val != hipSuccess) { throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " + - std::to_string(val) + ": " + cudaGetErrorString(val)); + std::to_string(val) + ": " + hipGetErrorString(val)); } } class CudaDeviceRestorer { public: - CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); } - ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); } + CudaDeviceRestorer() { CUDA_CHECK(hipGetDevice(&dev_)); } + ~CudaDeviceRestorer() { CUDA_CHECK(hipSetDevice(dev_)); } void check_device(int device) const { if (device != dev_) { throw std::runtime_error( @@ -65,14 +65,14 @@ class CudaDeviceRestorer { }; inline int get_dev(const void* ptr) { - cudaPointerAttributes attr; - CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); + hipPointerAttribute_t attr; + CUDA_CHECK(hipPointerGetAttributes(&attr, ptr)); int dev = -1; #if CUDART_VERSION >= 10000 - if (attr.type == cudaMemoryTypeDevice) + if (attr.type == hipMemoryTypeDevice) #else - if (attr.memoryType == cudaMemoryTypeDevice) + if (attr.memoryType == hipMemoryTypeDevice) #endif { dev = attr.device; @@ -83,7 +83,7 @@ inline int get_dev(const void* ptr) { inline void switch_to_dev(const void* ptr) { int dev = get_dev(ptr); if (dev >= 0) { - CUDA_CHECK(cudaSetDevice(dev)); + CUDA_CHECK(hipSetDevice(dev)); } } diff --git a/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip b/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip new file mode 100644 index 000000000000..f67ad6be2daf --- /dev/null +++ b/third_party/HugeCTR/gpu_cache/include/nv_util.h.prehip @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#define CUDA_CHECK(val) \ + { nv::cuda_check_((val), __FILE__, __LINE__); } + +namespace nv { + +template +struct is_fp8 : std::false_type {}; + +template <> +struct is_fp8<__nv_fp8_e4m3> : std::true_type {}; + +template <> +struct is_fp8<__nv_fp8_e5m2> : std::true_type {}; + +class CudaException : public std::runtime_error { + public: + CudaException(const std::string& what) : runtime_error(what) {} +}; + +inline void cuda_check_(cudaError_t val, const char* file, int line) { + if (val != cudaSuccess) { + throw CudaException(std::string(file) + ":" + std::to_string(line) + ": CUDA error " + + std::to_string(val) + ": " + cudaGetErrorString(val)); + } +} + +class CudaDeviceRestorer { + public: + CudaDeviceRestorer() { CUDA_CHECK(cudaGetDevice(&dev_)); } + ~CudaDeviceRestorer() { CUDA_CHECK(cudaSetDevice(dev_)); } + void check_device(int device) const { + if (device != dev_) { + throw std::runtime_error( + std::string(__FILE__) + ":" + std::to_string(__LINE__) + + ": Runtime Error: The device id in the context is not consistent with configuration"); + } + } + + private: + int dev_; +}; + +inline int get_dev(const void* ptr) { + cudaPointerAttributes attr; + CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr)); + int dev = -1; + +#if CUDART_VERSION >= 10000 + if (attr.type == cudaMemoryTypeDevice) +#else + if (attr.memoryType == cudaMemoryTypeDevice) +#endif + { + dev = attr.device; + } + return dev; +} + +inline void switch_to_dev(const void* ptr) { + int dev = get_dev(ptr); + if (dev >= 0) { + CUDA_CHECK(cudaSetDevice(dev)); + } +} + +} // namespace nv diff --git a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu index 8dc21395e329..4e982029fec1 100644 --- a/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu +++ b/third_party/HugeCTR/gpu_cache/src/nv_gpu_cache.cu @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /* * Copyright (c) 2023, NVIDIA CORPORATION. * @@ -14,7 +15,7 @@ * limitations under the License. */ -#include +#include #include @@ -1253,27 +1254,27 @@ gpu_cache init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); // Wait for initialization to finish - CUDA_CHECK(cudaStreamSynchronize(0)); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipStreamSynchronize(0)); + CUDA_CHECK(hipGetLastError()); } #else template init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); // Wait for initialization to finish - CUDA_CHECK(cudaStreamSynchronize(0)); - CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(hipStreamSynchronize(0)); + CUDA_CHECK(hipGetLastError()); } #endif @@ -1340,15 +1341,15 @@ gpu_cache>>( global_counter_, set_mutex_, capacity_in_set_); // Wait for destruction to finish - CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(hipStreamSynchronize(0)); // Free GPU memory for cache - CUDA_CHECK(cudaFree(keys_)); - CUDA_CHECK(cudaFree(vals_)); - CUDA_CHECK(cudaFree(slot_counter_)); - CUDA_CHECK(cudaFree(global_counter_)); + CUDA_CHECK(hipFree(keys_)); + CUDA_CHECK(hipFree(vals_)); + CUDA_CHECK(hipFree(slot_counter_)); + CUDA_CHECK(hipFree(global_counter_)); // Free GPU memory for set mutex - CUDA_CHECK(cudaFree(set_mutex_)); + CUDA_CHECK(hipFree(set_mutex_)); } #else template ::Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, key_type* d_missing_keys, - size_t* d_missing_len, cudaStream_t stream, + size_t* d_missing_len, hipStream_t stream, const size_t task_per_warp_tile) { // Device Restorer nv::CudaDeviceRestorer dev_restorer; @@ -1387,7 +1388,7 @@ void gpu_cache::Query(const key_type* d_keys, const size_t len, float* d_values, uint64_t* d_missing_index, key_type* d_missing_keys, - size_t* d_missing_len, cudaStream_t stream, + size_t* d_missing_len, hipStream_t stream, const size_t task_per_warp_tile) { // Device Restorer nv::CudaDeviceRestorer dev_restorer; @@ -1425,7 +1426,7 @@ void gpu_cache void gpu_cache::Replace(const key_type* d_keys, const size_t len, - const float* d_values, cudaStream_t stream, + const float* d_values, hipStream_t stream, const size_t task_per_warp_tile) { // Check if it is a valid replacement if (len == 0) { @@ -1477,14 +1478,14 @@ void gpu_cache void gpu_cache::Replace(const key_type* d_keys, const size_t len, - const float* d_values, cudaStream_t stream, + const float* d_values, hipStream_t stream, const size_t task_per_warp_tile) { // Check if it is a valid replacement if (len == 0) { @@ -1506,7 +1507,7 @@ void gpu_cache void gpu_cache::Update(const key_type* d_keys, const size_t len, const float* d_values, - cudaStream_t stream, const size_t task_per_warp_tile) { + hipStream_t stream, const size_t task_per_warp_tile) { // Check if it is a valid update request if (len == 0) { return; @@ -1535,14 +1536,14 @@ void gpu_cache void gpu_cache::Update(const key_type* d_keys, const size_t len, const float* d_values, - cudaStream_t stream, const size_t task_per_warp_tile) { + hipStream_t stream, const size_t task_per_warp_tile) { // Check if it is a valid update request if (len == 0) { return; @@ -1562,7 +1563,7 @@ void gpu_cache::Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, const size_t end_set_index, - cudaStream_t stream) { + hipStream_t stream) { // Check if it is a valid dump request if (start_set_index >= capacity_in_set_) { printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); @@ -1589,7 +1590,7 @@ void gpu_cache::Dump(key_type* d_keys, size_t* d_dump_counter, const size_t start_set_index, const size_t end_set_index, - cudaStream_t stream) { + hipStream_t stream) { // Check if it is a valid dump request if (start_set_index >= capacity_in_set_) { printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); @@ -1624,7 +1625,7 @@ void gpu_cache + +#include + +namespace cg = cooperative_groups; + +// Overload CUDA atomic for other 64bit unsigned/signed integer type +__forceinline__ __device__ long atomicAdd(long* address, long val) { + return (long)atomicAdd((unsigned long long*)address, (unsigned long long)val); +} + +__forceinline__ __device__ long long atomicAdd(long long* address, long long val) { + return (long long)atomicAdd((unsigned long long*)address, (unsigned long long)val); +} + +__forceinline__ __device__ unsigned long atomicAdd(unsigned long* address, unsigned long val) { + return (unsigned long)atomicAdd((unsigned long long*)address, (unsigned long long)val); +} + +namespace gpu_cache { + +#ifdef LIBCUDACXX_VERSION +template +__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx, + const size_t emb_vec_size_in_float, float* d_dst, + const float* d_src) { +#pragma unroll + for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) { + d_dst[i] = d_src[i]; + } +} +#else +template +__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx, + const size_t emb_vec_size_in_float, + volatile float* d_dst, volatile float* d_src) { + +#pragma unroll + for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) { + d_dst[i] = d_src[i]; + } +} +#endif + +#ifdef LIBCUDACXX_VERSION +// Will be called by multiple thread_block_tile((sub-)warp) on the same mutex +// Expect only one thread_block_tile return to execute critical section at any time +template +__forceinline__ __device__ void warp_lock_mutex(const cg::thread_block_tile& warp_tile, + mutex& set_mutex) { + // The first thread of this (sub-)warp to acquire the lock + if (warp_tile.thread_rank() == 0) { + set_mutex.acquire(); + } + warp_tile.sync(); // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence +} + +// The (sub-)warp holding the mutex will unlock the mutex after finishing the critical section on a +// set Expect any following (sub-)warp that acquire the mutex can see its modification done in the +// critical section +template +__forceinline__ __device__ void warp_unlock_mutex(const cg::thread_block_tile& warp_tile, + mutex& set_mutex) { + warp_tile.sync(); // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence + // The first thread of this (sub-)warp to release the lock + if (warp_tile.thread_rank() == 0) { + set_mutex.release(); + } +} +#else +// Will be called by multiple thread_block_tile((sub-)warp) on the same mutex +// Expect only one thread_block_tile return to execute critical section at any time +template +__forceinline__ __device__ void warp_lock_mutex(const cg::thread_block_tile& warp_tile, + volatile int& set_mutex) { + // The first thread of this (sub-)warp to acquire the lock + if (warp_tile.thread_rank() == 0) { + while (0 == atomicCAS((int*)&set_mutex, 1, 0)) + ; + } + __threadfence(); + warp_tile.sync(); // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence +} + +// The (sub-)warp holding the mutex will unlock the mutex after finishing the critical section on a +// set Expect any following (sub-)warp that acquire the mutex can see its modification done in the +// critical section +template +__forceinline__ __device__ void warp_unlock_mutex(const cg::thread_block_tile& warp_tile, + volatile int& set_mutex) { + __threadfence(); + warp_tile.sync(); // Synchronize the threads in the (sub-)warp. Execution barrier + memory fence + // The first thread of this (sub-)warp to release the lock + if (warp_tile.thread_rank() == 0) { + atomicExch((int*)&set_mutex, 1); + } +} +#endif + +// The (sub-)warp doing all reduction to find the slot with min slot_counter +// The slot with min slot_counter is the LR slot. +template +__forceinline__ __device__ void warp_min_reduction( + const cg::thread_block_tile& warp_tile, ref_counter_type& min_slot_counter_val, + size_t& slab_distance, size_t& slot_distance) { + const size_t lane_idx = warp_tile.thread_rank(); + slot_distance = lane_idx; + + for (size_t i = (warp_tile.size() >> 1); i > 0; i = i >> 1) { + ref_counter_type input_slot_counter_val = warp_tile.shfl_xor(min_slot_counter_val, (int)i); + size_t input_slab_distance = warp_tile.shfl_xor(slab_distance, (int)i); + size_t input_slot_distance = warp_tile.shfl_xor(slot_distance, (int)i); + + if (input_slot_counter_val == min_slot_counter_val) { + if (input_slab_distance == slab_distance) { + if (input_slot_distance < slot_distance) { + slot_distance = input_slot_distance; + } + } else if (input_slab_distance < slab_distance) { + slab_distance = input_slab_distance; + slot_distance = input_slot_distance; + } + } else if (input_slot_counter_val < min_slot_counter_val) { + min_slot_counter_val = input_slot_counter_val; + slab_distance = input_slab_distance; + slot_distance = input_slot_distance; + } + } +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef LIBCUDACXX_VERSION +// Kernel to initialize the GPU cache +// Init every entry of the cache with pair +template +__global__ void init_cache(slabset* keys, ref_counter_type* slot_counter, + atomic_ref_counter_type* global_counter, const size_t num_slot, + const key_type empty_key, mutex* set_mutex, + const size_t capacity_in_set) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_slot) { + // Set the key of this slot to unused key + // Flatten the cache + key_type* key_slot = (key_type*)keys; + key_slot[idx] = empty_key; + + // Clear the counter for this slot + slot_counter[idx] = 0; + } + // First CUDA thread clear the global counter + if (idx == 0) { + new (global_counter) atomic_ref_counter_type(0); + } + + // First capacity_in_set CUDA thread initialize mutex + if (idx < capacity_in_set) { + new (set_mutex + idx) mutex(1); + } +} + +template +__global__ void destruct_kernel(atomic_ref_counter_type* global_counter, mutex* set_mutex, + const size_t capacity_in_set) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + // First CUDA thread destruct the global_counter + if (idx == 0) { + global_counter->~atomic_ref_counter_type(); + } + // First capacity_in_set CUDA thread destruct the set mutex + if (idx < capacity_in_set) { + (set_mutex + idx)->~mutex(); + } +} +#else +// Kernel to initialize the GPU cache +// Init every entry of the cache with pair +template +__global__ void init_cache(slabset* keys, ref_counter_type* slot_counter, + ref_counter_type* global_counter, const size_t num_slot, + const key_type empty_key, int* set_mutex, const size_t capacity_in_set) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_slot) { + // Set the key of this slot to unused key + // Flatten the cache + key_type* key_slot = (key_type*)keys; + key_slot[idx] = empty_key; + + // Clear the counter for this slot + slot_counter[idx] = 0; + } + // First CUDA thread clear the global counter + if (idx == 0) { + global_counter[idx] = 0; + } + + // First capacity_in_set CUDA thread initialize mutex + if (idx < capacity_in_set) { + set_mutex[idx] = 1; + } +} +#endif + +// Kernel to update global counter +// Resolve distance overflow issue as well +#ifdef LIBCUDACXX_VERSION +template +__global__ void update_kernel_overflow_ignore(atomic_ref_counter_type* global_counter, + size_t* d_missing_len) { + // Update global counter + global_counter->fetch_add(1, cuda::std::memory_order_relaxed); + *d_missing_len = 0; +} +#else +template +__global__ void update_kernel_overflow_ignore(ref_counter_type* global_counter, + size_t* d_missing_len) { + // Update global counter + atomicAdd(global_counter, 1); + *d_missing_len = 0; +} +#endif + +#ifdef LIBCUDACXX_VERSION +// Kernel to read from cache +// Also update locality information for touched slot +template +__global__ void get_kernel(const key_type* d_keys, const size_t len, float* d_values, + const size_t embedding_vec_size, uint64_t* d_missing_index, + key_type* d_missing_keys, size_t* d_missing_len, + const atomic_ref_counter_type* global_counter, + ref_counter_type* slot_counter, const size_t capacity_in_set, + const slabset* keys, const float* vals, mutex* set_mutex, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // The variable that contains the missing key + key_type missing_key; + // The variable that contains the index for the missing key + uint64_t missing_index; + // The counter for counting the missing key in this warp + uint8_t warp_missing_counter = 0; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task and the global index to all lane in the warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched, mark missing task, task is + // completed + if (counter >= set_associativity) { + if (lane_idx == warp_missing_counter) { + missing_key = next_key; + missing_index = next_idx; + } + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_missing_counter++; + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found, mark hit task, copy the founded data, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed); + active = false; + } + + warp_tile_copy(lane_idx, embedding_vec_size, + d_values + next_idx * embedding_vec_size, + vals + found_offset * embedding_vec_size); + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key, if found empty key, mark missing task, task is + // completed + if (warp_tile.ballot(read_key == empty_key) != 0) { + if (lane_idx == warp_missing_counter) { + missing_key = next_key; + missing_index = next_idx; + } + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_missing_counter++; + active_mask = warp_tile.ballot(active); + break; + } + + // Not found in this slab, the task is not completed, goto searching next slab + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } + + // After warp_tile complete the working queue, save the result for output + // First thread of the warp_tile accumulate the missing length to global variable + size_t warp_position; + if (lane_idx == 0) { + warp_position = atomicAdd(d_missing_len, (size_t)warp_missing_counter); + } + warp_position = warp_tile.shfl(warp_position, 0); + + if (lane_idx < warp_missing_counter) { + d_missing_keys[warp_position + lane_idx] = missing_key; + d_missing_index[warp_position + lane_idx] = missing_index; + } +} +#else +// Kernel to read from cache +// Also update locality information for touched slot +template +__global__ void get_kernel(const key_type* d_keys, const size_t len, float* d_values, + const size_t embedding_vec_size, uint64_t* d_missing_index, + key_type* d_missing_keys, size_t* d_missing_len, + ref_counter_type* global_counter, + volatile ref_counter_type* slot_counter, const size_t capacity_in_set, + volatile slabset* keys, volatile float* vals, volatile int* set_mutex, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // The variable that contains the missing key + key_type missing_key; + // The variable that contains the index for the missing key + uint64_t missing_index; + // The counter for counting the missing key in this warp + uint8_t warp_missing_counter = 0; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task and the global index to all lane in the warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched, mark missing task, task is + // completed + if (counter >= set_associativity) { + if (lane_idx == warp_missing_counter) { + missing_key = next_key; + missing_index = next_idx; + } + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_missing_counter++; + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found, mark hit task, copy the founded data, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + slot_counter[found_offset] = atomicAdd(global_counter, 0); + active = false; + } + + warp_tile_copy(lane_idx, embedding_vec_size, + (volatile float*)(d_values + next_idx * embedding_vec_size), + (volatile float*)(vals + found_offset * embedding_vec_size)); + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key, if found empty key, mark missing task, task is + // completed + if (warp_tile.ballot(read_key == empty_key) != 0) { + if (lane_idx == warp_missing_counter) { + missing_key = next_key; + missing_index = next_idx; + } + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_missing_counter++; + active_mask = warp_tile.ballot(active); + break; + } + + // Not found in this slab, the task is not completed, goto searching next slab + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } + + // After warp_tile complete the working queue, save the result for output + // First thread of the warp_tile accumulate the missing length to global variable + size_t warp_position; + if (lane_idx == 0) { + warp_position = atomicAdd(d_missing_len, (size_t)warp_missing_counter); + } + warp_position = warp_tile.shfl(warp_position, 0); + + if (lane_idx < warp_missing_counter) { + d_missing_keys[warp_position + lane_idx] = missing_key; + d_missing_index[warp_position + lane_idx] = missing_index; + } +} +#endif + +#ifdef LIBCUDACXX_VERSION +// Kernel to insert or replace the pairs into the cache +template ::max(), + size_t max_slab_distance = std::numeric_limits::max()> +__global__ void insert_replace_kernel(const key_type* d_keys, const float* d_values, + const size_t embedding_vec_size, const size_t len, + slabset* keys, float* vals, ref_counter_type* slot_counter, + mutex* set_mutex, + const atomic_ref_counter_type* global_counter, + const size_t capacity_in_set, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task, the global index and the src slabset and slab to all lane in a warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + size_t first_slab = next_slab; + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Variable to keep the min slot counter during the probing + ref_counter_type min_slot_counter_val = max_ref_counter_type; + // Variable to keep the slab distance for slot with min counter + size_t slab_distance = max_slab_distance; + // Variable to keep the slot distance for slot with min counter within the slab + size_t slot_distance; + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched + // and no empty slots or target slots are found. Replace with LRU + if (counter >= set_associativity) { + // (sub)Warp all-reduction, the reduction result store in all threads + warp_min_reduction(warp_tile, min_slot_counter_val, + slab_distance, slot_distance); + + // Calculate the position of LR slot + size_t target_slab = (first_slab + slab_distance) % set_associativity; + size_t slot_index = + (next_set * set_associativity + target_slab) * warp_size + slot_distance; + + // Replace the LR slot + if (lane_idx == (size_t)next_lane) { + keys[next_set].set_[target_slab].slab_[slot_distance] = key; + slot_counter[slot_index] = global_counter->load(cuda::std::memory_order_relaxed); + } + + warp_tile_copy(lane_idx, embedding_vec_size, + vals + slot_index * embedding_vec_size, + d_values + next_idx * embedding_vec_size); + + // Replace complete, mark this task completed + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found target key, the insertion/replace is no longer needed. + // Refresh the slot, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed); + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key. + // If found empty key, do insertion,the task is complete + found_lane = __ffs(warp_tile.ballot(read_key == empty_key)) - 1; + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + + if (lane_idx == (size_t)next_lane) { + keys[next_set].set_[next_slab].slab_[found_lane] = key; + slot_counter[found_offset] = global_counter->load(cuda::std::memory_order_relaxed); + } + + warp_tile_copy(lane_idx, embedding_vec_size, + vals + found_offset * embedding_vec_size, + d_values + next_idx * embedding_vec_size); + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // If no target or unused slot found in this slab, + // Refresh LR info, continue probing + ref_counter_type read_slot_counter = + slot_counter[(next_set * set_associativity + next_slab) * warp_size + lane_idx]; + if (read_slot_counter < min_slot_counter_val) { + min_slot_counter_val = read_slot_counter; + slab_distance = counter; + } + + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } +} +#else +// Kernel to insert or replace the pairs into the cache +template ::max(), + size_t max_slab_distance = std::numeric_limits::max()> +__global__ void insert_replace_kernel(const key_type* d_keys, const float* d_values, + const size_t embedding_vec_size, const size_t len, + volatile slabset* keys, volatile float* vals, + volatile ref_counter_type* slot_counter, + volatile int* set_mutex, ref_counter_type* global_counter, + const size_t capacity_in_set, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task, the global index and the src slabset and slab to all lane in a warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + size_t first_slab = next_slab; + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Variable to keep the min slot counter during the probing + ref_counter_type min_slot_counter_val = max_ref_counter_type; + // Variable to keep the slab distance for slot with min counter + size_t slab_distance = max_slab_distance; + // Variable to keep the slot distance for slot with min counter within the slab + size_t slot_distance; + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched + // and no empty slots or target slots are found. Replace with LRU + if (counter >= set_associativity) { + // (sub)Warp all-reduction, the reduction result store in all threads + warp_min_reduction(warp_tile, min_slot_counter_val, + slab_distance, slot_distance); + + // Calculate the position of LR slot + size_t target_slab = (first_slab + slab_distance) % set_associativity; + size_t slot_index = + (next_set * set_associativity + target_slab) * warp_size + slot_distance; + + // Replace the LR slot + if (lane_idx == (size_t)next_lane) { + ((volatile key_type*)(keys[next_set].set_[target_slab].slab_))[slot_distance] = key; + slot_counter[slot_index] = atomicAdd(global_counter, 0); + } + + warp_tile_copy(lane_idx, embedding_vec_size, + (volatile float*)(vals + slot_index * embedding_vec_size), + (volatile float*)(d_values + next_idx * embedding_vec_size)); + + // Replace complete, mark this task completed + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found target key, the insertion/replace is no longer needed. + // Refresh the slot, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + slot_counter[found_offset] = atomicAdd(global_counter, 0); + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key. + // If found empty key, do insertion,the task is complete + found_lane = __ffs(warp_tile.ballot(read_key == empty_key)) - 1; + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + + if (lane_idx == (size_t)next_lane) { + ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[found_lane] = key; + slot_counter[found_offset] = atomicAdd(global_counter, 0); + } + + warp_tile_copy(lane_idx, embedding_vec_size, + (volatile float*)(vals + found_offset * embedding_vec_size), + (volatile float*)(d_values + next_idx * embedding_vec_size)); + + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // If no target or unused slot found in this slab, + // Refresh LR info, continue probing + ref_counter_type read_slot_counter = + slot_counter[(next_set * set_associativity + next_slab) * warp_size + lane_idx]; + if (read_slot_counter < min_slot_counter_val) { + min_slot_counter_val = read_slot_counter; + slab_distance = counter; + } + + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } +} +#endif + +#ifdef LIBCUDACXX_VERSION +// Kernel to update the existing keys in the cache +// Will not change the locality information +template +__global__ void update_kernel(const key_type* d_keys, const size_t len, const float* d_values, + const size_t embedding_vec_size, const size_t capacity_in_set, + const slabset* keys, float* vals, mutex* set_mutex, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task and the global index to all lane in the warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched, mark missing task, do nothing, task + // complete + if (counter >= set_associativity) { + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = keys[next_set].set_[next_slab].slab_[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found, mark hit task, update the value, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_tile_copy(lane_idx, embedding_vec_size, + vals + found_offset * embedding_vec_size, + d_values + next_idx * embedding_vec_size); + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key, if found empty key, mark missing task, do nothing, + // task is completed + if (warp_tile.ballot(read_key == empty_key) != 0) { + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // Not found in this slab, the task is not completed, goto searching next slab + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } +} +#else +// Kernel to update the existing keys in the cache +// Will not change the locality information +template +__global__ void update_kernel(const key_type* d_keys, const size_t len, const float* d_values, + const size_t embedding_vec_size, const size_t capacity_in_set, + volatile slabset* keys, volatile float* vals, volatile int* set_mutex, + const size_t task_per_warp_tile) { + // Lane(thread) ID within a warp_tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile global ID + const size_t warp_tile_global_idx = + (blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank(); + // The index of key for this thread + const size_t key_idx = (warp_tile_global_idx * task_per_warp_tile) + lane_idx; + // The assigned key for this lane(thread) + key_type key; + // The dst slabset and the dst slab inside this set + size_t src_set; + size_t src_slab; + // Active flag: whether current lane(thread) has unfinished task + bool active = false; + if (lane_idx < task_per_warp_tile) { + if (key_idx < len) { + active = true; + key = d_keys[key_idx]; + src_set = set_hasher::hash(key) % capacity_in_set; + src_slab = slab_hasher::hash(key) % set_associativity; + } + } + + // Lane participate in warp_tile ballot to produce warp-level work queue + unsigned active_mask = warp_tile.ballot(active); + + // The warp-level outer loop: finish all the tasks within the work queue + while (active_mask != 0) { + // Next task in the work quere, start from lower index lane(thread) + int next_lane = __ffs(active_mask) - 1; + // Broadcast the task and the global index to all lane in the warp_tile + key_type next_key = warp_tile.shfl(key, next_lane); + size_t next_idx = warp_tile.shfl(key_idx, next_lane); + size_t next_set = warp_tile.shfl(src_set, next_lane); + size_t next_slab = warp_tile.shfl(src_slab, next_lane); + + // Counter to record how many slab have been searched + size_t counter = 0; + + // Working queue before task started + const unsigned old_active_mask = active_mask; + + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[next_set]); + + // The warp-level inner loop: finish a single task in the work queue + while (active_mask == old_active_mask) { + // When all the slabs inside a slabset have been searched, mark missing task, do nothing, task + // complete + if (counter >= set_associativity) { + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // The warp_tile read out the slab + key_type read_key = ((volatile key_type*)(keys[next_set].set_[next_slab].slab_))[lane_idx]; + + // Compare the slab data with the target key + int found_lane = __ffs(warp_tile.ballot(read_key == next_key)) - 1; + + // If found, mark hit task, update the value, the task is completed + if (found_lane >= 0) { + size_t found_offset = (next_set * set_associativity + next_slab) * warp_size + found_lane; + if (lane_idx == (size_t)next_lane) { + active = false; + } + + warp_tile_copy(lane_idx, embedding_vec_size, + (volatile float*)(vals + found_offset * embedding_vec_size), + (volatile float*)(d_values + next_idx * embedding_vec_size)); + + active_mask = warp_tile.ballot(active); + break; + } + + // Compare the slab data with empty key, if found empty key, mark missing task, do nothing, + // task is completed + if (warp_tile.ballot(read_key == empty_key) != 0) { + if (lane_idx == (size_t)next_lane) { + active = false; + } + + active_mask = warp_tile.ballot(active); + break; + } + + // Not found in this slab, the task is not completed, goto searching next slab + counter++; + next_slab = (next_slab + 1) % set_associativity; + } + + // Unlock the slabset after operating the slabset + warp_unlock_mutex(warp_tile, set_mutex[next_set]); + } +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +__global__ void dump_kernel(key_type* d_keys, size_t* d_dump_counter, const slabset* keys, + mutex* set_mutex, const size_t start_set_index, + const size_t end_set_index) { + // Block-level counter used by all warp tiles within a block + __shared__ uint32_t block_acc; + // Initialize block-level counter + if (threadIdx.x == 0) { + block_acc = 0; + } + __syncthreads(); + // Lane(thread) ID within a warp tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile target slabset id + const size_t set_idx = + ((blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank()) + start_set_index; + // Keys dump from cache + key_type read_key[set_associativity]; + // Lane(thread) offset for storing each key + uint32_t thread_key_offset[set_associativity]; + // Warp offset for storing each key + uint32_t warp_key_offset; + // Block offset for storing each key + __shared__ size_t block_key_offset; + + // Warp tile dump target slabset + if (set_idx < end_set_index) { + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[set_idx]); + + // The warp tile read out the slabset + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + // The warp tile read out a slab + read_key[slab_id] = keys[set_idx].set_[slab_id].slab_[lane_idx]; + } + + // Finish dumping the slabset, unlock the slabset + warp_unlock_mutex(warp_tile, set_mutex[set_idx]); + + // Each lane(thread) within the warp tile calculate the offset to store its keys + uint32_t warp_tile_total_keys = 0; + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + unsigned valid_mask = warp_tile.ballot(read_key[slab_id] != empty_key); + thread_key_offset[slab_id] = + __popc(valid_mask & ((1U << lane_idx) - 1U)) + warp_tile_total_keys; + warp_tile_total_keys = warp_tile_total_keys + __popc(valid_mask); + } + + // Each warp tile request a unique place from the block-level counter + if (lane_idx == 0) { + warp_key_offset = atomicAdd(&block_acc, warp_tile_total_keys); + } + warp_key_offset = warp_tile.shfl(warp_key_offset, 0); + } + + // Each block request a unique place in global memory output buffer + __syncthreads(); + if (threadIdx.x == 0) { + block_key_offset = atomicAdd(d_dump_counter, (size_t)block_acc); + } + __syncthreads(); + + // Warp tile store the (non-empty)keys back to output buffer + if (set_idx < end_set_index) { + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + if (read_key[slab_id] != empty_key) { + d_keys[block_key_offset + warp_key_offset + thread_key_offset[slab_id]] = read_key[slab_id]; + } + } + } +} +#else +template +__global__ void dump_kernel(key_type* d_keys, size_t* d_dump_counter, volatile slabset* keys, + volatile int* set_mutex, const size_t start_set_index, + const size_t end_set_index) { + // Block-level counter used by all warp tiles within a block + __shared__ uint32_t block_acc; + // Initialize block-level counter + if (threadIdx.x == 0) { + block_acc = 0; + } + __syncthreads(); + // Lane(thread) ID within a warp tile + cg::thread_block_tile warp_tile = + cg::tiled_partition(cg::this_thread_block()); + const size_t lane_idx = warp_tile.thread_rank(); + // Warp tile target slabset id + const size_t set_idx = + ((blockIdx.x * (blockDim.x / warp_size)) + warp_tile.meta_group_rank()) + start_set_index; + // Keys dump from cache + key_type read_key[set_associativity]; + // Lane(thread) offset for storing each key + uint32_t thread_key_offset[set_associativity]; + // Warp offset for storing each key + uint32_t warp_key_offset; + // Block offset for storing each key + __shared__ size_t block_key_offset; + + // Warp tile dump target slabset + if (set_idx < end_set_index) { + // Lock the slabset before operating the slabset + warp_lock_mutex(warp_tile, set_mutex[set_idx]); + + // The warp tile read out the slabset + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + // The warp tile read out a slab + read_key[slab_id] = ((volatile key_type*)(keys[set_idx].set_[slab_id].slab_))[lane_idx]; + } + + // Finish dumping the slabset, unlock the slabset + warp_unlock_mutex(warp_tile, set_mutex[set_idx]); + + // Each lane(thread) within the warp tile calculate the offset to store its keys + uint32_t warp_tile_total_keys = 0; + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + unsigned valid_mask = warp_tile.ballot(read_key[slab_id] != empty_key); + thread_key_offset[slab_id] = + __popc(valid_mask & ((1U << lane_idx) - 1U)) + warp_tile_total_keys; + warp_tile_total_keys = warp_tile_total_keys + __popc(valid_mask); + } + + // Each warp tile request a unique place from the block-level counter + if (lane_idx == 0) { + warp_key_offset = atomicAdd(&block_acc, warp_tile_total_keys); + } + warp_key_offset = warp_tile.shfl(warp_key_offset, 0); + } + + // Each block request a unique place in global memory output buffer + __syncthreads(); + if (threadIdx.x == 0) { + block_key_offset = atomicAdd(d_dump_counter, (size_t)block_acc); + } + __syncthreads(); + + // Warp tile store the (non-empty)keys back to output buffer + if (set_idx < end_set_index) { + for (unsigned slab_id = 0; slab_id < set_associativity; slab_id++) { + if (read_key[slab_id] != empty_key) { + d_keys[block_key_offset + warp_key_offset + thread_key_offset[slab_id]] = read_key[slab_id]; + } + } + } +} +#endif +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#ifdef LIBCUDACXX_VERSION +template +gpu_cache::gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size) + : capacity_in_set_(capacity_in_set), embedding_vec_size_(embedding_vec_size) { + // Check parameter + if (capacity_in_set_ == 0) { + printf("Error: Invalid value for capacity_in_set.\n"); + return; + } + if (embedding_vec_size_ == 0) { + printf("Error: Invalid value for embedding_vec_size.\n"); + return; + } + if (set_associativity <= 0) { + printf("Error: Invalid value for set_associativity.\n"); + return; + } + if (warp_size != 1 && warp_size != 2 && warp_size != 4 && warp_size != 8 && warp_size != 16 && + warp_size != 32) { + printf("Error: Invalid value for warp_size.\n"); + return; + } + + // Get the current CUDA dev + CUDA_CHECK(cudaGetDevice(&dev_)); + + // Calculate # of slot + num_slot_ = capacity_in_set_ * set_associativity * warp_size; + + // Allocate GPU memory for cache + CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_)); + CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_)); + CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_)); + CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(atomic_ref_counter_type))); + + // Allocate GPU memory for set mutex + CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(mutex) * capacity_in_set_)); + + // Initialize the cache, set all entry to unused + init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( + keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); + + // Wait for initialization to finish + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaGetLastError()); +} +#else +template +gpu_cache::gpu_cache(const size_t capacity_in_set, const size_t embedding_vec_size) + : capacity_in_set_(capacity_in_set), embedding_vec_size_(embedding_vec_size) { + // Check parameter + if (capacity_in_set_ == 0) { + printf("Error: Invalid value for capacity_in_set.\n"); + return; + } + if (embedding_vec_size_ == 0) { + printf("Error: Invalid value for embedding_vec_size.\n"); + return; + } + if (set_associativity <= 0) { + printf("Error: Invalid value for set_associativity.\n"); + return; + } + if (warp_size != 1 && warp_size != 2 && warp_size != 4 && warp_size != 8 && warp_size != 16 && + warp_size != 32) { + printf("Error: Invalid value for warp_size.\n"); + return; + } + + // Get the current CUDA dev + CUDA_CHECK(cudaGetDevice(&dev_)); + + // Calculate # of slot + num_slot_ = capacity_in_set_ * set_associativity * warp_size; + + // Allocate GPU memory for cache + CUDA_CHECK(cudaMalloc((void**)&keys_, sizeof(slabset) * capacity_in_set_)); + CUDA_CHECK(cudaMalloc((void**)&vals_, sizeof(float) * embedding_vec_size_ * num_slot_)); + CUDA_CHECK(cudaMalloc((void**)&slot_counter_, sizeof(ref_counter_type) * num_slot_)); + CUDA_CHECK(cudaMalloc((void**)&global_counter_, sizeof(ref_counter_type))); + + // Allocate GPU memory for set mutex + CUDA_CHECK(cudaMalloc((void**)&set_mutex_, sizeof(int) * capacity_in_set_)); + + // Initialize the cache, set all entry to unused + init_cache<<<((num_slot_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( + keys_, slot_counter_, global_counter_, num_slot_, empty_key, set_mutex_, capacity_in_set_); + + // Wait for initialization to finish + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaGetLastError()); +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +gpu_cache::~gpu_cache() { + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + + // Check device + dev_restorer.check_device(dev_); + + // Destruct CUDA std object + destruct_kernel<<<((capacity_in_set_ - 1) / BLOCK_SIZE_) + 1, BLOCK_SIZE_>>>( + global_counter_, set_mutex_, capacity_in_set_); + // Wait for destruction to finish + CUDA_CHECK(cudaStreamSynchronize(0)); + + // Free GPU memory for cache + CUDA_CHECK(cudaFree(keys_)); + CUDA_CHECK(cudaFree(vals_)); + CUDA_CHECK(cudaFree(slot_counter_)); + CUDA_CHECK(cudaFree(global_counter_)); + // Free GPU memory for set mutex + CUDA_CHECK(cudaFree(set_mutex_)); +} +#else +template +gpu_cache::~gpu_cache() noexcept(false) { + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + + // Check device + dev_restorer.check_device(dev_); + + // Free GPU memory for cache + CUDA_CHECK(cudaFree(keys_)); + CUDA_CHECK(cudaFree(vals_)); + CUDA_CHECK(cudaFree(slot_counter_)); + CUDA_CHECK(cudaFree(global_counter_)); + // Free GPU memory for set mutex + CUDA_CHECK(cudaFree(set_mutex_)); +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +void gpu_cache::Query(const key_type* d_keys, const size_t len, float* d_values, + uint64_t* d_missing_index, key_type* d_missing_keys, + size_t* d_missing_len, cudaStream_t stream, + const size_t task_per_warp_tile) { + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Check if it is a valid query + if (len == 0) { + // Set the d_missing_len to 0 before return + CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream)); + return; + } + + // Update the global counter as user perform a new(most recent) read operation to the cache + // Resolve distance overflow issue as well. + update_kernel_overflow_ignore + <<<1, 1, 0, stream>>>(global_counter_, d_missing_len); + + // Read from the cache + // Touch and refresh the hitting slot + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + get_kernel<<>>( + d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len, + global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_, + task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#else +template +void gpu_cache::Query(const key_type* d_keys, const size_t len, float* d_values, + uint64_t* d_missing_index, key_type* d_missing_keys, + size_t* d_missing_len, cudaStream_t stream, + const size_t task_per_warp_tile) { + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Check if it is a valid query + if (len == 0) { + // Set the d_missing_len to 0 before return + CUDA_CHECK(cudaMemsetAsync(d_missing_len, 0, sizeof(size_t), stream)); + return; + } + + // Update the global counter as user perform a new(most recent) read operation to the cache + // Resolve distance overflow issue as well. + update_kernel_overflow_ignore + <<<1, 1, 0, stream>>>(global_counter_, d_missing_len); + + // Read from the cache + // Touch and refresh the hitting slot + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + get_kernel<<>>( + d_keys, len, d_values, embedding_vec_size_, d_missing_index, d_missing_keys, d_missing_len, + global_counter_, slot_counter_, capacity_in_set_, keys_, vals_, set_mutex_, + task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +void gpu_cache::Replace(const key_type* d_keys, const size_t len, + const float* d_values, cudaStream_t stream, + const size_t task_per_warp_tile) { + // Check if it is a valid replacement + if (len == 0) { + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Try to insert the paris into the cache as long as there are unused slot + // Then replace the pairs into the cache + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + insert_replace_kernel + <<>>(d_keys, d_values, embedding_vec_size_, len, keys_, + vals_, slot_counter_, set_mutex_, global_counter_, + capacity_in_set_, task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#else +template +void gpu_cache::Replace(const key_type* d_keys, const size_t len, + const float* d_values, cudaStream_t stream, + const size_t task_per_warp_tile) { + // Check if it is a valid replacement + if (len == 0) { + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Try to insert the paris into the cache as long as there are unused slot + // Then replace the pairs into the cache + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + insert_replace_kernel<<>>( + d_keys, d_values, embedding_vec_size_, len, keys_, vals_, slot_counter_, set_mutex_, + global_counter_, capacity_in_set_, task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +void gpu_cache::Update(const key_type* d_keys, const size_t len, const float* d_values, + cudaStream_t stream, const size_t task_per_warp_tile) { + // Check if it is a valid update request + if (len == 0) { + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Update the value of input keys that are existed in the cache + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + update_kernel<<>>( + d_keys, len, d_values, embedding_vec_size_, capacity_in_set_, keys_, vals_, set_mutex_, + task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#else +template +void gpu_cache::Update(const key_type* d_keys, const size_t len, const float* d_values, + cudaStream_t stream, const size_t task_per_warp_tile) { + // Check if it is a valid update request + if (len == 0) { + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Update the value of input keys that are existed in the cache + const size_t keys_per_block = (BLOCK_SIZE_ / warp_size) * task_per_warp_tile; + const size_t grid_size = ((len - 1) / keys_per_block) + 1; + update_kernel + <<>>(d_keys, len, d_values, embedding_vec_size_, + capacity_in_set_, keys_, vals_, set_mutex_, + task_per_warp_tile); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#endif + +#ifdef LIBCUDACXX_VERSION +template +void gpu_cache::Dump(key_type* d_keys, size_t* d_dump_counter, + const size_t start_set_index, const size_t end_set_index, + cudaStream_t stream) { + // Check if it is a valid dump request + if (start_set_index >= capacity_in_set_) { + printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); + return; + } + if (end_set_index <= start_set_index || end_set_index > capacity_in_set_) { + printf("Error: Invalid value for end_set_index. Nothing dumped.\n"); + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Set the global counter to 0 first + CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + + // Dump keys from the cache + const size_t grid_size = + (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1; + dump_kernel + <<>>(d_keys, d_dump_counter, keys_, set_mutex_, + start_set_index, end_set_index); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#else +template +void gpu_cache::Dump(key_type* d_keys, size_t* d_dump_counter, + const size_t start_set_index, const size_t end_set_index, + cudaStream_t stream) { + // Check if it is a valid dump request + if (start_set_index >= capacity_in_set_) { + printf("Error: Invalid value for start_set_index. Nothing dumped.\n"); + return; + } + if (end_set_index <= start_set_index || end_set_index > capacity_in_set_) { + printf("Error: Invalid value for end_set_index. Nothing dumped.\n"); + return; + } + + // Device Restorer + nv::CudaDeviceRestorer dev_restorer; + // Check device + dev_restorer.check_device(dev_); + + // Set the global counter to 0 first + CUDA_CHECK(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); + + // Dump keys from the cache + const size_t grid_size = + (((end_set_index - start_set_index) - 1) / (BLOCK_SIZE_ / warp_size)) + 1; + dump_kernel + <<>>(d_keys, d_dump_counter, keys_, set_mutex_, + start_set_index, end_set_index); + + // Check for GPU error before return + CUDA_CHECK(cudaGetLastError()); +} +#endif + +template class gpu_cache::max(), + SET_ASSOCIATIVITY, SLAB_SIZE>; +template class gpu_cache::max(), + SET_ASSOCIATIVITY, SLAB_SIZE>; +} // namespace gpu_cache