From 10c8b9e78b0abcd627c24b6bdb82c4b4cbec6d09 Mon Sep 17 00:00:00 2001
From: Danielmic <30855238+Danielmic@users.noreply.github.com>
Date: Thu, 29 Jun 2023 17:13:57 +0800
Subject: [PATCH] [Feature] Add the implementation of dynamic_scatter with
 mlu-ops (#2847)

---
 docs/en/understand_mmcv/ops.md                |   2 +-
 docs/zh_cn/understand_mmcv/ops.md             |   2 +-
 .../csrc/pytorch/mlu/mlu_common_helper.cpp    |  13 ++
 mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h |  32 +++-
 .../csrc/pytorch/mlu/ms_deform_attn_mlu.cpp   |  13 --
 .../csrc/pytorch/mlu/scatter_points_mlu.cpp   | 178 ++++++++++++++++++
 tests/test_ops/test_scatter_points.py         |  39 ++--
 7 files changed, 248 insertions(+), 31 deletions(-)
 create mode 100644 mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp

diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md
index 34604c05f4..c4212742d8 100644
--- a/docs/en/understand_mmcv/ops.md
+++ b/docs/en/understand_mmcv/ops.md
@@ -21,7 +21,7 @@ We implement common ops used in detection, segmentation, etc.
 | Deformable Convolution v1/v2 | √   | √    |     |     | √      |
 | Deformable RoIPool           |     | √    | √   |     | √      |
 | DiffIoURotated               |     | √    |     |     |        |
-| DynamicScatter               |     | √    |     |     |        |
+| DynamicScatter               |     | √    | √   |     |        |
 | FurthestPointSample          |     | √    |     |     |        |
 | FurthestPointSampleWithDist  |     | √    |     |     |        |
 | FusedBiasLeakyrelu           |     | √    |     |     | √      |
diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md
index 67ba3e681b..3e34793685 100644
--- a/docs/zh_cn/understand_mmcv/ops.md
+++ b/docs/zh_cn/understand_mmcv/ops.md
@@ -21,7 +21,7 @@ MMCV 提供了检测、分割等任务中常用的算子
 | Deformable Convolution v1/v2 | √   | √    |     |     | √      |
 | Deformable RoIPool           |     | √    | √   |     | √      |
 | DiffIoURotated               |     | √    |     |     |        |
-| DynamicScatter               |     | √    |     |     |        |
+| DynamicScatter               |     | √    | √   |     |        |
 | FurthestPointSample          |     | √    |     |     |        |
 | FurthestPointSampleWithDist  |     | √    |     |     |        |
 | FusedBiasLeakyrelu           |     | √    |     |     | √      |
diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
index 3a76b49715..2799d3aa1b 100644
--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.cpp
@@ -56,6 +56,19 @@ mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input) {
   return layout;
 }
 
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type) {
+  const std::map<reduce_t, mluOpReduceMode_t> mapping_type = {
+      {reduce_t::MAX, MLUOP_REDUCE_DMAX},
+      {reduce_t::SUM, MLUOP_REDUCE_DSUM},
+      {reduce_t::MEAN, MLUOP_REDUCE_DMEAN}};
+  if (mapping_type.find(reduce_type) != mapping_type.end()) {
+    return mapping_type.find(reduce_type)->second;
+  } else {
+    TORCH_CHECK(false, "Unsupported reduce type: ", to_string(reduce_type));
+    return MLUOP_REDUCE_DSUM;
+  }
+}
+
 void MluOpTensorDescriptor::set(Tensor t) {
   mluOpDataType_t data_type = getMluOpDataType(t.dtype());
   mluOpTensorLayout_t layout = getMluOpSuggestLayout(t);
diff --git a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
index 37e125aacb..91a63a415e 100644
--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -18,11 +18,39 @@
 #include "pytorch_device_registry.hpp"
 
 #define MLUOP_MAJOR 0
-#define MLUOP_MINOR 6
-#define MLUOP_PATCHLEVEL 0
+#define MLUOP_MINOR 7
+#define MLUOP_PATCHLEVEL 1
+
+/*************************************************************************
+ * This MACRO contains operations of simple tensor to mlu-tensor.
+ * _contiguous, _desc, _impl, _ptr will be automatically generated in
+ * this MACRO.
+ *************************************************************************/
+#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
+  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
+      NAME, NAME.suggest_memory_format());                          \
+  MluOpTensorDescriptor NAME##_desc;                                \
+  NAME##_desc.set(NAME##_contigous);                                \
+  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
+  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
+
+enum class reduce_t{ SUM = 0, MEAN = 1, MAX = 2  };
+
+inline std::string to_string(reduce_t reduce_type) {
+  if (reduce_type == reduce_t::MAX) {
+    return "max";
+  } else if (reduce_type == reduce_t::MEAN) {
+    return "mean";
+  } else if (reduce_type == reduce_t::SUM) {
+    return "sum";
+  } else {
+    return "unknown reduce type";
+  }
+}
 
 mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
 mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);
+mluOpReduceMode_t getMluOpReduceMode(const reduce_t reduce_type);
 
 class MluOpTensorDescriptor {
  public:
diff --git a/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
index ead293d1b5..2643bc537e 100644
--- a/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
@@ -13,19 +13,6 @@
 #include "pytorch_device_registry.hpp"
 #include "pytorch_mlu_helper.hpp"
 
-/*************************************************************************
- * This MACRO contains operations of simple tensor to mlu-tensor.
- * _contiguous, _desc, _impl, _ptr will be automatically generated in
- * this MACRO.
- *************************************************************************/
-#define INITIAL_MLU_PARAM_WITH_TENSOR(NAME)                         \
-  auto NAME##_contigous = torch_mlu::cnnl::ops::cnnl_contiguous(    \
-      NAME, NAME.suggest_memory_format());                          \
-  MluOpTensorDescriptor NAME##_desc;                                \
-  NAME##_desc.set(NAME##_contigous);                                \
-  auto NAME##_impl = torch_mlu::getMluTensorImpl(NAME##_contigous); \
-  auto NAME##_ptr = NAME##_impl->cnnlMalloc();
-
 Tensor MsDeformAttnForwardLauncher(const Tensor& value,
                                    const Tensor& spatial_shapes,
                                    const Tensor& level_start_index,
diff --git a/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
new file mode 100644
index 0000000000..c45e8c5e18
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/scatter_points_mlu.cpp
@@ -0,0 +1,178 @@
+/*************************************************************************
+ * Copyright (C) 2023 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_mlu(const Tensor &feats,
+                                                       const Tensor &coors,
+                                                       const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+      "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(coors.scalar_type() == at::kInt,
+      "coors type should be Int32, got ", coors.scalar_type());
+  TORCH_CHECK(feats.size(0) == coors.size(0),
+      "feats.dim(0) and coors.dim(0) should be same, got ", feats.size(0), " vs ", coors.size(0));
+
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+  // zero-element check
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+  auto reduced_feats = at::empty({num_input, num_feats}, feats.options());
+  auto out_coors = at::empty({num_input, 3}, coors.options());
+  auto coors_map = at::empty({num_input}, coors.options());
+  auto reduce_count = at::empty({num_input}, coors.options());
+  auto voxel_num = at::empty({1}, coors.options());
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(out_coors);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_map);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  mluOpGetDynamicPointToVoxelForwardWorkspaceSize(handle,
+                                                  feats_desc.desc(),
+                                                  coors_desc.desc(),
+                                                  &workspace_size);
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  mluOpDynamicPointToVoxelForward(handle,
+                                  mlu_reduce_type,
+                                  feats_desc.desc(),
+                                  feats_ptr,
+                                  coors_desc.desc(),
+                                  coors_ptr,
+                                  workspace_tensor_ptr,
+                                  workspace_size,
+                                  reduced_feats_desc.desc(),
+                                  reduced_feats_ptr,
+                                  out_coors_desc.desc(),
+                                  out_coors_ptr,
+                                  coors_map_desc.desc(),
+                                  coors_map_ptr,
+                                  reduce_count_desc.desc(),
+                                  reduce_count_ptr,
+                                  voxel_num_desc.desc(),
+                                  voxel_num_ptr);
+
+  int voxel_num_value = *static_cast<int *>(voxel_num.cpu().data_ptr());
+  TORCH_CHECK(voxel_num_value <= feats.size(0),
+      "voxel_num should be less than or equal to feats_num, got ", voxel_num_value, " vs ", feats.size(0));
+  return {reduced_feats.slice(0, 0, voxel_num_value), out_coors.slice(0, 0, voxel_num_value),
+          coors_map, reduce_count.slice(0, 0, voxel_num_value)};
+}
+
+void dynamic_point_to_voxel_backward_mlu(Tensor &grad_feats,
+                                         const Tensor &grad_reduced_feats,
+                                         const Tensor &feats,
+                                         const Tensor &reduced_feats,
+                                         const Tensor &coors_idx,
+                                         const Tensor &reduce_count,
+                                         const reduce_t reduce_type) {
+  // params check
+  TORCH_CHECK(grad_reduced_feats.scalar_type() == at::kFloat,
+      "grad_reduced_feats type should be Float, got ", grad_reduced_feats.scalar_type());
+  TORCH_CHECK(feats.scalar_type() == at::kFloat,
+      "feats type should be Float, got ", feats.scalar_type());
+  TORCH_CHECK(reduced_feats.scalar_type() == at::kFloat,
+      "reduced_feats type should be Float, got ", reduced_feats.scalar_type());
+  TORCH_CHECK(coors_idx.scalar_type() == at::kInt,
+      "coors_idx type should be Int32, got ", coors_idx.scalar_type());
+  TORCH_CHECK(reduce_count.scalar_type() == at::kInt,
+      "reduce_count type should be Int32, got ", reduce_count.scalar_type());
+
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+
+  // zero-element check
+  if (num_input == 0 || num_reduced == 0) return;
+
+  // TODO(miaochen): remove this after mlu-ops supports other mode of reduce.
+  TORCH_CHECK(reduce_type == reduce_t::MAX,
+      "only supports max reduce in current version, got ", to_string(reduce_type));
+
+  int voxel_num_value = reduced_feats.size(0);
+  auto opts = torch::TensorOptions().dtype(torch::kInt32);
+  auto voxel_num = torch::from_blob(&voxel_num_value, {1}, opts).clone().to(at::kMLU);
+  auto mlu_reduce_type = getMluOpReduceMode(reduce_type);
+
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(grad_reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduced_feats);
+  INITIAL_MLU_PARAM_WITH_TENSOR(coors_idx);
+  INITIAL_MLU_PARAM_WITH_TENSOR(reduce_count);
+  INITIAL_MLU_PARAM_WITH_TENSOR(voxel_num);
+
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+
+  size_t workspace_size;
+  mluOpGetDynamicPointToVoxelBackwardWorkspaceSize(
+    handle, mlu_reduce_type,
+    grad_feats_desc.desc(),
+    feats_desc.desc(),
+    grad_reduced_feats_desc.desc(),
+    coors_idx_desc.desc(),
+    reduce_count_desc.desc(),
+    voxel_num_desc.desc(),
+    &workspace_size);
+  auto workspace_tensor =
+      at::empty(workspace_size, feats.options().dtype(at::kByte));
+  INITIAL_MLU_PARAM_WITH_TENSOR(workspace_tensor);
+
+  // launch kernel
+  mluOpDynamicPointToVoxelBackward(
+    handle, mlu_reduce_type,
+    grad_reduced_feats_desc.desc(),
+    grad_reduced_feats_ptr,
+    feats_desc.desc(), feats_ptr,
+    reduced_feats_desc.desc(), reduced_feats_ptr,
+    coors_idx_desc.desc(), coors_idx_ptr,
+    reduce_count_desc.desc(), reduce_count_ptr,
+    voxel_num_desc.desc(), voxel_num_ptr,
+    workspace_tensor_ptr, workspace_size,
+    grad_feats_desc.desc(), grad_feats_ptr);
+}
+
+std::vector<Tensor> dynamic_point_to_voxel_forward_impl(const Tensor &feats,
+                                                        const Tensor &coors,
+                                                        const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(Tensor &grad_feats,
+                                          const Tensor &grad_reduced_feats,
+                                          const Tensor &feats,
+                                          const Tensor &reduced_feats,
+                                          const Tensor &coors_idx,
+                                          const Tensor &reduce_count,
+                                          const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, MLU,
+                     dynamic_point_to_voxel_forward_mlu);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, MLU,
+                     dynamic_point_to_voxel_backward_mlu);
diff --git a/tests/test_ops/test_scatter_points.py b/tests/test_ops/test_scatter_points.py
index cf4516047a..b8b569481a 100644
--- a/tests/test_ops/test_scatter_points.py
+++ b/tests/test_ops/test_scatter_points.py
@@ -4,22 +4,31 @@
 from torch.autograd import gradcheck
 
 from mmcv.ops import DynamicScatter
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
 if torch.__version__ == 'parrots':
     pytest.skip('not supported in parrots now', allow_module_level=True)
 
 
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_dynamic_scatter():
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+def test_dynamic_scatter(device):
     dsmean = DynamicScatter([0.32, 0.32, 6],
                             [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
     dsmax = DynamicScatter([0.32, 0.32, 6],
                            [-74.88, -74.88, -2, 74.88, 74.88, 4], False)
 
     # test empty input
-    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device='cuda')
-    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device='cuda')
+    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device=device)
+    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device=device)
 
     empty_feats.requires_grad_()
     empty_feats_out_mean, empty_coors_out_mean = dsmean(
@@ -35,9 +44,9 @@ def test_dynamic_scatter():
 
     # test empty reduced output
     empty_o_feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
     empty_o_coors = torch.randint(
-        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device=device)
 
     empty_o_feats.requires_grad_()
     empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(
@@ -52,9 +61,9 @@ def test_dynamic_scatter():
 
     # test non-empty input
     feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
     coors = torch.randint(
-        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device=device)
 
     ref_voxel_coors = coors.unique(dim=0, sorted=True)
     ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
@@ -88,9 +97,9 @@ def test_dynamic_scatter():
 
     # test non-empty input without any point out of bound
     feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(200000, 3), dtype=torch.float32, device=device) * 100 - 50
     coors = torch.randint(
-        low=0, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+        low=0, high=20, size=(200000, 3), dtype=torch.int32, device=device)
 
     ref_voxel_coors = coors.unique(dim=0, sorted=True)
     ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
@@ -124,9 +133,11 @@ def test_dynamic_scatter():
 
     # test grad #
     feats = torch.rand(
-        size=(100, 4), dtype=torch.float32, device='cuda') * 100 - 50
+        size=(100, 4), dtype=torch.float32, device=device) * 100 - 50
     coors = torch.randint(
-        low=-1, high=3, size=(100, 3), dtype=torch.int32, device='cuda')
+        low=-1, high=3, size=(100, 3), dtype=torch.int32, device=device)
     feats.requires_grad_()
-    gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
+    # TODO(Cambricon): mlu only support max reduce in current version.
+    if not IS_MLU_AVAILABLE:
+        gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
     gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)