open-mmlab · VVsssssk · Dec 16, 2022 · Dec 16, 2022
diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py
@@ -66,6 +66,7 @@
 from .sync_bn import SyncBatchNorm
 from .three_interpolate import three_interpolate
 from .three_nn import three_nn
+from .three_nn_vector_pool import three_nn_vector_pool_by_two_step
 from .tin_shift import TINShift, tin_shift
 from .upfirdn2d import upfirdn2d
 from .voxelize import Voxelization, voxelization
@@ -102,5 +103,5 @@
     'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
     'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
     'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
-    'PrRoIPool', 'prroi_pool'
+    'PrRoIPool', 'prroi_pool', 'three_nn_vector_pool_by_two_step'
 ]
diff --git a/mmcv/ops/csrc/common/cuda/three_nn_vector_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_nn_vector_pool_cuda_kernel.cuh
@@ -0,0 +1,180 @@
+#ifndef THREE_NN_VECTOR_POOL_CUDA_KERNEL_CUH
+#define THREE_NN_VECTOR_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__global__ void query_stacked_local_neighbor_idxs_cuda_kernel(
+    const float *support_xyz, const int *xyz_batch_cnt, const float *new_xyz,
+    const int *new_xyz_batch_cnt, int *stack_neighbor_idxs, int *start_len,
+    int *cumsum, int avg_length_of_neighbor_idxs, float max_neighbour_distance,
+    int batch_size, int M, int nsample, int neighbor_type) {
+  // support_xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+  // xyz_batch_cnt: (batch_size), [N1, N2, ...]
+  // new_xyz: (M1 + M2 ..., 3) centers of the ball query
+  // new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
+  // stack_neighbor_idxs: (max_length_of_neighbor_idxs)
+  // start_len: (M1 + M2, 2)  [start_offset, neighbor_length]
+  // cumsum: (1), max offset of current data in stack_neighbor_idxs
+  // max_neighbour_distance: float
+  // nsample: find all (-1), find limited number(>0)
+  // neighbor_type: 1: ball, others: cube
+  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
+    const float *cur_support_xyz = support_xyz;
+    const float *cur_new_xyz = new_xyz;
+    int *cur_start_len = start_len;
+    int *cur_stack_neighbor_idxs = stack_neighbor_idxs;
+
+    int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
+    for (int k = 1; k < batch_size; k++) {
+      if (pt_idx < pt_cnt) break;
+      pt_cnt += new_xyz_batch_cnt[k];
+      bs_idx = k;
+    }
+
+    int xyz_batch_start_idx = 0;
+    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
+
+    cur_support_xyz += xyz_batch_start_idx * 3;
+    cur_new_xyz += pt_idx * 3;
+    cur_start_len += pt_idx * 2;
+
+    float new_x = cur_new_xyz[0];
+    float new_y = cur_new_xyz[1];
+    float new_z = cur_new_xyz[2];
+    int n = xyz_batch_cnt[bs_idx];
+
+    float local_x, local_y, local_z;
+    float radius2 = max_neighbour_distance * max_neighbour_distance;
+
+    int temp_idxs[1000];
+
+    int sample_cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      local_x = cur_support_xyz[k * 3 + 0] - new_x;
+      local_y = cur_support_xyz[k * 3 + 1] - new_y;
+      local_z = cur_support_xyz[k * 3 + 2] - new_z;
+
+      if (neighbor_type == 1) {
+        // ball
+        if (local_x * local_x + local_y * local_y + local_z * local_z >
+            radius2) {
+          continue;
+        }
+      } else {
+        // voxel
+        if ((fabs(local_x) > max_neighbour_distance) |
+            (fabs(local_y) > max_neighbour_distance) |
+            (fabs(local_z) > max_neighbour_distance)) {
+          continue;
+        }
+      }
+      if (sample_cnt < 1000) {
+        temp_idxs[sample_cnt] = k;
+      } else {
+        break;
+      }
+      sample_cnt++;
+      if (nsample > 0 && sample_cnt >= nsample) break;
+    }
+    cur_start_len[0] = atomicAdd(cumsum, sample_cnt);
+    cur_start_len[1] = sample_cnt;
+
+    int max_thresh = avg_length_of_neighbor_idxs * M;
+    if (cur_start_len[0] >= max_thresh) continue;
+
+    cur_stack_neighbor_idxs += cur_start_len[0];
+    if (cur_start_len[0] + sample_cnt >= max_thresh)
+      sample_cnt = max_thresh - cur_start_len[0];
+
+    for (int k = 0; k < sample_cnt; k++) {
+      cur_stack_neighbor_idxs[k] = temp_idxs[k] + xyz_batch_start_idx;
+    }
+  }
+}
+
+__global__ void query_three_nn_by_stacked_local_idxs_cuda_kernel(
+    const float *support_xyz, const float *new_xyz,
+    const float *new_xyz_grid_centers, int *new_xyz_grid_idxs,
+    float *new_xyz_grid_dist2, const int *stack_neighbor_idxs,
+    const int *start_len, int M, int num_total_grids) {
+  // support_xyz: (N1 + N2 ..., 3) xyz coordinates of the features
+  // new_xyz: (M1 + M2 ..., 3) centers of the ball query
+  // new_xyz_grid_centers: (M1 + M2 ..., num_total_grids, 3) grids centers of
+  // each grid new_xyz_grid_idxs: (M1 + M2 ..., num_total_grids, 3) three-nn
+  // new_xyz_grid_dist2: (M1 + M2 ..., num_total_grids, 3) square of dist of
+  // three-nn stack_neighbor_idxs: (max_length_of_neighbor_idxs) start_len: (M1
+  // + M2, 2)  [start_offset, neighbor_length]
+  int grid_idx = blockIdx.y;
+  if (grid_idx >= num_total_grids) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
+    const float *cur_new_xyz = new_xyz;
+    const float *cur_new_xyz_grid_centers = new_xyz_grid_centers;
+    int *cur_new_xyz_grid_idxs = new_xyz_grid_idxs;
+    float *cur_new_xyz_grid_dist2 = new_xyz_grid_dist2;
+    const int *cur_start_len = start_len;
+    const int *cur_stack_neighbor_idxs = stack_neighbor_idxs;
+
+    cur_new_xyz += pt_idx * 3;
+    cur_new_xyz_grid_centers += pt_idx * num_total_grids * 3 + grid_idx * 3;
+    cur_new_xyz_grid_idxs += pt_idx * num_total_grids * 3 + grid_idx * 3;
+    cur_new_xyz_grid_dist2 += pt_idx * num_total_grids * 3 + grid_idx * 3;
+
+    cur_start_len += pt_idx * 2;
+    cur_stack_neighbor_idxs += cur_start_len[0];
+    int neighbor_length = cur_start_len[1];
+
+    float center_x = cur_new_xyz_grid_centers[0];
+    float center_y = cur_new_xyz_grid_centers[1];
+    float center_z = cur_new_xyz_grid_centers[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = -1, besti2 = -1, besti3 = -1;
+    for (int k = 0; k < neighbor_length; k++) {
+      int cur_neighbor_idx = cur_stack_neighbor_idxs[k];
+
+      float x = support_xyz[cur_neighbor_idx * 3 + 0];
+      float y = support_xyz[cur_neighbor_idx * 3 + 1];
+      float z = support_xyz[cur_neighbor_idx * 3 + 2];
+
+      float d = (center_x - x) * (center_x - x) +
+                (center_y - y) * (center_y - y) +
+                (center_z - z) * (center_z - z);
+
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = cur_neighbor_idx;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = cur_neighbor_idx;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = cur_neighbor_idx;
+      }
+    }
+    if (besti2 == -1) {
+      besti2 = besti1;
+      best2 = best1;
+    }
+    if (besti3 == -1) {
+      besti3 = besti1;
+      best3 = best1;
+    }
+    cur_new_xyz_grid_dist2[0] = best1;
+    cur_new_xyz_grid_dist2[1] = best2;
+    cur_new_xyz_grid_dist2[2] = best3;
+    cur_new_xyz_grid_idxs[0] = besti1;
+    cur_new_xyz_grid_idxs[1] = besti2;
+    cur_new_xyz_grid_idxs[2] = besti3;
+  }
+}
+#endif
diff --git a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -1867,3 +1867,60 @@ REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
 REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
 REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
                      prroi_pool_coor_backward_cuda);
+
+void StackQueryLocalNeighborIdxsCUDAKernelLauncher(
+    const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
+    const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
+    Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
+    Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
+    const float max_neighbour_distance, const int nsample,
+    const int neighbor_type);
+
+void stack_query_local_neighbor_idxs_cuda(
+    const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
+    const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
+    Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
+    Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
+    const float max_neighbour_distance, const int nsample,
+    const int neighbor_type) {
+  StackQueryLocalNeighborIdxsCUDAKernelLauncher(
+      support_xyz_tensor, xyz_batch_cnt_tensor, new_xyz_tensor,
+      new_xyz_batch_cnt_tensor, stack_neighbor_idxs_tensor, start_len_tensor,
+      cumsum_tensor, avg_length_of_neighbor_idxs, max_neighbour_distance,
+      nsample, neighbor_type);
+}
+
+void stack_query_local_neighbor_idxs_impl(
+    const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
+    const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
+    Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
+    Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
+    const float max_neighbour_distance, const int nsample,
+    const int neighbor_type);
+
+void StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
+    const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
+    const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
+    Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
+    Tensor start_len_tensor, const int M, const int num_total_grids);
+
+void stack_query_three_nn_local_idxs_cuda(
+    const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
+    const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
+    Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
+    Tensor start_len_tensor, const int M, const int num_total_grids) {
+  StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
+      support_xyz_tensor, new_xyz_tensor, new_xyz_grid_centers_tensor,
+      new_xyz_grid_idxs_tensor, new_xyz_grid_dist2_tensor,
+      stack_neighbor_idxs_tensor, start_len_tensor, M, num_total_grids);
+}
+
+void stack_query_three_nn_local_idxs_impl(
+    const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
+    const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
+    Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
+    Tensor start_len_tensor, const int M, const int num_total_grids);
+REGISTER_DEVICE_IMPL(stack_query_three_nn_local_idxs_impl, CUDA,
+                     stack_query_three_nn_local_idxs_cuda);
+REGISTER_DEVICE_IMPL(stack_query_local_neighbor_idxs_impl, CUDA,
+                     stack_query_local_neighbor_idxs_cuda);
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_nn_vector_pool.cu b/mmcv/ops/csrc/pytorch/cuda/three_nn_vector_pool.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "vector_pool.cuh"
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+
+void StackQueryLocalNeighborIdxsCUDAKernelLauncher(
+    const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
+    const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
+    Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
+    Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
+    const float max_neighbour_distance, const int nsample,
+    const int neighbor_type) {
+  int batch_size = xyz_batch_cnt_tensor.size(0);
+  int M = new_xyz_tensor.size(0);
+  at::cuda::CUDAGuard device_guard(support_xyz_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  query_stacked_local_neighbor_idxs_cuda_kernel<<<blocks, threads>>>(
+      support_xyz_tensor.data_ptr<float>(),
+      xyz_batch_cnt_tensor.data_ptr<int>(), new_xyz_tensor.data_ptr<float>(),
+      new_xyz_batch_cnt_tensor.data_ptr<int>(),
+      stack_neighbor_idxs_tensor.data_ptr<int>(),
+      start_len_tensor.data_ptr<int>(), cumsum_tensor.data_ptr<int>(),
+      avg_length_of_neighbor_idxs, max_neighbour_distance, batch_size, M,
+      nsample, neighbor_type);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
+    const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
+    const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
+    Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
+    Tensor start_len_tensor, const int M, const int num_total_grids) {
+  at::cuda::CUDAGuard device_guard(support_xyz_tensor.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), num_total_grids);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  query_three_nn_by_stacked_local_idxs_cuda_kernel<<<blocks, threads>>>(
+      support_xyz_tensor.data_ptr<float>(), new_xyz_tensor.data_ptr<float>(),
+      new_xyz_grid_centers_tensor.data_ptr<float>(),
+      new_xyz_grid_idxs_tensor.data_ptr<int>(),
+      new_xyz_grid_dist2_tensor.data_ptr<float>(),
+      stack_neighbor_idxs_tensor.data_ptr<int>(),
+      start_len_tensor.data_ptr<int>(), M, num_total_grids);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -446,6 +446,20 @@ Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
                   const Tensor dets_sorted, const float iou_threshold,
                   const int multi_label);
 
+void stack_query_local_neighbor_idxs(
+    const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
+    const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
+    Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
+    Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
+    const float max_neighbour_distance, const int nsample,
+    const int neighbor_type);
+
+void stack_query_three_nn_local_idxs(
+    const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
+    const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
+    Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
+    Tensor start_len_tensor, const int M, const int num_total_grids);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
         py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
@@ -899,4 +913,19 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("dets"), py::arg("scores"), py::arg("order"),
         py::arg("dets_sorted"), py::arg("iou_threshold"),
         py::arg("multi_label"));
+  m.def("stack_query_local_neighbor_idxs", &stack_query_local_neighbor_idxs,
+        "stack quert local neighbor indexes", py::arg("support_xyz_tensor"),
+        py::arg("xyz_batch_cnt_tensor"), py::arg("new_xyz_tensor"),
+        py::arg("new_xyz_batch_cnt_tensor"),
+        py::arg("stack_neighbor_idxs_tensor"), py::arg("start_len_tensor"),
+        py::arg("cumsum_tensor"), py::arg("avg_length_of_neighbor_idxs"),
+        py::arg("max_neighbour_distance"), py::arg("nsample"),
+        py::arg("neighbor_type"));
+  m.def("stack_query_three_nn_local_idxs", &stack_query_three_nn_local_idxs,
+        "stack quert three nn local indexes", py::arg("support_xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("new_xyz_grid_centers_tensor"),
+        py::arg("new_xyz_grid_idxs_tensor"),
+        py::arg("new_xyz_grid_dist2_tensor"),
+        py::arg("stack_neighbor_idxs_tensor"), py::arg("start_len_tensor"),
+        py::arg("M"), py::arg("num_total_grids"));
 }