Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Ops]Add three nn vector pool ops #2494

Open
wants to merge 2 commits into
base: 2.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion mmcv/ops/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
from .sync_bn import SyncBatchNorm
from .three_interpolate import three_interpolate
from .three_nn import three_nn
from .three_nn_vector_pool import three_nn_vector_pool_by_two_step
from .tin_shift import TINShift, tin_shift
from .upfirdn2d import upfirdn2d
from .voxelize import Voxelization, voxelization
Expand Down Expand Up @@ -102,5 +103,5 @@
'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
'PrRoIPool', 'prroi_pool'
'PrRoIPool', 'prroi_pool', 'three_nn_vector_pool_by_two_step'
]
180 changes: 180 additions & 0 deletions mmcv/ops/csrc/common/cuda/three_nn_vector_pool_cuda_kernel.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
#ifndef THREE_NN_VECTOR_POOL_CUDA_KERNEL_CUH
#define THREE_NN_VECTOR_POOL_CUDA_KERNEL_CUH

#ifdef MMCV_USE_PARROTS
#include "parrots_cuda_helper.hpp"
#else
#include "pytorch_cuda_helper.hpp"
#endif

__global__ void query_stacked_local_neighbor_idxs_cuda_kernel(
const float *support_xyz, const int *xyz_batch_cnt, const float *new_xyz,
const int *new_xyz_batch_cnt, int *stack_neighbor_idxs, int *start_len,
int *cumsum, int avg_length_of_neighbor_idxs, float max_neighbour_distance,
int batch_size, int M, int nsample, int neighbor_type) {
// support_xyz: (N1 + N2 ..., 3) xyz coordinates of the features
// xyz_batch_cnt: (batch_size), [N1, N2, ...]
// new_xyz: (M1 + M2 ..., 3) centers of the ball query
// new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
// stack_neighbor_idxs: (max_length_of_neighbor_idxs)
// start_len: (M1 + M2, 2) [start_offset, neighbor_length]
// cumsum: (1), max offset of current data in stack_neighbor_idxs
// max_neighbour_distance: float
// nsample: find all (-1), find limited number(>0)
// neighbor_type: 1: ball, others: cube
CUDA_1D_KERNEL_LOOP(pt_idx, M) {
const float *cur_support_xyz = support_xyz;
const float *cur_new_xyz = new_xyz;
int *cur_start_len = start_len;
int *cur_stack_neighbor_idxs = stack_neighbor_idxs;

int bs_idx = 0, pt_cnt = new_xyz_batch_cnt[0];
for (int k = 1; k < batch_size; k++) {
if (pt_idx < pt_cnt) break;
pt_cnt += new_xyz_batch_cnt[k];
bs_idx = k;
}

int xyz_batch_start_idx = 0;
for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];

cur_support_xyz += xyz_batch_start_idx * 3;
cur_new_xyz += pt_idx * 3;
cur_start_len += pt_idx * 2;

float new_x = cur_new_xyz[0];
float new_y = cur_new_xyz[1];
float new_z = cur_new_xyz[2];
int n = xyz_batch_cnt[bs_idx];

float local_x, local_y, local_z;
float radius2 = max_neighbour_distance * max_neighbour_distance;

int temp_idxs[1000];

int sample_cnt = 0;
for (int k = 0; k < n; ++k) {
local_x = cur_support_xyz[k * 3 + 0] - new_x;
local_y = cur_support_xyz[k * 3 + 1] - new_y;
local_z = cur_support_xyz[k * 3 + 2] - new_z;

if (neighbor_type == 1) {
// ball
if (local_x * local_x + local_y * local_y + local_z * local_z >
radius2) {
continue;
}
} else {
// voxel
if ((fabs(local_x) > max_neighbour_distance) |
(fabs(local_y) > max_neighbour_distance) |
(fabs(local_z) > max_neighbour_distance)) {
continue;
}
}
if (sample_cnt < 1000) {
temp_idxs[sample_cnt] = k;
} else {
break;
}
sample_cnt++;
if (nsample > 0 && sample_cnt >= nsample) break;
}
cur_start_len[0] = atomicAdd(cumsum, sample_cnt);
cur_start_len[1] = sample_cnt;

int max_thresh = avg_length_of_neighbor_idxs * M;
if (cur_start_len[0] >= max_thresh) continue;

cur_stack_neighbor_idxs += cur_start_len[0];
if (cur_start_len[0] + sample_cnt >= max_thresh)
sample_cnt = max_thresh - cur_start_len[0];

for (int k = 0; k < sample_cnt; k++) {
cur_stack_neighbor_idxs[k] = temp_idxs[k] + xyz_batch_start_idx;
}
}
}

__global__ void query_three_nn_by_stacked_local_idxs_cuda_kernel(
const float *support_xyz, const float *new_xyz,
const float *new_xyz_grid_centers, int *new_xyz_grid_idxs,
float *new_xyz_grid_dist2, const int *stack_neighbor_idxs,
const int *start_len, int M, int num_total_grids) {
// support_xyz: (N1 + N2 ..., 3) xyz coordinates of the features
// new_xyz: (M1 + M2 ..., 3) centers of the ball query
// new_xyz_grid_centers: (M1 + M2 ..., num_total_grids, 3) grids centers of
// each grid new_xyz_grid_idxs: (M1 + M2 ..., num_total_grids, 3) three-nn
// new_xyz_grid_dist2: (M1 + M2 ..., num_total_grids, 3) square of dist of
// three-nn stack_neighbor_idxs: (max_length_of_neighbor_idxs) start_len: (M1
// + M2, 2) [start_offset, neighbor_length]
int grid_idx = blockIdx.y;
if (grid_idx >= num_total_grids) return;
CUDA_1D_KERNEL_LOOP(pt_idx, M) {
const float *cur_new_xyz = new_xyz;
const float *cur_new_xyz_grid_centers = new_xyz_grid_centers;
int *cur_new_xyz_grid_idxs = new_xyz_grid_idxs;
float *cur_new_xyz_grid_dist2 = new_xyz_grid_dist2;
const int *cur_start_len = start_len;
const int *cur_stack_neighbor_idxs = stack_neighbor_idxs;

cur_new_xyz += pt_idx * 3;
cur_new_xyz_grid_centers += pt_idx * num_total_grids * 3 + grid_idx * 3;
cur_new_xyz_grid_idxs += pt_idx * num_total_grids * 3 + grid_idx * 3;
cur_new_xyz_grid_dist2 += pt_idx * num_total_grids * 3 + grid_idx * 3;

cur_start_len += pt_idx * 2;
cur_stack_neighbor_idxs += cur_start_len[0];
int neighbor_length = cur_start_len[1];

float center_x = cur_new_xyz_grid_centers[0];
float center_y = cur_new_xyz_grid_centers[1];
float center_z = cur_new_xyz_grid_centers[2];

double best1 = 1e40, best2 = 1e40, best3 = 1e40;
int besti1 = -1, besti2 = -1, besti3 = -1;
for (int k = 0; k < neighbor_length; k++) {
int cur_neighbor_idx = cur_stack_neighbor_idxs[k];

float x = support_xyz[cur_neighbor_idx * 3 + 0];
float y = support_xyz[cur_neighbor_idx * 3 + 1];
float z = support_xyz[cur_neighbor_idx * 3 + 2];

float d = (center_x - x) * (center_x - x) +
(center_y - y) * (center_y - y) +
(center_z - z) * (center_z - z);

if (d < best1) {
best3 = best2;
besti3 = besti2;
best2 = best1;
besti2 = besti1;
best1 = d;
besti1 = cur_neighbor_idx;
} else if (d < best2) {
best3 = best2;
besti3 = besti2;
best2 = d;
besti2 = cur_neighbor_idx;
} else if (d < best3) {
best3 = d;
besti3 = cur_neighbor_idx;
}
}
if (besti2 == -1) {
besti2 = besti1;
best2 = best1;
}
if (besti3 == -1) {
besti3 = besti1;
best3 = best1;
}
cur_new_xyz_grid_dist2[0] = best1;
cur_new_xyz_grid_dist2[1] = best2;
cur_new_xyz_grid_dist2[2] = best3;
cur_new_xyz_grid_idxs[0] = besti1;
cur_new_xyz_grid_idxs[1] = besti2;
cur_new_xyz_grid_idxs[2] = besti3;
}
}
#endif
57 changes: 57 additions & 0 deletions mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1867,3 +1867,60 @@ REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
prroi_pool_coor_backward_cuda);

void StackQueryLocalNeighborIdxsCUDAKernelLauncher(
const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
const float max_neighbour_distance, const int nsample,
const int neighbor_type);

void stack_query_local_neighbor_idxs_cuda(
const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
const float max_neighbour_distance, const int nsample,
const int neighbor_type) {
StackQueryLocalNeighborIdxsCUDAKernelLauncher(
support_xyz_tensor, xyz_batch_cnt_tensor, new_xyz_tensor,
new_xyz_batch_cnt_tensor, stack_neighbor_idxs_tensor, start_len_tensor,
cumsum_tensor, avg_length_of_neighbor_idxs, max_neighbour_distance,
nsample, neighbor_type);
}

void stack_query_local_neighbor_idxs_impl(
const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
const float max_neighbour_distance, const int nsample,
const int neighbor_type);

void StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
Tensor start_len_tensor, const int M, const int num_total_grids);

void stack_query_three_nn_local_idxs_cuda(
const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
Tensor start_len_tensor, const int M, const int num_total_grids) {
StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
support_xyz_tensor, new_xyz_tensor, new_xyz_grid_centers_tensor,
new_xyz_grid_idxs_tensor, new_xyz_grid_dist2_tensor,
stack_neighbor_idxs_tensor, start_len_tensor, M, num_total_grids);
}

void stack_query_three_nn_local_idxs_impl(
const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
Tensor start_len_tensor, const int M, const int num_total_grids);
REGISTER_DEVICE_IMPL(stack_query_three_nn_local_idxs_impl, CUDA,
stack_query_three_nn_local_idxs_cuda);
REGISTER_DEVICE_IMPL(stack_query_local_neighbor_idxs_impl, CUDA,
stack_query_local_neighbor_idxs_cuda);
58 changes: 58 additions & 0 deletions mmcv/ops/csrc/pytorch/cuda/three_nn_vector_pool.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (c) OpenMMLab. All rights reserved.
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "pytorch_cuda_helper.hpp"
#include "vector_pool.cuh"
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))


void StackQueryLocalNeighborIdxsCUDAKernelLauncher(
const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
const float max_neighbour_distance, const int nsample,
const int neighbor_type) {
int batch_size = xyz_batch_cnt_tensor.size(0);
int M = new_xyz_tensor.size(0);
at::cuda::CUDAGuard device_guard(support_xyz_tensor.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(M, THREADS_PER_BLOCK));
dim3 threads(THREADS_PER_BLOCK);

query_stacked_local_neighbor_idxs_cuda_kernel<<<blocks, threads>>>(
support_xyz_tensor.data_ptr<float>(),
xyz_batch_cnt_tensor.data_ptr<int>(), new_xyz_tensor.data_ptr<float>(),
new_xyz_batch_cnt_tensor.data_ptr<int>(),
stack_neighbor_idxs_tensor.data_ptr<int>(),
start_len_tensor.data_ptr<int>(), cumsum_tensor.data_ptr<int>(),
avg_length_of_neighbor_idxs, max_neighbour_distance, batch_size, M,
nsample, neighbor_type);
AT_CUDA_CHECK(cudaGetLastError());
}

void StackQueryThreeNNLocalIdxsCUDAKernelLauncher(
const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
Tensor start_len_tensor, const int M, const int num_total_grids) {
at::cuda::CUDAGuard device_guard(support_xyz_tensor.device());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

// blockIdx.x(col), blockIdx.y(row)
dim3 blocks(DIVUP(M, THREADS_PER_BLOCK), num_total_grids);
dim3 threads(THREADS_PER_BLOCK);

query_three_nn_by_stacked_local_idxs_cuda_kernel<<<blocks, threads>>>(
support_xyz_tensor.data_ptr<float>(), new_xyz_tensor.data_ptr<float>(),
new_xyz_grid_centers_tensor.data_ptr<float>(),
new_xyz_grid_idxs_tensor.data_ptr<int>(),
new_xyz_grid_dist2_tensor.data_ptr<float>(),
stack_neighbor_idxs_tensor.data_ptr<int>(),
start_len_tensor.data_ptr<int>(), M, num_total_grids);
AT_CUDA_CHECK(cudaGetLastError());
}
29 changes: 29 additions & 0 deletions mmcv/ops/csrc/pytorch/pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,20 @@ Tensor nms_quadri(const Tensor dets, const Tensor scores, const Tensor order,
const Tensor dets_sorted, const float iou_threshold,
const int multi_label);

void stack_query_local_neighbor_idxs(
const Tensor support_xyz_tensor, const Tensor xyz_batch_cnt_tensor,
const Tensor new_xyz_tensor, const Tensor new_xyz_batch_cnt_tensor,
Tensor stack_neighbor_idxs_tensor, Tensor start_len_tensor,
Tensor cumsum_tensor, const int avg_length_of_neighbor_idxs,
const float max_neighbour_distance, const int nsample,
const int neighbor_type);

void stack_query_three_nn_local_idxs(
const Tensor support_xyz_tensor, const Tensor new_xyz_tensor,
const Tensor new_xyz_grid_centers_tensor, Tensor new_xyz_grid_idxs_tensor,
Tensor new_xyz_grid_dist2_tensor, Tensor stack_neighbor_idxs_tensor,
Tensor start_len_tensor, const int M, const int num_total_grids);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
Expand Down Expand Up @@ -899,4 +913,19 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
py::arg("dets"), py::arg("scores"), py::arg("order"),
py::arg("dets_sorted"), py::arg("iou_threshold"),
py::arg("multi_label"));
m.def("stack_query_local_neighbor_idxs", &stack_query_local_neighbor_idxs,
"stack quert local neighbor indexes", py::arg("support_xyz_tensor"),
py::arg("xyz_batch_cnt_tensor"), py::arg("new_xyz_tensor"),
py::arg("new_xyz_batch_cnt_tensor"),
py::arg("stack_neighbor_idxs_tensor"), py::arg("start_len_tensor"),
py::arg("cumsum_tensor"), py::arg("avg_length_of_neighbor_idxs"),
py::arg("max_neighbour_distance"), py::arg("nsample"),
py::arg("neighbor_type"));
m.def("stack_query_three_nn_local_idxs", &stack_query_three_nn_local_idxs,
"stack quert three nn local indexes", py::arg("support_xyz_tensor"),
py::arg("new_xyz_tensor"), py::arg("new_xyz_grid_centers_tensor"),
py::arg("new_xyz_grid_idxs_tensor"),
py::arg("new_xyz_grid_dist2_tensor"),
py::arg("stack_neighbor_idxs_tensor"), py::arg("start_len_tensor"),
py::arg("M"), py::arg("num_total_grids"));
}
Loading