Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(lidar_centerpoint): accelerated preprocessing for centerpoint #6989

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#ifndef LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
#define LIDAR_CENTERPOINT__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_

#include <lidar_centerpoint/cuda_utils.hpp>
knzo25 marked this conversation as resolved.
Show resolved Hide resolved

#include <tf2_ros/buffer.h>
#include <tf2_ros/transform_listener.h>
#ifdef ROS_DISTRO_GALACTIC
Expand Down Expand Up @@ -48,7 +50,10 @@ class DensificationParam

struct PointCloudWithTransform
{
sensor_msgs::msg::PointCloud2 pointcloud_msg;
cuda::unique_ptr<float[]> points_d{nullptr};
std_msgs::msg::Header header;
size_t num_points{0};
size_t point_step{0};
Eigen::Affine3f affine_past2world;
};

Expand All @@ -58,7 +63,8 @@ class PointCloudDensification
explicit PointCloudDensification(const DensificationParam & param);

bool enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t streams);
amadeuszsz marked this conversation as resolved.
Show resolved Hide resolved

double getCurrentTimestamp() const { return current_timestamp_; }
Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
Expand All @@ -73,7 +79,8 @@ class PointCloudDensification
unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }

private:
void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
void enqueue(
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine, cudaStream_t stream);
void dequeue();

DensificationParam param_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

namespace centerpoint
{
cudaError_t generateSweepPoints_launch(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform, int num_features, float * output_points, cudaStream_t stream);

cudaError_t generateVoxels_random_launch(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@ class VoxelGeneratorTemplate
explicit VoxelGeneratorTemplate(
const DensificationParam & param, const CenterPointConfig & config);

virtual std::size_t generateSweepPoints(std::vector<float> & points) = 0;
virtual std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) = 0;

bool enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer);
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream);

protected:
std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
Expand All @@ -50,7 +51,7 @@ class VoxelGenerator : public VoxelGeneratorTemplate
public:
using VoxelGeneratorTemplate::VoxelGeneratorTemplate;

std::size_t generateSweepPoints(std::vector<float> & points) override;
std::size_t generateSweepPoints(float * d_points, cudaStream_t stream) override;
};

} // namespace centerpoint
Expand Down
7 changes: 2 additions & 5 deletions perception/lidar_centerpoint/lib/centerpoint_trt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,11 @@ bool CenterPointTRT::detect(
bool CenterPointTRT::preprocess(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
{
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
bool is_success = vg_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream_);
if (!is_success) {
return false;
}
const auto count = vg_ptr_->generateSweepPoints(points_);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
points_d_.get(), points_.data(), count * config_.point_feature_size_ * sizeof(float),
cudaMemcpyHostToDevice, stream_));
const auto count = vg_ptr_->generateSweepPoints(points_d_.get(), stream_);
CHECK_CUDA_ERROR(cudaMemsetAsync(num_voxels_d_.get(), 0, sizeof(unsigned int), stream_));
CHECK_CUDA_ERROR(
cudaMemsetAsync(voxels_buffer_d_.get(), 0, voxels_buffer_size_ * sizeof(float), stream_));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ PointCloudDensification::PointCloudDensification(const DensificationParam & para
}

bool PointCloudDensification::enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream)
{
const auto header = pointcloud_msg.header;

Expand All @@ -73,9 +74,9 @@ bool PointCloudDensification::enqueuePointCloud(
}
auto affine_world2current = transformToEigen(transform_world2current.get());

enqueue(pointcloud_msg, affine_world2current);
enqueue(pointcloud_msg, affine_world2current, stream);
} else {
enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
enqueue(pointcloud_msg, Eigen::Affine3f::Identity(), stream);
}

dequeue();
Expand All @@ -84,12 +85,24 @@ bool PointCloudDensification::enqueuePointCloud(
}

void PointCloudDensification::enqueue(
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current,
cudaStream_t stream)
{
affine_world2current_ = affine_world2current;
current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
PointCloudWithTransform pointcloud = {msg, affine_world2current.inverse()};
pointcloud_cache_.push_front(pointcloud);

assert(sizeof(uint8_t) * msg.width * msg.height * msg.point_step % sizeof(float) == 0);
auto points_d = cuda::make_unique<float[]>(
sizeof(uint8_t) * msg.width * msg.height * msg.point_step / sizeof(float));
CHECK_CUDA_ERROR(cudaMemcpyAsync(
points_d.get(), msg.data.data(), sizeof(uint8_t) * msg.width * msg.height * msg.point_step,
cudaMemcpyHostToDevice, stream));
kminoda marked this conversation as resolved.
Show resolved Hide resolved

PointCloudWithTransform pointcloud = {
std::move(points_d), msg.header, msg.width * msg.height, msg.point_step,
affine_world2current.inverse()};

pointcloud_cache_.push_front(std::move(pointcloud));
}

void PointCloudDensification::dequeue()
Expand Down
49 changes: 47 additions & 2 deletions perception/lidar_centerpoint/lib/preprocess/preprocess_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,11 @@
* limitations under the License.
*/

#include "lidar_centerpoint/preprocess/preprocess_kernel.hpp"

#include <lidar_centerpoint/cuda_utils.hpp>
#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>
#include <lidar_centerpoint/utils.hpp>

#include <cassert>
namespace
{
const std::size_t MAX_POINT_IN_VOXEL_SIZE = 32; // the same as max_point_in_voxel_size_ in config
Expand All @@ -41,6 +42,50 @@ const std::size_t ENCODER_IN_FEATURE_SIZE = 9; // the same as encoder_in_featur

namespace centerpoint
{

__global__ void generateSweepPoints_kernel(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform_array, int num_features, float * output_points)
{
int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (point_idx >= points_size) return;

const float input_x = input_points[point_idx * input_point_step + 0];
const float input_y = input_points[point_idx * input_point_step + 1];
const float input_z = input_points[point_idx * input_point_step + 2];

output_points[point_idx * num_features + 0] = transform_array[0] * input_x +
transform_array[1] * input_y +
transform_array[2] * input_z + transform_array[3];
output_points[point_idx * num_features + 1] = transform_array[4] * input_x +
transform_array[5] * input_y +
transform_array[6] * input_z + transform_array[7];
output_points[point_idx * num_features + 2] = transform_array[8] * input_x +
transform_array[9] * input_y +
transform_array[10] * input_z + transform_array[11];
output_points[point_idx * num_features + 3] = time_lag;
}

cudaError_t generateSweepPoints_launch(
const float * input_points, size_t points_size, int input_point_step, float time_lag,
const float * transform_array, int num_features, float * output_points, cudaStream_t stream)
{
auto transform_d = cuda::make_unique<float[]>(16);
CHECK_CUDA_ERROR(cudaMemcpyAsync(
transform_d.get(), transform_array, 16 * sizeof(float), cudaMemcpyHostToDevice, stream));

dim3 blocks((points_size + 256 - 1) / 256);
dim3 threads(256);
assert(num_features == 4);

generateSweepPoints_kernel<<<blocks, threads, 0, stream>>>(
input_points, points_size, input_point_step, time_lag, transform_d.get(), num_features,
output_points);

cudaError_t err = cudaGetLastError();
return err;
}

__global__ void generateVoxels_random_kernel(
const float * points, size_t points_size, float min_x_range, float max_x_range, float min_y_range,
float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
Expand Down
33 changes: 15 additions & 18 deletions perception/lidar_centerpoint/lib/preprocess/voxel_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include "lidar_centerpoint/preprocess/voxel_generator.hpp"

#include <lidar_centerpoint/preprocess/preprocess_kernel.hpp>

#include <sensor_msgs/point_cloud2_iterator.hpp>

namespace centerpoint
Expand All @@ -38,35 +40,30 @@ VoxelGeneratorTemplate::VoxelGeneratorTemplate(
}

bool VoxelGeneratorTemplate::enqueuePointCloud(
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
const sensor_msgs::msg::PointCloud2 & input_pointcloud_msg, const tf2_ros::Buffer & tf_buffer,
cudaStream_t stream)
{
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer);
return pd_ptr_->enqueuePointCloud(input_pointcloud_msg, tf_buffer, stream);
}

std::size_t VoxelGenerator::generateSweepPoints(std::vector<float> & points)
std::size_t VoxelGenerator::generateSweepPoints(float * points_d, cudaStream_t stream)
{
Eigen::Vector3f point_current, point_past;
size_t point_counter{};
size_t point_counter = 0;
for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
amadeuszsz marked this conversation as resolved.
Show resolved Hide resolved
pc_cache_iter++) {
auto pc_msg = pc_cache_iter->pointcloud_msg;
auto sweep_num_points = pc_cache_iter->num_points;
auto point_step = pc_cache_iter->point_step;
auto affine_past2current =
pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
float time_lag = static_cast<float>(
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_msg.header.stamp).seconds());
pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());

for (sensor_msgs::PointCloud2ConstIterator<float> x_iter(pc_msg, "x"), y_iter(pc_msg, "y"),
z_iter(pc_msg, "z");
x_iter != x_iter.end(); ++x_iter, ++y_iter, ++z_iter) {
point_past << *x_iter, *y_iter, *z_iter;
point_current = affine_past2current * point_past;
generateSweepPoints_launch(
pc_cache_iter->points_d.get(), sweep_num_points, point_step / sizeof(float), time_lag,
affine_past2current.matrix().data(), config_.point_feature_size_, points_d + point_counter,
amadeuszsz marked this conversation as resolved.
Show resolved Hide resolved
stream);

points.at(point_counter * config_.point_feature_size_) = point_current.x();
points.at(point_counter * config_.point_feature_size_ + 1) = point_current.y();
points.at(point_counter * config_.point_feature_size_ + 2) = point_current.z();
points.at(point_counter * config_.point_feature_size_ + 3) = time_lag;
++point_counter;
}
point_counter += sweep_num_points;
}
return point_counter;
}
Expand Down
Loading