From 8752e454fac3214539a694536a2661f0cd14c337 Mon Sep 17 00:00:00 2001 From: Kris Thielemans Date: Fri, 17 May 2024 22:19:21 +0100 Subject: [PATCH] Avoid image copies in Parallelproj if data is contiguous, we don't need an extra copy. --- .../BackProjectorByBinParallelproj.cxx | 30 +++++++++++++++---- .../ForwardProjectorByBinParallelproj.cxx | 24 +++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/src/recon_buildblock/Parallelproj_projector/BackProjectorByBinParallelproj.cxx b/src/recon_buildblock/Parallelproj_projector/BackProjectorByBinParallelproj.cxx index 2589c62b4d..474372018a 100644 --- a/src/recon_buildblock/Parallelproj_projector/BackProjectorByBinParallelproj.cxx +++ b/src/recon_buildblock/Parallelproj_projector/BackProjectorByBinParallelproj.cxx @@ -133,8 +133,19 @@ TOF_transpose(std::vector& mem_for_PP_back, void BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density) const { + std::vector image_vec; + float* image_ptr; + if (_density_sptr->is_contiguous()) + { + image_ptr = _density_sptr->get_full_data_ptr(); + } + else + { + image_vec.resize(density.size_all()); + std::copy(_density_sptr->begin_all(), _density_sptr->end_all(), image_vec.begin()); + image_ptr = image_vec.data(); + } - std::vector image_vec(density.size_all()); // create an alias for the projection data const ProjDataInMemory& p(*_proj_data_to_backproject_sptr); @@ -149,7 +160,7 @@ BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density long long offset = 0; // send image to all visible CUDA devices - float** image_on_cuda_devices = copy_float_array_to_all_devices(image_vec.data(), _helper->num_image_voxel); + float** image_on_cuda_devices = copy_float_array_to_all_devices(image_ptr, _helper->num_image_voxel); // do (chuck-wise) back projection on the CUDA devices for (int chunk_num = 0; chunk_num < _num_gpu_chunks; chunk_num++) @@ -211,7 +222,7 @@ BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density sum_float_arrays_on_first_device(image_on_cuda_devices, _helper->num_image_voxel); // copy summed image back to host - get_float_array_from_device(image_on_cuda_devices, _helper->num_image_voxel, 0, image_vec.data()); + get_float_array_from_device(image_on_cuda_devices, _helper->num_image_voxel, 0, image_ptr); // free image array from CUDA devices free_float_array_on_all_devices(image_on_cuda_devices); @@ -226,7 +237,7 @@ BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density joseph3d_back_tof_sino(_helper->xend.data(), _helper->xstart.data(), - image_vec.data(), + image_ptr, _helper->origin.data(), _helper->voxsize.data(), mem_for_PP_back.data(), @@ -245,7 +256,7 @@ BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density { joseph3d_back(_helper->xstart.data(), _helper->xend.data(), - image_vec.data(), + image_ptr, _helper->origin.data(), _helper->voxsize.data(), p.get_const_data_ptr(), @@ -260,7 +271,14 @@ BackProjectorByBinParallelproj::get_output(DiscretisedDensity<3, float>& density // --------------------------------------------------------------- // // Parallelproj -> STIR image conversion // --------------------------------------------------------------- // - std::copy(image_vec.begin(), image_vec.end(), density.begin_all()); + if (_density_sptr->is_contiguous()) + { + _density_sptr->release_full_data_ptr(); + } + else + { + std::copy(image_vec.begin(), image_vec.end(), density.begin_all()); + } // After the back projection, we enforce a truncation outside of the FOV. // This is because the parallelproj projector seems to have some trouble at the edges and this diff --git a/src/recon_buildblock/Parallelproj_projector/ForwardProjectorByBinParallelproj.cxx b/src/recon_buildblock/Parallelproj_projector/ForwardProjectorByBinParallelproj.cxx index 9c445a9d1f..ac4a3704c1 100644 --- a/src/recon_buildblock/Parallelproj_projector/ForwardProjectorByBinParallelproj.cxx +++ b/src/recon_buildblock/Parallelproj_projector/ForwardProjectorByBinParallelproj.cxx @@ -154,8 +154,18 @@ ForwardProjectorByBinParallelproj::set_input(const DiscretisedDensity<3, float>& truncate_rim(*_density_sptr, static_cast(std::max((image_radius - radius) / _helper->voxsize[2], 0.F))); } - std::vector image_vec(density.size_all()); - std::copy(_density_sptr->begin_all(), _density_sptr->end_all(), image_vec.begin()); + std::vector image_vec; + float* image_ptr; + if (_density_sptr->is_contiguous()) + { + image_ptr = _density_sptr->get_full_data_ptr(); + } + else + { + image_vec.resize(density.size_all()); + std::copy(_density_sptr->begin_all(), _density_sptr->end_all(), image_vec.begin()); + image_ptr = image_vec.data(); + } #if 0 // needed to set output to zero as parallelproj accumulates but is no longer the case @@ -174,7 +184,7 @@ ForwardProjectorByBinParallelproj::set_input(const DiscretisedDensity<3, float>& long long offset = 0; // send image to all visible CUDA devices - float** image_on_cuda_devices = copy_float_array_to_all_devices(image_vec.data(), _helper->num_image_voxel); + float** image_on_cuda_devices = copy_float_array_to_all_devices(image_ptr, _helper->num_image_voxel); // do (chuck-wise) projection on the CUDA devices for (int chunk_num = 0; chunk_num < _num_gpu_chunks; chunk_num++) @@ -246,7 +256,7 @@ ForwardProjectorByBinParallelproj::set_input(const DiscretisedDensity<3, float>& std::vector mem_for_PP(_helper->num_lors * _helper->num_tof_bins); joseph3d_fwd_tof_sino(_helper->xend.data(), _helper->xstart.data(), - image_vec.data(), + image_ptr, _helper->origin.data(), _helper->voxsize.data(), mem_for_PP.data(), @@ -268,7 +278,7 @@ ForwardProjectorByBinParallelproj::set_input(const DiscretisedDensity<3, float>& { joseph3d_fwd(_helper->xstart.data(), _helper->xend.data(), - image_vec.data(), + image_ptr, _helper->origin.data(), _helper->voxsize.data(), _projected_data_sptr->get_data_ptr(), @@ -278,6 +288,10 @@ ForwardProjectorByBinParallelproj::set_input(const DiscretisedDensity<3, float>& #endif info("done", 2); + if (_density_sptr->is_contiguous()) + { + _density_sptr->release_full_data_ptr(); + } _projected_data_sptr->release_data_ptr(); }