Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPUvsCPU: updates to modules and workflows for pixel reconstruction #37617

Merged
merged 2 commits into from
May 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 119 additions & 23 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
template <typename Traits>
class TrackingRecHit2DHeterogeneous {
public:
enum class Storage32 {
kXLocal = 0,
kYLocal = 1,
kXerror = 2,
kYerror = 3,
kCharge = 4,
kXGlobal = 5,
kYGlobal = 6,
kZGlobal = 7,
kRGlobal = 8,
kPhiStorage = 9,
kLayers = 10
};

enum class Storage16 {
kDetId = 0,
kPhi = 1,
kXSize = 2,
kYSize = 3,
};

template <typename T>
using unique_ptr = typename Traits::template unique_ptr<T>;

Expand All @@ -24,6 +45,8 @@ class TrackingRecHit2DHeterogeneous {
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);

explicit TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
~TrackingRecHit2DHeterogeneous() = default;

TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
Expand All @@ -44,18 +67,21 @@ class TrackingRecHit2DHeterogeneous {
auto phiBinnerStorage() { return m_phiBinnerStorage; }
auto iphi() { return m_iphi; }

// only the local coord and detector index
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4; // number of elements in m_store16
static constexpr uint32_t n32 = 10; // number of elements in m_store32
static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious

static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
unique_ptr<uint16_t[]> m_store16; //!
unique_ptr<float[]> m_store32; //!

Expand Down Expand Up @@ -108,7 +134,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(

// if empy do not bother
if (0 == nHits) {
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
Expand All @@ -123,7 +149,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
// so unless proven VERY inefficient we keep it ordered as generated

// host copy is "reduced" (to be reviewed at some point)
if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
Expand All @@ -139,43 +165,113 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));

auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };

// copy all the pointers
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

view->m_xl = get32(0);
view->m_yl = get32(1);
view->m_xerr = get32(2);
view->m_yerr = get32(3);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));

if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
assert(input == nullptr);
view->m_xg = get32(5);
view->m_yg = get32(6);
view->m_zg = get32(7);
view->m_rg = get32(8);
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));

view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(0);
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
view->m_detInd = get16(Storage16::kDetId);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
}

// transfer view
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
}
}

//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(modules) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;

if (0 == nHits) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view = std::move(view);
}
return;
}

m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);

view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_hitsModuleStart = m_hitsModuleStart;

//store transfer
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_store16, store16, stream);
cms::cuda::copyAsync(m_store32, store32, stream);
} else {
std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it
std::copy(store16, store16 + nHits * n16, m_store16.get());
}

//getters
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };

//Store 32
view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

//Store 16
view->m_detInd = get16(Storage16::kDetId);
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));

// transfer view
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view = std::move(view);
}
}

#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(
return ret;
}

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include <cuda_runtime.h>

#include <fmt/printf.h>

#include "CUDADataFormats/Common/interface/HostProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "DataFormats/Common/interface/DetSetVectorNew.h"
#include "DataFormats/Common/interface/Handle.h"
#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
#include "FWCore/Framework/interface/Event.h"
#include "FWCore/Framework/interface/EventSetup.h"
#include "FWCore/Framework/interface/MakerMacros.h"
#include "FWCore/Framework/interface/stream/EDProducer.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/Utilities/interface/InputTag.h"
#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"

class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
~SiPixelRecHitSoAFromCUDA() override = default;

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
using HMSstorage = HostProduct<uint32_t[]>;

private:
void acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;

const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_; // CUDA hits
const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
const edm::EDPutTokenT<HMSstorage> hostPutToken_;

uint32_t nHits_;
uint32_t nMaxModules_;

cms::cuda::host::unique_ptr<float[]> store32_;
cms::cuda::host::unique_ptr<uint16_t[]> store16_;
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
};

SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
: hitsTokenGPU_(
consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
hostPutToken_(produces<HMSstorage>()) {}

void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;
desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
descriptions.addWithDefaultLabel(desc);
}

void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
auto const& inputData = ctx.get(inputDataWrapped);

nHits_ = inputData.nHits();
LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";

if (0 == nHits_)
return;
nMaxModules_ = inputData.nMaxModules();
store32_ = inputData.store32ToHostAsync(ctx.stream());
store16_ = inputData.store16ToHostAsync(ctx.stream());
hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
}

void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());

iEvent.emplace(hostPutToken_, std::move(hmsp));
iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
}

DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
31 changes: 23 additions & 8 deletions RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,18 @@

# convert the pixel rechits from legacy to SoA format
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitsPreSplittingSoA
siPixelRecHitsPreSplittingSoA = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDA_cfi import siPixelRecHitSoAFromCUDA as _siPixelRecHitSoAFromCUDA

siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)

# phase 2 tracker modifier
from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA,
phase2_tracker.toModify(siPixelRecHitsPreSplittingCPU,
isPhase2 = True)
# modifier used to prompt patatrack pixel tracks reconstruction on cpu
from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
pixelNtupletFit.toModify(siPixelRecHitsPreSplitting,
cpu = siPixelRecHitsPreSplittingSoA.clone()
cpu = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
)

siPixelRecHitsPreSplittingTask = cms.Task(
Expand All @@ -48,20 +51,32 @@
# transfer the pixel rechits to the host and convert them from SoA
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA

#this is an alias for the SoA on GPU or CPU to be used for DQM
siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
cpu = cms.EDAlias(
siPixelRecHitsPreSplittingCPU = cms.VPSet(
cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
cms.PSet(type = cms.string("uintAsHostProduct"))
)),
cuda = _siPixelRecHitSoAFromCUDA.clone()
)

(gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplitting,
cpu = cms.EDAlias(
siPixelRecHitsPreSplittingSoA = cms.VPSet(
siPixelRecHitsPreSplittingCPU = cms.VPSet(
cms.PSet(type = cms.string("SiPixelRecHitedmNewDetSetVector")),
cms.PSet(type = cms.string("uintAsHostProduct"))
)
),
cuda = _siPixelRecHitFromCUDA.clone())

(gpu & pixelNtupletFit).toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task(
# reconstruct the pixel rechits on the gpu
# reconstruct the pixel rechits on the gpu or on the cpu
# (normally only one of the two is run because only one is consumed from later stages)
siPixelRecHitsPreSplittingCUDA,
# producing and converting on cpu
siPixelRecHitsPreSplittingSoA,
siPixelRecHitsPreSplittingCPU,
# SwitchProducer wrapping an EDAlias on cpu or the converter from SoA to legacy on gpu
siPixelRecHitsPreSplittingTask.copy()
siPixelRecHitsPreSplittingTask.copy(),
# producing and converting on cpu (if needed)
siPixelRecHitsPreSplittingSoA
))
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@
)
)

## GPU vs CPU validation
# force CPU vertexing to use track SoA from CPU chain and not the converted one from GPU chain
from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel
(pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelVerticesSoA.cpu,
pixelTrackSrc = "pixelTracksSoA@cpu"
)

(pixelNtupletFit & gpu).toReplaceWith(pixelVerticesTask, cms.Task(
# build pixel vertices in SoA format on the GPU
pixelVerticesCUDA,
Expand Down
Loading