diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml new file mode 100644 index 0000000000000..f61e4aff7403f --- /dev/null +++ b/CUDADataFormats/Vertex/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h new file mode 100644 index 0000000000000..aacfddc6fe7e2 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H +#define CUDADataFormatsVertexZVertexHeterogeneous_H + +#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + +using ZVertexHeterogeneous = HeterogeneousSoA; +#ifndef __CUDACC__ +#include "CUDADataFormats/Common/interface/Product.h" +using ZVertexCUDAProduct = cms::cuda::Product; +#endif + +#endif diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h new file mode 100644 index 0000000000000..e31b87f30fa11 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h @@ -0,0 +1,26 @@ +#ifndef CUDADataFormats_Vertex_ZVertexSoA_h +#define CUDADataFormats_Vertex_ZVertexSoA_h + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +// SOA for vertices +// These vertices are clusterized and fitted only along the beam line (z) +// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) +struct ZVertexSoA { + static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) + float zv[MAXVTX]; // output z-posistion of found vertices + float wv[MAXVTX]; // output weight (1/error^2) on the above + float chi2[MAXVTX]; // vertices chi2 + float ptv2[MAXVTX]; // vertices pt^2 + int32_t ndof[MAXTRACKS]; // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME) + uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending + uint32_t nvFinal; // the number of vertices + + __host__ __device__ void init() { nvFinal = 0; } +}; + +#endif // CUDADataFormats_Vertex_ZVertexSoA_h diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h new file mode 100644 index 0000000000000..7931beaa8f4bd --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats_Vertex_src_classes_h +#define CUDADataFormats_Vertex_src_classes_h + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif // CUDADataFormats_Vertex_src_classes_h diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml new file mode 100644 index 0000000000000..ea633080af9af --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py new file mode 100644 index 0000000000000..2558e88d26012 --- /dev/null +++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py @@ -0,0 +1,7 @@ +import FWCore.ParameterSet.Config as cms + +from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient + +pixelVertexResolutionClient = _primaryVertexResolutionClient.clone( + subDirs = ["OfflinePixelPV/Resolution/*"] +) diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py index 368b328632fd8..29bf311c474d4 100644 --- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py @@ -122,6 +122,7 @@ from DQM.CTPPS.ctppsDQM_cff import * from Validation.RecoTau.DQMSequences_cfi import * from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import * +from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import * DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient ) @@ -179,7 +180,8 @@ DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias * dqmFastTimerServiceClient ) -DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern ) +DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern * + pixelVertexResolutionClient ) DQMHarvestOuterTracker = cms.Sequence( OuterTrackerClient * diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py index 2001c22352a48..ac28700d4eaf4 100644 --- a/DQMOffline/Configuration/python/DQMOffline_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_cff.py @@ -157,10 +157,12 @@ #DQMOfflineCommon from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import * +from DQMOffline.RecoB.PixelVertexMonitor_cff import * from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import * from Validation.RecoTau.DQMSequences_cfi import * -DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring ) +DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring * + pixelPVMonitor ) DQMOuterTracker = cms.Sequence( DQMOfflineDCS * OuterTrackerSource * diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py new file mode 100644 index 0000000000000..9e293f4478bd6 --- /dev/null +++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py @@ -0,0 +1,8 @@ +import FWCore.ParameterSet.Config as cms + +from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor +pixelPVMonitor = _pvMonitor.clone( + TopFolderName = "OfflinePixelPV", + vertexLabel = "pixelVertices", + ndof = cms.int32( 1 ) +) diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py index 34ee6fadb04de..424ac13a43627 100644 --- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py +++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py @@ -4,7 +4,21 @@ # # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex # -#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import * -from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import * +from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import * +#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import * recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices) recopixelvertexing = cms.Sequence(recopixelvertexingTask) + +from Configuration.ProcessModifiers.gpu_cff import gpu + +from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA +from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA + +_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices) + +# pixelVertexSoAonCPU = pixelVertexCUDA.clone() +# pixelVertexSoAonCPU.onGPU = False; + +gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA) +gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask) diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py index 24cc16e02b463..1661cac832b8b 100644 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py @@ -20,12 +20,21 @@ def customizePixelTracksSoAonCPU(process): pixelRecHitSrc = 'siPixelRecHitsPreSplitting' ) + from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA + process.pixelVertexSoA = pixelVertexCUDA.clone( + onGPU = False, + pixelTrackSrc = 'pixelTrackSoA' + ) + from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA process.pixelTracks = pixelTrackProducerFromSoA.clone( pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting' ) - process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA + process.pixelVertices = pixelVertexFromSoA.clone() + + process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA return process @@ -46,7 +55,7 @@ def customizePixelTracksSoAonCPUForProfiling(process): process.siPixelRecHitSoAFromLegacy.convertToLegacy = False - process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA) + process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA) process.schedule = cms.Schedule(process.TkSoA) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc new file mode 100644 index 0000000000000..2f0965be50eb8 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -0,0 +1,86 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { +public: + explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig); + ~PixelTrackDumpCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; +}; + +PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) + : m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tokenGPUTrack_ = + consumes>(iConfig.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = + consumes>(iConfig.getParameter("pixelVertexSrc")); + } else { + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + } +} + +void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("onGPU", true); + desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA")); + desc.add("pixelVertexSrc", edm::InputTag("pixelVertexCUDA")); + descriptions.add("pixelTrackDumpCUDA", desc); +} + +void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, + edm::Event const& iEvent, + const edm::EventSetup& iSetup) const { + if (m_onGPU) { + auto const& hTracks = iEvent.get(tokenGPUTrack_); + cms::cuda::ScopedContextProduce ctx{hTracks}; + + auto const& tracks = ctx.get(hTracks); + auto const* tsoa = tracks.get(); + assert(tsoa); + + auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); + auto const* vsoa = vertices.get(); + assert(vsoa); + + } else { + auto const* tsoa = iEvent.get(tokenSoATrack_).get(); + assert(tsoa); + + auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); + assert(vsoa); + } +} + +DEFINE_FWK_MODULE(PixelTrackDumpCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml index 427799cb122b5..99b91b2587bcf 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml @@ -1,3 +1,4 @@ + @@ -15,10 +16,12 @@ + + - + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc new file mode 100644 index 0000000000000..e2c2bc76c8612 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -0,0 +1,141 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +#include "gpuVertexFinder.h" + +#undef PIXVERTEX_DEBUG_PRODUCE + +class PixelVertexProducerCUDA : public edm::global::EDProducer<> { +public: + explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig); + ~PixelVertexProducerCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const; + void produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const; + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + bool onGPU_; + + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDPutTokenT tokenGPUVertex_; + edm::EDGetTokenT tokenCPUTrack_; + edm::EDPutTokenT tokenCPUVertex_; + + const gpuVertexFinder::Producer gpuAlgo_; + + // Tracking cuts before sending tracks to vertex algo + const float ptMin_; +}; + +PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf) + : onGPU_(conf.getParameter("onGPU")), + gpuAlgo_(conf.getParameter("oneKernel"), + conf.getParameter("useDensity"), + conf.getParameter("useDBSCAN"), + conf.getParameter("useIterative"), + conf.getParameter("minT"), + conf.getParameter("eps"), + conf.getParameter("errmax"), + conf.getParameter("chi2max")), + ptMin_(conf.getParameter("PtMin")) // 0.5 GeV +{ + if (onGPU_) { + tokenGPUTrack_ = + consumes>(conf.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = produces(); + } else { + tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenCPUVertex_ = produces(); + } +} + +void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + // Only one of these three algos can be used at once. + // Maybe this should become a Plugin Factory + desc.add("onGPU", true); + desc.add("oneKernel", true); + desc.add("useDensity", true); + desc.add("useDBSCAN", false); + desc.add("useIterative", false); + + desc.add("minT", 2); // min number of neighbours to be "core" + desc.add("eps", 0.07); // max absolute distance to cluster + desc.add("errmax", 0.01); // max error to be "seed" + desc.add("chi2max", 9.); // max normalized distance to cluster + + desc.add("PtMin", 0.5); + desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA")); + + auto label = "pixelVertexCUDA"; + descriptions.add(label, desc); +} + +void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { + edm::Handle> hTracks; + iEvent.getByToken(tokenGPUTrack_, hTracks); + + cms::cuda::ScopedContextProduce ctx{*hTracks}; + auto const* tracks = ctx.get(*hTracks).get(); + + assert(tracks); + + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_)); +} + +void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID, + edm::Event& iEvent, + const edm::EventSetup& iSetup) const { + auto const* tracks = iEvent.get(tokenCPUTrack_).get(); + assert(tracks); + +#ifdef PIXVERTEX_DEBUG_PRODUCE + auto const& tsoa = *tracks; + auto maxTracks = tsoa.stride(); + std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + assert(nHits == int(tsoa.hitIndices.size(it))); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + nt++; + } + std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_)); +} + +void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + if (onGPU_) { + produceOnGPU(streamID, iEvent, iSetup); + } else { + produceOnCPU(streamID, iEvent, iSetup); + } +} + +DEFINE_FWK_MODULE(PixelVertexProducerCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc new file mode 100644 index 0000000000000..62b9bb46bd4a5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -0,0 +1,175 @@ +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/VertexReco/interface/Vertex.h" +#include "DataFormats/VertexReco/interface/VertexFwd.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + +#undef PIXVERTEX_DEBUG_PRODUCE + +class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { +public: + using IndToEdm = std::vector; + + explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig); + ~PixelVertexProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenBeamSpot_; + edm::EDGetTokenT tokenTracks_; + edm::EDGetTokenT tokenIndToEdm_; +}; + +PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) + : tokenVertex_(consumes(conf.getParameter("src"))), + tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), + tokenTracks_(consumes(conf.getParameter("TrackCollection"))), + tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { + produces(); +} + +void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + + desc.add("TrackCollection", edm::InputTag("pixelTracks")); + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("src", edm::InputTag("pixelVertexSoA")); + + descriptions.add("pixelVertexFromSoA", desc); +} + +void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const { + auto vertexes = std::make_unique(); + + auto tracksHandle = iEvent.getHandle(tokenTracks_); + auto tracksSize = tracksHandle->size(); + auto const &indToEdm = iEvent.get(tokenIndToEdm_); + auto bsHandle = iEvent.getHandle(tokenBeamSpot_); + + float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0; + std::vector itrk; + if (!bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) "; + } else { + const reco::BeamSpot &bs = *bsHandle; + x0 = bs.x0(); + y0 = bs.y0(); + z0 = bs.z0(); + dxdz = bs.dxdz(); + dydz = bs.dydz(); + } + + auto const &soa = *(iEvent.get(tokenVertex_).get()); + + int nv = soa.nvFinal; + +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "converting " << nv << " vertices " + << " from " << indToEdm.size() << " tracks" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + + std::set uind; // fort verifing index consistency + for (int j = nv - 1; j >= 0; --j) { + auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + assert(i < nv); + uind.insert(i); + assert(itrk.empty()); + auto z = soa.zv[i]; + auto x = x0 + dxdz * z; + auto y = y0 + dydz * z; + z += z0; + reco::Vertex::Error err; + err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) *= 2.; // artifically inflate error + //Copy also the tracks (no intention to be efficient....) + for (auto k = 0U; k < indToEdm.size(); ++k) { + if (soa.idv[k] == int16_t(i)) + itrk.push_back(k); + } + auto nt = itrk.size(); + if (nt == 0) { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "vertex " << i << " with no tracks..." << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + continue; + } + if (nt < 2) { + itrk.clear(); + continue; + } // remove outliers + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + auto &v = (*vertexes).back(); + for (auto it : itrk) { + assert(it < int(indToEdm.size())); + auto k = indToEdm[it]; + if (k > tracksSize) { + edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k; + continue; + } + auto tk = reco::TrackRef(tracksHandle, k); + v.add(reco::TrackBaseRef(tk)); + } + itrk.clear(); + } + + LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n"; + for (unsigned int i = 0; i < vertexes->size(); ++i) { + LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize() + << " tracks with a position of " << (*vertexes)[i].z() << " +- " + << std::sqrt((*vertexes)[i].covariance(2, 2)); + } + + // legacy logic.... + if (vertexes->empty() && bsHandle.isValid()) { + const reco::BeamSpot &bs = *bsHandle; + + GlobalError bse(bs.rotatedCovariance3D()); + if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) { + AlgebraicSymMatrix33 we; + we(0, 0) = 10000; + we(1, 1) = 10000; + we(2, 2) = 10000; + vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix() + << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } else { + vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } + } else if (vertexes->empty() && !bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned."; + } + + iEvent.put(std::move(vertexes)); +} + +DEFINE_FWK_MODULE(PixelVertexProducerFromSoA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc new file mode 100644 index 0000000000000..0cadf24580cf7 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -0,0 +1,65 @@ +#include + +#include "CUDADataFormats/Common/interface/Product.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + +class PixelVertexSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig); + ~PixelVertexSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; + + cms::cuda::host::unique_ptr m_soa; +}; + +PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} + +void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("src", edm::InputTag("pixelVertexCUDA")); + descriptions.add("pixelVertexSoA", desc); +} + +void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + m_soa = inputData.toHostAsync(ctx.stream()); +} + +void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // No copies.... + iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); +} + +DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h new file mode 100644 index 0000000000000..b19aeb5930fc6 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -0,0 +1,234 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + // + // based on Rodrighez&Laio algo + // + __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + using namespace gpuVertexFinder; + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only + iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + + // find closest above me .... (we ignore the possibility of two j at same distance from i) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // (break natural order???) + mdist = dist; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + + // consolidate graph (percolate index of seed) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + +#ifdef GPU_DEBUG + __syncthreads(); + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto minJ = i; + auto mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + mdist = dist; + minJ = j; + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + // should belong to the same cluster... + assert(iv[i] == iv[minJ]); + assert(nn[i] <= nn[iv[i]]); + } + __syncthreads(); +#endif + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + + __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h new file mode 100644 index 0000000000000..22ba1e15b4e05 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -0,0 +1,241 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + __global__ void clusterTracksDBSCAN(ZVertices* pdata, + WorkSpace* pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + + // find NN with smaller z... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (nn[i] < minT) + continue; // DBSCAN core rule + float mz = zt[i]; + auto loop = [&](uint32_t j) { + if (zt[j] >= mz) + return; + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + mz = zt[j]; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + + // consolidate graph (percolate index of seed) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + + __syncthreads(); + +#ifdef GPU_DEBUG + // mini verification + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] != int(i)) + assert(iv[iv[i]] != int(i)); + } + __syncthreads(); +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (nn[i] < minT) + continue; // DBSCAN core rule + assert(zt[iv[i]] <= zt[i]); + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + // they should belong to the same cluster, isn't it? + if (iv[i] != iv[j]) { + printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]); + printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]); + ; + } + assert(iv[i] == iv[j]); + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + __syncthreads(); +#endif + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h new file mode 100644 index 0000000000000..1f2934ba15d0c --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -0,0 +1,212 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + __global__ void clusterTracksIterative(ZVertices* pdata, + WorkSpace* pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + if (verbose && 0 == threadIdx.x) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + + auto er2mx = errmax * errmax; + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + uint8_t* __restrict__ izt = ws.izt; + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + using Hist = cms::cuda::HistoContainer; + __shared__ Hist hist; + __shared__ typename Hist::Counter hws[32]; + for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) { + hist.off[j] = 0; + } + __syncthreads(); + + if (verbose && 0 == threadIdx.x) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + + assert(nt <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + assert(i < ZVertices::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); + izt[i] = iz - INT8_MIN; + assert(iz - INT8_MIN >= 0); + assert(iz - INT8_MIN < 256); + hist.count(izt[i]); + iv[i] = i; + nn[i] = 0; + } + __syncthreads(); + if (threadIdx.x < 32) + hws[threadIdx.x] = 0; // used by prefix scan... + __syncthreads(); + hist.finalize(hws); + __syncthreads(); + assert(hist.size() == nt); + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i], uint16_t(i)); + } + __syncthreads(); + + // count neighbours + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ int nloops; + nloops = 0; + + __syncthreads(); + + // cluster seeds only + bool more = true; + while (__syncthreads_or(more)) { + if (1 == nloops % 2) { + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + } else { + more = false; + for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) { + auto p = hist.begin() + k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); + if (nn[i] < minT) + continue; // DBSCAN core rule + auto loop = [&](uint32_t j) { + assert(i != j); + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + auto old = atomicMin(&iv[j], iv[i]); + if (old != iv[i]) { + // end the loop only if no changes were applied + more = true; + } + atomicMin(&iv[i], old); + }; + ++p; + for (; p < hist.end(be); ++p) + loop(*p); + } // for i + } + if (threadIdx.x == 0) + ++nloops; + } // while + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](int j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::cuda::forEachInBins(hist, izt[i], 1, loop); + } + + __shared__ unsigned int foundClusters; + foundClusters = 0; + __syncthreads(); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = atomicInc(&foundClusters, 0xffffffff); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + __syncthreads(); + + assert(foundClusters < ZVertices::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto vertices\n", foundClusters); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h new file mode 100644 index 0000000000000..b8bbd0f601cb6 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -0,0 +1,113 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void fitVertices(ZVertices* pdata, + WorkSpace* pws, + float chi2Max // for outlier rejection + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + float* __restrict__ zv = data.zv; + float* __restrict__ wv = data.wv; + float* __restrict__ chi2 = data.chi2; + uint32_t& nvFinal = data.nvFinal; + uint32_t& nvIntermediate = ws.nvIntermediate; + + int32_t* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + assert(nvFinal <= nvIntermediate); + nvFinal = nvIntermediate; + auto foundClusters = nvFinal; + + // zero + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) { + zv[i] = 0; + wv[i] = 0; + chi2[i] = 0; + } + + // only for test + __shared__ int noise; + if (verbose && 0 == threadIdx.x) + noise = 0; + + __syncthreads(); + + // compute cluster location + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) { + if (verbose) + atomicAdd(&noise, 1); + continue; + } + assert(iv[i] >= 0); + assert(iv[i] < int(foundClusters)); + auto w = 1.f / ezt2[i]; + atomicAdd(&zv[iv[i]], zt[i] * w); + atomicAdd(&wv[iv[i]], w); + } + + __syncthreads(); + // reuse nn + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) { + assert(wv[i] > 0.f); + zv[i] /= wv[i]; + nn[i] = -1; // ndof + } + __syncthreads(); + + // compute chi2 + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) + continue; + + auto c2 = zv[iv[i]] - zt[i]; + c2 *= c2 / ezt2[i]; + if (c2 > chi2Max) { + iv[i] = 9999; + continue; + } + atomicAdd(&chi2[iv[i]], c2); + atomicAdd(&nn[iv[i]], 1); + } + __syncthreads(); + for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) + if (nn[i] > 0) + wv[i] *= float(nn[i]) / chi2[i]; + + if (verbose && 0 == threadIdx.x) + printf("found %d proto clusters ", foundClusters); + if (verbose && 0 == threadIdx.x) + printf("and %d noise\n", noise); + } + + __global__ void fitVerticesKernel(ZVertices* pdata, + WorkSpace* pws, + float chi2Max // for outlier rejection + ) { + fitVertices(pdata, pws, chi2Max); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h new file mode 100644 index 0000000000000..841eab3901965 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -0,0 +1,73 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#ifdef __CUDA_ARCH__ +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" +#endif + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ ptt2 = ws.ptt2; + uint32_t const& nvFinal = data.nvFinal; + + int32_t const* __restrict__ iv = ws.iv; + float* __restrict__ ptv2 = data.ptv2; + uint16_t* __restrict__ sortInd = data.sortInd; + + // if (threadIdx.x == 0) + // printf("sorting %d vertices\n",nvFinal); + + if (nvFinal < 1) + return; + + // fill indexing + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + data.idv[ws.itrk[i]] = iv[i]; + } + + // can be done asynchronoisly at the end of previous event + for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) { + ptv2[i] = 0; + } + __syncthreads(); + + for (auto i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i] > 9990) + continue; + atomicAdd(&ptv2[iv[i]], ptt2[i]); + } + __syncthreads(); + + if (1 == nvFinal) { + if (threadIdx.x == 0) + sortInd[0] = 0; + return; + } +#ifdef __CUDA_ARCH__ + __shared__ uint16_t sws[1024]; + // sort using only 16 bits + radixSort(ptv2, sortInd, sws, nvFinal); +#else + for (uint16_t i = 0; i < nvFinal; ++i) + sortInd[i] = i; + std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; }); +#endif + } + + __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h new file mode 100644 index 0000000000000..0fe8bd882dcc5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -0,0 +1,139 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +#include "gpuVertexFinder.h" + +namespace gpuVertexFinder { + + __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + auto& __restrict__ data = *pdata; + auto& __restrict__ ws = *pws; + auto nt = ws.ntrks; + float const* __restrict__ zt = ws.zt; + float const* __restrict__ ezt2 = ws.ezt2; + float* __restrict__ zv = data.zv; + float* __restrict__ wv = data.wv; + float const* __restrict__ chi2 = data.chi2; + uint32_t& nvFinal = data.nvFinal; + + int32_t const* __restrict__ nn = data.ndof; + int32_t* __restrict__ iv = ws.iv; + + assert(pdata); + assert(zt); + + // one vertex per block + for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) { + if (nn[kv] < 4) + continue; + if (chi2[kv] < maxChi2 * float(nn[kv])) + continue; + + constexpr int MAXTK = 512; + assert(nn[kv] < MAXTK); + if (nn[kv] >= MAXTK) + continue; // too bad FIXME + __shared__ uint32_t it[MAXTK]; // track index + __shared__ float zz[MAXTK]; // z pos + __shared__ uint8_t newV[MAXTK]; // 0 or 1 + __shared__ float ww[MAXTK]; // z weight + + __shared__ uint32_t nq; // number of track for this vertex + nq = 0; + __syncthreads(); + + // copy to local + for (auto k = threadIdx.x; k < nt; k += blockDim.x) { + if (iv[k] == int(kv)) { + auto old = atomicInc(&nq, MAXTK); + zz[old] = zt[k] - zv[kv]; + newV[old] = zz[old] < 0 ? 0 : 1; + ww[old] = 1.f / ezt2[k]; + it[old] = k; + } + } + + __shared__ float znew[2], wnew[2]; // the new vertices + + __syncthreads(); + assert(int(nq) == nn[kv] + 1); + + int maxiter = 20; + // kt-min.... + bool more = true; + while (__syncthreads_or(more)) { + more = false; + if (0 == threadIdx.x) { + znew[0] = 0; + znew[1] = 0; + wnew[0] = 0; + wnew[1] = 0; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto i = newV[k]; + atomicAdd(&znew[i], zz[k] * ww[k]); + atomicAdd(&wnew[i], ww[k]); + } + __syncthreads(); + if (0 == threadIdx.x) { + znew[0] /= wnew[0]; + znew[1] /= wnew[1]; + } + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + auto d0 = fabs(zz[k] - znew[0]); + auto d1 = fabs(zz[k] - znew[1]); + auto newer = d0 < d1 ? 0 : 1; + more |= newer != newV[k]; + newV[k] = newer; + } + --maxiter; + if (maxiter <= 0) + more = false; + } + + // avoid empty vertices + if (0 == wnew[0] || 0 == wnew[1]) + continue; + + // quality cut + auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]); + + auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]); + + if (verbose && 0 == threadIdx.x) + printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]); + + if (chi2Dist < 4) + continue; + + // get a new global vertex + __shared__ uint32_t igv; + if (0 == threadIdx.x) + igv = atomicAdd(&ws.nvIntermediate, 1); + __syncthreads(); + for (auto k = threadIdx.x; k < nq; k += blockDim.x) { + if (1 == newV[k]) + iv[it[k]] = igv; + } + + } // loop on vertices + } + + __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + splitVertices(pdata, pws, maxChi2); + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc new file mode 100644 index 0000000000000..084763385bdb4 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -0,0 +1 @@ +#include "gpuVertexFinderImpl.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu new file mode 100644 index 0000000000000..084763385bdb4 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu @@ -0,0 +1 @@ +#include "gpuVertexFinderImpl.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h new file mode 100644 index 0000000000000..5f8238c3ea8c8 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -0,0 +1,83 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h + +#include +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" + +namespace gpuVertexFinder { + + using ZVertices = ZVertexSoA; + using TkSoA = pixelTrack::TrackSoA; + + // workspace used in the vertex reco algos + struct WorkSpace { + static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; + static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; + + uint32_t ntrks; // number of "selected tracks" + uint16_t itrk[MAXTRACKS]; // index of original track + float zt[MAXTRACKS]; // input track z at bs + float ezt2[MAXTRACKS]; // input error^2 on the above + float ptt2[MAXTRACKS]; // input pt^2 on the above + uint8_t izt[MAXTRACKS]; // interized z-position of input tracks + int32_t iv[MAXTRACKS]; // vertex index for each associated track + + uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. + + __host__ __device__ void init() { + ntrks = 0; + nvIntermediate = 0; + } + }; + + __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { + pdata->init(); + pws->init(); + } + + class Producer { + public: + using ZVertices = ZVertexSoA; + using WorkSpace = gpuVertexFinder::WorkSpace; + using TkSoA = pixelTrack::TrackSoA; + + Producer(bool oneKernel, + bool useDensity, + bool useDBSCAN, + bool useIterative, + int iminT, // min number of neighbours to be "core" + float ieps, // max absolute distance to cluster + float ierrmax, // max error to be "seed" + float ichi2max // max normalized distance to cluster + ) + : oneKernel_(oneKernel && !(useDBSCAN || useIterative)), + useDensity_(useDensity), + useDBSCAN_(useDBSCAN), + useIterative_(useIterative), + minT(iminT), + eps(ieps), + errmax(ierrmax), + chi2max(ichi2max) {} + + ~Producer() = default; + + ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const; + ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const; + + private: + const bool oneKernel_; + const bool useDensity_; + const bool useDBSCAN_; + const bool useIterative_; + + int minT; // min number of neighbours to be "core" + float eps; // max absolute distance to cluster + float errmax; // max error to be "seed" + float chi2max; // max normalized distance to cluster + }; + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h new file mode 100644 index 0000000000000..d685ced488233 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h @@ -0,0 +1,192 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h +#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "gpuClusterTracksByDensity.h" +#include "gpuClusterTracksDBSCAN.h" +#include "gpuClusterTracksIterative.h" +#include "gpuFitVertices.h" +#include "gpuSortByPt2.h" +#include "gpuSplitVertices.h" + +#undef PIXVERTEX_DEBUG_PRODUCE + +namespace gpuVertexFinder { + + // reject outlier tracks that contribute more than this to the chi2 of the vertex fit + constexpr float maxChi2ForFirstFit = 50.f; + constexpr float maxChi2ForFinalFit = 5000.f; + + // split vertices with a chi2/NDoF greater than this + constexpr float maxChi2ForSplit = 9.f; + + __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) { + assert(ptracks); + assert(soa); + auto const& tracks = *ptracks; + auto const& fit = tracks.stateAtBS; + auto const* quality = tracks.qualityData(); + + auto first = blockIdx.x * blockDim.x + threadIdx.x; + for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = tracks.nHits(idx); + if (nHits == 0) + break; // this is a guard: maybe we need to move to nTracks... + + // initialize soa... + soa->idv[idx] = -1; + + if (nHits < 4) + continue; // no triplets + if (quality[idx] != pixelTrack::Quality::loose) + continue; + + auto pt = tracks.pt(idx); + + if (pt < ptMin) + continue; + + auto& data = *pws; + auto it = atomicAdd(&data.ntrks, 1); + data.itrk[it] = idx; + data.zt[it] = tracks.zip(idx); + data.ezt2[it] = fit.covariance(idx)(14); + data.ptt2[it] = pt * pt; + } + } + +// #define THREE_KERNELS +#ifndef THREE_KERNELS + __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFirstFit); + __syncthreads(); + splitVertices(pdata, pws, maxChi2ForSplit); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFinalFit); + __syncthreads(); + sortByPt2(pdata, pws); + } +#else + __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, maxChi2ForFirstFit); + } + + __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + fitVertices(pdata, pws, maxChi2ForFinalFit); + __syncthreads(); + sortByPt2(pdata, pws); + } +#endif + +#ifdef __CUDACC__ + ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "producing Vertices on GPU" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); +#else + ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "producing Vertices on CPU" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + ZVertexHeterogeneous vertices(std::make_unique()); +#endif + assert(tksoa); + auto* soa = vertices.get(); + assert(soa); + +#ifdef __CUDACC__ + auto ws_d = cms::cuda::make_device_unique(stream); +#else + auto ws_d = std::make_unique(); +#endif + +#ifdef __CUDACC__ + init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + auto blockSize = 128; + auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize; + loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin); + cudaCheck(cudaGetLastError()); +#else + init(soa, ws_d.get()); + loadTracks(tksoa, soa, ws_d.get(), ptMin); +#endif + +#ifdef __CUDACC__ + // Running too many thread lead to problems when printf is enabled. + constexpr int maxThreadsForPrint = 1024 - 256; + constexpr int numBlocks = 1024; + constexpr int threadsPerBlock = 128; + + if (oneKernel_) { + // implemented only for density clustesrs +#ifndef THREE_KERNELS + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); +#else + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + cudaCheck(cudaGetLastError()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); +#endif + } else { // five kernels + if (useDensity_) { + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + } + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + cudaCheck(cudaGetLastError()); + // one block per vertex... + splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + cudaCheck(cudaGetLastError()); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + cudaCheck(cudaGetLastError()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + } + cudaCheck(cudaGetLastError()); +#else // __CUDACC__ + if (useDensity_) { + clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useDBSCAN_) { + clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + } +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + // one block per vertex! + splitVertices(soa, ws_d.get(), maxChi2ForSplit); + fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.get()); +#endif + + return vertices; + } + +} // namespace gpuVertexFinder + +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py index 77a9f367b9d9b..903c2a894ff86 100644 --- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py +++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py @@ -18,5 +18,3 @@ refToPSet_ = cms.string('pvClusterComparer') ) ) - - diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml index 0f4f4dee63832..f5c154b298574 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml @@ -2,8 +2,41 @@ - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h new file mode 100644 index 0000000000000..52253a1e4bbfe --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -0,0 +1,347 @@ +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#ifdef USE_DBSCAN +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN +#elif USE_ITERATIVE +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative +#else +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h" +#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel +#endif +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h" + +#ifdef ONE_KERNEL +#ifdef __CUDACC__ +__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, + gpuVertexFinder::WorkSpace* pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, +) { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + __syncthreads(); + fitVertices(pdata, pws, 50.); + __syncthreads(); + splitVertices(pdata, pws, 9.f); + __syncthreads(); + fitVertices(pdata, pws, 5000.); + __syncthreads(); + sortByPt2(pdata, pws); +} +#endif +#endif + +struct Event { + std::vector zvert; + std::vector itrack; + std::vector ztrack; + std::vector eztrack; + std::vector pttrack; + std::vector ivert; +}; + +struct ClusterGenerator { + explicit ClusterGenerator(float nvert, float ntrack) + : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {} + + void operator()(Event& ev) { + int nclus = clusGen(reng); + ev.zvert.resize(nclus); + ev.itrack.resize(nclus); + for (auto& z : ev.zvert) { + z = 3.5f * gauss(reng); + } + + ev.ztrack.clear(); + ev.eztrack.clear(); + ev.ivert.clear(); + for (int iv = 0; iv < nclus; ++iv) { + auto nt = trackGen(reng); + ev.itrack[nclus] = nt; + for (int it = 0; it < nt; ++it) { + auto err = errgen(reng); // reality is not flat.... + ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng)); + ev.eztrack.push_back(err * err); + ev.ivert.push_back(iv); + ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng)); + ev.pttrack.back() *= ev.pttrack.back(); + } + } + // add noise + auto nt = 2 * trackGen(reng); + for (int it = 0; it < nt; ++it) { + auto err = 0.03f; + ev.ztrack.push_back(rgen(reng)); + ev.eztrack.push_back(err * err); + ev.ivert.push_back(9999); + ev.pttrack.push_back(0.5f + ptGen(reng)); + ev.pttrack.back() *= ev.pttrack.back(); + } + } + + std::mt19937 reng; + std::uniform_real_distribution rgen; + std::uniform_real_distribution errgen; + std::poisson_distribution clusGen; + std::poisson_distribution trackGen; + std::normal_distribution gauss; + std::exponential_distribution ptGen; +}; + +// a macro SORRY +#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) +#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) + +__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { + auto const& __restrict__ data = *pdata; + auto const& __restrict__ ws = *pws; + printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); +} + +int main() { +#ifdef __CUDACC__ + cms::cudatest::requireDevices(); + + auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); + auto ws_d = cms::cuda::make_device_unique(1, nullptr); +#else + auto onGPU_d = std::make_unique(); + auto ws_d = std::make_unique(); +#endif + + Event ev; + + float eps = 0.1f; + std::array par{{eps, 0.01f, 9.0f}}; + for (int nav = 30; nav < 80; nav += 20) { + ClusterGenerator gen(nav, 10); + + for (int i = 8; i < 20; ++i) { + auto kk = i / 4; // M param + + gen(ev); + +#ifdef __CUDACC__ + init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); +#else + onGPU_d->init(); + ws_d->init(); +#endif + + std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; + auto nt = ev.ztrack.size(); +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); +#else + ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); + ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); +#endif + + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + + if ((i % 4) == 0) + par = {{eps, 0.02f, 12.0f}}; + if ((i % 4) == 1) + par = {{eps, 0.02f, 9.0f}}; + if ((i % 4) == 2) + par = {{eps, 0.01f, 9.0f}}; + if ((i % 4) == 3) + par = {{0.7f * eps, 0.01f, 9.0f}}; + + uint32_t nv = 0; +#ifdef __CUDACC__ + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); + +#ifdef ONE_KERNEL + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#else + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); +#endif + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); + + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + +#else + print(onGPU_d.get(), ws_d.get()); + CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + print(onGPU_d.get(), ws_d.get()); + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; +#endif + + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + + float* zv = nullptr; + float* wv = nullptr; + float* ptv2 = nullptr; + int32_t* nn = nullptr; + uint16_t* ind = nullptr; + + // keep chi2 separated... + float chi2[2 * nv]; // make space for splitting... + +#ifdef __CUDACC__ + float hzv[2 * nv]; + float hwv[2 * nv]; + float hptv2[2 * nv]; + int32_t hnn[2 * nv]; + uint16_t hind[2 * nv]; + + zv = hzv; + wv = hwv; + ptv2 = hptv2; + nn = hnn; + ind = hind; +#else + zv = onGPU_d->zv; + wv = onGPU_d->wv; + ptv2 = onGPU_d->ptv2; + nn = onGPU_d->ndof; + ind = onGPU_d->sortInd; +#endif + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); +#else + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + +#ifdef __CUDACC__ + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); +#else + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + +#ifdef __CUDACC__ + // one vertex per block!!! + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); +#else + splitVertices(onGPU_d.get(), ws_d.get(), 9.f); + nv = ws_d->nvIntermediate; +#endif + std::cout << "after split " << nv << std::endl; + +#ifdef __CUDACC__ + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cudaCheck(cudaGetLastError()); + + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); +#else + fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); + sortByPt2(onGPU_d.get(), ws_d.get()); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); +#endif + + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + +#ifdef __CUDACC__ + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); +#endif + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } + + { + auto mx = std::minmax_element(wv, wv + nv); + std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl; + } + + { + auto mx = std::minmax_element(ptv2, ptv2 + nv); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' ' + << ind[nv - 1] << std::endl; + } + + float dd[nv]; + for (auto kv = 0U; kv < nv; ++kv) { + auto zr = zv[kv]; + auto md = 500.0f; + for (auto zs : ev.ztrack) { + auto d = std::abs(zr - zs); + md = std::min(d, md); + } + dd[kv] = md; + } + if (i == 6) { + for (auto d : dd) + std::cout << d << ' '; + std::cout << std::endl; + } + auto mx = std::minmax_element(dd, dd + nv); + float rms = 0; + for (auto d : dd) + rms += d * d; + rms = std::sqrt(rms) / (nv - 1); + std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + } // loop on events + } // lopp on ave vert + + return 0; +} diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp new file mode 100644 index 0000000000000..a7906fe0d03f5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp @@ -0,0 +1 @@ +#include "VertexFinder_t.h" diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu new file mode 100644 index 0000000000000..a7906fe0d03f5 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu @@ -0,0 +1 @@ +#include "VertexFinder_t.h"