diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
new file mode 100644
index 0000000000000..f61e4aff7403f
--- /dev/null
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -0,0 +1,9 @@
+
+
+
+
+
+
+
+
+
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
new file mode 100644
index 0000000000000..aacfddc6fe7e2
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+using ZVertexHeterogeneous = HeterogeneousSoA;
+#ifndef __CUDACC__
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product;
+#endif
+
+#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
new file mode 100644
index 0000000000000..e31b87f30fa11
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
+#define CUDADataFormats_Vertex_ZVertexSoA_h
+
+#include
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+ static constexpr uint32_t MAXTRACKS = 32 * 1024;
+ static constexpr uint32_t MAXVTX = 1024;
+
+ int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate)
+ float zv[MAXVTX]; // output z-posistion of found vertices
+ float wv[MAXVTX]; // output weight (1/error^2) on the above
+ float chi2[MAXVTX]; // vertices chi2
+ float ptv2[MAXVTX]; // vertices pt^2
+ int32_t ndof[MAXTRACKS]; // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
+ uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending
+ uint32_t nvFinal; // the number of vertices
+
+ __host__ __device__ void init() { nvFinal = 0; }
+};
+
+#endif // CUDADataFormats_Vertex_ZVertexSoA_h
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
new file mode 100644
index 0000000000000..7931beaa8f4bd
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_Vertex_src_classes_h
+#define CUDADataFormats_Vertex_src_classes_h
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif // CUDADataFormats_Vertex_src_classes_h
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
new file mode 100644
index 0000000000000..ea633080af9af
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
new file mode 100644
index 0000000000000..2558e88d26012
--- /dev/null
+++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
@@ -0,0 +1,7 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient
+
+pixelVertexResolutionClient = _primaryVertexResolutionClient.clone(
+ subDirs = ["OfflinePixelPV/Resolution/*"]
+)
diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 368b328632fd8..29bf311c474d4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -122,6 +122,7 @@
from DQM.CTPPS.ctppsDQM_cff import *
from Validation.RecoTau.DQMSequences_cfi import *
from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
+from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
@@ -179,7 +180,8 @@
DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias *
dqmFastTimerServiceClient )
-DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern )
+DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
+ pixelVertexResolutionClient )
DQMHarvestOuterTracker = cms.Sequence(
OuterTrackerClient *
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 2001c22352a48..ac28700d4eaf4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -157,10 +157,12 @@
#DQMOfflineCommon
from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
+from DQMOffline.RecoB.PixelVertexMonitor_cff import *
from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
from Validation.RecoTau.DQMSequences_cfi import *
-DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring )
+DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring *
+ pixelPVMonitor )
DQMOuterTracker = cms.Sequence( DQMOfflineDCS *
OuterTrackerSource *
diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
new file mode 100644
index 0000000000000..9e293f4478bd6
--- /dev/null
+++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
@@ -0,0 +1,8 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor
+pixelPVMonitor = _pvMonitor.clone(
+ TopFolderName = "OfflinePixelPV",
+ vertexLabel = "pixelVertices",
+ ndof = cms.int32( 1 )
+)
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..424ac13a43627 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,21 @@
#
# for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
#
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
recopixelvertexing = cms.Sequence(recopixelvertexingTask)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
+
+_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices)
+
+# pixelVertexSoAonCPU = pixelVertexCUDA.clone()
+# pixelVertexSoAonCPU.onGPU = False;
+
+gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA)
+gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask)
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 24cc16e02b463..1661cac832b8b 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -20,12 +20,21 @@ def customizePixelTracksSoAonCPU(process):
pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
)
+ from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+ process.pixelVertexSoA = pixelVertexCUDA.clone(
+ onGPU = False,
+ pixelTrackSrc = 'pixelTrackSoA'
+ )
+
from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
process.pixelTracks = pixelTrackProducerFromSoA.clone(
pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
)
- process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA
+ from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
+ process.pixelVertices = pixelVertexFromSoA.clone()
+
+ process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
return process
@@ -46,7 +55,7 @@ def customizePixelTracksSoAonCPUForProfiling(process):
process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
- process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA)
+ process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
process.schedule = cms.Schedule(process.TkSoA)
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
new file mode 100644
index 0000000000000..2f0965be50eb8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -0,0 +1,86 @@
+#include
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+public:
+ explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
+ ~PixelTrackDumpCUDA() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+ void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+ const bool m_onGPU;
+ edm::EDGetTokenT> tokenGPUTrack_;
+ edm::EDGetTokenT> tokenGPUVertex_;
+ edm::EDGetTokenT tokenSoATrack_;
+ edm::EDGetTokenT tokenSoAVertex_;
+};
+
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
+ : m_onGPU(iConfig.getParameter("onGPU")) {
+ if (m_onGPU) {
+ tokenGPUTrack_ =
+ consumes>(iConfig.getParameter("pixelTrackSrc"));
+ tokenGPUVertex_ =
+ consumes>(iConfig.getParameter("pixelVertexSrc"));
+ } else {
+ tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc"));
+ tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc"));
+ }
+}
+
+void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("onGPU", true);
+ desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+ desc.add("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+ descriptions.add("pixelTrackDumpCUDA", desc);
+}
+
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
+ edm::Event const& iEvent,
+ const edm::EventSetup& iSetup) const {
+ if (m_onGPU) {
+ auto const& hTracks = iEvent.get(tokenGPUTrack_);
+ cms::cuda::ScopedContextProduce ctx{hTracks};
+
+ auto const& tracks = ctx.get(hTracks);
+ auto const* tsoa = tracks.get();
+ assert(tsoa);
+
+ auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
+ auto const* vsoa = vertices.get();
+ assert(vsoa);
+
+ } else {
+ auto const* tsoa = iEvent.get(tokenSoATrack_).get();
+ assert(tsoa);
+
+ auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
+ assert(vsoa);
+ }
+}
+
+DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
index 427799cb122b5..99b91b2587bcf 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+
@@ -15,10 +16,12 @@
+
+
-
+
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
new file mode 100644
index 0000000000000..e2c2bc76c8612
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -0,0 +1,141 @@
+#include
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "gpuVertexFinder.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
+public:
+ explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
+ ~PixelVertexProducerCUDA() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+ void produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+ void produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+ void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+ bool onGPU_;
+
+ edm::EDGetTokenT> tokenGPUTrack_;
+ edm::EDPutTokenT tokenGPUVertex_;
+ edm::EDGetTokenT tokenCPUTrack_;
+ edm::EDPutTokenT tokenCPUVertex_;
+
+ const gpuVertexFinder::Producer gpuAlgo_;
+
+ // Tracking cuts before sending tracks to vertex algo
+ const float ptMin_;
+};
+
+PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
+ : onGPU_(conf.getParameter("onGPU")),
+ gpuAlgo_(conf.getParameter("oneKernel"),
+ conf.getParameter("useDensity"),
+ conf.getParameter("useDBSCAN"),
+ conf.getParameter("useIterative"),
+ conf.getParameter("minT"),
+ conf.getParameter("eps"),
+ conf.getParameter("errmax"),
+ conf.getParameter("chi2max")),
+ ptMin_(conf.getParameter("PtMin")) // 0.5 GeV
+{
+ if (onGPU_) {
+ tokenGPUTrack_ =
+ consumes>(conf.getParameter("pixelTrackSrc"));
+ tokenGPUVertex_ = produces();
+ } else {
+ tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc"));
+ tokenCPUVertex_ = produces();
+ }
+}
+
+void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ // Only one of these three algos can be used at once.
+ // Maybe this should become a Plugin Factory
+ desc.add("onGPU", true);
+ desc.add("oneKernel", true);
+ desc.add("useDensity", true);
+ desc.add("useDBSCAN", false);
+ desc.add("useIterative", false);
+
+ desc.add("minT", 2); // min number of neighbours to be "core"
+ desc.add("eps", 0.07); // max absolute distance to cluster
+ desc.add("errmax", 0.01); // max error to be "seed"
+ desc.add("chi2max", 9.); // max normalized distance to cluster
+
+ desc.add("PtMin", 0.5);
+ desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+
+ auto label = "pixelVertexCUDA";
+ descriptions.add(label, desc);
+}
+
+void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
+ edm::Event& iEvent,
+ const edm::EventSetup& iSetup) const {
+ edm::Handle> hTracks;
+ iEvent.getByToken(tokenGPUTrack_, hTracks);
+
+ cms::cuda::ScopedContextProduce ctx{*hTracks};
+ auto const* tracks = ctx.get(*hTracks).get();
+
+ assert(tracks);
+
+ ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+}
+
+void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
+ edm::Event& iEvent,
+ const edm::EventSetup& iSetup) const {
+ auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+ assert(tracks);
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ auto const& tsoa = *tracks;
+ auto maxTracks = tsoa.stride();
+ std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+ int32_t nt = 0;
+ for (int32_t it = 0; it < maxTracks; ++it) {
+ auto nHits = tsoa.nHits(it);
+ assert(nHits == int(tsoa.hitIndices.size(it)));
+ if (nHits == 0)
+ break; // this is a guard: maybe we need to move to nTracks...
+ nt++;
+ }
+ std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+
+ iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_));
+}
+
+void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+ if (onGPU_) {
+ produceOnGPU(streamID, iEvent, iSetup);
+ } else {
+ produceOnCPU(streamID, iEvent, iSetup);
+ }
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
new file mode 100644
index 0000000000000..62b9bb46bd4a5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -0,0 +1,175 @@
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
+public:
+ using IndToEdm = std::vector;
+
+ explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig);
+ ~PixelVertexProducerFromSoA() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+ void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+ edm::EDGetTokenT tokenVertex_;
+ edm::EDGetTokenT tokenBeamSpot_;
+ edm::EDGetTokenT tokenTracks_;
+ edm::EDGetTokenT tokenIndToEdm_;
+};
+
+PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
+ : tokenVertex_(consumes(conf.getParameter("src"))),
+ tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))),
+ tokenTracks_(consumes(conf.getParameter("TrackCollection"))),
+ tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) {
+ produces();
+}
+
+void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("TrackCollection", edm::InputTag("pixelTracks"));
+ desc.add("beamSpot", edm::InputTag("offlineBeamSpot"));
+ desc.add("src", edm::InputTag("pixelVertexSoA"));
+
+ descriptions.add("pixelVertexFromSoA", desc);
+}
+
+void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const {
+ auto vertexes = std::make_unique();
+
+ auto tracksHandle = iEvent.getHandle(tokenTracks_);
+ auto tracksSize = tracksHandle->size();
+ auto const &indToEdm = iEvent.get(tokenIndToEdm_);
+ auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
+
+ float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
+ std::vector itrk;
+ if (!bsHandle.isValid()) {
+ edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) ";
+ } else {
+ const reco::BeamSpot &bs = *bsHandle;
+ x0 = bs.x0();
+ y0 = bs.y0();
+ z0 = bs.z0();
+ dxdz = bs.dxdz();
+ dydz = bs.dydz();
+ }
+
+ auto const &soa = *(iEvent.get(tokenVertex_).get());
+
+ int nv = soa.nvFinal;
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ std::cout << "converting " << nv << " vertices "
+ << " from " << indToEdm.size() << " tracks" << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+
+ std::set uind; // fort verifing index consistency
+ for (int j = nv - 1; j >= 0; --j) {
+ auto i = soa.sortInd[j]; // on gpu sorted in ascending order....
+ assert(i < nv);
+ uind.insert(i);
+ assert(itrk.empty());
+ auto z = soa.zv[i];
+ auto x = x0 + dxdz * z;
+ auto y = y0 + dydz * z;
+ z += z0;
+ reco::Vertex::Error err;
+ err(2, 2) = 1.f / soa.wv[i];
+ err(2, 2) *= 2.; // artifically inflate error
+ //Copy also the tracks (no intention to be efficient....)
+ for (auto k = 0U; k < indToEdm.size(); ++k) {
+ if (soa.idv[k] == int16_t(i))
+ itrk.push_back(k);
+ }
+ auto nt = itrk.size();
+ if (nt == 0) {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ std::cout << "vertex " << i << " with no tracks..." << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+ continue;
+ }
+ if (nt < 2) {
+ itrk.clear();
+ continue;
+ } // remove outliers
+ (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+ auto &v = (*vertexes).back();
+ for (auto it : itrk) {
+ assert(it < int(indToEdm.size()));
+ auto k = indToEdm[it];
+ if (k > tracksSize) {
+ edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
+ continue;
+ }
+ auto tk = reco::TrackRef(tracksHandle, k);
+ v.add(reco::TrackBaseRef(tk));
+ }
+ itrk.clear();
+ }
+
+ LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n";
+ for (unsigned int i = 0; i < vertexes->size(); ++i) {
+ LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize()
+ << " tracks with a position of " << (*vertexes)[i].z() << " +- "
+ << std::sqrt((*vertexes)[i].covariance(2, 2));
+ }
+
+ // legacy logic....
+ if (vertexes->empty() && bsHandle.isValid()) {
+ const reco::BeamSpot &bs = *bsHandle;
+
+ GlobalError bse(bs.rotatedCovariance3D());
+ if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+ AlgebraicSymMatrix33 we;
+ we(0, 0) = 10000;
+ we(1, 1) = 10000;
+ we(2, 2) = 10000;
+ vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0));
+
+ edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix()
+ << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n"
+ << (*vertexes)[0].x() << "\n"
+ << (*vertexes)[0].y() << "\n"
+ << (*vertexes)[0].z() << "\n";
+ } else {
+ vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0));
+
+ edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n"
+ << (*vertexes)[0].x() << "\n"
+ << (*vertexes)[0].y() << "\n"
+ << (*vertexes)[0].z() << "\n";
+ }
+ } else if (vertexes->empty() && !bsHandle.isValid()) {
+ edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned.";
+ }
+
+ iEvent.put(std::move(vertexes));
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
new file mode 100644
index 0000000000000..0cadf24580cf7
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -0,0 +1,65 @@
+#include
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+class PixelVertexSoAFromCUDA : public edm::stream::EDProducer {
+public:
+ explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig);
+ ~PixelVertexSoAFromCUDA() override = default;
+
+ static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+ void acquire(edm::Event const& iEvent,
+ edm::EventSetup const& iSetup,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+ edm::EDGetTokenT> tokenCUDA_;
+ edm::EDPutTokenT tokenSOA_;
+
+ cms::cuda::host::unique_ptr m_soa;
+};
+
+PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
+ : tokenCUDA_(consumes>(iConfig.getParameter("src"))),
+ tokenSOA_(produces()) {}
+
+void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+ edm::ParameterSetDescription desc;
+
+ desc.add("src", edm::InputTag("pixelVertexCUDA"));
+ descriptions.add("pixelVertexSoA", desc);
+}
+
+void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
+ edm::EventSetup const& iSetup,
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+ cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+ auto const& inputData = ctx.get(inputDataWrapped);
+
+ m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+ // No copies....
+ iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+}
+
+DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
new file mode 100644
index 0000000000000..b19aeb5930fc6
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -0,0 +1,234 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ // this algo does not really scale as it works in a single block...
+ // enough for <10K tracks we have
+ //
+ // based on Rodrighez&Laio algo
+ //
+ __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+ gpuVertexFinder::WorkSpace* pws,
+ int minT, // min number of neighbours to be "seed"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster
+ ) {
+ using namespace gpuVertexFinder;
+ constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+ if (verbose && 0 == threadIdx.x)
+ printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+ auto er2mx = errmax * errmax;
+
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ zt = ws.zt;
+ float const* __restrict__ ezt2 = ws.ezt2;
+
+ uint32_t& nvFinal = data.nvFinal;
+ uint32_t& nvIntermediate = ws.nvIntermediate;
+
+ uint8_t* __restrict__ izt = ws.izt;
+ int32_t* __restrict__ nn = data.ndof;
+ int32_t* __restrict__ iv = ws.iv;
+
+ assert(pdata);
+ assert(zt);
+
+ using Hist = cms::cuda::HistoContainer;
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter hws[32];
+ for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+ hist.off[j] = 0;
+ }
+ __syncthreads();
+
+ if (verbose && 0 == threadIdx.x)
+ printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+ assert(nt <= hist.capacity());
+
+ // fill hist (bin shall be wider than "eps")
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ assert(i < ZVertices::MAXTRACKS);
+ int iz = int(zt[i] * 10.); // valid if eps<=0.1
+ // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only
+ iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+ izt[i] = iz - INT8_MIN;
+ assert(iz - INT8_MIN >= 0);
+ assert(iz - INT8_MIN < 256);
+ hist.count(izt[i]);
+ iv[i] = i;
+ nn[i] = 0;
+ }
+ __syncthreads();
+ if (threadIdx.x < 32)
+ hws[threadIdx.x] = 0; // used by prefix scan...
+ __syncthreads();
+ hist.finalize(hws);
+ __syncthreads();
+ assert(hist.size() == nt);
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ hist.fill(izt[i], uint16_t(i));
+ }
+ __syncthreads();
+
+ // count neighbours
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (ezt2[i] > er2mx)
+ continue;
+ auto loop = [&](uint32_t j) {
+ if (i == j)
+ return;
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return;
+ nn[i]++;
+ };
+
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __syncthreads();
+
+ // find closest above me .... (we ignore the possibility of two j at same distance from i)
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ float mdist = eps;
+ auto loop = [&](uint32_t j) {
+ if (nn[j] < nn[i])
+ return;
+ if (nn[j] == nn[i] && zt[j] >= zt[i])
+ return; // if equal use natural order...
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > mdist)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return; // (break natural order???)
+ mdist = dist;
+ iv[i] = j; // assign to cluster (better be unique??)
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __syncthreads();
+
+#ifdef GPU_DEBUG
+ // mini verification
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] != int(i))
+ assert(iv[iv[i]] != int(i));
+ }
+ __syncthreads();
+#endif
+
+ // consolidate graph (percolate index of seed)
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ auto m = iv[i];
+ while (m != iv[m])
+ m = iv[m];
+ iv[i] = m;
+ }
+
+#ifdef GPU_DEBUG
+ __syncthreads();
+ // mini verification
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] != int(i))
+ assert(iv[iv[i]] != int(i));
+ }
+#endif
+
+#ifdef GPU_DEBUG
+ // and verify that we did not spit any cluster...
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ auto minJ = i;
+ auto mdist = eps;
+ auto loop = [&](uint32_t j) {
+ if (nn[j] < nn[i])
+ return;
+ if (nn[j] == nn[i] && zt[j] >= zt[i])
+ return; // if equal use natural order...
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > mdist)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return;
+ mdist = dist;
+ minJ = j;
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ // should belong to the same cluster...
+ assert(iv[i] == iv[minJ]);
+ assert(nn[i] <= nn[iv[i]]);
+ }
+ __syncthreads();
+#endif
+
+ __shared__ unsigned int foundClusters;
+ foundClusters = 0;
+ __syncthreads();
+
+ // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+ // mark these tracks with a negative id.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] == int(i)) {
+ if (nn[i] >= minT) {
+ auto old = atomicInc(&foundClusters, 0xffffffff);
+ iv[i] = -(old + 1);
+ } else { // noise
+ iv[i] = -9998;
+ }
+ }
+ }
+ __syncthreads();
+
+ assert(foundClusters < ZVertices::MAXVTX);
+
+ // propagate the negative id to all the tracks in the cluster.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] >= 0) {
+ // mark each track in a cluster with the same id as the first one
+ iv[i] = iv[iv[i]];
+ }
+ }
+ __syncthreads();
+
+ // adjust the cluster id to be a positive value starting from 0
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ iv[i] = -iv[i] - 1;
+ }
+
+ nvIntermediate = nvFinal = foundClusters;
+
+ if (verbose && 0 == threadIdx.x)
+ printf("found %d proto vertices\n", foundClusters);
+ }
+
+ __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+ gpuVertexFinder::WorkSpace* pws,
+ int minT, // min number of neighbours to be "seed"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster
+ ) {
+ clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
new file mode 100644
index 0000000000000..22ba1e15b4e05
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -0,0 +1,241 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ // this algo does not really scale as it works in a single block...
+ // enough for <10K tracks we have
+ __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+ WorkSpace* pws,
+ int minT, // min number of neighbours to be "core"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster
+ ) {
+ constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+ if (verbose && 0 == threadIdx.x)
+ printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+ auto er2mx = errmax * errmax;
+
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ zt = ws.zt;
+ float const* __restrict__ ezt2 = ws.ezt2;
+
+ uint32_t& nvFinal = data.nvFinal;
+ uint32_t& nvIntermediate = ws.nvIntermediate;
+
+ uint8_t* __restrict__ izt = ws.izt;
+ int32_t* __restrict__ nn = data.ndof;
+ int32_t* __restrict__ iv = ws.iv;
+
+ assert(pdata);
+ assert(zt);
+
+ using Hist = cms::cuda::HistoContainer;
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter hws[32];
+ for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+ hist.off[j] = 0;
+ }
+ __syncthreads();
+
+ if (verbose && 0 == threadIdx.x)
+ printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+ assert(nt <= hist.capacity());
+
+ // fill hist (bin shall be wider than "eps")
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ assert(i < ZVertices::MAXTRACKS);
+ int iz = int(zt[i] * 10.); // valid if eps<=0.1
+ iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+ izt[i] = iz - INT8_MIN;
+ assert(iz - INT8_MIN >= 0);
+ assert(iz - INT8_MIN < 256);
+ hist.count(izt[i]);
+ iv[i] = i;
+ nn[i] = 0;
+ }
+ __syncthreads();
+ if (threadIdx.x < 32)
+ hws[threadIdx.x] = 0; // used by prefix scan...
+ __syncthreads();
+ hist.finalize(hws);
+ __syncthreads();
+ assert(hist.size() == nt);
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ hist.fill(izt[i], uint16_t(i));
+ }
+ __syncthreads();
+
+ // count neighbours
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (ezt2[i] > er2mx)
+ continue;
+ auto loop = [&](uint32_t j) {
+ if (i == j)
+ return;
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+ nn[i]++;
+ };
+
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __syncthreads();
+
+ // find NN with smaller z...
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (nn[i] < minT)
+ continue; // DBSCAN core rule
+ float mz = zt[i];
+ auto loop = [&](uint32_t j) {
+ if (zt[j] >= mz)
+ return;
+ if (nn[j] < minT)
+ return; // DBSCAN core rule
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+ mz = zt[j];
+ iv[i] = j; // assign to cluster (better be unique??)
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __syncthreads();
+
+#ifdef GPU_DEBUG
+ // mini verification
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] != int(i))
+ assert(iv[iv[i]] != int(i));
+ }
+ __syncthreads();
+#endif
+
+ // consolidate graph (percolate index of seed)
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ auto m = iv[i];
+ while (m != iv[m])
+ m = iv[m];
+ iv[i] = m;
+ }
+
+ __syncthreads();
+
+#ifdef GPU_DEBUG
+ // mini verification
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] != int(i))
+ assert(iv[iv[i]] != int(i));
+ }
+ __syncthreads();
+#endif
+
+#ifdef GPU_DEBUG
+ // and verify that we did not spit any cluster...
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (nn[i] < minT)
+ continue; // DBSCAN core rule
+ assert(zt[iv[i]] <= zt[i]);
+ auto loop = [&](uint32_t j) {
+ if (nn[j] < minT)
+ return; // DBSCAN core rule
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+ // they should belong to the same cluster, isn't it?
+ if (iv[i] != iv[j]) {
+ printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+ printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+ ;
+ }
+ assert(iv[i] == iv[j]);
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+ __syncthreads();
+#endif
+
+ // collect edges (assign to closest cluster of closest point??? here to closest point)
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
+ if (nn[i] >= minT)
+ continue; // DBSCAN edge rule
+ float mdist = eps;
+ auto loop = [&](uint32_t j) {
+ if (nn[j] < minT)
+ return; // DBSCAN core rule
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > mdist)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return; // needed?
+ mdist = dist;
+ iv[i] = iv[j]; // assign to cluster (better be unique??)
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __shared__ unsigned int foundClusters;
+ foundClusters = 0;
+ __syncthreads();
+
+ // find the number of different clusters, identified by a tracks with clus[i] == i;
+ // mark these tracks with a negative id.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] == int(i)) {
+ if (nn[i] >= minT) {
+ auto old = atomicInc(&foundClusters, 0xffffffff);
+ iv[i] = -(old + 1);
+ } else { // noise
+ iv[i] = -9998;
+ }
+ }
+ }
+ __syncthreads();
+
+ assert(foundClusters < ZVertices::MAXVTX);
+
+ // propagate the negative id to all the tracks in the cluster.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] >= 0) {
+ // mark each track in a cluster with the same id as the first one
+ iv[i] = iv[iv[i]];
+ }
+ }
+ __syncthreads();
+
+ // adjust the cluster id to be a positive value starting from 0
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ iv[i] = -iv[i] - 1;
+ }
+
+ nvIntermediate = nvFinal = foundClusters;
+
+ if (verbose && 0 == threadIdx.x)
+ printf("found %d proto vertices\n", foundClusters);
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
new file mode 100644
index 0000000000000..1f2934ba15d0c
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -0,0 +1,212 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ // this algo does not really scale as it works in a single block...
+ // enough for <10K tracks we have
+ __global__ void clusterTracksIterative(ZVertices* pdata,
+ WorkSpace* pws,
+ int minT, // min number of neighbours to be "core"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster
+ ) {
+ constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+ if (verbose && 0 == threadIdx.x)
+ printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+ auto er2mx = errmax * errmax;
+
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ zt = ws.zt;
+ float const* __restrict__ ezt2 = ws.ezt2;
+
+ uint32_t& nvFinal = data.nvFinal;
+ uint32_t& nvIntermediate = ws.nvIntermediate;
+
+ uint8_t* __restrict__ izt = ws.izt;
+ int32_t* __restrict__ nn = data.ndof;
+ int32_t* __restrict__ iv = ws.iv;
+
+ assert(pdata);
+ assert(zt);
+
+ using Hist = cms::cuda::HistoContainer;
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter hws[32];
+ for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+ hist.off[j] = 0;
+ }
+ __syncthreads();
+
+ if (verbose && 0 == threadIdx.x)
+ printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+ assert(nt <= hist.capacity());
+
+ // fill hist (bin shall be wider than "eps")
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ assert(i < ZVertices::MAXTRACKS);
+ int iz = int(zt[i] * 10.); // valid if eps<=0.1
+ iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+ izt[i] = iz - INT8_MIN;
+ assert(iz - INT8_MIN >= 0);
+ assert(iz - INT8_MIN < 256);
+ hist.count(izt[i]);
+ iv[i] = i;
+ nn[i] = 0;
+ }
+ __syncthreads();
+ if (threadIdx.x < 32)
+ hws[threadIdx.x] = 0; // used by prefix scan...
+ __syncthreads();
+ hist.finalize(hws);
+ __syncthreads();
+ assert(hist.size() == nt);
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ hist.fill(izt[i], uint16_t(i));
+ }
+ __syncthreads();
+
+ // count neighbours
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (ezt2[i] > er2mx)
+ continue;
+ auto loop = [&](uint32_t j) {
+ if (i == j)
+ return;
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return;
+ nn[i]++;
+ };
+
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __shared__ int nloops;
+ nloops = 0;
+
+ __syncthreads();
+
+ // cluster seeds only
+ bool more = true;
+ while (__syncthreads_or(more)) {
+ if (1 == nloops % 2) {
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ auto m = iv[i];
+ while (m != iv[m])
+ m = iv[m];
+ iv[i] = m;
+ }
+ } else {
+ more = false;
+ for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+ auto p = hist.begin() + k;
+ auto i = (*p);
+ auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+ if (nn[i] < minT)
+ continue; // DBSCAN core rule
+ auto loop = [&](uint32_t j) {
+ assert(i != j);
+ if (nn[j] < minT)
+ return; // DBSCAN core rule
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > eps)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return;
+ auto old = atomicMin(&iv[j], iv[i]);
+ if (old != iv[i]) {
+ // end the loop only if no changes were applied
+ more = true;
+ }
+ atomicMin(&iv[i], old);
+ };
+ ++p;
+ for (; p < hist.end(be); ++p)
+ loop(*p);
+ } // for i
+ }
+ if (threadIdx.x == 0)
+ ++nloops;
+ } // while
+
+ // collect edges (assign to closest cluster of closest point??? here to closest point)
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule
+ if (nn[i] >= minT)
+ continue; // DBSCAN edge rule
+ float mdist = eps;
+ auto loop = [&](int j) {
+ if (nn[j] < minT)
+ return; // DBSCAN core rule
+ auto dist = std::abs(zt[i] - zt[j]);
+ if (dist > mdist)
+ return;
+ if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+ return; // needed?
+ mdist = dist;
+ iv[i] = iv[j]; // assign to cluster (better be unique??)
+ };
+ cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+ }
+
+ __shared__ unsigned int foundClusters;
+ foundClusters = 0;
+ __syncthreads();
+
+ // find the number of different clusters, identified by a tracks with clus[i] == i;
+ // mark these tracks with a negative id.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] == int(i)) {
+ if (nn[i] >= minT) {
+ auto old = atomicInc(&foundClusters, 0xffffffff);
+ iv[i] = -(old + 1);
+ } else { // noise
+ iv[i] = -9998;
+ }
+ }
+ }
+ __syncthreads();
+
+ assert(foundClusters < ZVertices::MAXVTX);
+
+ // propagate the negative id to all the tracks in the cluster.
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] >= 0) {
+ // mark each track in a cluster with the same id as the first one
+ iv[i] = iv[iv[i]];
+ }
+ }
+ __syncthreads();
+
+ // adjust the cluster id to be a positive value starting from 0
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ iv[i] = -iv[i] - 1;
+ }
+
+ nvIntermediate = nvFinal = foundClusters;
+
+ if (verbose && 0 == threadIdx.x)
+ printf("found %d proto vertices\n", foundClusters);
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
new file mode 100644
index 0000000000000..b8bbd0f601cb6
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -0,0 +1,113 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+ WorkSpace* pws,
+ float chi2Max // for outlier rejection
+ ) {
+ constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ zt = ws.zt;
+ float const* __restrict__ ezt2 = ws.ezt2;
+ float* __restrict__ zv = data.zv;
+ float* __restrict__ wv = data.wv;
+ float* __restrict__ chi2 = data.chi2;
+ uint32_t& nvFinal = data.nvFinal;
+ uint32_t& nvIntermediate = ws.nvIntermediate;
+
+ int32_t* __restrict__ nn = data.ndof;
+ int32_t* __restrict__ iv = ws.iv;
+
+ assert(pdata);
+ assert(zt);
+
+ assert(nvFinal <= nvIntermediate);
+ nvFinal = nvIntermediate;
+ auto foundClusters = nvFinal;
+
+ // zero
+ for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+ zv[i] = 0;
+ wv[i] = 0;
+ chi2[i] = 0;
+ }
+
+ // only for test
+ __shared__ int noise;
+ if (verbose && 0 == threadIdx.x)
+ noise = 0;
+
+ __syncthreads();
+
+ // compute cluster location
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] > 9990) {
+ if (verbose)
+ atomicAdd(&noise, 1);
+ continue;
+ }
+ assert(iv[i] >= 0);
+ assert(iv[i] < int(foundClusters));
+ auto w = 1.f / ezt2[i];
+ atomicAdd(&zv[iv[i]], zt[i] * w);
+ atomicAdd(&wv[iv[i]], w);
+ }
+
+ __syncthreads();
+ // reuse nn
+ for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+ assert(wv[i] > 0.f);
+ zv[i] /= wv[i];
+ nn[i] = -1; // ndof
+ }
+ __syncthreads();
+
+ // compute chi2
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] > 9990)
+ continue;
+
+ auto c2 = zv[iv[i]] - zt[i];
+ c2 *= c2 / ezt2[i];
+ if (c2 > chi2Max) {
+ iv[i] = 9999;
+ continue;
+ }
+ atomicAdd(&chi2[iv[i]], c2);
+ atomicAdd(&nn[iv[i]], 1);
+ }
+ __syncthreads();
+ for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
+ if (nn[i] > 0)
+ wv[i] *= float(nn[i]) / chi2[i];
+
+ if (verbose && 0 == threadIdx.x)
+ printf("found %d proto clusters ", foundClusters);
+ if (verbose && 0 == threadIdx.x)
+ printf("and %d noise\n", noise);
+ }
+
+ __global__ void fitVerticesKernel(ZVertices* pdata,
+ WorkSpace* pws,
+ float chi2Max // for outlier rejection
+ ) {
+ fitVertices(pdata, pws, chi2Max);
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
new file mode 100644
index 0000000000000..841eab3901965
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#ifdef __CUDA_ARCH__
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+#endif
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ ptt2 = ws.ptt2;
+ uint32_t const& nvFinal = data.nvFinal;
+
+ int32_t const* __restrict__ iv = ws.iv;
+ float* __restrict__ ptv2 = data.ptv2;
+ uint16_t* __restrict__ sortInd = data.sortInd;
+
+ // if (threadIdx.x == 0)
+ // printf("sorting %d vertices\n",nvFinal);
+
+ if (nvFinal < 1)
+ return;
+
+ // fill indexing
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ data.idv[ws.itrk[i]] = iv[i];
+ }
+
+ // can be done asynchronoisly at the end of previous event
+ for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
+ ptv2[i] = 0;
+ }
+ __syncthreads();
+
+ for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i] > 9990)
+ continue;
+ atomicAdd(&ptv2[iv[i]], ptt2[i]);
+ }
+ __syncthreads();
+
+ if (1 == nvFinal) {
+ if (threadIdx.x == 0)
+ sortInd[0] = 0;
+ return;
+ }
+#ifdef __CUDA_ARCH__
+ __shared__ uint16_t sws[1024];
+ // sort using only 16 bits
+ radixSort(ptv2, sortInd, sws, nvFinal);
+#else
+ for (uint16_t i = 0; i < nvFinal; ++i)
+ sortInd[i] = i;
+ std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+#endif
+ }
+
+ __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
new file mode 100644
index 0000000000000..0fe8bd882dcc5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -0,0 +1,139 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+ __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+ constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+ auto& __restrict__ data = *pdata;
+ auto& __restrict__ ws = *pws;
+ auto nt = ws.ntrks;
+ float const* __restrict__ zt = ws.zt;
+ float const* __restrict__ ezt2 = ws.ezt2;
+ float* __restrict__ zv = data.zv;
+ float* __restrict__ wv = data.wv;
+ float const* __restrict__ chi2 = data.chi2;
+ uint32_t& nvFinal = data.nvFinal;
+
+ int32_t const* __restrict__ nn = data.ndof;
+ int32_t* __restrict__ iv = ws.iv;
+
+ assert(pdata);
+ assert(zt);
+
+ // one vertex per block
+ for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
+ if (nn[kv] < 4)
+ continue;
+ if (chi2[kv] < maxChi2 * float(nn[kv]))
+ continue;
+
+ constexpr int MAXTK = 512;
+ assert(nn[kv] < MAXTK);
+ if (nn[kv] >= MAXTK)
+ continue; // too bad FIXME
+ __shared__ uint32_t it[MAXTK]; // track index
+ __shared__ float zz[MAXTK]; // z pos
+ __shared__ uint8_t newV[MAXTK]; // 0 or 1
+ __shared__ float ww[MAXTK]; // z weight
+
+ __shared__ uint32_t nq; // number of track for this vertex
+ nq = 0;
+ __syncthreads();
+
+ // copy to local
+ for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
+ if (iv[k] == int(kv)) {
+ auto old = atomicInc(&nq, MAXTK);
+ zz[old] = zt[k] - zv[kv];
+ newV[old] = zz[old] < 0 ? 0 : 1;
+ ww[old] = 1.f / ezt2[k];
+ it[old] = k;
+ }
+ }
+
+ __shared__ float znew[2], wnew[2]; // the new vertices
+
+ __syncthreads();
+ assert(int(nq) == nn[kv] + 1);
+
+ int maxiter = 20;
+ // kt-min....
+ bool more = true;
+ while (__syncthreads_or(more)) {
+ more = false;
+ if (0 == threadIdx.x) {
+ znew[0] = 0;
+ znew[1] = 0;
+ wnew[0] = 0;
+ wnew[1] = 0;
+ }
+ __syncthreads();
+ for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+ auto i = newV[k];
+ atomicAdd(&znew[i], zz[k] * ww[k]);
+ atomicAdd(&wnew[i], ww[k]);
+ }
+ __syncthreads();
+ if (0 == threadIdx.x) {
+ znew[0] /= wnew[0];
+ znew[1] /= wnew[1];
+ }
+ __syncthreads();
+ for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+ auto d0 = fabs(zz[k] - znew[0]);
+ auto d1 = fabs(zz[k] - znew[1]);
+ auto newer = d0 < d1 ? 0 : 1;
+ more |= newer != newV[k];
+ newV[k] = newer;
+ }
+ --maxiter;
+ if (maxiter <= 0)
+ more = false;
+ }
+
+ // avoid empty vertices
+ if (0 == wnew[0] || 0 == wnew[1])
+ continue;
+
+ // quality cut
+ auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+ auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+ if (verbose && 0 == threadIdx.x)
+ printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+ if (chi2Dist < 4)
+ continue;
+
+ // get a new global vertex
+ __shared__ uint32_t igv;
+ if (0 == threadIdx.x)
+ igv = atomicAdd(&ws.nvIntermediate, 1);
+ __syncthreads();
+ for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+ if (1 == newV[k])
+ iv[it[k]] = igv;
+ }
+
+ } // loop on vertices
+ }
+
+ __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+ splitVertices(pdata, pws, maxChi2);
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
new file mode 100644
index 0000000000000..5f8238c3ea8c8
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+
+#include
+#include
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+namespace gpuVertexFinder {
+
+ using ZVertices = ZVertexSoA;
+ using TkSoA = pixelTrack::TrackSoA;
+
+ // workspace used in the vertex reco algos
+ struct WorkSpace {
+ static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
+ static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+
+ uint32_t ntrks; // number of "selected tracks"
+ uint16_t itrk[MAXTRACKS]; // index of original track
+ float zt[MAXTRACKS]; // input track z at bs
+ float ezt2[MAXTRACKS]; // input error^2 on the above
+ float ptt2[MAXTRACKS]; // input pt^2 on the above
+ uint8_t izt[MAXTRACKS]; // interized z-position of input tracks
+ int32_t iv[MAXTRACKS]; // vertex index for each associated track
+
+ uint32_t nvIntermediate; // the number of vertices after splitting pruning etc.
+
+ __host__ __device__ void init() {
+ ntrks = 0;
+ nvIntermediate = 0;
+ }
+ };
+
+ __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
+ pdata->init();
+ pws->init();
+ }
+
+ class Producer {
+ public:
+ using ZVertices = ZVertexSoA;
+ using WorkSpace = gpuVertexFinder::WorkSpace;
+ using TkSoA = pixelTrack::TrackSoA;
+
+ Producer(bool oneKernel,
+ bool useDensity,
+ bool useDBSCAN,
+ bool useIterative,
+ int iminT, // min number of neighbours to be "core"
+ float ieps, // max absolute distance to cluster
+ float ierrmax, // max error to be "seed"
+ float ichi2max // max normalized distance to cluster
+ )
+ : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+ useDensity_(useDensity),
+ useDBSCAN_(useDBSCAN),
+ useIterative_(useIterative),
+ minT(iminT),
+ eps(ieps),
+ errmax(ierrmax),
+ chi2max(ichi2max) {}
+
+ ~Producer() = default;
+
+ ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+ ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+
+ private:
+ const bool oneKernel_;
+ const bool useDensity_;
+ const bool useDBSCAN_;
+ const bool useIterative_;
+
+ int minT; // min number of neighbours to be "core"
+ float eps; // max absolute distance to cluster
+ float errmax; // max error to be "seed"
+ float chi2max; // max normalized distance to cluster
+ };
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
new file mode 100644
index 0000000000000..d685ced488233
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -0,0 +1,192 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+namespace gpuVertexFinder {
+
+ // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
+ constexpr float maxChi2ForFirstFit = 50.f;
+ constexpr float maxChi2ForFinalFit = 5000.f;
+
+ // split vertices with a chi2/NDoF greater than this
+ constexpr float maxChi2ForSplit = 9.f;
+
+ __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+ assert(ptracks);
+ assert(soa);
+ auto const& tracks = *ptracks;
+ auto const& fit = tracks.stateAtBS;
+ auto const* quality = tracks.qualityData();
+
+ auto first = blockIdx.x * blockDim.x + threadIdx.x;
+ for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+ auto nHits = tracks.nHits(idx);
+ if (nHits == 0)
+ break; // this is a guard: maybe we need to move to nTracks...
+
+ // initialize soa...
+ soa->idv[idx] = -1;
+
+ if (nHits < 4)
+ continue; // no triplets
+ if (quality[idx] != pixelTrack::Quality::loose)
+ continue;
+
+ auto pt = tracks.pt(idx);
+
+ if (pt < ptMin)
+ continue;
+
+ auto& data = *pws;
+ auto it = atomicAdd(&data.ntrks, 1);
+ data.itrk[it] = idx;
+ data.zt[it] = tracks.zip(idx);
+ data.ezt2[it] = fit.covariance(idx)(14);
+ data.ptt2[it] = pt * pt;
+ }
+ }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+ __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+ gpuVertexFinder::WorkSpace* pws,
+ int minT, // min number of neighbours to be "seed"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster,
+ ) {
+ clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+ __syncthreads();
+ fitVertices(pdata, pws, maxChi2ForFirstFit);
+ __syncthreads();
+ splitVertices(pdata, pws, maxChi2ForSplit);
+ __syncthreads();
+ fitVertices(pdata, pws, maxChi2ForFinalFit);
+ __syncthreads();
+ sortByPt2(pdata, pws);
+ }
+#else
+ __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+ gpuVertexFinder::WorkSpace* pws,
+ int minT, // min number of neighbours to be "seed"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster,
+ ) {
+ clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+ __syncthreads();
+ fitVertices(pdata, pws, maxChi2ForFirstFit);
+ }
+
+ __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+ fitVertices(pdata, pws, maxChi2ForFinalFit);
+ __syncthreads();
+ sortByPt2(pdata, pws);
+ }
+#endif
+
+#ifdef __CUDACC__
+ ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ std::cout << "producing Vertices on GPU" << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+ ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream));
+#else
+ ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ std::cout << "producing Vertices on CPU" << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+ ZVertexHeterogeneous vertices(std::make_unique());
+#endif
+ assert(tksoa);
+ auto* soa = vertices.get();
+ assert(soa);
+
+#ifdef __CUDACC__
+ auto ws_d = cms::cuda::make_device_unique(stream);
+#else
+ auto ws_d = std::make_unique();
+#endif
+
+#ifdef __CUDACC__
+ init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+ auto blockSize = 128;
+ auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+ loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin);
+ cudaCheck(cudaGetLastError());
+#else
+ init(soa, ws_d.get());
+ loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+ // Running too many thread lead to problems when printf is enabled.
+ constexpr int maxThreadsForPrint = 1024 - 256;
+ constexpr int numBlocks = 1024;
+ constexpr int threadsPerBlock = 128;
+
+ if (oneKernel_) {
+ // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+ vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+ vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ cudaCheck(cudaGetLastError());
+ // one block per vertex...
+ splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit);
+ cudaCheck(cudaGetLastError());
+ vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+#endif
+ } else { // five kernels
+ if (useDensity_) {
+ clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ } else if (useDBSCAN_) {
+ clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ } else if (useIterative_) {
+ clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ }
+ cudaCheck(cudaGetLastError());
+ fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+ cudaCheck(cudaGetLastError());
+ // one block per vertex...
+ splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit);
+ cudaCheck(cudaGetLastError());
+ fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+ cudaCheck(cudaGetLastError());
+ sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+ }
+ cudaCheck(cudaGetLastError());
+#else // __CUDACC__
+ if (useDensity_) {
+ clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ } else if (useDBSCAN_) {
+ clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ } else if (useIterative_) {
+ clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+ }
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+ std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+#endif // PIXVERTEX_DEBUG_PRODUCE
+ fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+ // one block per vertex!
+ splitVertices(soa, ws_d.get(), maxChi2ForSplit);
+ fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
+ sortByPt2(soa, ws_d.get());
+#endif
+
+ return vertices;
+ }
+
+} // namespace gpuVertexFinder
+
+#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..903c2a894ff86 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -18,5 +18,3 @@
refToPSet_ = cms.string('pvClusterComparer')
)
)
-
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 0f4f4dee63832..f5c154b298574 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -2,8 +2,41 @@
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
new file mode 100644
index 0000000000000..52253a1e4bbfe
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -0,0 +1,347 @@
+#include
+#include
+#include
+#include
+#include
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#ifdef USE_DBSCAN
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
+#else
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
+#endif
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h"
+
+#ifdef ONE_KERNEL
+#ifdef __CUDACC__
+__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+ gpuVertexFinder::WorkSpace* pws,
+ int minT, // min number of neighbours to be "seed"
+ float eps, // max absolute distance to cluster
+ float errmax, // max error to be "seed"
+ float chi2max // max normalized distance to cluster,
+) {
+ clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+ __syncthreads();
+ fitVertices(pdata, pws, 50.);
+ __syncthreads();
+ splitVertices(pdata, pws, 9.f);
+ __syncthreads();
+ fitVertices(pdata, pws, 5000.);
+ __syncthreads();
+ sortByPt2(pdata, pws);
+}
+#endif
+#endif
+
+struct Event {
+ std::vector zvert;
+ std::vector itrack;
+ std::vector ztrack;
+ std::vector eztrack;
+ std::vector pttrack;
+ std::vector ivert;
+};
+
+struct ClusterGenerator {
+ explicit ClusterGenerator(float nvert, float ntrack)
+ : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+
+ void operator()(Event& ev) {
+ int nclus = clusGen(reng);
+ ev.zvert.resize(nclus);
+ ev.itrack.resize(nclus);
+ for (auto& z : ev.zvert) {
+ z = 3.5f * gauss(reng);
+ }
+
+ ev.ztrack.clear();
+ ev.eztrack.clear();
+ ev.ivert.clear();
+ for (int iv = 0; iv < nclus; ++iv) {
+ auto nt = trackGen(reng);
+ ev.itrack[nclus] = nt;
+ for (int it = 0; it < nt; ++it) {
+ auto err = errgen(reng); // reality is not flat....
+ ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
+ ev.eztrack.push_back(err * err);
+ ev.ivert.push_back(iv);
+ ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+ ev.pttrack.back() *= ev.pttrack.back();
+ }
+ }
+ // add noise
+ auto nt = 2 * trackGen(reng);
+ for (int it = 0; it < nt; ++it) {
+ auto err = 0.03f;
+ ev.ztrack.push_back(rgen(reng));
+ ev.eztrack.push_back(err * err);
+ ev.ivert.push_back(9999);
+ ev.pttrack.push_back(0.5f + ptGen(reng));
+ ev.pttrack.back() *= ev.pttrack.back();
+ }
+ }
+
+ std::mt19937 reng;
+ std::uniform_real_distribution rgen;
+ std::uniform_real_distribution errgen;
+ std::poisson_distribution clusGen;
+ std::poisson_distribution trackGen;
+ std::normal_distribution gauss;
+ std::exponential_distribution ptGen;
+};
+
+// a macro SORRY
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
+
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
+ auto const& __restrict__ data = *pdata;
+ auto const& __restrict__ ws = *pws;
+ printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+}
+
+int main() {
+#ifdef __CUDACC__
+ cms::cudatest::requireDevices();
+
+ auto onGPU_d = cms::cuda::make_device_unique(1, nullptr);
+ auto ws_d = cms::cuda::make_device_unique(1, nullptr);
+#else
+ auto onGPU_d = std::make_unique();
+ auto ws_d = std::make_unique();
+#endif
+
+ Event ev;
+
+ float eps = 0.1f;
+ std::array par{{eps, 0.01f, 9.0f}};
+ for (int nav = 30; nav < 80; nav += 20) {
+ ClusterGenerator gen(nav, 10);
+
+ for (int i = 8; i < 20; ++i) {
+ auto kk = i / 4; // M param
+
+ gen(ev);
+
+#ifdef __CUDACC__
+ init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+#else
+ onGPU_d->init();
+ ws_d->init();
+#endif
+
+ std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+ auto nt = ev.ztrack.size();
+#ifdef __CUDACC__
+ cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+ cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+#else
+ ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+ ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+ ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+ ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+#endif
+
+ std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+
+ if ((i % 4) == 0)
+ par = {{eps, 0.02f, 12.0f}};
+ if ((i % 4) == 1)
+ par = {{eps, 0.02f, 9.0f}};
+ if ((i % 4) == 2)
+ par = {{eps, 0.01f, 9.0f}};
+ if ((i % 4) == 3)
+ par = {{0.7f * eps, 0.01f, 9.0f}};
+
+ uint32_t nv = 0;
+#ifdef __CUDACC__
+ print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+ cudaCheck(cudaGetLastError());
+ cudaDeviceSynchronize();
+
+#ifdef ONE_KERNEL
+ cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#else
+ cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#endif
+ print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+
+ cudaCheck(cudaGetLastError());
+ cudaDeviceSynchronize();
+
+ cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+ cudaCheck(cudaGetLastError());
+ cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+#else
+ print(onGPU_d.get(), ws_d.get());
+ CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+ print(onGPU_d.get(), ws_d.get());
+ fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+ nv = onGPU_d->nvFinal;
+#endif
+
+ if (nv == 0) {
+ std::cout << "NO VERTICES???" << std::endl;
+ continue;
+ }
+
+ float* zv = nullptr;
+ float* wv = nullptr;
+ float* ptv2 = nullptr;
+ int32_t* nn = nullptr;
+ uint16_t* ind = nullptr;
+
+ // keep chi2 separated...
+ float chi2[2 * nv]; // make space for splitting...
+
+#ifdef __CUDACC__
+ float hzv[2 * nv];
+ float hwv[2 * nv];
+ float hptv2[2 * nv];
+ int32_t hnn[2 * nv];
+ uint16_t hind[2 * nv];
+
+ zv = hzv;
+ wv = hwv;
+ ptv2 = hptv2;
+ nn = hnn;
+ ind = hind;
+#else
+ zv = onGPU_d->zv;
+ wv = onGPU_d->wv;
+ ptv2 = onGPU_d->ptv2;
+ nn = onGPU_d->ndof;
+ ind = onGPU_d->sortInd;
+#endif
+
+#ifdef __CUDACC__
+ cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+ memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+ for (auto j = 0U; j < nv; ++j)
+ if (nn[j] > 0)
+ chi2[j] /= float(nn[j]);
+ {
+ auto mx = std::minmax_element(chi2, chi2 + nv);
+ std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+ }
+
+#ifdef __CUDACC__
+ cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+ cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+ fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+ nv = onGPU_d->nvFinal;
+ memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+ for (auto j = 0U; j < nv; ++j)
+ if (nn[j] > 0)
+ chi2[j] /= float(nn[j]);
+ {
+ auto mx = std::minmax_element(chi2, chi2 + nv);
+ std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+ }
+
+#ifdef __CUDACC__
+ // one vertex per block!!!
+ cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+ cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+ splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+ nv = ws_d->nvIntermediate;
+#endif
+ std::cout << "after split " << nv << std::endl;
+
+#ifdef __CUDACC__
+ cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+ cudaCheck(cudaGetLastError());
+
+ cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+ cudaCheck(cudaGetLastError());
+ cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+ fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+ sortByPt2(onGPU_d.get(), ws_d.get());
+ nv = onGPU_d->nvFinal;
+ memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+ if (nv == 0) {
+ std::cout << "NO VERTICES???" << std::endl;
+ continue;
+ }
+
+#ifdef __CUDACC__
+ cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+ cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+#endif
+ for (auto j = 0U; j < nv; ++j)
+ if (nn[j] > 0)
+ chi2[j] /= float(nn[j]);
+ {
+ auto mx = std::minmax_element(chi2, chi2 + nv);
+ std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+ }
+
+ {
+ auto mx = std::minmax_element(wv, wv + nv);
+ std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+ }
+
+ {
+ auto mx = std::minmax_element(ptv2, ptv2 + nv);
+ std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+ std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+ << ind[nv - 1] << std::endl;
+ }
+
+ float dd[nv];
+ for (auto kv = 0U; kv < nv; ++kv) {
+ auto zr = zv[kv];
+ auto md = 500.0f;
+ for (auto zs : ev.ztrack) {
+ auto d = std::abs(zr - zs);
+ md = std::min(d, md);
+ }
+ dd[kv] = md;
+ }
+ if (i == 6) {
+ for (auto d : dd)
+ std::cout << d << ' ';
+ std::cout << std::endl;
+ }
+ auto mx = std::minmax_element(dd, dd + nv);
+ float rms = 0;
+ for (auto d : dd)
+ rms += d * d;
+ rms = std::sqrt(rms) / (nv - 1);
+ std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+ } // loop on events
+ } // lopp on ave vert
+
+ return 0;
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"