diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
new file mode 100644
index 0000000000000..f61e4aff7403f
--- /dev/null
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -0,0 +1,9 @@
+<use name="cuda"/>
+<use name="eigen"/>
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
new file mode 100644
index 0000000000000..aacfddc6fe7e2
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -0,0 +1,14 @@
+#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H
+#define CUDADataFormatsVertexZVertexHeterogeneous_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+#ifndef __CUDACC__
+#include "CUDADataFormats/Common/interface/Product.h"
+using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
+#endif
+
+#endif
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
new file mode 100644
index 0000000000000..e31b87f30fa11
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h
@@ -0,0 +1,26 @@
+#ifndef CUDADataFormats_Vertex_ZVertexSoA_h
+#define CUDADataFormats_Vertex_ZVertexSoA_h
+
+#include <cstdint>
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+
+// SOA for vertices
+// These vertices are clusterized and fitted only along the beam line (z)
+// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well)
+struct ZVertexSoA {
+  static constexpr uint32_t MAXTRACKS = 32 * 1024;
+  static constexpr uint32_t MAXVTX = 1024;
+
+  int16_t idv[MAXTRACKS];    // vertex index for each associated (original) track  (-1 == not associate)
+  float zv[MAXVTX];          // output z-posistion of found vertices
+  float wv[MAXVTX];          // output weight (1/error^2) on the above
+  float chi2[MAXVTX];        // vertices chi2
+  float ptv2[MAXVTX];        // vertices pt^2
+  int32_t ndof[MAXTRACKS];   // vertices number of dof (reused as workspace for the number of nearest neighbours FIXME)
+  uint16_t sortInd[MAXVTX];  // sorted index (by pt2)  ascending
+  uint32_t nvFinal;          // the number of vertices
+
+  __host__ __device__ void init() { nvFinal = 0; }
+};
+
+#endif  // CUDADataFormats_Vertex_ZVertexSoA_h
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
new file mode 100644
index 0000000000000..7931beaa8f4bd
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_Vertex_src_classes_h
+#define CUDADataFormats_Vertex_src_classes_h
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  // CUDADataFormats_Vertex_src_classes_h
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
new file mode 100644
index 0000000000000..ea633080af9af
--- /dev/null
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
+  <class name="ZVertexHeterogeneous" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+</lcgdict>
diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
new file mode 100644
index 0000000000000..2558e88d26012
--- /dev/null
+++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py
@@ -0,0 +1,7 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient
+
+pixelVertexResolutionClient = _primaryVertexResolutionClient.clone(
+    subDirs = ["OfflinePixelPV/Resolution/*"]
+)
diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
index 368b328632fd8..29bf311c474d4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py
@@ -122,6 +122,7 @@
 from DQM.CTPPS.ctppsDQM_cff import *
 from Validation.RecoTau.DQMSequences_cfi import *
 from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import *
+from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import *
 
 DQMHarvestTrackerStrip = cms.Sequence ( SiStripOfflineDQMClient )
 
@@ -179,7 +180,8 @@
 DQMHarvestTrackingZeroBias = cms.Sequence( TrackingOfflineDQMClientZeroBias *
                                            dqmFastTimerServiceClient )
 
-DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern )
+DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern *
+                                        pixelVertexResolutionClient )
 
 DQMHarvestOuterTracker = cms.Sequence(
                                  OuterTrackerClient *
diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py
index 2001c22352a48..ac28700d4eaf4 100644
--- a/DQMOffline/Configuration/python/DQMOffline_cff.py
+++ b/DQMOffline/Configuration/python/DQMOffline_cff.py
@@ -157,10 +157,12 @@
 
 #DQMOfflineCommon
 from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import *
+from DQMOffline.RecoB.PixelVertexMonitor_cff import *
 from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import *
 from Validation.RecoTau.DQMSequences_cfi import *
 
-DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring )
+DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring *
+                                        pixelPVMonitor )
 
 DQMOuterTracker = cms.Sequence( DQMOfflineDCS *
                                 OuterTrackerSource *
diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
new file mode 100644
index 0000000000000..9e293f4478bd6
--- /dev/null
+++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py
@@ -0,0 +1,8 @@
+import FWCore.ParameterSet.Config as cms
+
+from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor
+pixelPVMonitor = _pvMonitor.clone(
+    TopFolderName = "OfflinePixelPV",
+    vertexLabel = "pixelVertices",
+    ndof        = cms.int32( 1 )
+)
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 34ee6fadb04de..424ac13a43627 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -4,7 +4,21 @@
 #
 # for STARTUP ONLY use try and use Offline 3D PV from pixelTracks, with adaptive vertex
 #
-#from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
-from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
+from RecoPixelVertexing.PixelVertexFinding.PixelVertexes_cff import *
+#from RecoVertex.PrimaryVertexProducer.OfflinePixel3DPrimaryVertices_cfi import *
 recopixelvertexingTask = cms.Task(pixelTracksTask,pixelVertices)
 recopixelvertexing = cms.Sequence(recopixelvertexingTask)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi import pixelVertexSoA
+from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA as _pixelVertexFromSoA
+
+_pixelVertexingCUDATask = cms.Task(pixelTracksTask,pixelVertexCUDA,pixelVertexSoA,pixelVertices)
+
+# pixelVertexSoAonCPU = pixelVertexCUDA.clone()
+# pixelVertexSoAonCPU.onGPU = False;
+
+gpu.toReplaceWith(pixelVertices,_pixelVertexFromSoA)
+gpu.toReplaceWith(recopixelvertexingTask,_pixelVertexingCUDATask)
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
index 24cc16e02b463..1661cac832b8b 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksSoAonCPU.py
@@ -20,12 +20,21 @@ def customizePixelTracksSoAonCPU(process):
     pixelRecHitSrc = 'siPixelRecHitsPreSplitting'
   )
 
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi import pixelVertexCUDA
+  process.pixelVertexSoA = pixelVertexCUDA.clone(
+    onGPU = False,
+    pixelTrackSrc = 'pixelTrackSoA'
+  )
+
   from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA
   process.pixelTracks = pixelTrackProducerFromSoA.clone(
     pixelRecHitLegacySrc = 'siPixelRecHitsPreSplitting'
   )
 
-  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA
+  from RecoPixelVertexing.PixelVertexFinding.pixelVertexFromSoA_cfi import pixelVertexFromSoA
+  process.pixelVertices = pixelVertexFromSoA.clone()
+
+  process.reconstruction_step += process.siPixelRecHitsPreSplitting + process.pixelTrackSoA + process.pixelVertexSoA
 
   return process
 
@@ -46,7 +55,7 @@ def customizePixelTracksSoAonCPUForProfiling(process):
 
   process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
   
-  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA)
+  process.TkSoA = cms.Path(process.offlineBeamSpot + process.siPixelDigis + process.siPixelClustersPreSplitting + process.siPixelRecHitSoAFromLegacy + process.pixelTrackSoA + process.pixelVertexSoA)
 
   process.schedule = cms.Schedule(process.TkSoA)
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
new file mode 100644
index 0000000000000..2f0965be50eb8
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -0,0 +1,86 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+public:
+  explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+};
+
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig)
+    : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ =
+        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  } else {
+    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  }
+}
+
+void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<bool>("onGPU", true);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+  desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelTrackDumpCUDA", desc);
+}
+
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID,
+                                 edm::Event const& iEvent,
+                                 const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+    auto const& hTracks = iEvent.get(tokenGPUTrack_);
+    cms::cuda::ScopedContextProduce ctx{hTracks};
+
+    auto const& tracks = ctx.get(hTracks);
+    auto const* tsoa = tracks.get();
+    assert(tsoa);
+
+    auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
+    auto const* vsoa = vertices.get();
+    assert(vsoa);
+
+  } else {
+    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
+    assert(tsoa);
+
+    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
+    assert(vsoa);
+  }
+}
+
+DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
index 427799cb122b5..99b91b2587bcf 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/Vertex"/>
 <use name="CommonTools/Clustering1D"/>
 <use name="DataFormats/BeamSpot"/>
 <use name="DataFormats/GeometryCommonDetAlgo"/>
@@ -15,10 +16,12 @@
 <use name="FWCore/Utilities"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalTracker/ClusterParameterEstimator"/>
 <use name="RecoLocalTracker/Records"/>
 <use name="RecoPixelVertexing/PixelVertexFinding"/>
 <use name="SimDataFormats/PileupSummaryInfo"/>
-<library file="*.cc" name="RecoPixelVertexingPixelVertexFindingPlugins">
+<library file="*.cc *.cu" name="RecoPixelVertexingPixelVertexFindingPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
new file mode 100644
index 0000000000000..e2c2bc76c8612
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -0,0 +1,141 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+#include "gpuVertexFinder.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+class PixelVertexProducerCUDA : public edm::global::EDProducer<> {
+public:
+  explicit PixelVertexProducerCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexProducerCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+  void produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  bool onGPU_;
+
+  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+
+  const gpuVertexFinder::Producer gpuAlgo_;
+
+  // Tracking cuts before sending tracks to vertex algo
+  const float ptMin_;
+};
+
+PixelVertexProducerCUDA::PixelVertexProducerCUDA(const edm::ParameterSet& conf)
+    : onGPU_(conf.getParameter<bool>("onGPU")),
+      gpuAlgo_(conf.getParameter<bool>("oneKernel"),
+               conf.getParameter<bool>("useDensity"),
+               conf.getParameter<bool>("useDBSCAN"),
+               conf.getParameter<bool>("useIterative"),
+               conf.getParameter<int>("minT"),
+               conf.getParameter<double>("eps"),
+               conf.getParameter<double>("errmax"),
+               conf.getParameter<double>("chi2max")),
+      ptMin_(conf.getParameter<double>("PtMin"))  // 0.5 GeV
+{
+  if (onGPU_) {
+    tokenGPUTrack_ =
+        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+  } else {
+    tokenCPUTrack_ = consumes<PixelTrackHeterogeneous>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+  }
+}
+
+void PixelVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  // Only one of these three algos can be used at once.
+  // Maybe this should become a Plugin Factory
+  desc.add<bool>("onGPU", true);
+  desc.add<bool>("oneKernel", true);
+  desc.add<bool>("useDensity", true);
+  desc.add<bool>("useDBSCAN", false);
+  desc.add<bool>("useIterative", false);
+
+  desc.add<int>("minT", 2);          // min number of neighbours to be "core"
+  desc.add<double>("eps", 0.07);     // max absolute distance to cluster
+  desc.add<double>("errmax", 0.01);  // max error to be "seed"
+  desc.add<double>("chi2max", 9.);   // max normalized distance to cluster
+
+  desc.add<double>("PtMin", 0.5);
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+
+  auto label = "pixelVertexCUDA";
+  descriptions.add(label, desc);
+}
+
+void PixelVertexProducerCUDA::produceOnGPU(edm::StreamID streamID,
+                                           edm::Event& iEvent,
+                                           const edm::EventSetup& iSetup) const {
+  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
+  iEvent.getByToken(tokenGPUTrack_, hTracks);
+
+  cms::cuda::ScopedContextProduce ctx{*hTracks};
+  auto const* tracks = ctx.get(*hTracks).get();
+
+  assert(tracks);
+
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_));
+}
+
+void PixelVertexProducerCUDA::produceOnCPU(edm::StreamID streamID,
+                                           edm::Event& iEvent,
+                                           const edm::EventSetup& iSetup) const {
+  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
+  assert(tracks);
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  auto const& tsoa = *tracks;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA " << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits == int(tsoa.hitIndices.size(it)));
+    if (nHits == 0)
+      break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_));
+}
+
+void PixelVertexProducerCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  if (onGPU_) {
+    produceOnGPU(streamID, iEvent, iSetup);
+  } else {
+    produceOnCPU(streamID, iEvent, iSetup);
+  }
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
new file mode 100644
index 0000000000000..62b9bb46bd4a5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -0,0 +1,175 @@
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelVertexProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelVertexProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
+  edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
+};
+
+PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
+    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
+      tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
+      tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
+  produces<reco::VertexCollection>();
+}
+
+void PixelVertexProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("TrackCollection", edm::InputTag("pixelTracks"));
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexSoA"));
+
+  descriptions.add("pixelVertexFromSoA", desc);
+}
+
+void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &) const {
+  auto vertexes = std::make_unique<reco::VertexCollection>();
+
+  auto tracksHandle = iEvent.getHandle(tokenTracks_);
+  auto tracksSize = tracksHandle->size();
+  auto const &indToEdm = iEvent.get(tokenIndToEdm_);
+  auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
+
+  float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
+  std::vector<int32_t> itrk;
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot &bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  auto const &soa = *(iEvent.get(tokenVertex_).get());
+
+  int nv = soa.nvFinal;
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  std::cout << "converting " << nv << " vertices "
+            << " from " << indToEdm.size() << " tracks" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+
+  std::set<uint16_t> uind;  // fort verifing index consistency
+  for (int j = nv - 1; j >= 0; --j) {
+    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    assert(i < nv);
+    uind.insert(i);
+    assert(itrk.empty());
+    auto z = soa.zv[i];
+    auto x = x0 + dxdz * z;
+    auto y = y0 + dydz * z;
+    z += z0;
+    reco::Vertex::Error err;
+    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) *= 2.;  // artifically inflate error
+    //Copy also the tracks (no intention to be efficient....)
+    for (auto k = 0U; k < indToEdm.size(); ++k) {
+      if (soa.idv[k] == int16_t(i))
+        itrk.push_back(k);
+    }
+    auto nt = itrk.size();
+    if (nt == 0) {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+      std::cout << "vertex " << i << " with no tracks..." << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+      continue;
+    }
+    if (nt < 2) {
+      itrk.clear();
+      continue;
+    }  // remove outliers
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    auto &v = (*vertexes).back();
+    for (auto it : itrk) {
+      assert(it < int(indToEdm.size()));
+      auto k = indToEdm[it];
+      if (k > tracksSize) {
+        edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
+        continue;
+      }
+      auto tk = reco::TrackRef(tracksHandle, k);
+      v.add(reco::TrackBaseRef(tk));
+    }
+    itrk.clear();
+  }
+
+  LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n";
+  for (unsigned int i = 0; i < vertexes->size(); ++i) {
+    LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize()
+                                    << " tracks with a position of " << (*vertexes)[i].z() << " +- "
+                                    << std::sqrt((*vertexes)[i].covariance(2, 2));
+  }
+
+  // legacy logic....
+  if (vertexes->empty() && bsHandle.isValid()) {
+    const reco::BeamSpot &bs = *bsHandle;
+
+    GlobalError bse(bs.rotatedCovariance3D());
+    if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+      AlgebraicSymMatrix33 we;
+      we(0, 0) = 10000;
+      we(1, 1) = 10000;
+      we(2, 2) = 10000;
+      vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix()
+                                          << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    } else {
+      vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    }
+  } else if (vertexes->empty() && !bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned.";
+  }
+
+  iEvent.put(std::move(vertexes));
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
new file mode 100644
index 0000000000000..0cadf24580cf7
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -0,0 +1,65 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+
+class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelVertexSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+
+  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+};
+
+PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+
+void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVertexCUDA"));
+  descriptions.add("pixelVertexSoA", desc);
+}
+
+void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                     edm::EventSetup const& iSetup,
+                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+}
+
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // No copies....
+  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+}
+
+DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
new file mode 100644
index 0000000000000..b19aeb5930fc6
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -0,0 +1,234 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  //
+  // based on Rodrighez&Laio algo
+  //
+  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
+                                                         gpuVertexFinder::WorkSpace* pws,
+                                                         int minT,      // min number of neighbours to be "seed"
+                                                         float eps,     // max absolute distance to cluster
+                                                         float errmax,  // max error to be "seed"
+                                                         float chi2max  // max normalized distance to cluster
+  ) {
+    using namespace gpuVertexFinder;
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+      iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find closest above me .... (we ignore the possibility of two j at same distance from i)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // (break natural order???)
+        mdist = dist;
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+#ifdef GPU_DEBUG
+    __syncthreads();
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto minJ = i;
+      auto mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < nn[i])
+          return;
+        if (nn[j] == nn[i] && zt[j] >= zt[i])
+          return;  // if equal use natural order...
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        mdist = dist;
+        minJ = j;
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+      // should belong to the same cluster...
+      assert(iv[i] == iv[minJ]);
+      assert(nn[i] <= nn[iv[i]]);
+    }
+    __syncthreads();
+#endif
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
+                                               gpuVertexFinder::WorkSpace* pws,
+                                               int minT,      // min number of neighbours to be "seed"
+                                               float eps,     // max absolute distance to cluster
+                                               float errmax,  // max error to be "seed"
+                                               float chi2max  // max normalized distance to cluster
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksByDensity_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
new file mode 100644
index 0000000000000..22ba1e15b4e05
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -0,0 +1,241 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
+                                      WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "core"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+    // find NN with smaller z...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      float mz = zt[i];
+      auto loop = [&](uint32_t j) {
+        if (zt[j] >= mz)
+          return;
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        mz = zt[j];
+        iv[i] = j;  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+    // consolidate graph (percolate index of seed)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      auto m = iv[i];
+      while (m != iv[m])
+        m = iv[m];
+      iv[i] = m;
+    }
+
+    __syncthreads();
+
+#ifdef GPU_DEBUG
+    //  mini verification
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] != int(i))
+        assert(iv[iv[i]] != int(i));
+    }
+    __syncthreads();
+#endif
+
+#ifdef GPU_DEBUG
+    // and verify that we did not spit any cluster...
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (nn[i] < minT)
+        continue;  // DBSCAN core rule
+      assert(zt[iv[i]] <= zt[i]);
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+        // they should belong to the same cluster, isn't it?
+        if (iv[i] != iv[j]) {
+          printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+          printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+          ;
+        }
+        assert(iv[i] == iv[j]);
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+    __syncthreads();
+#endif
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](uint32_t j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
new file mode 100644
index 0000000000000..1f2934ba15d0c
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -0,0 +1,212 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  // this algo does not really scale as it works in a single block...
+  // enough for <10K tracks we have
+  __global__ void clusterTracksIterative(ZVertices* pdata,
+                                         WorkSpace* pws,
+                                         int minT,      // min number of neighbours to be "core"
+                                         float eps,     // max absolute distance to cluster
+                                         float errmax,  // max error to be "seed"
+                                         float chi2max  // max normalized distance to cluster
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    if (verbose && 0 == threadIdx.x)
+      printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+
+    auto er2mx = errmax * errmax;
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    uint8_t* __restrict__ izt = ws.izt;
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter hws[32];
+    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+      hist.off[j] = 0;
+    }
+    __syncthreads();
+
+    if (verbose && 0 == threadIdx.x)
+      printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+
+    assert(nt <= hist.capacity());
+
+    // fill hist  (bin shall be wider than "eps")
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      assert(i < ZVertices::MAXTRACKS);
+      int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+      iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+      izt[i] = iz - INT8_MIN;
+      assert(iz - INT8_MIN >= 0);
+      assert(iz - INT8_MIN < 256);
+      hist.count(izt[i]);
+      iv[i] = i;
+      nn[i] = 0;
+    }
+    __syncthreads();
+    if (threadIdx.x < 32)
+      hws[threadIdx.x] = 0;  // used by prefix scan...
+    __syncthreads();
+    hist.finalize(hws);
+    __syncthreads();
+    assert(hist.size() == nt);
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i], uint16_t(i));
+    }
+    __syncthreads();
+
+    // count neighbours
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (ezt2[i] > er2mx)
+        continue;
+      auto loop = [&](uint32_t j) {
+        if (i == j)
+          return;
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > eps)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;
+        nn[i]++;
+      };
+
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ int nloops;
+    nloops = 0;
+
+    __syncthreads();
+
+    // cluster seeds only
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      if (1 == nloops % 2) {
+        for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+          auto m = iv[i];
+          while (m != iv[m])
+            m = iv[m];
+          iv[i] = m;
+        }
+      } else {
+        more = false;
+        for (auto k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+          auto p = hist.begin() + k;
+          auto i = (*p);
+          auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          auto loop = [&](uint32_t j) {
+            assert(i != j);
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;
+            auto old = atomicMin(&iv[j], iv[i]);
+            if (old != iv[i]) {
+              // end the loop only if no changes were applied
+              more = true;
+            }
+            atomicMin(&iv[i], old);
+          };
+          ++p;
+          for (; p < hist.end(be); ++p)
+            loop(*p);
+        }  // for i
+      }
+      if (threadIdx.x == 0)
+        ++nloops;
+    }  // while
+
+    // collect edges (assign to closest cluster of closest point??? here to closest point)
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+      if (nn[i] >= minT)
+        continue;  // DBSCAN edge rule
+      float mdist = eps;
+      auto loop = [&](int j) {
+        if (nn[j] < minT)
+          return;  // DBSCAN core rule
+        auto dist = std::abs(zt[i] - zt[j]);
+        if (dist > mdist)
+          return;
+        if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+          return;  // needed?
+        mdist = dist;
+        iv[i] = iv[j];  // assign to cluster (better be unique??)
+      };
+      cms::cuda::forEachInBins(hist, izt[i], 1, loop);
+    }
+
+    __shared__ unsigned int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a tracks with clus[i] == i;
+    // mark these tracks with a negative id.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] == int(i)) {
+        if (nn[i] >= minT) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          iv[i] = -(old + 1);
+        } else {  // noise
+          iv[i] = -9998;
+        }
+      }
+    }
+    __syncthreads();
+
+    assert(foundClusters < ZVertices::MAXVTX);
+
+    // propagate the negative id to all the tracks in the cluster.
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] >= 0) {
+        // mark each track in a cluster with the same id as the first one
+        iv[i] = iv[iv[i]];
+      }
+    }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      iv[i] = -iv[i] - 1;
+    }
+
+    nvIntermediate = nvFinal = foundClusters;
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto vertices\n", foundClusters);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksIterative_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
new file mode 100644
index 0000000000000..b8bbd0f601cb6
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -0,0 +1,113 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
+                                              WorkSpace* pws,
+                                              float chi2Max  // for outlier rejection
+  ) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+    uint32_t& nvIntermediate = ws.nvIntermediate;
+
+    int32_t* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    assert(nvFinal <= nvIntermediate);
+    nvFinal = nvIntermediate;
+    auto foundClusters = nvFinal;
+
+    // zero
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      zv[i] = 0;
+      wv[i] = 0;
+      chi2[i] = 0;
+    }
+
+    // only for test
+    __shared__ int noise;
+    if (verbose && 0 == threadIdx.x)
+      noise = 0;
+
+    __syncthreads();
+
+    // compute cluster location
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990) {
+        if (verbose)
+          atomicAdd(&noise, 1);
+        continue;
+      }
+      assert(iv[i] >= 0);
+      assert(iv[i] < int(foundClusters));
+      auto w = 1.f / ezt2[i];
+      atomicAdd(&zv[iv[i]], zt[i] * w);
+      atomicAdd(&wv[iv[i]], w);
+    }
+
+    __syncthreads();
+    // reuse nn
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x) {
+      assert(wv[i] > 0.f);
+      zv[i] /= wv[i];
+      nn[i] = -1;  // ndof
+    }
+    __syncthreads();
+
+    // compute chi2
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+
+      auto c2 = zv[iv[i]] - zt[i];
+      c2 *= c2 / ezt2[i];
+      if (c2 > chi2Max) {
+        iv[i] = 9999;
+        continue;
+      }
+      atomicAdd(&chi2[iv[i]], c2);
+      atomicAdd(&nn[iv[i]], 1);
+    }
+    __syncthreads();
+    for (auto i = threadIdx.x; i < foundClusters; i += blockDim.x)
+      if (nn[i] > 0)
+        wv[i] *= float(nn[i]) / chi2[i];
+
+    if (verbose && 0 == threadIdx.x)
+      printf("found %d proto clusters ", foundClusters);
+    if (verbose && 0 == threadIdx.x)
+      printf("and %d noise\n", noise);
+  }
+
+  __global__ void fitVerticesKernel(ZVertices* pdata,
+                                    WorkSpace* pws,
+                                    float chi2Max  // for outlier rejection
+  ) {
+    fitVertices(pdata, pws, chi2Max);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
new file mode 100644
index 0000000000000..841eab3901965
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#ifdef __CUDA_ARCH__
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+#endif
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ ptt2 = ws.ptt2;
+    uint32_t const& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ iv = ws.iv;
+    float* __restrict__ ptv2 = data.ptv2;
+    uint16_t* __restrict__ sortInd = data.sortInd;
+
+    // if (threadIdx.x == 0)
+    //    printf("sorting %d vertices\n",nvFinal);
+
+    if (nvFinal < 1)
+      return;
+
+    // fill indexing
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      data.idv[ws.itrk[i]] = iv[i];
+    }
+
+    // can be done asynchronoisly at the end of previous event
+    for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
+      ptv2[i] = 0;
+    }
+    __syncthreads();
+
+    for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i] > 9990)
+        continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    if (1 == nvFinal) {
+      if (threadIdx.x == 0)
+        sortInd[0] = 0;
+      return;
+    }
+#ifdef __CUDA_ARCH__
+    __shared__ uint16_t sws[1024];
+    // sort using only 16 bits
+    radixSort<float, 2>(ptv2, sortInd, sws, nvFinal);
+#else
+    for (uint16_t i = 0; i < nvFinal; ++i)
+      sortInd[i] = i;
+    std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+#endif
+  }
+
+  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSortByPt2_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
new file mode 100644
index 0000000000000..0fe8bd882dcc5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -0,0 +1,139 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+
+#include "gpuVertexFinder.h"
+
+namespace gpuVertexFinder {
+
+  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+    auto& __restrict__ data = *pdata;
+    auto& __restrict__ ws = *pws;
+    auto nt = ws.ntrks;
+    float const* __restrict__ zt = ws.zt;
+    float const* __restrict__ ezt2 = ws.ezt2;
+    float* __restrict__ zv = data.zv;
+    float* __restrict__ wv = data.wv;
+    float const* __restrict__ chi2 = data.chi2;
+    uint32_t& nvFinal = data.nvFinal;
+
+    int32_t const* __restrict__ nn = data.ndof;
+    int32_t* __restrict__ iv = ws.iv;
+
+    assert(pdata);
+    assert(zt);
+
+    // one vertex per block
+    for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
+      if (nn[kv] < 4)
+        continue;
+      if (chi2[kv] < maxChi2 * float(nn[kv]))
+        continue;
+
+      constexpr int MAXTK = 512;
+      assert(nn[kv] < MAXTK);
+      if (nn[kv] >= MAXTK)
+        continue;                      // too bad FIXME
+      __shared__ uint32_t it[MAXTK];   // track index
+      __shared__ float zz[MAXTK];      // z pos
+      __shared__ uint8_t newV[MAXTK];  // 0 or 1
+      __shared__ float ww[MAXTK];      // z weight
+
+      __shared__ uint32_t nq;  // number of track for this vertex
+      nq = 0;
+      __syncthreads();
+
+      // copy to local
+      for (auto k = threadIdx.x; k < nt; k += blockDim.x) {
+        if (iv[k] == int(kv)) {
+          auto old = atomicInc(&nq, MAXTK);
+          zz[old] = zt[k] - zv[kv];
+          newV[old] = zz[old] < 0 ? 0 : 1;
+          ww[old] = 1.f / ezt2[k];
+          it[old] = k;
+        }
+      }
+
+      __shared__ float znew[2], wnew[2];  // the new vertices
+
+      __syncthreads();
+      assert(int(nq) == nn[kv] + 1);
+
+      int maxiter = 20;
+      // kt-min....
+      bool more = true;
+      while (__syncthreads_or(more)) {
+        more = false;
+        if (0 == threadIdx.x) {
+          znew[0] = 0;
+          znew[1] = 0;
+          wnew[0] = 0;
+          wnew[1] = 0;
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto i = newV[k];
+          atomicAdd(&znew[i], zz[k] * ww[k]);
+          atomicAdd(&wnew[i], ww[k]);
+        }
+        __syncthreads();
+        if (0 == threadIdx.x) {
+          znew[0] /= wnew[0];
+          znew[1] /= wnew[1];
+        }
+        __syncthreads();
+        for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+          auto d0 = fabs(zz[k] - znew[0]);
+          auto d1 = fabs(zz[k] - znew[1]);
+          auto newer = d0 < d1 ? 0 : 1;
+          more |= newer != newV[k];
+          newV[k] = newer;
+        }
+        --maxiter;
+        if (maxiter <= 0)
+          more = false;
+      }
+
+      // avoid empty vertices
+      if (0 == wnew[0] || 0 == wnew[1])
+        continue;
+
+      // quality cut
+      auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+      auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+      if (verbose && 0 == threadIdx.x)
+        printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+      if (chi2Dist < 4)
+        continue;
+
+      // get a new global vertex
+      __shared__ uint32_t igv;
+      if (0 == threadIdx.x)
+        igv = atomicAdd(&ws.nvIntermediate, 1);
+      __syncthreads();
+      for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
+        if (1 == newV[k])
+          iv[it[k]] = igv;
+      }
+
+    }  // loop on vertices
+  }
+
+  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+    splitVertices(pdata, pws, maxChi2);
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuSplitVertices_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
new file mode 100644
index 0000000000000..084763385bdb4
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cu
@@ -0,0 +1 @@
+#include "gpuVertexFinderImpl.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
new file mode 100644
index 0000000000000..5f8238c3ea8c8
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -0,0 +1,83 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
+
+#include <cstddef>
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+namespace gpuVertexFinder {
+
+  using ZVertices = ZVertexSoA;
+  using TkSoA = pixelTrack::TrackSoA;
+
+  // workspace used in the vertex reco algos
+  struct WorkSpace {
+    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
+    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+
+    uint32_t ntrks;            // number of "selected tracks"
+    uint16_t itrk[MAXTRACKS];  // index of original track
+    float zt[MAXTRACKS];       // input track z at bs
+    float ezt2[MAXTRACKS];     // input error^2 on the above
+    float ptt2[MAXTRACKS];     // input pt^2 on the above
+    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
+    int32_t iv[MAXTRACKS];     // vertex index for each associated track
+
+    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
+
+    __host__ __device__ void init() {
+      ntrks = 0;
+      nvIntermediate = 0;
+    }
+  };
+
+  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
+    pdata->init();
+    pws->init();
+  }
+
+  class Producer {
+  public:
+    using ZVertices = ZVertexSoA;
+    using WorkSpace = gpuVertexFinder::WorkSpace;
+    using TkSoA = pixelTrack::TrackSoA;
+
+    Producer(bool oneKernel,
+             bool useDensity,
+             bool useDBSCAN,
+             bool useIterative,
+             int iminT,      // min number of neighbours to be "core"
+             float ieps,     // max absolute distance to cluster
+             float ierrmax,  // max error to be "seed"
+             float ichi2max  // max normalized distance to cluster
+             )
+        : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+          useDensity_(useDensity),
+          useDBSCAN_(useDBSCAN),
+          useIterative_(useIterative),
+          minT(iminT),
+          eps(ieps),
+          errmax(ierrmax),
+          chi2max(ichi2max) {}
+
+    ~Producer() = default;
+
+    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const;
+    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin) const;
+
+  private:
+    const bool oneKernel_;
+    const bool useDensity_;
+    const bool useDBSCAN_;
+    const bool useIterative_;
+
+    int minT;       // min number of neighbours to be "core"
+    float eps;      // max absolute distance to cluster
+    float errmax;   // max error to be "seed"
+    float chi2max;  // max normalized distance to cluster
+  };
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinder_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
new file mode 100644
index 0000000000000..d685ced488233
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinderImpl.h
@@ -0,0 +1,192 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+#define RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "gpuClusterTracksByDensity.h"
+#include "gpuClusterTracksDBSCAN.h"
+#include "gpuClusterTracksIterative.h"
+#include "gpuFitVertices.h"
+#include "gpuSortByPt2.h"
+#include "gpuSplitVertices.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+
+namespace gpuVertexFinder {
+
+  // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
+  constexpr float maxChi2ForFirstFit = 50.f;
+  constexpr float maxChi2ForFinalFit = 5000.f;
+
+  // split vertices with a chi2/NDoF greater than this
+  constexpr float maxChi2ForSplit = 9.f;
+
+  __global__ void loadTracks(TkSoA const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin) {
+    assert(ptracks);
+    assert(soa);
+    auto const& tracks = *ptracks;
+    auto const& fit = tracks.stateAtBS;
+    auto const* quality = tracks.qualityData();
+
+    auto first = blockIdx.x * blockDim.x + threadIdx.x;
+    for (int idx = first, nt = TkSoA::stride(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = tracks.nHits(idx);
+      if (nHits == 0)
+        break;  // this is a guard: maybe we need to move to nTracks...
+
+      // initialize soa...
+      soa->idv[idx] = -1;
+
+      if (nHits < 4)
+        continue;  // no triplets
+      if (quality[idx] != pixelTrack::Quality::loose)
+        continue;
+
+      auto pt = tracks.pt(idx);
+
+      if (pt < ptMin)
+        continue;
+
+      auto& data = *pws;
+      auto it = atomicAdd(&data.ntrks, 1);
+      data.itrk[it] = idx;
+      data.zt[it] = tracks.zip(idx);
+      data.ezt2[it] = fit.covariance(idx)(14);
+      data.ptt2[it] = pt * pt;
+    }
+  }
+
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                        gpuVertexFinder::WorkSpace* pws,
+                                        int minT,      // min number of neighbours to be "seed"
+                                        float eps,     // max absolute distance to cluster
+                                        float errmax,  // max error to be "seed"
+                                        float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
+    __syncthreads();
+    splitVertices(pdata, pws, maxChi2ForSplit);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#else
+  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+  ) {
+    clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+    __syncthreads();
+    fitVertices(pdata, pws, maxChi2ForFirstFit);
+  }
+
+  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+    fitVertices(pdata, pws, maxChi2ForFinalFit);
+    __syncthreads();
+    sortByPt2(pdata, pws);
+  }
+#endif
+
+#ifdef __CUDACC__
+  ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on GPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+#else
+  ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "producing Vertices on  CPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#endif
+    assert(tksoa);
+    auto* soa = vertices.get();
+    assert(soa);
+
+#ifdef __CUDACC__
+    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+#else
+    auto ws_d = std::make_unique<WorkSpace>();
+#endif
+
+#ifdef __CUDACC__
+    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    auto blockSize = 128;
+    auto numberOfBlocks = (TkSoA::stride() + blockSize - 1) / blockSize;
+    loadTracks<<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin);
+    cudaCheck(cudaGetLastError());
+#else
+    init(soa, ws_d.get());
+    loadTracks(tksoa, soa, ws_d.get(), ptMin);
+#endif
+
+#ifdef __CUDACC__
+    // Running too many thread lead to problems when printf is enabled.
+    constexpr int maxThreadsForPrint = 1024 - 256;
+    constexpr int numBlocks = 1024;
+    constexpr int threadsPerBlock = 128;
+
+    if (oneKernel_) {
+      // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+#else
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      cudaCheck(cudaGetLastError());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+#endif
+    } else {  // five kernels
+      if (useDensity_) {
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useDBSCAN_) {
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      } else if (useIterative_) {
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      }
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      cudaCheck(cudaGetLastError());
+      // one block per vertex...
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      cudaCheck(cudaGetLastError());
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      cudaCheck(cudaGetLastError());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+    }
+    cudaCheck(cudaGetLastError());
+#else  // __CUDACC__
+    if (useDensity_) {
+      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useDBSCAN_) {
+      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    } else if (useIterative_) {
+      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+    }
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+    // one block per vertex!
+    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
+    sortByPt2(soa, ws_d.get());
+#endif
+
+    return vertices;
+  }
+
+}  // namespace gpuVertexFinder
+
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuVertexFinderImpl_h
diff --git a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
index 77a9f367b9d9b..903c2a894ff86 100644
--- a/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
+++ b/RecoPixelVertexing/PixelVertexFinding/python/PixelVertexes_cfi.py
@@ -18,5 +18,3 @@
        refToPSet_ = cms.string('pvClusterComparer')
     )
 )
-
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
index 0f4f4dee63832..f5c154b298574 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/test/BuildFile.xml
@@ -2,8 +2,41 @@
 <use name="root"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
-<use name="MagneticField/Records"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="MagneticField/Engine"/>
-<use name="TrackingTools/TransientTrack"/>
+<use name="MagneticField/Records"/>
 <use name="RecoVertex/KalmanVertexFit"/>
 <use name="SimDataFormats/Track"/>
+<use name="TrackingTools/TransientTrack"/>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderByDensity_t">
+  <flags CXXFLAGS="-g  -DGPU_DEBUG"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
+  <use name="cuda"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+  <flags CXXFLAGS="-g"/>
+</bin>
+
+<bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+</bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
new file mode 100644
index 0000000000000..52253a1e4bbfe
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -0,0 +1,347 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#ifdef USE_DBSCAN
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksIterative
+#else
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h"
+#define CLUSTERIZE gpuVertexFinder::clusterTracksByDensityKernel
+#endif
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h"
+
+#ifdef ONE_KERNEL
+#ifdef __CUDACC__
+__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
+                                      gpuVertexFinder::WorkSpace* pws,
+                                      int minT,      // min number of neighbours to be "seed"
+                                      float eps,     // max absolute distance to cluster
+                                      float errmax,  // max error to be "seed"
+                                      float chi2max  // max normalized distance to cluster,
+) {
+  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  __syncthreads();
+  fitVertices(pdata, pws, 50.);
+  __syncthreads();
+  splitVertices(pdata, pws, 9.f);
+  __syncthreads();
+  fitVertices(pdata, pws, 5000.);
+  __syncthreads();
+  sortByPt2(pdata, pws);
+}
+#endif
+#endif
+
+struct Event {
+  std::vector<float> zvert;
+  std::vector<uint16_t> itrack;
+  std::vector<float> ztrack;
+  std::vector<float> eztrack;
+  std::vector<float> pttrack;
+  std::vector<uint16_t> ivert;
+};
+
+struct ClusterGenerator {
+  explicit ClusterGenerator(float nvert, float ntrack)
+      : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+
+  void operator()(Event& ev) {
+    int nclus = clusGen(reng);
+    ev.zvert.resize(nclus);
+    ev.itrack.resize(nclus);
+    for (auto& z : ev.zvert) {
+      z = 3.5f * gauss(reng);
+    }
+
+    ev.ztrack.clear();
+    ev.eztrack.clear();
+    ev.ivert.clear();
+    for (int iv = 0; iv < nclus; ++iv) {
+      auto nt = trackGen(reng);
+      ev.itrack[nclus] = nt;
+      for (int it = 0; it < nt; ++it) {
+        auto err = errgen(reng);  // reality is not flat....
+        ev.ztrack.push_back(ev.zvert[iv] + err * gauss(reng));
+        ev.eztrack.push_back(err * err);
+        ev.ivert.push_back(iv);
+        ev.pttrack.push_back((iv == 5 ? 1.f : 0.5f) + ptGen(reng));
+        ev.pttrack.back() *= ev.pttrack.back();
+      }
+    }
+    // add noise
+    auto nt = 2 * trackGen(reng);
+    for (int it = 0; it < nt; ++it) {
+      auto err = 0.03f;
+      ev.ztrack.push_back(rgen(reng));
+      ev.eztrack.push_back(err * err);
+      ev.ivert.push_back(9999);
+      ev.pttrack.push_back(0.5f + ptGen(reng));
+      ev.pttrack.back() *= ev.pttrack.back();
+    }
+  }
+
+  std::mt19937 reng;
+  std::uniform_real_distribution<float> rgen;
+  std::uniform_real_distribution<float> errgen;
+  std::poisson_distribution<int> clusGen;
+  std::poisson_distribution<int> trackGen;
+  std::normal_distribution<float> gauss;
+  std::exponential_distribution<float> ptGen;
+};
+
+// a macro SORRY
+#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
+#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
+
+__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
+  auto const& __restrict__ data = *pdata;
+  auto const& __restrict__ ws = *pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+}
+
+int main() {
+#ifdef __CUDACC__
+  cms::cudatest::requireDevices();
+
+  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
+  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+#else
+  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
+  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+#endif
+
+  Event ev;
+
+  float eps = 0.1f;
+  std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+  for (int nav = 30; nav < 80; nav += 20) {
+    ClusterGenerator gen(nav, 10);
+
+    for (int i = 8; i < 20; ++i) {
+      auto kk = i / 4;  // M param
+
+      gen(ev);
+
+#ifdef __CUDACC__
+      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+#else
+      onGPU_d->init();
+      ws_d->init();
+#endif
+
+      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+      auto nt = ev.ztrack.size();
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+#else
+      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+#endif
+
+      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+
+      if ((i % 4) == 0)
+        par = {{eps, 0.02f, 12.0f}};
+      if ((i % 4) == 1)
+        par = {{eps, 0.02f, 9.0f}};
+      if ((i % 4) == 2)
+        par = {{eps, 0.01f, 9.0f}};
+      if ((i % 4) == 3)
+        par = {{0.7f * eps, 0.01f, 9.0f}};
+
+      uint32_t nv = 0;
+#ifdef __CUDACC__
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+#ifdef ONE_KERNEL
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#else
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+#endif
+      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+
+      cudaCheck(cudaGetLastError());
+      cudaDeviceSynchronize();
+
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+#else
+      print(onGPU_d.get(), ws_d.get());
+      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.get(), ws_d.get());
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+      float* zv = nullptr;
+      float* wv = nullptr;
+      float* ptv2 = nullptr;
+      int32_t* nn = nullptr;
+      uint16_t* ind = nullptr;
+
+      // keep chi2 separated...
+      float chi2[2 * nv];  // make space for splitting...
+
+#ifdef __CUDACC__
+      float hzv[2 * nv];
+      float hwv[2 * nv];
+      float hptv2[2 * nv];
+      int32_t hnn[2 * nv];
+      uint16_t hind[2 * nv];
+
+      zv = hzv;
+      wv = hwv;
+      ptv2 = hptv2;
+      nn = hnn;
+      ind = hind;
+#else
+      zv = onGPU_d->zv;
+      wv = onGPU_d->wv;
+      ptv2 = onGPU_d->ptv2;
+      nn = onGPU_d->ndof;
+      ind = onGPU_d->sortInd;
+#endif
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+#ifdef __CUDACC__
+      // one vertex per block!!!
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+      nv = ws_d->nvIntermediate;
+#endif
+      std::cout << "after split " << nv << std::endl;
+
+#ifdef __CUDACC__
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cudaCheck(cudaGetLastError());
+
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cudaCheck(cudaGetLastError());
+      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+#else
+      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+      sortByPt2(onGPU_d.get(), ws_d.get());
+      nv = onGPU_d->nvFinal;
+      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+#endif
+
+      if (nv == 0) {
+        std::cout << "NO VERTICES???" << std::endl;
+        continue;
+      }
+
+#ifdef __CUDACC__
+      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+#endif
+      for (auto j = 0U; j < nv; ++j)
+        if (nn[j] > 0)
+          chi2[j] /= float(nn[j]);
+      {
+        auto mx = std::minmax_element(chi2, chi2 + nv);
+        std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(wv, wv + nv);
+        std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+      }
+
+      {
+        auto mx = std::minmax_element(ptv2, ptv2 + nv);
+        std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+        std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+                  << ind[nv - 1] << std::endl;
+      }
+
+      float dd[nv];
+      for (auto kv = 0U; kv < nv; ++kv) {
+        auto zr = zv[kv];
+        auto md = 500.0f;
+        for (auto zs : ev.ztrack) {
+          auto d = std::abs(zr - zs);
+          md = std::min(d, md);
+        }
+        dd[kv] = md;
+      }
+      if (i == 6) {
+        for (auto d : dd)
+          std::cout << d << ' ';
+        std::cout << std::endl;
+      }
+      auto mx = std::minmax_element(dd, dd + nv);
+      float rms = 0;
+      for (auto d : dd)
+        rms += d * d;
+      rms = std::sqrt(rms) / (nv - 1);
+      std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+    }  // loop on events
+  }    // lopp on ave vert
+
+  return 0;
+}
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/cpuVertexFinder_t.cpp
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
new file mode 100644
index 0000000000000..a7906fe0d03f5
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -0,0 +1 @@
+#include "VertexFinder_t.h"