From 81bc63b9abeefa5d71f1209a792ddd72b7866e87 Mon Sep 17 00:00:00 2001
From: AdrianoDee <adriano.diflorio@ba.infn.it>
Date: Tue, 19 Apr 2022 09:18:10 +0200
Subject: [PATCH 1/2] GPUvsCPU DQM for pixels

- adding pixel hits SoA from GPU to CPU copy;
- updating pixel only wfs accordingly (.502,.503).
---
 .../interface/TrackingRecHit2DHeterogeneous.h | 142 +++++++++++++++---
 .../src/TrackingRecHit2DHeterogeneous.cc      |  14 ++
 .../plugins/SiPixelPhase1MonitorRecHitsSoA.cc |   4 +-
 .../plugins/SiPixelPhase1MonitorTrackSoA.cc   |   2 +-
 ...ixelPhase1HeterogenousDQM_FirstStep_cff.py |   6 +-
 .../plugins/SiPixelRecHitSoAFromCUDA.cc       |  91 +++++++++++
 .../python/SiPixelRecHits_cfi.py              |  31 +++-
 .../python/RecoPixelVertexing_cff.py          |   7 +
 .../python/PixelTracks_cff.py                 |  17 ++-
 9 files changed, 268 insertions(+), 46 deletions(-)
 create mode 100644 RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index d85673238942b..8ce37f280ac6c 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -8,6 +8,27 @@
 template <typename Traits>
 class TrackingRecHit2DHeterogeneous {
 public:
+  enum class Storage32 {
+    kXLocal = 0,
+    kYLocal = 1,
+    kXerror = 2,
+    kYerror = 3,
+    kCharge = 4,
+    kXGlobal = 5,
+    kYGlobal = 6,
+    kZGlobal = 7,
+    kRGlobal = 8,
+    kPhiStorage = 9,
+    kLayers = 10
+  };
+
+  enum class Storage16 {
+    kDetId = 0,
+    kPhi = 1,
+    kXSize = 2,
+    kYSize = 3,
+  };
+
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
@@ -24,6 +45,8 @@ class TrackingRecHit2DHeterogeneous {
       cudaStream_t stream,
       TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
 
+  explicit TrackingRecHit2DHeterogeneous(
+      float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
   ~TrackingRecHit2DHeterogeneous() = default;
 
   TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
@@ -44,10 +67,13 @@ class TrackingRecHit2DHeterogeneous {
   auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
-  // only the local coord and detector index
   cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
+  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+
   // needs specialization for Host
   void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
 
@@ -55,7 +81,7 @@ class TrackingRecHit2DHeterogeneous {
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
   static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
-
+  static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
   unique_ptr<uint16_t[]> m_store16;  //!
   unique_ptr<float[]> m_store32;     //!
 
@@ -108,7 +134,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
 
   // if empy do not bother
   if (0 == nHits) {
-    if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
       cms::cuda::copyAsync(m_view, view, stream);
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
@@ -123,7 +149,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   // so unless proven VERY inefficient we keep it ordered as generated
 
   // host copy is "reduced"  (to be reviewed at some point)
-  if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
     // it has to compile for ALL cases
     copyFromGPU(input, stream);
   } else {
@@ -139,43 +165,113 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
 
-  auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
 
   // copy all the pointers
   m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
   m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
 
-  view->m_xl = get32(0);
-  view->m_yl = get32(1);
-  view->m_xerr = get32(2);
-  view->m_yerr = get32(3);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
 
-  if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+  if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
     assert(input == nullptr);
-    view->m_xg = get32(5);
-    view->m_yg = get32(6);
-    view->m_zg = get32(7);
-    view->m_rg = get32(8);
+    view->m_xg = get32(Storage32::kXGlobal);
+    view->m_yg = get32(Storage32::kYGlobal);
+    view->m_zg = get32(Storage32::kZGlobal);
+    view->m_rg = get32(Storage32::kRGlobal);
 
-    auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
-    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
+    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
 
-    view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
-    view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
-    view->m_detInd = get16(0);
+    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+    view->m_detInd = get16(Storage16::kDetId);
 
     m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
+    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
   }
 
   // transfer view
-  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
     cms::cuda::copyAsync(m_view, view, stream);
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
 }
 
+//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
+template <typename Traits>
+TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+    float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
+    : m_nHits(nHits), m_hitsModuleStart(modules) {
+  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+
+  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
+
+  view->m_nHits = nHits;
+
+  if (0 == nHits) {
+    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+      cms::cuda::copyAsync(m_view, view, stream);
+    } else {
+      m_view = std::move(view);
+    }
+    return;
+  }
+
+  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
+  m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
+  m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
+  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_hitsModuleStart = m_hitsModuleStart;
+
+  //store transfer
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+    cms::cuda::copyAsync(m_store16, store16, stream);
+    cms::cuda::copyAsync(m_store32, store32, stream);
+  } else {
+    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
+    std::copy(store16, store16 + nHits * n16, m_store16.get());
+  }
+
+  //getters
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+
+  //Store 32
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+  view->m_xg = get32(Storage32::kXGlobal);
+  view->m_yg = get32(Storage32::kYGlobal);
+  view->m_zg = get32(Storage32::kZGlobal);
+  view->m_rg = get32(Storage32::kRGlobal);
+
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  //Store 16
+  view->m_detInd = get16(Storage16::kDetId);
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+
+  // transfer view
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+    cms::cuda::copyAsync(m_view, view, stream);
+  } else {
+    m_view = std::move(view);
+  }
+}
+
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
index 54622fcf62553..fc6a05ba9ed3e 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
@@ -11,6 +11,20 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(
   return ret;
 }
 
+template <>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
+  return ret;
+}
+
 template <>
 cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
   auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
index f1d4894dc35db..df766e9156cf8 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorRecHitsSoA
 // Class:      SiPixelPhase1MonitorRecHitsSoA
 //
-/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc 
+/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury, Alessandro Rossi
@@ -97,7 +97,6 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm
   }
   auto const& rhsoa = *rhsoaHandle;
   const TrackingRecHit2DSOAView* soa2d = rhsoa.view();
-
   uint32_t nHits_ = soa2d->nHits();
   hnHits->Fill(nHits_);
   auto detIds = tkGeom_->detUnitIds();
@@ -111,6 +110,7 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm
     uint32_t charge = soa2d->charge(i);
     int16_t sizeX = std::ceil(float(std::abs(soa2d->clusterSizeX(i)) / 8.));
     int16_t sizeY = std::ceil(float(std::abs(soa2d->clusterSizeY(i)) / 8.));
+
     hBFposZP->Fill(zG, fphi);
     int16_t ysign = yG >= 0 ? 1 : -1;
     hBFposZR->Fill(zG, rG * ysign);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
index 47bde4f171ede..aac487b0bdf71 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorTrackSoA
 // Class:      SiPixelPhase1MonitorTrackSoA
 //
-/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc 
+/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
diff --git a/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py b/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
index dc19a2318a08d..07915be92d413 100644
--- a/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
+++ b/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
@@ -4,13 +4,11 @@
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1MonitorRecHitsSoA_cfi import *
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA")
-
+gpu.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA") #would be obsloete if .501 is dropped
 
 monitorpixelSoASource = cms.Sequence(siPixelPhase1MonitorRecHitsSoA * siPixelPhase1MonitorTrackSoA * siPixelPhase1MonitorVertexSoA)
 
-
-#Define the sequence for GPU vs CPU validation
+Define the sequence for GPU vs CPU validation
 #This should run:- individual monitor for the 2 collections + comparison module
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1CompareTrackSoA_cfi import *
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1CompareVertexSoA_cfi import *
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
new file mode 100644
index 0000000000000..fda418320e70a
--- /dev/null
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -0,0 +1,91 @@
+#include <cuda_runtime.h>
+
+#include <fmt/printf.h>
+
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+  using HMSstorage = HostProduct<uint32_t[]>;
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
+  const edm::EDPutTokenT<HMSstorage> hostPutToken_;
+
+  uint32_t nHits_;
+  uint32_t nMaxModules_;
+
+  cms::cuda::host::unique_ptr<float[]> store32_;
+  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+};
+
+SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : hitsTokenGPU_(
+          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
+      hostPutToken_(produces<HMSstorage>()) {}
+
+void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                       edm::EventSetup const& iSetup,
+                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  nHits_ = inputData.nHits();
+  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+
+  if (0 == nHits_)
+    return;
+  nMaxModules_ = inputData.nMaxModules();
+  store32_ = inputData.store32ToHostAsync(ctx.stream());
+  store16_ = inputData.store16ToHostAsync(ctx.stream());
+  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+}
+
+void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
+  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
+  std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+
+  iEvent.emplace(hostPutToken_, std::move(hmsp));
+  iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
+}
+
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index b2be63a4b6216..781447c70b512 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -23,15 +23,18 @@
 
 # convert the pixel rechits from legacy to SoA format
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitsPreSplittingSoA
-siPixelRecHitsPreSplittingSoA = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDA_cfi import siPixelRecHitSoAFromCUDA as _siPixelRecHitSoAFromCUDA
+
+siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
+
 # phase 2 tracker modifier
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA,
+phase2_tracker.toModify(siPixelRecHitsPreSplittingCPU,
     isPhase2 = True)
 # modifier used to prompt patatrack pixel tracks reconstruction on cpu
 from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
 pixelNtupletFit.toModify(siPixelRecHitsPreSplitting,
-    cpu = siPixelRecHitsPreSplittingSoA.clone()
+    cpu = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
 )
 
 siPixelRecHitsPreSplittingTask = cms.Task(
@@ -48,9 +51,19 @@
 # transfer the pixel rechits to the host and convert them from SoA
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA
 
+#this is an alias for the SoA on GPU or CPU to be used for DQM
+siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
+    cpu = cms.EDAlias(
+            siPixelRecHitsPreSplittingCPU = cms.VPSet(
+                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("uintAsHostProduct"))
+             )),
+    cuda = _siPixelRecHitSoAFromCUDA.clone()
+)
+
 (gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplitting,
     cpu = cms.EDAlias(
-            siPixelRecHitsPreSplittingSoA = cms.VPSet(
+            siPixelRecHitsPreSplittingCPU = cms.VPSet(
                  cms.PSet(type = cms.string("SiPixelRecHitedmNewDetSetVector")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )
@@ -58,10 +71,12 @@
     cuda = _siPixelRecHitFromCUDA.clone())
 
 (gpu & pixelNtupletFit).toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task(
-    # reconstruct the pixel rechits on the gpu
+    # reconstruct the pixel rechits on the gpu or on the cpu
+    # (normally only one of the two is run because only one is consumed from later stages)
     siPixelRecHitsPreSplittingCUDA,
-    # producing and converting on cpu
-    siPixelRecHitsPreSplittingSoA,
+    siPixelRecHitsPreSplittingCPU,
     # SwitchProducer wrapping an EDAlias on cpu or the converter from SoA to legacy on gpu
-    siPixelRecHitsPreSplittingTask.copy()
+    siPixelRecHitsPreSplittingTask.copy(),
+    # producing and converting on cpu (if needed)
+    siPixelRecHitsPreSplittingSoA
 ))
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
index 380586bba9bbc..6954b536aba1f 100644
--- a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -53,6 +53,13 @@
     )
 )
 
+## GPU vs CPU validation
+# force CPU vertexing to use track SoA from CPU chain and not the converted one from GPU chain
+from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel
+(pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelVerticesSoA.cpu,
+    pixelTrackSrc = "pixelTracksSoA@cpu"
+)
+
 (pixelNtupletFit & gpu).toReplaceWith(pixelVerticesTask, cms.Task(
     # build pixel vertices in SoA format on the GPU
     pixelVerticesCUDA,
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index 449d9cdfd084d..143b062eb7c9d 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -114,12 +114,6 @@
 ))
 
 (pixelNtupletFit & ~phase2_tracker).toReplaceWith(pixelTracksTask, cms.Task(
-    #pixelTracksTrackingRegions,
-    #pixelFitterByHelixProjections,
-    #pixelTrackFilterByKinematics,
-    #pixelTracksSeedLayers,
-    #pixelTracksHitDoublets,
-    #pixelTracksHitQuadruplets,
     # build the pixel ntuplets and the pixel tracks in SoA format on the GPU
     pixelTracksSoA,
     # convert the pixel tracks from SoA to legacy format
@@ -129,9 +123,9 @@
 
 # "Patatrack" sequence running on GPU (or CPU if not available)
 from Configuration.ProcessModifiers.gpu_cff import gpu
+
 (pixelNtupletFit & gpu).toModify(pixelTracksSoA.cpu,
-    pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA",
-)
+    pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA")
 
 # build the pixel ntuplets and pixel tracks in SoA format on the GPU
 pixelTracksCUDA = _pixelTracksCUDA.clone(
@@ -159,3 +153,10 @@
     # transfer the pixel tracks in SoA format to the CPU, and convert them to legacy format
     pixelTracksTask.copy()
 ))
+
+## GPU vs CPU validation
+# force CPU vertexing to use hit SoA from CPU chain and not the converted one from GPU chain
+from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel
+(pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelTracksSoA.cpu,
+    pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA@cpu"
+    )

From f8cb46e1c3c8db47e3209cdc67ca777823c1e629 Mon Sep 17 00:00:00 2001
From: AdrianoDee <adriano.diflorio@ba.infn.it>
Date: Fri, 29 Apr 2022 12:28:26 +0200
Subject: [PATCH 2/2] Clean up

---
 .../plugins/SiPixelPhase1MonitorRecHitsSoA.cc               | 4 ++--
 .../plugins/SiPixelPhase1MonitorTrackSoA.cc                 | 2 +-
 .../python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py    | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
index df766e9156cf8..f1d4894dc35db 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorRecHitsSoA
 // Class:      SiPixelPhase1MonitorRecHitsSoA
 //
-/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc
+/**\class SiPixelPhase1MonitorRecHitsSoA SiPixelPhase1MonitorRecHitsSoA.cc 
 */
 //
 // Author: Suvankar Roy Chowdhury, Alessandro Rossi
@@ -97,6 +97,7 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm
   }
   auto const& rhsoa = *rhsoaHandle;
   const TrackingRecHit2DSOAView* soa2d = rhsoa.view();
+
   uint32_t nHits_ = soa2d->nHits();
   hnHits->Fill(nHits_);
   auto detIds = tkGeom_->detUnitIds();
@@ -110,7 +111,6 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm
     uint32_t charge = soa2d->charge(i);
     int16_t sizeX = std::ceil(float(std::abs(soa2d->clusterSizeX(i)) / 8.));
     int16_t sizeY = std::ceil(float(std::abs(soa2d->clusterSizeY(i)) / 8.));
-
     hBFposZP->Fill(zG, fphi);
     int16_t ysign = yG >= 0 ? 1 : -1;
     hBFposZR->Fill(zG, rG * ysign);
diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
index aac487b0bdf71..47bde4f171ede 100644
--- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
+++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc
@@ -3,7 +3,7 @@
 // Package:    SiPixelPhase1MonitorTrackSoA
 // Class:      SiPixelPhase1MonitorTrackSoA
 //
-/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc
+/**\class SiPixelPhase1MonitorTrackSoA SiPixelPhase1MonitorTrackSoA.cc 
 */
 //
 // Author: Suvankar Roy Chowdhury
diff --git a/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py b/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
index 07915be92d413..dc19a2318a08d 100644
--- a/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
+++ b/DQM/SiPixelPhase1Heterogeneous/python/SiPixelPhase1HeterogenousDQM_FirstStep_cff.py
@@ -4,11 +4,13 @@
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1MonitorRecHitsSoA_cfi import *
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA") #would be obsloete if .501 is dropped
+gpu.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA")
+
 
 monitorpixelSoASource = cms.Sequence(siPixelPhase1MonitorRecHitsSoA * siPixelPhase1MonitorTrackSoA * siPixelPhase1MonitorVertexSoA)
 
-Define the sequence for GPU vs CPU validation
+
+#Define the sequence for GPU vs CPU validation
 #This should run:- individual monitor for the 2 collections + comparison module
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1CompareTrackSoA_cfi import *
 from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1CompareVertexSoA_cfi import *