cms-sw · cmsbuild · May 3, 2022 · Apr 19, 2022 · Apr 29, 2022
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -8,6 +8,27 @@
 template <typename Traits>
 class TrackingRecHit2DHeterogeneous {
 public:
+  enum class Storage32 {
+    kXLocal = 0,
+    kYLocal = 1,
+    kXerror = 2,
+    kYerror = 3,
+    kCharge = 4,
+    kXGlobal = 5,
+    kYGlobal = 6,
+    kZGlobal = 7,
+    kRGlobal = 8,
+    kPhiStorage = 9,
+    kLayers = 10
+  };
+
+  enum class Storage16 {
+    kDetId = 0,
+    kPhi = 1,
+    kXSize = 2,
+    kYSize = 3,
+  };
+
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
@@ -24,6 +45,8 @@ class TrackingRecHit2DHeterogeneous {
       cudaStream_t stream,
       TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
 
+  explicit TrackingRecHit2DHeterogeneous(
+      float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
   ~TrackingRecHit2DHeterogeneous() = default;
 
   TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
@@ -44,18 +67,21 @@ class TrackingRecHit2DHeterogeneous {
   auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
-  // only the local coord and detector index
   cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+
   cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
+  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
+  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+
   // needs specialization for Host
   void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
 
 private:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
   static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
-
+  static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
   unique_ptr<uint16_t[]> m_store16;  //!
   unique_ptr<float[]> m_store32;     //!
 
@@ -108,7 +134,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
 
   // if empy do not bother
   if (0 == nHits) {
-    if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
       cms::cuda::copyAsync(m_view, view, stream);
     } else {
       m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
@@ -123,7 +149,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   // so unless proven VERY inefficient we keep it ordered as generated
 
   // host copy is "reduced"  (to be reviewed at some point)
-  if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
     // it has to compile for ALL cases
     copyFromGPU(input, stream);
   } else {
@@ -139,43 +165,113 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
   static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
 
-  auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
 
   // copy all the pointers
   m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
   m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
 
-  view->m_xl = get32(0);
-  view->m_yl = get32(1);
-  view->m_xerr = get32(2);
-  view->m_yerr = get32(3);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
 
-  if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
+  if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
     assert(input == nullptr);
-    view->m_xg = get32(5);
-    view->m_yg = get32(6);
-    view->m_zg = get32(7);
-    view->m_rg = get32(8);
+    view->m_xg = get32(Storage32::kXGlobal);
+    view->m_yg = get32(Storage32::kYGlobal);
+    view->m_zg = get32(Storage32::kZGlobal);
+    view->m_rg = get32(Storage32::kRGlobal);
 
-    auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
-    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
+    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
 
-    view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
-    view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
-    view->m_detInd = get16(0);
+    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+    view->m_detInd = get16(Storage16::kDetId);
 
     m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
+    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
   }
 
   // transfer view
-  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
     cms::cuda::copyAsync(m_view, view, stream);
   } else {
     m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
   }
 }
 
+//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
+template <typename Traits>
+TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
+    float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
+    : m_nHits(nHits), m_hitsModuleStart(modules) {
+  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
+
+  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
+
+  view->m_nHits = nHits;
+
+  if (0 == nHits) {
+    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+      cms::cuda::copyAsync(m_view, view, stream);
+    } else {
+      m_view = std::move(view);
+    }
+    return;
+  }
+
+  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
+  m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
+  m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
+  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
+
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_hitsModuleStart = m_hitsModuleStart;
+
+  //store transfer
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+    cms::cuda::copyAsync(m_store16, store16, stream);
+    cms::cuda::copyAsync(m_store32, store32, stream);
+  } else {
+    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
+    std::copy(store16, store16 + nHits * n16, m_store16.get());
+  }
+
+  //getters
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+
+  //Store 32
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+  view->m_xg = get32(Storage32::kXGlobal);
+  view->m_yg = get32(Storage32::kYGlobal);
+  view->m_zg = get32(Storage32::kZGlobal);
+  view->m_rg = get32(Storage32::kRGlobal);
+
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  //Store 16
+  view->m_detInd = get16(Storage16::kDetId);
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+
+  // transfer view
+  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
+    cms::cuda::copyAsync(m_view, view, stream);
+  } else {
+    m_view = std::move(view);
+  }
+}
+
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
@@ -11,6 +11,20 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(
   return ret;
 }
 
+template <>
+cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
+  return ret;
+}
+
+template <>
+cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
+  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
+  cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
+  return ret;
+}
+
 template <>
 cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
   auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -0,0 +1,91 @@
+#include <cuda_runtime.h>
+
+#include <fmt/printf.h>
+
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Common/interface/Product.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "DataFormats/Common/interface/DetSetVectorNew.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRecHitSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+  using HMSstorage = HostProduct<uint32_t[]>;
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
+  const edm::EDPutTokenT<HMSstorage> hostPutToken_;
+
+  uint32_t nHits_;
+  uint32_t nMaxModules_;
+
+  cms::cuda::host::unique_ptr<float[]> store32_;
+  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+};
+
+SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
+    : hitsTokenGPU_(
+          consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
+      hostPutToken_(produces<HMSstorage>()) {}
+
+void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
+                                       edm::EventSetup const& iSetup,
+                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  nHits_ = inputData.nHits();
+  LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
+
+  if (0 == nHits_)
+    return;
+  nMaxModules_ = inputData.nMaxModules();
+  store32_ = inputData.store32ToHostAsync(ctx.stream());
+  store16_ = inputData.store16ToHostAsync(ctx.stream());
+  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
+}
+
+void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
+  auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
+  std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
+
+  iEvent.emplace(hostPutToken_, std::move(hmsp));
+  iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
+}
+
+DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -23,15 +23,18 @@
 
 # convert the pixel rechits from legacy to SoA format
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromLegacy_cfi import siPixelRecHitSoAFromLegacy as _siPixelRecHitsPreSplittingSoA
-siPixelRecHitsPreSplittingSoA = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitSoAFromCUDA_cfi import siPixelRecHitSoAFromCUDA as _siPixelRecHitSoAFromCUDA
+
+siPixelRecHitsPreSplittingCPU = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
+
 # phase 2 tracker modifier
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
-phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA,
+phase2_tracker.toModify(siPixelRecHitsPreSplittingCPU,
     isPhase2 = True)
 # modifier used to prompt patatrack pixel tracks reconstruction on cpu
 from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
 pixelNtupletFit.toModify(siPixelRecHitsPreSplitting,
-    cpu = siPixelRecHitsPreSplittingSoA.clone()
+    cpu = _siPixelRecHitsPreSplittingSoA.clone(convertToLegacy=True)
 )
 
 siPixelRecHitsPreSplittingTask = cms.Task(
@@ -48,20 +51,32 @@
 # transfer the pixel rechits to the host and convert them from SoA
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromCUDA_cfi import siPixelRecHitFromCUDA as _siPixelRecHitFromCUDA
 
+#this is an alias for the SoA on GPU or CPU to be used for DQM
+siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
+    cpu = cms.EDAlias(
+            siPixelRecHitsPreSplittingCPU = cms.VPSet(
+                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("uintAsHostProduct"))
+             )),
+    cuda = _siPixelRecHitSoAFromCUDA.clone()
+)
+
 (gpu & pixelNtupletFit).toModify(siPixelRecHitsPreSplitting,
     cpu = cms.EDAlias(
-            siPixelRecHitsPreSplittingSoA = cms.VPSet(
+            siPixelRecHitsPreSplittingCPU = cms.VPSet(
                  cms.PSet(type = cms.string("SiPixelRecHitedmNewDetSetVector")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )
          ),
     cuda = _siPixelRecHitFromCUDA.clone())
 
 (gpu & pixelNtupletFit).toReplaceWith(siPixelRecHitsPreSplittingTask, cms.Task(
-    # reconstruct the pixel rechits on the gpu
+    # reconstruct the pixel rechits on the gpu or on the cpu
+    # (normally only one of the two is run because only one is consumed from later stages)
     siPixelRecHitsPreSplittingCUDA,
-    # producing and converting on cpu
-    siPixelRecHitsPreSplittingSoA,
+    siPixelRecHitsPreSplittingCPU,
     # SwitchProducer wrapping an EDAlias on cpu or the converter from SoA to legacy on gpu
-    siPixelRecHitsPreSplittingTask.copy()
+    siPixelRecHitsPreSplittingTask.copy(),
+    # producing and converting on cpu (if needed)
+    siPixelRecHitsPreSplittingSoA
 ))
diff --git a/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py b/RecoPixelVertexing/Configuration/python/RecoPixelVertexing_cff.py
@@ -53,6 +53,13 @@
     )
 )
 
+## GPU vs CPU validation
+# force CPU vertexing to use track SoA from CPU chain and not the converted one from GPU chain
+from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel
+(pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelVerticesSoA.cpu,
+    pixelTrackSrc = "pixelTracksSoA@cpu"
+)
+
 (pixelNtupletFit & gpu).toReplaceWith(pixelVerticesTask, cms.Task(
     # build pixel vertices in SoA format on the GPU
     pixelVerticesCUDA,