Migrated PixelRecHit to Heterogeneous producer (cms-sw#81)

Migrate PixelRecHit EDProducer to HeterogeneousEDProducer, including the cpu product. Data structures on gpu now include everything needed for Doublets, CA and fit. Layer splitting done: phi sorting (or partial sorting) requires cms-sw#69. Includes some cleanup and bug fixes.
makortel · Jun 29, 2018 · af7cd9e · af7cd9e
1 parent 10d59f2
commit af7cd9e
Show file tree

Hide file tree

Showing 9 changed files with 293 additions and 67 deletions.
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -20,6 +20,15 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule)* uint32_t(numColsInModule);
 
+  constexpr uint32_t numberOfModules = 1856;
+
+  constexpr uint32_t layerStart[11] = {0,96,320,672,1184,1296,1408,1520,1632,1744,1856};
+  constexpr char const * layerName[10] = {"BL1","BL2","BL3","BL4",
+   		  	                  "E+1", "E+2", "E+3",
+			                  "E-1", "E-2", "E-3"
+                                          };
+
+
   // this is for the ROC n<512 (upgrade 1024)
   constexpr inline
   uint16_t  divu52(uint16_t n) {

diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -17,16 +17,23 @@
 striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits)
 trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer)
 
+
 from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import *
 from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
 from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
 
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import *
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter
+gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone())
+
+
+
 from Configuration.ProcessModifiers.gpu_cff import gpu
 _pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy()
 _pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting)
+_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting)
 gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu)
 
-
 from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
 from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *
 

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -17,31 +17,60 @@
 #include "gpuPixelRecHits.h"
 
 namespace pixelgpudetails {
-  PixelRecHitGPUKernel::PixelRecHitGPUKernel() {
+  PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
+
+    cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.xg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.yg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.zg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.rg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.xl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.yl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.xerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.yerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.iphi_d,(gpuClustering::MaxNumModules*256)*sizeof(int16_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
+//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
+
+    cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
+
   }
 
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
     cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
     cudaCheck(cudaFree(gpu_.charge_d));
+    cudaCheck(cudaFree(gpu_.detInd_d));
     cudaCheck(cudaFree(gpu_.xg_d));
     cudaCheck(cudaFree(gpu_.yg_d));
     cudaCheck(cudaFree(gpu_.zg_d));
+    cudaCheck(cudaFree(gpu_.rg_d));
+    cudaCheck(cudaFree(gpu_.xl_d));
+    cudaCheck(cudaFree(gpu_.yl_d));
     cudaCheck(cudaFree(gpu_.xerr_d));
     cudaCheck(cudaFree(gpu_.yerr_d));
+    cudaCheck(cudaFree(gpu_.iphi_d));
+    cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
+    cudaCheck(cudaFree(gpu_.mc_d));
+    // cudaCheck(cudaFree(gpu_.hist_d));
+
+    cudaCheck(cudaFree(gpu_d));
   }
 
   void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+                                           float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            cuda::stream_t<>& stream) {
+
+   cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));
+
     thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
                            input.clusInModule_d,
                            input.clusInModule_d + gpuClustering::MaxNumModules + 1,
@@ -51,6 +80,7 @@ namespace pixelgpudetails {
     int blocks = input.nModules; // active modules (with digis)
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
+      gpu_.bs_d,
       input.moduleInd_d,
       input.xx_d, input.yy_d, input.adc_d,
       input.moduleStart_d,
@@ -59,27 +89,50 @@ namespace pixelgpudetails {
       input.nDigis,
       gpu_.hitsModuleStart_d,
       gpu_.charge_d,
-      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d,
-      gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d,
-      true // for the time being stay local...
+      gpu_.detInd_d,
+      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
+      gpu_.iphi_d,
+      gpu_.xl_d, gpu_.yl_d,
+      gpu_.xerr_d, gpu_.yerr_d, 
+      gpu_.mr_d, gpu_.mc_d
     );
 
     // needed only if hits on CPU are required...
     cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+    // to be moved to gpu?
+    auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
+    for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
+    hitsLayerStart_[10]=nhits;
+
+    std::cout << "hit layerStart "; 
+    for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
+    std::cout << "end:" << hitsLayerStart_[10] << std::endl;
+
+    cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+    // for timing test
+    // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
+
+    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
+
+
   }
 
   HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
     // needed only if hits on CPU are required...
     auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
 
     HitsOnCPU hoc(nhits);
+    hoc.gpu_d = gpu_d;
     memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
     cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaStreamSynchronize(stream.id()));
     return hoc;
   }

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -9,32 +9,21 @@
 #include <cstdint>
 #include <vector>
 
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
 namespace pixelCPEforGPU {
   struct ParamsOnGPU;
 }
 
 namespace pixelgpudetails {
-  struct HitsOnGPU{
-    uint32_t * hitsModuleStart_d;
-    int32_t  * charge_d;
-    float *xg_d, *yg_d, *zg_d;
-    float *xerr_d, *yerr_d;
-    uint16_t * mr_d;
-  };
+  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
 
-  struct HitsOnCPU {
-    explicit HitsOnCPU(uint32_t nhits) :
-      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits){}
-    uint32_t hitsModuleStart[2001];
-    std::vector<int32_t> charge;
-    std::vector<float> xl, yl;
-    std::vector<float> xe, ye;
-    std::vector<uint16_t> mr;
-  };
+  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
   class PixelRecHitGPUKernel {
   public:
-    PixelRecHitGPUKernel();
+    PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
     ~PixelRecHitGPUKernel();
 
     PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
@@ -43,14 +32,17 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
     void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+                       float const * bs,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        cuda::stream_t<>& stream);
 
     HitsOnCPU getOutput(cuda::stream_t<>& stream) const;
 
   private:
+    HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
     HitsOnGPU gpu_;
     uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
+    uint32_t hitsLayerStart_[11];
   };
 }