cms-patatrack · fwyzard · Jun 29, 2018 · Jun 11, 2018 · Jun 11, 2018 · Jun 11, 2018
diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h
@@ -20,6 +20,15 @@ namespace phase1PixelTopology {
 
   constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule)* uint32_t(numColsInModule);
 
+  constexpr uint32_t numberOfModules = 1856;
+
+  constexpr uint32_t layerStart[11] = {0,96,320,672,1184,1296,1408,1520,1632,1744,1856};
+  constexpr char const * layerName[10] = {"BL1","BL2","BL3","BL4",
+   		  	                  "E+1", "E+2", "E+3",
+			                  "E-1", "E-2", "E-3"
+                                          };
 PixelLayerTriplets.layerList = cms.vstring('BPix1+BPix2+BPix3',  
     'BPix1+BPix2+FPix1_pos',  
     'BPix1+BPix2+FPix1_neg',  
     'BPix1+FPix1_pos+FPix2_pos',  
     'BPix1+FPix1_neg+FPix2_neg' 
 ) 
 enum SubDetector {PixelBarrel=1,PixelEndcap=2}; 
 enum SubDetector {PixelBarrel, PixelEndcap, TIB, TOB, TID, TEC, CSC, DT, RPCBarrel, RPCEndcap, GEM, ME0, P2OTB, P2OTEC, P1PXB, P1PXEC, P2PXB, P2PXEC, TimingBarrel, TimingEndcap, invalidDet}; 
 unsigned int pxbModule(const DetId &id) const { 
   return ((id.rawId()>>pbVals_.moduleStartBit_)& pbVals_.moduleMask_); 
 } 
 unsigned int pxfModule(const DetId &id) const { 
 if subdet in [SubDet.FPix, SubDet.TID, SubDet.TEC] or isPhase2OTBarrel: 
     sideNum = get("side") 
     if sideNum == 1: 
         side = "-" 
     elif sideNum == 2: 
         side = "+" 
     elif isPhase2OTBarrel and sideNum == 3: 
         side = "" 
     else: 
         side = "?" 
 return "%s%d%s" % (SubDet.toString(subdet), 
                    getattr(self._tree, self._prefix+"_layer")[self._index], 
                    side) 
 PixelLayerTriplets.layerList = cms.vstring('BPix1+BPix2+BPix3',  
     'BPix1+BPix2+FPix1_pos',  
     'BPix1+BPix2+FPix1_neg',  
     'BPix1+FPix1_pos+FPix2_pos',  
     'BPix1+FPix1_neg+FPix2_neg' 
 ) 
 enum SubDetector {PixelBarrel=1,PixelEndcap=2}; 
 enum SubDetector {PixelBarrel, PixelEndcap, TIB, TOB, TID, TEC, CSC, DT, RPCBarrel, RPCEndcap, GEM, ME0, P2OTB, P2OTEC, P1PXB, P1PXEC, P2PXB, P2PXEC, TimingBarrel, TimingEndcap, invalidDet}; 
 unsigned int pxbModule(const DetId &id) const { 
   return ((id.rawId()>>pbVals_.moduleStartBit_)& pbVals_.moduleMask_); 
 } 
 unsigned int pxfModule(const DetId &id) const { 
 if subdet in [SubDet.FPix, SubDet.TID, SubDet.TEC] or isPhase2OTBarrel: 
     sideNum = get("side") 
     if sideNum == 1: 
         side = "-" 
     elif sideNum == 2: 
         side = "+" 
     elif isPhase2OTBarrel and sideNum == 3: 
         side = "" 
     else: 
         side = "?" 
 return "%s%d%s" % (SubDet.toString(subdet), 
                    getattr(self._tree, self._prefix+"_layer")[self._index], 
                    side) 
+
+
   // this is for the ROC n<512 (upgrade 1024)
   constexpr inline
   uint16_t  divu52(uint16_t n) {

diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -17,16 +17,23 @@
 striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits)
 trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer)
 
+
 from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import *
 from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
 from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
 
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import *
+from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter
+gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone())
+
+
+
 from Configuration.ProcessModifiers.gpu_cff import gpu
 _pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy()
 _pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting)
+_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting)
 gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu)
 
-
 from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
 from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *
 

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -17,31 +17,60 @@
 #include "gpuPixelRecHits.h"
 
 namespace pixelgpudetails {
-  PixelRecHitGPUKernel::PixelRecHitGPUKernel() {
+  PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
+
+    cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.xg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.yg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.zg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.rg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.xl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.yl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.xerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
     cudaCheck(cudaMalloc((void**) & gpu_.yerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
+    cudaCheck(cudaMalloc((void**) & gpu_.iphi_d,(gpuClustering::MaxNumModules*256)*sizeof(int16_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
     cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
+    cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
+//    cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));
+
+    cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
+    cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));
+
   }
 
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
     cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
     cudaCheck(cudaFree(gpu_.charge_d));
+    cudaCheck(cudaFree(gpu_.detInd_d));
     cudaCheck(cudaFree(gpu_.xg_d));
     cudaCheck(cudaFree(gpu_.yg_d));
     cudaCheck(cudaFree(gpu_.zg_d));
+    cudaCheck(cudaFree(gpu_.rg_d));
+    cudaCheck(cudaFree(gpu_.xl_d));
+    cudaCheck(cudaFree(gpu_.yl_d));
     cudaCheck(cudaFree(gpu_.xerr_d));
     cudaCheck(cudaFree(gpu_.yerr_d));
+    cudaCheck(cudaFree(gpu_.iphi_d));
+    cudaCheck(cudaFree(gpu_.sortIndex_d));
     cudaCheck(cudaFree(gpu_.mr_d));
+    cudaCheck(cudaFree(gpu_.mc_d));
+    // cudaCheck(cudaFree(gpu_.hist_d));
+
+    cudaCheck(cudaFree(gpu_d));
   }
 
   void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+                                           float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            cuda::stream_t<>& stream) {
+
+   cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));
+
     thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
                            input.clusInModule_d,
                            input.clusInModule_d + gpuClustering::MaxNumModules + 1,
@@ -51,6 +80,7 @@ namespace pixelgpudetails {
     int blocks = input.nModules; // active modules (with digis)
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
+      gpu_.bs_d,
       input.moduleInd_d,
       input.xx_d, input.yy_d, input.adc_d,
       input.moduleStart_d,
@@ -59,27 +89,50 @@ namespace pixelgpudetails {
       input.nDigis,
       gpu_.hitsModuleStart_d,
       gpu_.charge_d,
-      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d,
-      gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d,
-      true // for the time being stay local...
+      gpu_.detInd_d,
+      gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
+      gpu_.iphi_d,
+      gpu_.xl_d, gpu_.yl_d,
+      gpu_.xerr_d, gpu_.yerr_d, 
+      gpu_.mr_d, gpu_.mc_d
     );
 
     // needed only if hits on CPU are required...
     cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+    // to be moved to gpu?
+    auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
+    for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
+    hitsLayerStart_[10]=nhits;
+
+    std::cout << "hit layerStart "; 
+    for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
+    std::cout << "end:" << hitsLayerStart_[10] << std::endl;
+
+    cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+
+    // for timing test
+    // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);
+
+    // fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);
+
+
   }
 
   HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
     // needed only if hits on CPU are required...
     auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
 
     HitsOnCPU hoc(nhits);
+    hoc.gpu_d = gpu_d;
     memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
     cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
+    cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaStreamSynchronize(stream.id()));
     return hoc;
   }

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -9,32 +9,21 @@
 #include <cstdint>
 #include <vector>
 
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" 
+
+
 namespace pixelCPEforGPU {
   struct ParamsOnGPU;
 }
 
 namespace pixelgpudetails {
-  struct HitsOnGPU{
-    uint32_t * hitsModuleStart_d;
-    int32_t  * charge_d;
-    float *xg_d, *yg_d, *zg_d;
-    float *xerr_d, *yerr_d;
-    uint16_t * mr_d;
-  };
+  using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
 
-  struct HitsOnCPU {
-    explicit HitsOnCPU(uint32_t nhits) :
-      charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits){}
-    uint32_t hitsModuleStart[2001];
-    std::vector<int32_t> charge;
-    std::vector<float> xl, yl;
-    std::vector<float> xe, ye;
-    std::vector<uint16_t> mr;
-  };
+  using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
   class PixelRecHitGPUKernel {
   public:
-    PixelRecHitGPUKernel();
+    PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
     ~PixelRecHitGPUKernel();
 
     PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
@@ -43,14 +32,17 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
     void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+                       float const * bs,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        cuda::stream_t<>& stream);
 
     HitsOnCPU getOutput(cuda::stream_t<>& stream) const;
 
   private:
+    HitsOnGPU * gpu_d;  // copy of the structure on the gpu itself: this is the "Product" 
     HitsOnGPU gpu_;
     uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
+    uint32_t hitsLayerStart_[11];
   };
 }