Merge pull request #10 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_h…

…ackason2_tmp kernel optimisations. utilize produceLegacy switch.
jsamudio · Oct 5, 2022 · af2e681 · af2e681
2 parents cd7994d + eb14f9a
commit af2e681
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 42 deletions.
diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
@@ -23,7 +23,7 @@ def customizeHcalOnlyForProfilingGPUOnly(process):
 # Currently, this means:
 #   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU
 #   - running the HBHE local reconstruction, including MAHI, on GPU.
-#   - running the HBHE PFRecHit and PFCluster producers on GPU [including copy of them to host, as we cannot untangle the copy back at a moment]
+#   - running the HBHE PFRecHit and PFCluster producers on GPU [without copy of them to host]
 def customizeHcalPFOnlyForProfilingGPUOnly(process):
 
   process.consumer = cms.EDAnalyzer("GenericConsumer",
@@ -33,6 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process):
   process.consume_step = cms.EndPath(process.consumer)
 
   process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+  process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
+  process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)
 
   return process
 

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -75,7 +75,7 @@ namespace PFClusterCudaHCAL {
   // PFRechitToPFCluster_HCAL_entryPoint
   //   seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU]
   //   prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
-  //   topoClusterLinking: run topo clustering (output: pfrh_topoId)
+  //   topoClusterLinking(KH): run topo clustering (output: pfrh_topoId)
   //   topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac)
   //   fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices)
   //   hcalFastCluster_selection
@@ -3899,6 +3899,8 @@ namespace PFClusterCudaHCAL {
       *topoIter = 0;
       nEdges = *nEdgesIn;
       gridStride = blockDim.x * gridDim.x;  // For single block kernel this is the number of threads
+      notDone = 0;
+      notDone2 = 0;
     }
 
     __syncthreads();
@@ -3915,35 +3917,22 @@ namespace PFClusterCudaHCAL {
 
     // __syncthreads();
 
-    // Explicitly initialize pfrh_parent
-    for (int i = start; i < nRH; i += gridStride) {
-      pfrh_parent[i] = i;
-    }
-
-    __syncthreads();
-
-    // for notDone
-    if (threadIdx.x == 0) {
-      notDone = 0;
-      notDone2 = 0;
-      //printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x);
-    }
-
-    __syncthreads();
-
-    // (1) First attempt
+    // First attempt of topo clustering
     // First edge [set parents to those smaller numbers]
     for (int idx = start; idx < nEdges; idx += gridStride) {
       int i = pfrh_edgeId[idx];  // Get edge topo id
       if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH
 	pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
+      } else {
+	pfrh_parent[i] = i;
       }
     }
 
     __syncthreads();
 
-    // KenH
-    for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges
+    //
+    // loop until topo clustering iteration converges
+    for (int ii=0; ii<100; ii++) {
 
       // for notDone
       if (threadIdx.x == 0) {
@@ -3995,27 +3984,6 @@ namespace PFClusterCudaHCAL {
 
     } // for-loop ii
 
-    __syncthreads();
-
-    // Follow parents of parents .... to contract parent structure
-    do {
-      volatile bool threadNotDone = false;
-      for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
-        int parent = pfrh_parent[i];
-        if (parent >= 0 && parent != pfrh_parent[parent]) {
-          threadNotDone = true;
-          pfrh_parent[i] = pfrh_parent[parent];
-        }
-      }
-      if (threadIdx.x == 0)
-        notDone = 0;
-      __syncthreads();
-
-      atomicAdd(&notDone, (int)threadNotDone);
-      __syncthreads();
-
-    } while (notDone);
-
   }
 
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {

diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py
@@ -111,11 +111,14 @@
     positionReCalc = cms.PSet(),
     energyCorrector = cms.PSet()
 )
+_module_config_cuda = _module_config.clone()
 
 #### PF CLUSTER HCAL ####
 _particleFlowClusterHBHE_cpu = cms.EDProducer("PFClusterProducer", _module_config.clone())
 _particleFlowClusterHBHE_cuda = cms.EDProducer("PFClusterProducerCudaHCAL", _module_config.clone())
 _particleFlowClusterHBHE_cuda.PFRecHitsLabelIn = cms.InputTag("particleFlowRecHitHBHE","")
+_particleFlowClusterHBHE_cuda.produceLegacy = cms.bool(True)
+_particleFlowClusterHBHE_cuda.produceSoA = cms.bool(True)
 
 #####
 

diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py
@@ -44,6 +44,10 @@
 
 _module_pset_cuda = _module_pset.clone()
 _module_pset_cuda.producers[0].src= "hbheRecHitProducerGPU" # use GPU version as input instead of legacy version
+_module_pset_cuda.produceSoA = cms.bool(True)
+_module_pset_cuda.produceLegacy = cms.bool(True)
+_module_pset_cuda.produceCleanedLegacy = cms.bool(True)
+
 for idx, x in enumerate(_module_pset_cuda.producers):
     for idy, y in enumerate(x.qualityTests):
         if y.name._value == "PFRecHitQTestHCALThresholdVsDepth": # when applying phase1 depth-dependent HCAL thresholds