Skip to content

Commit

Permalink
Merge pull request #10 from hatakeyamak/PFRecHitAndCluster_GPU_12_5_h…
Browse files Browse the repository at this point in the history
…ackason2_tmp

kernel optimisations. utilize produceLegacy switch.
  • Loading branch information
hatakeyamak authored Oct 5, 2022
2 parents cd7994d + eb14f9a commit af2e681
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def customizeHcalOnlyForProfilingGPUOnly(process):
# Currently, this means:
# - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU
# - running the HBHE local reconstruction, including MAHI, on GPU.
# - running the HBHE PFRecHit and PFCluster producers on GPU [including copy of them to host, as we cannot untangle the copy back at a moment]
# - running the HBHE PFRecHit and PFCluster producers on GPU [without copy of them to host]
def customizeHcalPFOnlyForProfilingGPUOnly(process):

process.consumer = cms.EDAnalyzer("GenericConsumer",
Expand All @@ -33,6 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process):
process.consume_step = cms.EndPath(process.consumer)

process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)

return process

Expand Down
50 changes: 9 additions & 41 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ namespace PFClusterCudaHCAL {
// PFRechitToPFCluster_HCAL_entryPoint
// seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU]
// prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
// topoClusterLinking: run topo clustering (output: pfrh_topoId)
// topoClusterLinking(KH): run topo clustering (output: pfrh_topoId)
// topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac)
// fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices)
// hcalFastCluster_selection
Expand Down Expand Up @@ -3899,6 +3899,8 @@ namespace PFClusterCudaHCAL {
*topoIter = 0;
nEdges = *nEdgesIn;
gridStride = blockDim.x * gridDim.x; // For single block kernel this is the number of threads
notDone = 0;
notDone2 = 0;
}

__syncthreads();
Expand All @@ -3915,35 +3917,22 @@ namespace PFClusterCudaHCAL {

// __syncthreads();

// Explicitly initialize pfrh_parent
for (int i = start; i < nRH; i += gridStride) {
pfrh_parent[i] = i;
}

__syncthreads();

// for notDone
if (threadIdx.x == 0) {
notDone = 0;
notDone2 = 0;
//printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x);
}

__syncthreads();

// (1) First attempt
// First attempt of topo clustering
// First edge [set parents to those smaller numbers]
for (int idx = start; idx < nEdges; idx += gridStride) {
int i = pfrh_edgeId[idx]; // Get edge topo id
if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH
pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
} else {
pfrh_parent[i] = i;
}
}

__syncthreads();

// KenH
for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges
//
// loop until topo clustering iteration converges
for (int ii=0; ii<100; ii++) {

// for notDone
if (threadIdx.x == 0) {
Expand Down Expand Up @@ -3995,27 +3984,6 @@ namespace PFClusterCudaHCAL {

} // for-loop ii

__syncthreads();

// Follow parents of parents .... to contract parent structure
do {
volatile bool threadNotDone = false;
for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
int parent = pfrh_parent[i];
if (parent >= 0 && parent != pfrh_parent[parent]) {
threadNotDone = true;
pfrh_parent[i] = pfrh_parent[parent];
}
}
if (threadIdx.x == 0)
notDone = 0;
__syncthreads();

atomicAdd(&notDone, (int)threadNotDone);
__syncthreads();

} while (notDone);

}

__device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,14 @@
positionReCalc = cms.PSet(),
energyCorrector = cms.PSet()
)
_module_config_cuda = _module_config.clone()

#### PF CLUSTER HCAL ####
_particleFlowClusterHBHE_cpu = cms.EDProducer("PFClusterProducer", _module_config.clone())
_particleFlowClusterHBHE_cuda = cms.EDProducer("PFClusterProducerCudaHCAL", _module_config.clone())
_particleFlowClusterHBHE_cuda.PFRecHitsLabelIn = cms.InputTag("particleFlowRecHitHBHE","")
_particleFlowClusterHBHE_cuda.produceLegacy = cms.bool(True)
_particleFlowClusterHBHE_cuda.produceSoA = cms.bool(True)

#####

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@

_module_pset_cuda = _module_pset.clone()
_module_pset_cuda.producers[0].src= "hbheRecHitProducerGPU" # use GPU version as input instead of legacy version
_module_pset_cuda.produceSoA = cms.bool(True)
_module_pset_cuda.produceLegacy = cms.bool(True)
_module_pset_cuda.produceCleanedLegacy = cms.bool(True)

for idx, x in enumerate(_module_pset_cuda.producers):
for idy, y in enumerate(x.qualityTests):
if y.name._value == "PFRecHitQTestHCALThresholdVsDepth": # when applying phase1 depth-dependent HCAL thresholds
Expand Down

0 comments on commit af2e681

Please sign in to comment.