From 8ae2ba80def631aaecd104fb9c7c3a8f757f527d Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Fri, 30 Sep 2022 07:27:30 -0500 Subject: [PATCH 01/11] temporary fix for shared memory bool going out of bound. --- .../plugins/PFClusterCudaHCAL.cu | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index d29e3794aa3b1..2a495a2d9fa79 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -73,9 +73,9 @@ namespace PFClusterCudaHCAL { // --- kernel summary -- // initializeCudaConstants // PFRechitToPFCluster_HCAL_entryPoint - // seedingTopoThreshKernel_HCAL - // prepareTopoInputs - // topoClusterLinking + // seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU] + // prepareTopoInputs: prepare "edge" data (output: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension]) + // topoClusterLinking: // topoClusterContraction // fillRhfIndex // hcalFastCluster_selection @@ -3766,7 +3766,7 @@ namespace PFClusterCudaHCAL { int* pfrh_edgeMask, const int* pfrh_passTopoThresh, int* topoIter) { - __shared__ bool notDone; + __shared__ int notDone; // This is better be bool, but somehow it leads to out of bound __shared__ int iter, gridStride, nEdges; int start = blockIdx.x * blockDim.x + threadIdx.x; @@ -3790,7 +3790,7 @@ namespace PFClusterCudaHCAL { do { if (threadIdx.x == 0) { - notDone = false; + notDone = 0; } __syncthreads(); @@ -3814,7 +3814,7 @@ namespace PFClusterCudaHCAL { // edgeMask set to true if elements of edgeId and edgeList are different if (pfrh_edgeId[idx] != pfrh_edgeList[idx]) { pfrh_edgeMask[idx] = 1; - notDone = true; + notDone = 1; } else { pfrh_edgeMask[idx] = 0; } @@ -3825,13 +3825,13 @@ namespace PFClusterCudaHCAL { __syncthreads(); - if (!notDone) + if (notDone==0) break; __syncthreads();//!! if (threadIdx.x == 0) { - notDone = false; + notDone = 0; // KenH is this necessary? } __syncthreads(); @@ -3858,7 +3858,7 @@ namespace PFClusterCudaHCAL { // edgeMask set to true if elements of edgeId and edgeList are different if (pfrh_edgeId[idx] != pfrh_edgeList[idx]) { pfrh_edgeMask[idx] = 1; - notDone = true; + notDone = 1; } else { pfrh_edgeMask[idx] = 0; } @@ -3870,7 +3870,7 @@ namespace PFClusterCudaHCAL { __syncthreads(); - } while (notDone); + } while (notDone==1); *topoIter = iter; #ifdef DEBUG_GPU_HCAL From f5f6d63e4723598d6d4a5d5a413e470859e1d1a1 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Mon, 3 Oct 2022 01:29:53 -0500 Subject: [PATCH 02/11] memori allocation adjustment --- RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h | 3 ++- .../PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h index 46310729d5a5e..02b1930d66aec 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h +++ b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h @@ -161,7 +161,7 @@ namespace PFRecHit { namespace PFClustering { namespace HCAL { struct ConfigurationParameters { - uint32_t maxRH = 3000; // previously: 2000 + uint32_t maxRH = 4000; // previously: 2000 uint32_t maxPFCFracs = 600000; // previously: 80000 uint32_t maxNeighbors = 8; }; @@ -332,6 +332,7 @@ namespace PFClustering { } }; } // namespace HCAL + } // namespace PFClustering #endif // RecoParticleFlow_PFClusterProducer_plugins_DeclsForKernels_h diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc index 4cf0d9525b98e..aace914db3ab7 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc @@ -317,6 +317,7 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event, nRH_ = PFRecHits.size; if (nRH_ == 0) return; + if (nRH_>3000) std::cout << "nRH(PFRecHitSize)>3000: " << nRH_ << std::endl; const int numbytes_int = nRH_ * sizeof(int); int totalNeighbours = 0; // Running count of 8 neighbour edges for edgeId, edgeList From 8f30bee3fe1d063d2ab497c9c994209bd9a08f47 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Mon, 3 Oct 2022 22:33:45 -0500 Subject: [PATCH 03/11] memory allocation param updates. --- RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h | 2 +- .../PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h index 02b1930d66aec..f27e576afe3b0 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h +++ b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h @@ -162,7 +162,7 @@ namespace PFClustering { namespace HCAL { struct ConfigurationParameters { uint32_t maxRH = 4000; // previously: 2000 - uint32_t maxPFCFracs = 600000; // previously: 80000 + uint32_t maxPFCFracs = 200000; // previously: 80000 uint32_t maxNeighbors = 8; }; diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc index aace914db3ab7..3dd5a66718421 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc @@ -317,7 +317,7 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event, nRH_ = PFRecHits.size; if (nRH_ == 0) return; - if (nRH_>3000) std::cout << "nRH(PFRecHitSize)>3000: " << nRH_ << std::endl; + if (nRH_>4000) std::cout << "nRH(PFRecHitSize)>4000: " << nRH_ << std::endl; const int numbytes_int = nRH_ * sizeof(int); int totalNeighbours = 0; // Running count of 8 neighbour edges for edgeId, edgeList From a6fe13943b5646c2388edee1ef9a16527281cc96 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Mon, 3 Oct 2022 22:34:42 -0500 Subject: [PATCH 04/11] Ad-hoc kernel for topo linking. --- .../plugins/PFClusterCudaHCAL.cu | 312 ++++++++++++++---- 1 file changed, 252 insertions(+), 60 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 2a495a2d9fa79..16a1605e4247f 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -74,10 +74,10 @@ namespace PFClusterCudaHCAL { // initializeCudaConstants // PFRechitToPFCluster_HCAL_entryPoint // seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU] - // prepareTopoInputs: prepare "edge" data (output: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension]) - // topoClusterLinking: - // topoClusterContraction - // fillRhfIndex + // prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension]) + // topoClusterLinking: run topo clustering (output: pfrh_topoId) + // topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac) + // fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices) // hcalFastCluster_selection // dev_hcalFastCluster_optimizedSimple // dev_hcalFastCluster_optimizedComplex @@ -1830,7 +1830,7 @@ namespace PFClusterCudaHCAL { pos4.z += rechitPos.z * norm; pos4.w += norm; // position_norm }; - /* + /* auto computeClusterPosAtomic = [&] (float4& pos4, float _frac, int rhInd, bool isDebug) { float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0); @@ -1839,7 +1839,7 @@ namespace PFClusterCudaHCAL { (_frac < minFractionInCalc ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv))); if (isDebug) printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", rhInd, norm, _frac, rh_energy, rechitPos.x, rechitPos.y, rechitPos.z); - + atomicAdd(&pos4.x, rechitPos.x * norm); atomicAdd(&pos4.y, rechitPos.y * norm); atomicAdd(&pos4.z, rechitPos.z * norm); @@ -2168,7 +2168,7 @@ namespace PFClusterCudaHCAL { pos4.z += rechitPos.z * norm; pos4.w += norm; // position_norm }; - /* + /* auto computeClusterPosAtomic = [&] (float4& pos4, float _frac, int rhInd, bool isDebug) { float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0); @@ -2177,7 +2177,7 @@ namespace PFClusterCudaHCAL { (_frac < minFractionInCalc ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv))); if (isDebug) printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", rhInd, norm, _frac, rh_energy, rechitPos.x, rechitPos.y, rechitPos.z); - + atomicAdd(&pos4.x, rechitPos.x * norm); atomicAdd(&pos4.y, rechitPos.y * norm); atomicAdd(&pos4.z, rechitPos.z * norm); @@ -3520,38 +3520,11 @@ namespace PFClusterCudaHCAL { } } - // Contraction in a single block - __global__ void topoClusterContraction(size_t size, int* pfrh_parent, int* pfrh_isSeed) { - __shared__ int notDone; - if (threadIdx.x == 0) - notDone = 0; - __syncthreads(); - - do { - volatile bool threadNotDone = false; - for (int i = threadIdx.x; i < size; i += blockDim.x) { - int parent = pfrh_parent[i]; - if (parent >= 0 && parent != pfrh_parent[parent]) { - threadNotDone = true; - pfrh_parent[i] = pfrh_parent[parent]; - } - } - if (threadIdx.x == 0) - notDone = 0; - __syncthreads(); - - atomicAdd(¬Done, (int)threadNotDone); - //if (threadNotDone) notDone = true; - //notDone |= threadNotDone; - __syncthreads(); - - } while (notDone); - } - // Contraction in a single block __global__ void topoClusterContraction(size_t size, int* pfrh_parent, int* pfrh_isSeed, + //const int* pfrh_neighbours, // temporary inputs for debuggi int* rhCount, int* topoSeedCount, int* topoRHCount, @@ -3588,6 +3561,30 @@ namespace PFClusterCudaHCAL { } while (notDone); + // // debugging printing block + // __syncthreads(); + // if (threadIdx.x == 0) { + // int nnode=0; + // for (int i = 0; i < size; i++) { + // //printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); + // if (i==pfrh_parent[i]) nnode++; + // } + // printf("pfrh_parent 3 multiplicity: %d\n",nnode); + // for (int pos = 0; pos < size; pos++) { + // int parent_target = pfrh_parent[pos]; + // for (int i = 0; i < 8; i++) { + // int neighbor_id = pfrh_neighbours[pos * 8 + i]; + // if (neighbor_id>-1){ // valid neighbors + // int parent_neighbor = pfrh_parent[neighbor_id]; + // if (parent_target!=parent_neighbor){ + // printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id); + // } + // } + // } + // } + // } + // __syncthreads(); + // Now determine the number of seeds and rechits in each topo cluster for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) { int topoId = pfrh_parent[rhIdx]; @@ -3640,8 +3637,10 @@ namespace PFClusterCudaHCAL { __syncthreads(); if (threadIdx.x == 0) { *pcrhFracSize = totalSeedFracOffset; - //printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize); + if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs + printf("At the end of topoClusterContraction, found large *pcrhFracSize = %d\n", *pcrhFracSize); } + } // Prefill the rechit index for all PFCluster fractions @@ -3676,7 +3675,7 @@ namespace PFClusterCudaHCAL { int* pcrhfracind) { //int debugSeedIdx = 500; - /* + /* printf("rhCount = \n["); for (int i = 0; i < (int)nRH; i++) { if (i != 0) printf(", "); @@ -3729,6 +3728,27 @@ namespace PFClusterCudaHCAL { return false; } + // when on the left edge of the edgeId/List block, returns true + __device__ __forceinline__ bool isLeftEdgeKH(const int idx, + const int nEdges, + const int* __restrict__ pfrh_edgeId, + const int* __restrict__ pfrh_edgeMask) { + int temp = idx - 1; + if (idx > 0) { + int edgeId = pfrh_edgeId[idx]; + int tempId = pfrh_edgeId[temp]; + if (edgeId != tempId) { + // Different topo Id here! + return true; + } + } else if (temp < 0) { // idx==0 + return true; + } + + // Invalid index + return false; + } + __device__ __forceinline__ bool isRightEdge(const int idx, const int nEdges, const int* __restrict__ pfrh_edgeId, @@ -3881,6 +3901,176 @@ namespace PFClusterCudaHCAL { #endif } + __global__ void topoClusterLinkingKH(int nRH, + int* nEdgesIn, + //float* pfrh_energy, // Temporary entry for debugging + int* pfrh_parent, + int* pfrh_edgeId, + int* pfrh_edgeList, + int* pfrh_edgeMask, + const int* pfrh_passTopoThresh, + int* topoIter) { + __shared__ int notDone; // This is better be bool, but somehow it leads to out of bound + __shared__ int notDone2; + __shared__ int gridStride, nEdges; + + // Initialization + int start = blockIdx.x * blockDim.x + threadIdx.x; + + if (threadIdx.x == 0) { + *topoIter = 0; + nEdges = *nEdgesIn; + gridStride = blockDim.x * gridDim.x; // For single block kernel this is the number of threads + } + + __syncthreads(); + + // Check if pairs in edgeId,edgeList contain a rh not passing topo threshold + // If found, set the mask to 0 + // But, for now, not using edgeMask hereafter, because the same threshold cut is applied at the PFRecHit level + // for (int idx = start; idx < nEdges; idx += gridStride) { + // if (pfrh_passTopoThresh[pfrh_edgeId[idx]] && pfrh_passTopoThresh[pfrh_edgeList[idx]]) + // pfrh_edgeMask[idx] = 1; + // else + // pfrh_edgeMask[idx] = 0; + // } + + // __syncthreads(); + + // // Print out debugging info + // if (threadIdx.x == 0) { + // // for (int idx = 0; idx < nEdges; idx++) { + // // printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]); + // // //printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]); + // // } + // printf("number of eges %d\n",nEdges); + // // for (int i = 0; i < nRH; i++) { + // // printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]); + // // } + // } + + // __syncthreads(); + + // for notDone + if (threadIdx.x == 0) { + notDone = 0; + notDone2 = 0; + //printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x); + } + + __syncthreads(); + + // (1) First attempt + // First edge [set parents to those smaller numbers] + for (int idx = start; idx < nEdges; idx += gridStride) { + int i = pfrh_edgeId[idx]; // Get edge topo id + if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH + pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]); + } + } + + __syncthreads(); + + // KenH + for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges + + // for notDone + if (threadIdx.x == 0) { + notDone2 = 0; + } + + // Follow parents of parents .... to contract parent structure + do { + volatile bool threadNotDone = false; + for (int i = threadIdx.x; i < nRH; i += blockDim.x) { + int parent = pfrh_parent[i]; + if (parent >= 0 && parent != pfrh_parent[parent]) { + threadNotDone = true; + pfrh_parent[i] = pfrh_parent[parent]; + } + } + if (threadIdx.x == 0) + notDone = 0; + __syncthreads(); + + atomicAdd(¬Done, (int)threadNotDone); + __syncthreads(); + + } while (notDone); + + __syncthreads(); + + // Print out debugging info + // Connect remaining links + // if (threadIdx.x == 0) { + // int nnode=0; + // for (int i = 0; i < nRH; i++) { + // //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); + // if (i==pfrh_parent[i]) nnode++; + // } + // printf("pfrh_parent multiplicity: %d\n",nnode); + // } + + // __syncthreads(); + + for (int idx = start; idx < nEdges; idx += gridStride) { + //for (int idx = 0; idx < nEdges; idx++) { + int i = pfrh_edgeId[idx]; // Get edge topo id + int j = pfrh_edgeList[idx]; // Get edge neighbor list + int parent_target = pfrh_parent[i]; + int parent_neighbor = pfrh_parent[j]; + if (parent_target!=parent_neighbor){ + notDone2 = 1; + //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii); + int min_parent = (int)min(parent_target,parent_neighbor); + int max_parent = (int)max(parent_target,parent_neighbor); + int idx_max = i; + if (parent_neighbor == max_parent) idx_max = j; + pfrh_parent[idx_max] = min_parent; + } + } + + __syncthreads(); + if (notDone2==0) // if topocluster finding is converged, terminate the for-ii loop + break; + + } // for-loop ii + + __syncthreads(); + + // Follow parents of parents .... to contract parent structure + do { + volatile bool threadNotDone = false; + for (int i = threadIdx.x; i < nRH; i += blockDim.x) { + int parent = pfrh_parent[i]; + if (parent >= 0 && parent != pfrh_parent[parent]) { + threadNotDone = true; + pfrh_parent[i] = pfrh_parent[parent]; + } + } + if (threadIdx.x == 0) + notDone = 0; + __syncthreads(); + + atomicAdd(¬Done, (int)threadNotDone); + __syncthreads(); + + } while (notDone); + + //__syncthreads(); + + // Print out debugging info + // if (threadIdx.x == 0) { + // int nnode=0; + // for (int i = 0; i < nRH; i++) { + // //printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); + // if (i==pfrh_parent[i]) nnode++; + // } + // printf("pfrh_parent 2 multiplicity: %d\n",nnode); + // } + + } + __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) { const int tmp = min(toSort[a], toSort[b]); toSort[b] = max(toSort[a], toSort[b]); @@ -4475,23 +4665,23 @@ namespace PFClusterCudaHCAL { cudaEventRecord(start, cudaStream); #endif - // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); + prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( + nRH, + outputGPU.nEdges.get(), + outputGPU.pfrh_passTopoThresh.get(), + inputPFRecHits.pfrh_neighbours.get(), + scratchGPU.pfrh_edgeId.get(), + scratchGPU.pfrh_edgeList.get()); // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. - prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, - outputGPU.nEdges.get(), - outputGPU.pfrh_passTopoThresh.get(), - inputPFRecHits.pfrh_neighbours.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get()); + // prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, + // outputGPU.nEdges.get(), + // outputGPU.pfrh_passTopoThresh.get(), + // inputPFRecHits.pfrh_neighbours.get(), + // scratchGPU.pfrh_edgeId.get(), + // scratchGPU.pfrh_edgeList.get()); cudaCheck(cudaStreamSynchronize(cudaStream)); // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( @@ -4522,20 +4712,22 @@ namespace PFClusterCudaHCAL { #endif // Topo clustering - topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, - outputGPU.nEdges.get(), - outputGPU.pfrh_topoId.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get(), - scratchGPU.pfrh_edgeMask.get(), - //inputGPU.pfrh_edgeMask.get(), - outputGPU.pfrh_passTopoThresh.get(), - outputGPU.topoIter.get()); + topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH, + outputGPU.nEdges.get(), + //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging + outputGPU.pfrh_topoId.get(), + scratchGPU.pfrh_edgeId.get(), + scratchGPU.pfrh_edgeList.get(), + scratchGPU.pfrh_edgeMask.get(), + //inputGPU.pfrh_edgeMask.get(), + outputGPU.pfrh_passTopoThresh.get(), + outputGPU.topoIter.get()); cudaCheck(cudaStreamSynchronize(cudaStream)); topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.pfrh_topoId.get(), outputGPU.pfrh_isSeed.get(), + //inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging scratchGPU.rhcount.get(), outputGPU.topoSeedCount.get(), outputGPU.topoRHCount.get(), From 80e88a586216033865212ff1ed4842755e1a909e Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Tue, 4 Oct 2022 11:24:51 -0500 Subject: [PATCH 05/11] Add explicit initialization of pfrh_parent. --- .../plugins/PFClusterCudaHCAL.cu | 100 ++++++++++-------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 16a1605e4247f..99413662041ea 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3851,7 +3851,7 @@ namespace PFClusterCudaHCAL { __syncthreads();//!! if (threadIdx.x == 0) { - notDone = 0; // KenH is this necessary? + notDone = 0; } __syncthreads(); @@ -3949,7 +3949,12 @@ namespace PFClusterCudaHCAL { // // } // } - // __syncthreads(); + // Explicitly initialize pfrh_parent + for (int i = start; i < nRH; i += gridStride) { + pfrh_parent[i] = i; + } + + __syncthreads(); // for notDone if (threadIdx.x == 0) { @@ -3976,25 +3981,25 @@ namespace PFClusterCudaHCAL { // for notDone if (threadIdx.x == 0) { - notDone2 = 0; + notDone2 = 0; } // Follow parents of parents .... to contract parent structure do { - volatile bool threadNotDone = false; - for (int i = threadIdx.x; i < nRH; i += blockDim.x) { - int parent = pfrh_parent[i]; - if (parent >= 0 && parent != pfrh_parent[parent]) { - threadNotDone = true; - pfrh_parent[i] = pfrh_parent[parent]; - } - } - if (threadIdx.x == 0) - notDone = 0; - __syncthreads(); - - atomicAdd(¬Done, (int)threadNotDone); - __syncthreads(); + volatile bool threadNotDone = false; + for (int i = threadIdx.x; i < nRH; i += blockDim.x) { + int parent = pfrh_parent[i]; + if (parent >= 0 && parent != pfrh_parent[parent]) { + threadNotDone = true; + pfrh_parent[i] = pfrh_parent[parent]; + } + } + if (threadIdx.x == 0) + notDone = 0; + __syncthreads(); + + atomicAdd(¬Done, (int)threadNotDone); + __syncthreads(); } while (notDone); @@ -4014,25 +4019,25 @@ namespace PFClusterCudaHCAL { // __syncthreads(); for (int idx = start; idx < nEdges; idx += gridStride) { - //for (int idx = 0; idx < nEdges; idx++) { - int i = pfrh_edgeId[idx]; // Get edge topo id - int j = pfrh_edgeList[idx]; // Get edge neighbor list - int parent_target = pfrh_parent[i]; - int parent_neighbor = pfrh_parent[j]; - if (parent_target!=parent_neighbor){ - notDone2 = 1; - //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii); - int min_parent = (int)min(parent_target,parent_neighbor); - int max_parent = (int)max(parent_target,parent_neighbor); - int idx_max = i; - if (parent_neighbor == max_parent) idx_max = j; - pfrh_parent[idx_max] = min_parent; - } + //for (int idx = 0; idx < nEdges; idx++) { + int i = pfrh_edgeId[idx]; // Get edge topo id + int j = pfrh_edgeList[idx]; // Get edge neighbor list + int parent_target = pfrh_parent[i]; + int parent_neighbor = pfrh_parent[j]; + if (parent_target!=parent_neighbor){ + notDone2 = 1; + //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii); + int min_parent = (int)min(parent_target,parent_neighbor); + int max_parent = (int)max(parent_target,parent_neighbor); + int idx_max = i; + if (parent_neighbor == max_parent) idx_max = j; + pfrh_parent[idx_max] = min_parent; + } } __syncthreads(); if (notDone2==0) // if topocluster finding is converged, terminate the for-ii loop - break; + break; } // for-loop ii @@ -4057,9 +4062,9 @@ namespace PFClusterCudaHCAL { } while (notDone); - //__syncthreads(); + // __syncthreads(); - // Print out debugging info + // // Print out debugging info // if (threadIdx.x == 0) { // int nnode=0; // for (int i = 0; i < nRH; i++) { @@ -4665,23 +4670,23 @@ namespace PFClusterCudaHCAL { cudaEventRecord(start, cudaStream); #endif - prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - nRH, - outputGPU.nEdges.get(), - outputGPU.pfrh_passTopoThresh.get(), - inputPFRecHits.pfrh_neighbours.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get()); + // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( + // nRH, + // outputGPU.nEdges.get(), + // outputGPU.pfrh_passTopoThresh.get(), + // inputPFRecHits.pfrh_neighbours.get(), + // scratchGPU.pfrh_edgeId.get(), + // scratchGPU.pfrh_edgeList.get()); // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. - // prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); + prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, + outputGPU.nEdges.get(), + outputGPU.pfrh_passTopoThresh.get(), + inputPFRecHits.pfrh_neighbours.get(), + scratchGPU.pfrh_edgeId.get(), + scratchGPU.pfrh_edgeList.get()); cudaCheck(cudaStreamSynchronize(cudaStream)); // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( @@ -4712,6 +4717,7 @@ namespace PFClusterCudaHCAL { #endif // Topo clustering + //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.nEdges.get(), //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging From a12d9442830884580bda93ba21fa8ff50689daab Mon Sep 17 00:00:00 2001 From: Marino Missiroli Date: Fri, 30 Sep 2022 00:05:55 +0200 Subject: [PATCH 06/11] more kernel improvements to avoid race conditions --- .../PFClusterProducer/plugins/PFClusterCudaHCAL.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 99413662041ea..78ccc4f2d5053 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3892,13 +3892,8 @@ namespace PFClusterCudaHCAL { } while (notDone==1); - *topoIter = iter; -#ifdef DEBUG_GPU_HCAL -// if (threadIdx.x == 0) { -// printf("*** Topo clustering converged in %d iterations ***\n", iter); -// } -// __syncthreads(); -#endif + if (threadIdx.x == 0) + *topoIter = iter; } __global__ void topoClusterLinkingKH(int nRH, @@ -4133,6 +4128,7 @@ namespace PFClusterCudaHCAL { } __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) { + assert(size == 32); int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1)); s_Data[pos] = 0; pos += size; @@ -4140,7 +4136,9 @@ namespace PFClusterCudaHCAL { for (int offset = 1; offset < size; offset <<= 1) { int t = s_Data[pos] + s_Data[pos - offset]; + __syncwarp(); s_Data[pos] = t; + __syncwarp(); } return s_Data[pos]; From 171d2b6917e9ffe0e62458e71acf7185c0afb4c4 Mon Sep 17 00:00:00 2001 From: Marino Missiroli Date: Fri, 30 Sep 2022 01:01:41 +0200 Subject: [PATCH 07/11] more syncthreads calls in topoClusterLinking kernel --- .../plugins/PFClusterCudaHCAL.cu | 84 ++++++------------- 1 file changed, 25 insertions(+), 59 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 78ccc4f2d5053..bc27bff0a0420 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3635,6 +3635,7 @@ namespace PFClusterCudaHCAL { } } __syncthreads(); + if (threadIdx.x == 0) { *pcrhFracSize = totalSeedFracOffset; if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs @@ -3807,8 +3808,11 @@ namespace PFClusterCudaHCAL { else pfrh_edgeMask[idx] = 0; } + __syncthreads();//!! do { + __syncthreads();//!! + if (threadIdx.x == 0) { notDone = 0; } @@ -3840,6 +3844,9 @@ namespace PFClusterCudaHCAL { } } } + + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3853,7 +3860,6 @@ namespace PFClusterCudaHCAL { if (threadIdx.x == 0) { notDone = 0; } - __syncthreads(); // Even linking @@ -3864,7 +3870,6 @@ namespace PFClusterCudaHCAL { pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]); } } - __syncthreads(); // edgeParent @@ -3885,6 +3890,8 @@ namespace PFClusterCudaHCAL { } } + __syncthreads();//!! + if (threadIdx.x == 0) iter++; @@ -3892,6 +3899,8 @@ namespace PFClusterCudaHCAL { } while (notDone==1); + __syncthreads();//!! + if (threadIdx.x == 0) *topoIter = iter; } @@ -4628,18 +4637,9 @@ namespace PFClusterCudaHCAL { ::PFClustering::HCAL::ScratchDataGPU& scratchGPU, float (&timer)[8]) { -#ifdef DEBUG_GPU_HCAL - cudaProfilerStart(); - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEventRecord(start, cudaStream); -#endif - int nRH = inputPFRecHits.size; // Combined seeding & topo clustering thresholds, array initialization - seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH, inputPFRecHits.pfrh_energy.get(), inputPFRecHits.pfrh_x.get(), @@ -4659,6 +4659,7 @@ namespace PFClusterCudaHCAL { outputGPU.topoSeedList.get(), outputGPU.pfc_iter.get()); +<<<<<<< HEAD cudaCheck(cudaStreamSynchronize(cudaStream)); #ifdef DEBUG_GPU_HCAL @@ -4676,6 +4677,8 @@ namespace PFClusterCudaHCAL { // scratchGPU.pfrh_edgeId.get(), // scratchGPU.pfrh_edgeList.get()); +======= +>>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel) // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. @@ -4685,36 +4688,9 @@ namespace PFClusterCudaHCAL { inputPFRecHits.pfrh_neighbours.get(), scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); - - // prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[4], start, stop); - //printf("\nprepareTopoInputs took %f ms\n", timer[4]); - - compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get(), - nEdges, - inputGPU.pfrh_edgeId.get(), - inputGPU.pfrh_edgeList.get(), - nRH, - inputGPU.pfNeighFourInd.get(), - inputPFRecHits.pfrh_neighbours.get()); - - cudaEventRecord(start, cudaStream); -#endif // Topo clustering +<<<<<<< HEAD //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.nEdges.get(), @@ -4727,6 +4703,16 @@ namespace PFClusterCudaHCAL { outputGPU.pfrh_passTopoThresh.get(), outputGPU.topoIter.get()); cudaCheck(cudaStreamSynchronize(cudaStream)); +======= + topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, + outputGPU.nEdges.get(), + outputGPU.pfrh_topoId.get(), + scratchGPU.pfrh_edgeId.get(), + scratchGPU.pfrh_edgeList.get(), + scratchGPU.pfrh_edgeMask.get(), + outputGPU.pfrh_passTopoThresh.get(), + outputGPU.topoIter.get()); +>>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel) topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.pfrh_topoId.get(), @@ -4742,13 +4728,6 @@ namespace PFClusterCudaHCAL { outputGPU.pcrh_frac.get(), outputGPU.pcrhFracSize.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[1], start, stop); - cudaEventRecord(start, cudaStream); -#endif - dim3 grid((nRH + 31) / 32, (nRH + 31) / 32); dim3 block(32, 32); @@ -4761,13 +4740,6 @@ namespace PFClusterCudaHCAL { scratchGPU.rhcount.get(), outputGPU.pcrh_fracInd.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[2], start, stop); - cudaEventRecord(start, cudaStream); -#endif - hcalFastCluster_selection<<>>(nRH, inputPFRecHits.pfrh_x.get(), inputPFRecHits.pfrh_y.get(), @@ -4791,11 +4763,5 @@ namespace PFClusterCudaHCAL { inputGPU.pfc_prevPos4.get(), inputGPU.pfc_energy.get(), outputGPU.pfc_iter.get()); -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[3], start, stop); - cudaProfilerStop(); -#endif } } // namespace PFClusterCudaHCAL From 6ca17e5c05665d50f4cd0daef8559b11179102d5 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Tue, 4 Oct 2022 12:41:48 -0500 Subject: [PATCH 08/11] resolve conflict --- .../plugins/PFClusterCudaHCAL.cu | 100 +----------------- 1 file changed, 2 insertions(+), 98 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index bc27bff0a0420..140ed6cca5bb2 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3524,7 +3524,6 @@ namespace PFClusterCudaHCAL { __global__ void topoClusterContraction(size_t size, int* pfrh_parent, int* pfrh_isSeed, - //const int* pfrh_neighbours, // temporary inputs for debuggi int* rhCount, int* topoSeedCount, int* topoRHCount, @@ -3561,30 +3560,6 @@ namespace PFClusterCudaHCAL { } while (notDone); - // // debugging printing block - // __syncthreads(); - // if (threadIdx.x == 0) { - // int nnode=0; - // for (int i = 0; i < size; i++) { - // //printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); - // if (i==pfrh_parent[i]) nnode++; - // } - // printf("pfrh_parent 3 multiplicity: %d\n",nnode); - // for (int pos = 0; pos < size; pos++) { - // int parent_target = pfrh_parent[pos]; - // for (int i = 0; i < 8; i++) { - // int neighbor_id = pfrh_neighbours[pos * 8 + i]; - // if (neighbor_id>-1){ // valid neighbors - // int parent_neighbor = pfrh_parent[neighbor_id]; - // if (parent_target!=parent_neighbor){ - // printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id); - // } - // } - // } - // } - // } - // __syncthreads(); - // Now determine the number of seeds and rechits in each topo cluster for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) { int topoId = pfrh_parent[rhIdx]; @@ -3907,7 +3882,6 @@ namespace PFClusterCudaHCAL { __global__ void topoClusterLinkingKH(int nRH, int* nEdgesIn, - //float* pfrh_energy, // Temporary entry for debugging int* pfrh_parent, int* pfrh_edgeId, int* pfrh_edgeList, @@ -3941,18 +3915,6 @@ namespace PFClusterCudaHCAL { // __syncthreads(); - // // Print out debugging info - // if (threadIdx.x == 0) { - // // for (int idx = 0; idx < nEdges; idx++) { - // // printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]); - // // //printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]); - // // } - // printf("number of eges %d\n",nEdges); - // // for (int i = 0; i < nRH; i++) { - // // printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]); - // // } - // } - // Explicitly initialize pfrh_parent for (int i = start; i < nRH; i += gridStride) { pfrh_parent[i] = i; @@ -4009,19 +3971,7 @@ namespace PFClusterCudaHCAL { __syncthreads(); - // Print out debugging info - // Connect remaining links - // if (threadIdx.x == 0) { - // int nnode=0; - // for (int i = 0; i < nRH; i++) { - // //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); - // if (i==pfrh_parent[i]) nnode++; - // } - // printf("pfrh_parent multiplicity: %d\n",nnode); - // } - - // __syncthreads(); - + // All rechit pairs in edge id-list have the same topo cluster label? for (int idx = start; idx < nEdges; idx += gridStride) { //for (int idx = 0; idx < nEdges; idx++) { int i = pfrh_edgeId[idx]; // Get edge topo id @@ -4066,18 +4016,6 @@ namespace PFClusterCudaHCAL { } while (notDone); - // __syncthreads(); - - // // Print out debugging info - // if (threadIdx.x == 0) { - // int nnode=0; - // for (int i = 0; i < nRH; i++) { - // //printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]); - // if (i==pfrh_parent[i]) nnode++; - // } - // printf("pfrh_parent 2 multiplicity: %d\n",nnode); - // } - } __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) { @@ -4659,29 +4597,10 @@ namespace PFClusterCudaHCAL { outputGPU.topoSeedList.get(), outputGPU.pfc_iter.get()); -<<<<<<< HEAD - cudaCheck(cudaStreamSynchronize(cudaStream)); - -#ifdef DEBUG_GPU_HCAL - cudaEventRecord(stop, cudaStream); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&timer[0], start, stop); - cudaEventRecord(start, cudaStream); -#endif - - // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( - // nRH, - // outputGPU.nEdges.get(), - // outputGPU.pfrh_passTopoThresh.get(), - // inputPFRecHits.pfrh_neighbours.get(), - // scratchGPU.pfrh_edgeId.get(), - // scratchGPU.pfrh_edgeList.get()); - -======= ->>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel) // Topo clustering // Fill edgeId, edgeList arrays with rechit neighbors // Has a bug when using more than 128 threads.. + // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>( prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH, outputGPU.nEdges.get(), outputGPU.pfrh_passTopoThresh.get(), @@ -4690,34 +4609,19 @@ namespace PFClusterCudaHCAL { scratchGPU.pfrh_edgeList.get()); // Topo clustering -<<<<<<< HEAD //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.nEdges.get(), - //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging outputGPU.pfrh_topoId.get(), scratchGPU.pfrh_edgeId.get(), scratchGPU.pfrh_edgeList.get(), scratchGPU.pfrh_edgeMask.get(), - //inputGPU.pfrh_edgeMask.get(), outputGPU.pfrh_passTopoThresh.get(), outputGPU.topoIter.get()); - cudaCheck(cudaStreamSynchronize(cudaStream)); -======= - topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH, - outputGPU.nEdges.get(), - outputGPU.pfrh_topoId.get(), - scratchGPU.pfrh_edgeId.get(), - scratchGPU.pfrh_edgeList.get(), - scratchGPU.pfrh_edgeMask.get(), - outputGPU.pfrh_passTopoThresh.get(), - outputGPU.topoIter.get()); ->>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel) topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH, outputGPU.pfrh_topoId.get(), outputGPU.pfrh_isSeed.get(), - //inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging scratchGPU.rhcount.get(), outputGPU.topoSeedCount.get(), outputGPU.topoRHCount.get(), From dd6b122d0a4d3a0f7c58bee77ba21da72d307d9e Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Tue, 4 Oct 2022 21:11:23 -0500 Subject: [PATCH 09/11] Add machinery to toggle produceLegacy. --- .../python/customizeHcalOnlyForProfiling.py | 2 ++ .../plugins/PFClusterCudaHCAL.cu | 18 ++++++------------ .../python/particleFlowClusterHBHE_cfi.py | 3 +++ .../python/particleFlowRecHitHBHE_cfi.py | 4 ++++ 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py index 93da82c580d99..052dd7f3d59c8 100644 --- a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py +++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py @@ -33,6 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process): process.consume_step = cms.EndPath(process.consumer) process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) + #process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False) + #process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False) return process diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 140ed6cca5bb2..42304ec139c2a 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -75,7 +75,7 @@ namespace PFClusterCudaHCAL { // PFRechitToPFCluster_HCAL_entryPoint // seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU] // prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension]) - // topoClusterLinking: run topo clustering (output: pfrh_topoId) + // topoClusterLinking(KH): run topo clustering (output: pfrh_topoId) // topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac) // fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices) // hcalFastCluster_selection @@ -3899,6 +3899,8 @@ namespace PFClusterCudaHCAL { *topoIter = 0; nEdges = *nEdgesIn; gridStride = blockDim.x * gridDim.x; // For single block kernel this is the number of threads + notDone = 0; + notDone2 = 0; } __syncthreads(); @@ -3922,15 +3924,6 @@ namespace PFClusterCudaHCAL { __syncthreads(); - // for notDone - if (threadIdx.x == 0) { - notDone = 0; - notDone2 = 0; - //printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x); - } - - __syncthreads(); - // (1) First attempt // First edge [set parents to those smaller numbers] for (int idx = start; idx < nEdges; idx += gridStride) { @@ -3942,8 +3935,9 @@ namespace PFClusterCudaHCAL { __syncthreads(); - // KenH - for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges + // + // loop until topo clustering iteration converges + for (int ii=0; ii<100; ii++) { // for notDone if (threadIdx.x == 0) { diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py index 7dea30a6d7dc7..ab713030f821b 100644 --- a/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py +++ b/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py @@ -111,11 +111,14 @@ positionReCalc = cms.PSet(), energyCorrector = cms.PSet() ) +_module_config_cuda = _module_config.clone() #### PF CLUSTER HCAL #### _particleFlowClusterHBHE_cpu = cms.EDProducer("PFClusterProducer", _module_config.clone()) _particleFlowClusterHBHE_cuda = cms.EDProducer("PFClusterProducerCudaHCAL", _module_config.clone()) _particleFlowClusterHBHE_cuda.PFRecHitsLabelIn = cms.InputTag("particleFlowRecHitHBHE","") +_particleFlowClusterHBHE_cuda.produceLegacy = cms.bool(True) +_particleFlowClusterHBHE_cuda.produceSoA = cms.bool(True) ##### diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py index 33b77bccff92a..0dcec57965c03 100644 --- a/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py +++ b/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py @@ -44,6 +44,10 @@ _module_pset_cuda = _module_pset.clone() _module_pset_cuda.producers[0].src= "hbheRecHitProducerGPU" # use GPU version as input instead of legacy version +_module_pset_cuda.produceSoA = cms.bool(True) +_module_pset_cuda.produceLegacy = cms.bool(True) +_module_pset_cuda.produceCleanedLegacy = cms.bool(True) + for idx, x in enumerate(_module_pset_cuda.producers): for idy, y in enumerate(x.qualityTests): if y.name._value == "PFRecHitQTestHCALThresholdVsDepth": # when applying phase1 depth-dependent HCAL thresholds From 18f7e39c99aedad4c5b800fa2136a50299a75e65 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Wed, 5 Oct 2022 11:48:14 -0500 Subject: [PATCH 10/11] Optimising kernels. --- .../plugins/PFClusterCudaHCAL.cu | 32 ++----------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index 42304ec139c2a..9734f46f4e7eb 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -3917,19 +3917,14 @@ namespace PFClusterCudaHCAL { // __syncthreads(); - // Explicitly initialize pfrh_parent - for (int i = start; i < nRH; i += gridStride) { - pfrh_parent[i] = i; - } - - __syncthreads(); - - // (1) First attempt + // First attempt of topo clustering // First edge [set parents to those smaller numbers] for (int idx = start; idx < nEdges; idx += gridStride) { int i = pfrh_edgeId[idx]; // Get edge topo id if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]); + } else { + pfrh_parent[i] = i; } } @@ -3989,27 +3984,6 @@ namespace PFClusterCudaHCAL { } // for-loop ii - __syncthreads(); - - // Follow parents of parents .... to contract parent structure - do { - volatile bool threadNotDone = false; - for (int i = threadIdx.x; i < nRH; i += blockDim.x) { - int parent = pfrh_parent[i]; - if (parent >= 0 && parent != pfrh_parent[parent]) { - threadNotDone = true; - pfrh_parent[i] = pfrh_parent[parent]; - } - } - if (threadIdx.x == 0) - notDone = 0; - __syncthreads(); - - atomicAdd(¬Done, (int)threadNotDone); - __syncthreads(); - - } while (notDone); - } __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) { From eb14f9abe135f2bdd6f64d7c6abd78e7ebfe3153 Mon Sep 17 00:00:00 2001 From: Kenichi Hatakeyama Date: Wed, 5 Oct 2022 12:34:58 -0500 Subject: [PATCH 11/11] turn off legacy data storing when running profiling customisation. --- .../Configuration/python/customizeHcalOnlyForProfiling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py index 052dd7f3d59c8..1ba182960b82c 100644 --- a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py +++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py @@ -23,7 +23,7 @@ def customizeHcalOnlyForProfilingGPUOnly(process): # Currently, this means: # - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU # - running the HBHE local reconstruction, including MAHI, on GPU. -# - running the HBHE PFRecHit and PFCluster producers on GPU [including copy of them to host, as we cannot untangle the copy back at a moment] +# - running the HBHE PFRecHit and PFCluster producers on GPU [without copy of them to host] def customizeHcalPFOnlyForProfilingGPUOnly(process): process.consumer = cms.EDAnalyzer("GenericConsumer", @@ -33,8 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process): process.consume_step = cms.EndPath(process.consumer) process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step) - #process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False) - #process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False) + process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False) + process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False) return process