diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu index ded8d9adbd1e8..ccd9c5722ea2a 100644 --- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu +++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu @@ -487,6 +487,50 @@ namespace PFClusterCudaHCAL { } } + __device__ auto dev_getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; } + + __device__ auto dev_getRhFracIdx(int* rechits, int rhNum) { + if (rhNum <= 0) { + printf("Invalid rhNum (%d) for get RhFracIdx!\n", rhNum); + } + return rechits[rhNum - 1]; + } + + __device__ auto dev_getRhFrac( + int* topoSeedList, int topoSeedBegin, float* pcrhfrac, int* seedFracOffsets, int seedNum, int rhNum) { + int seedIdx = topoSeedList[topoSeedBegin + seedNum]; + return pcrhfrac[seedFracOffsets[seedIdx] + rhNum]; + } + + __device__ auto dev_computeClusterPos(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams, + float4& pos4, + float frac, + int rhInd, + bool isDebug, + const float* __restrict__ pfrh_x, + const float* __restrict__ pfrh_y, + const float* __restrict__ pfrh_z, + const float* __restrict__ pfrh_energy, + float rhENormInv) { + float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0); + const auto rh_energy = pfrh_energy[rhInd] * frac; + const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv))); + if (isDebug) + printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", + rhInd, + norm, + frac, + rh_energy, + rechitPos.x, + rechitPos.y, + rechitPos.z); + + pos4.x += rechitPos.x * norm; + pos4.y += rechitPos.y * norm; + pos4.z += rechitPos.z * norm; + pos4.w += norm; // position_norm + } + __device__ void dev_hcalFastCluster_optimizedSimple(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams, int topoId, int nRHTopo, @@ -697,19 +741,6 @@ namespace PFClusterCudaHCAL { } __syncthreads(); - auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; }; - - auto getRhFracIdx = [&](int seedNum, int rhNum) { - if (rhNum <= 0) - printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum); - return rechits[rhNum - 1]; - }; - - auto getRhFrac = [&](int seedNum, int rhNum) { - int seedIdx = topoSeedList[topoSeedBegin + seedNum]; - return pcrhfrac[seedFracOffsets[seedIdx] + rhNum]; - }; - if (debug) { if (threadIdx.x == 0) { printf("\n===========================================================================================\n"); @@ -717,7 +748,7 @@ namespace PFClusterCudaHCAL { for (int s = 0; s < nSeeds; s++) { if (s != 0) printf(", "); - printf("%d", getSeedRhIdx(s)); + printf("%d", dev_getSeedRhIdx(seeds, s)); } if (nRHTopo == nSeeds) { printf(")\n\n"); @@ -726,7 +757,7 @@ namespace PFClusterCudaHCAL { for (int r = 1; r < nRHNotSeed; r++) { if (r != 1) printf(", "); - printf("%d", getRhFracIdx(0, r)); + printf("%d", dev_getRhFracIdx(rechits, r)); } printf(")\n\n"); } @@ -734,33 +765,9 @@ namespace PFClusterCudaHCAL { __syncthreads(); } - auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams, - float4& pos4, - float frac, - int rhInd, - bool isDebug) { - float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0); - const auto rh_energy = pfrh_energy[rhInd] * frac; - const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv))); - if (isDebug) - printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", - rhInd, - norm, - frac, - rh_energy, - rechitPos.x, - rechitPos.y, - rechitPos.z); - - pos4.x += rechitPos.x * norm; - pos4.y += rechitPos.y * norm; - pos4.z += rechitPos.z * norm; - pos4.w += norm; // position_norm - }; - // Set initial cluster position (energy) to seed rechit position (energy) for (int s = threadIdx.x; s < nSeeds; s += gridStride) { - int i = getSeedRhIdx(s); + int i = dev_getSeedRhIdx(seeds, s); clusterPos[s] = make_float4(pfrh_x[i], pfrh_y[i], pfrh_z[i], 1.0); prevClusterPos[s] = clusterPos[s]; clusterEnergy[s] = pfrh_energy[i]; @@ -784,7 +791,7 @@ namespace PFClusterCudaHCAL { float seedEnergy = -1.; float4 seedInitClusterPos = make_float4(0., 0., 0., 0.); if (tid < nSeeds) { - seedThreadIdx = getSeedRhIdx(tid); + seedThreadIdx = dev_getSeedRhIdx(seeds, tid); seedNeighbors = make_int4(pfrh_neighbours[8 * seedThreadIdx], pfrh_neighbours[8 * seedThreadIdx + 1], pfrh_neighbours[8 * seedThreadIdx + 2], @@ -792,7 +799,8 @@ namespace PFClusterCudaHCAL { seedEnergy = pfrh_energy[seedThreadIdx]; // Compute initial cluster position shift for seed - computeClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug); + dev_computeClusterPos( + pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv); } do { @@ -869,14 +877,15 @@ namespace PFClusterCudaHCAL { if (tid < nSeeds) { for (int r = 0; r < nRHNotSeed - 1; r++) { int j = rechits[r]; - float frac = getRhFrac(tid, r + 1); + float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, tid, r + 1); if (frac > -0.5) { clusterEnergy[tid] += frac * pfrh_energy[j]; if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z || j == seedNeighbors.w) - computeClusterPos(pfClusParams, clusterPos[tid], frac, j, debug); + dev_computeClusterPos( + pfClusParams, clusterPos[tid], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv); } } } @@ -959,9 +968,16 @@ namespace PFClusterCudaHCAL { __shared__ int nRHNotSeed, topoSeedBegin, gridStride, iter; __shared__ float tol, diff2, rhENormInv; __shared__ bool notDone, debug; - __shared__ float4 clusterPos[400], prevClusterPos[400]; - __shared__ float clusterEnergy[400], rhFracSum[1500]; - __shared__ int seeds[400], rechits[1500]; + //__shared__ float4 clusterPos[400], prevClusterPos[400]; + //__shared__ float clusterEnergy[400], rhFracSum[1500]; + //__shared__ int seeds[400], rechits[1500]; + extern __shared__ float4 sharedArr[]; + float4* clusterPos = sharedArr; //nSeeds + float4* prevClusterPos = (float4*)&clusterPos[nSeeds]; //nSeeds + float* clusterEnergy = (float*)&prevClusterPos[nSeeds]; //nSeeds + float* rhFracSum = (float*)&clusterEnergy[nSeeds]; //nRHTopo - nSeeds + int* seeds = (int*)&rhFracSum[nRHTopo - nSeeds]; //nSeeds + int* rechits = (int*)&seeds[nSeeds]; //nRHTopo - nSeeds if (threadIdx.x == 0) { nRHNotSeed = nRHTopo - nSeeds + 1; // 1 + (# rechits per topoId that are NOT seeds) @@ -991,19 +1007,6 @@ namespace PFClusterCudaHCAL { } __syncthreads(); - auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; }; - - auto getRhFracIdx = [&](int seedNum, int rhNum) { - if (rhNum <= 0) - printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum); - return rechits[rhNum - 1]; - }; - - auto getRhFrac = [&](int seedNum, int rhNum) { - int seedIdx = topoSeedList[topoSeedBegin + seedNum]; - return pcrhfrac[seedFracOffsets[seedIdx] + rhNum]; - }; - if (debug) { if (threadIdx.x == 0) { printf("\n===========================================================================================\n"); @@ -1011,7 +1014,7 @@ namespace PFClusterCudaHCAL { for (int s = 0; s < nSeeds; s++) { if (s != 0) printf(", "); - printf("%d", getSeedRhIdx(s)); + printf("%d", dev_getSeedRhIdx(seeds, s)); } if (nRHTopo == nSeeds) { printf(")\n\n"); @@ -1020,7 +1023,7 @@ namespace PFClusterCudaHCAL { for (int r = 1; r < nRHNotSeed; r++) { if (r != 1) printf(", "); - printf("%d", getRhFracIdx(0, r)); + printf("%d", dev_getRhFracIdx(rechits, r)); } printf(")\n\n"); } @@ -1028,30 +1031,6 @@ namespace PFClusterCudaHCAL { __syncthreads(); } - auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams, - float4& pos4, - float frac, - int rhInd, - bool isDebug) { - float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0); - const auto rh_energy = pfrh_energy[rhInd] * frac; - const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv))); - if (isDebug) - printf("\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", - rhInd, - norm, - frac, - rh_energy, - rechitPos.x, - rechitPos.y, - rechitPos.z); - - pos4.x += rechitPos.x * norm; - pos4.y += rechitPos.y * norm; - pos4.z += rechitPos.z * norm; - pos4.w += norm; // position_norm - }; - // Set initial cluster position (energy) to seed rechit position (energy) for (int s = threadIdx.x; s < nSeeds; s += gridStride) { int i = seeds[s]; @@ -1122,7 +1101,7 @@ namespace PFClusterCudaHCAL { // Reset cluster position and energy for (int s = threadIdx.x; s < nSeeds; s += gridStride) { - int seedRhIdx = getSeedRhIdx(s); + int seedRhIdx = dev_getSeedRhIdx(seeds, s); float norm = logf(pfrh_energy[seedRhIdx] * rhENormInv); clusterPos[s] = make_float4(pfrh_x[seedRhIdx] * norm, pfrh_y[seedRhIdx] * norm, pfrh_z[seedRhIdx] * norm, norm); clusterEnergy[s] = pfrh_energy[seedRhIdx]; @@ -1141,17 +1120,18 @@ namespace PFClusterCudaHCAL { // Recalculate position for (int s = threadIdx.x; s < nSeeds; s += gridStride) { - int seedRhIdx = getSeedRhIdx(s); + int seedRhIdx = dev_getSeedRhIdx(seeds, s); for (int r = 0; r < nRHNotSeed - 1; r++) { int j = rechits[r]; - float frac = getRhFrac(s, r + 1); + float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, s, r + 1); if (frac > -0.5) { clusterEnergy[s] += frac * pfrh_energy[j]; if (nSeeds == 1 || j == pfrh_neighbours[8 * seedRhIdx] || j == pfrh_neighbours[8 * seedRhIdx + 1] || j == pfrh_neighbours[8 * seedRhIdx + 2] || j == pfrh_neighbours[8 * seedRhIdx + 3]) - computeClusterPos(pfClusParams, clusterPos[s], frac, j, debug); + dev_computeClusterPos( + pfClusParams, clusterPos[s], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv); } } } @@ -1743,10 +1723,13 @@ namespace PFClusterCudaHCAL { dim3 grid((nRH + 31) / 32, (nRH + 31) / 32); dim3 block(32, 32); - typeof(nTopo) h_nTopo; - - cudaCheck(cudaMemcpyFromSymbolAsync(&h_nTopo, nTopo, sizeof(int), 0, cudaMemcpyDeviceToHost, cudaStream)); - + int nRHTopo_h[nRH]; + int nSeedsTopo_h[nRH]; + cudaCheck(cudaMemcpyAsync(&nRHTopo_h, outputGPU.topoRHCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream)); + cudaCheck(cudaMemcpyAsync(&nSeedsTopo_h, outputGPU.topoSeedCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream)); + int nRHTopoMax = *std::max_element(nRHTopo_h, nRHTopo_h + (nRH-1)); + int nSeedsTopoMax = *std::max_element(nSeedsTopo_h, nSeedsTopo_h + (nRH-1)); + fillRhfIndex<<>>(nRH, outputGPU.pfrh_topoId.get(), outputGPU.pfrh_isSeed.get(), @@ -1755,8 +1738,16 @@ namespace PFClusterCudaHCAL { outputGPU.seedFracOffsets.get(), scratchGPU.rhcount.get(), outputGPU.pcrh_fracInd.get()); + int sharedMem = + nSeedsTopoMax*sizeof(float4)+ + nSeedsTopoMax*sizeof(float4)+ + nSeedsTopoMax*sizeof(float)+ + (nRHTopoMax-nSeedsTopoMax)*sizeof(float)+ + nSeedsTopoMax*sizeof(int)+ + (nRHTopoMax-nSeedsTopoMax)*sizeof(int); + - hcalFastCluster_selection<<>>(pfClusParams.const_view(), + hcalFastCluster_selection<<>>(pfClusParams.const_view(), nRH, inputPFRecHits.pfrh_x.get(), inputPFRecHits.pfrh_y.get(),