Merge pull request cms-sw#16 from jsamudio/devFactorAndShared

fixed device functions and shared memory implementation
fllor · Mar 20, 2023 · d3eb59b · d3eb59b
2 parents 338163d + eb77819
commit d3eb59b
Showing 1 changed file with 86 additions and 95 deletions.
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -487,6 +487,50 @@ namespace PFClusterCudaHCAL {
     }
   }
 
+  __device__ auto dev_getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; }
+
+  __device__ auto dev_getRhFracIdx(int* rechits, int rhNum) {
+    if (rhNum <= 0) {
+      printf("Invalid rhNum (%d) for get RhFracIdx!\n", rhNum);
+    }
+    return rechits[rhNum - 1];
+  }
+
+  __device__ auto dev_getRhFrac(
+      int* topoSeedList, int topoSeedBegin, float* pcrhfrac, int* seedFracOffsets, int seedNum, int rhNum) {
+    int seedIdx = topoSeedList[topoSeedBegin + seedNum];
+    return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
+  }
+
+  __device__ auto dev_computeClusterPos(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
+                                        float4& pos4,
+                                        float frac,
+                                        int rhInd,
+                                        bool isDebug,
+                                        const float* __restrict__ pfrh_x,
+                                        const float* __restrict__ pfrh_y,
+                                        const float* __restrict__ pfrh_z,
+                                        const float* __restrict__ pfrh_energy,
+                                        float rhENormInv) {
+    float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
+    const auto rh_energy = pfrh_energy[rhInd] * frac;
+    const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
+    if (isDebug)
+      printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
+             rhInd,
+             norm,
+             frac,
+             rh_energy,
+             rechitPos.x,
+             rechitPos.y,
+             rechitPos.z);
+
+    pos4.x += rechitPos.x * norm;
+    pos4.y += rechitPos.y * norm;
+    pos4.z += rechitPos.z * norm;
+    pos4.w += norm;  //  position_norm
+  }
+
   __device__ void dev_hcalFastCluster_optimizedSimple(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
                                                       int topoId,
                                                       int nRHTopo,
@@ -697,27 +741,14 @@ namespace PFClusterCudaHCAL {
     }
     __syncthreads();
 
-    auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; };
-
-    auto getRhFracIdx = [&](int seedNum, int rhNum) {
-      if (rhNum <= 0)
-        printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum);
-      return rechits[rhNum - 1];
-    };
-
-    auto getRhFrac = [&](int seedNum, int rhNum) {
-      int seedIdx = topoSeedList[topoSeedBegin + seedNum];
-      return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
-    };
-
     if (debug) {
       if (threadIdx.x == 0) {
         printf("\n===========================================================================================\n");
         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
         for (int s = 0; s < nSeeds; s++) {
           if (s != 0)
             printf(", ");
-          printf("%d", getSeedRhIdx(s));
+          printf("%d", dev_getSeedRhIdx(seeds, s));
         }
         if (nRHTopo == nSeeds) {
           printf(")\n\n");
@@ -726,41 +757,17 @@ namespace PFClusterCudaHCAL {
           for (int r = 1; r < nRHNotSeed; r++) {
             if (r != 1)
               printf(", ");
-            printf("%d", getRhFracIdx(0, r));
+            printf("%d", dev_getRhFracIdx(rechits, r));
           }
           printf(")\n\n");
         }
       }
       __syncthreads();
     }
 
-    auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
-                                 float4& pos4,
-                                 float frac,
-                                 int rhInd,
-                                 bool isDebug) {
-      float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
-      const auto rh_energy = pfrh_energy[rhInd] * frac;
-      const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
-      if (isDebug)
-        printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
-               rhInd,
-               norm,
-               frac,
-               rh_energy,
-               rechitPos.x,
-               rechitPos.y,
-               rechitPos.z);
-
-      pos4.x += rechitPos.x * norm;
-      pos4.y += rechitPos.y * norm;
-      pos4.z += rechitPos.z * norm;
-      pos4.w += norm;  //  position_norm
-    };
-
     // Set initial cluster position (energy) to seed rechit position (energy)
     for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
-      int i = getSeedRhIdx(s);
+      int i = dev_getSeedRhIdx(seeds, s);
       clusterPos[s] = make_float4(pfrh_x[i], pfrh_y[i], pfrh_z[i], 1.0);
       prevClusterPos[s] = clusterPos[s];
       clusterEnergy[s] = pfrh_energy[i];
@@ -784,15 +791,16 @@ namespace PFClusterCudaHCAL {
     float seedEnergy = -1.;
     float4 seedInitClusterPos = make_float4(0., 0., 0., 0.);
     if (tid < nSeeds) {
-      seedThreadIdx = getSeedRhIdx(tid);
+      seedThreadIdx = dev_getSeedRhIdx(seeds, tid);
       seedNeighbors = make_int4(pfrh_neighbours[8 * seedThreadIdx],
                                 pfrh_neighbours[8 * seedThreadIdx + 1],
                                 pfrh_neighbours[8 * seedThreadIdx + 2],
                                 pfrh_neighbours[8 * seedThreadIdx + 3]);
       seedEnergy = pfrh_energy[seedThreadIdx];
 
       // Compute initial cluster position shift for seed
-      computeClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug);
+      dev_computeClusterPos(
+        pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
     }
 
     do {
@@ -869,14 +877,15 @@ namespace PFClusterCudaHCAL {
       if (tid < nSeeds) {
         for (int r = 0; r < nRHNotSeed - 1; r++) {
           int j = rechits[r];
-          float frac = getRhFrac(tid, r + 1);
+          float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, tid, r + 1);
 
           if (frac > -0.5) {
             clusterEnergy[tid] += frac * pfrh_energy[j];
 
             if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z ||
                 j == seedNeighbors.w)
-              computeClusterPos(pfClusParams, clusterPos[tid], frac, j, debug);
+              dev_computeClusterPos(
+                pfClusParams, clusterPos[tid], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
           }
         }
       }
@@ -959,9 +968,16 @@ namespace PFClusterCudaHCAL {
     __shared__ int nRHNotSeed, topoSeedBegin, gridStride, iter;
     __shared__ float tol, diff2, rhENormInv;
     __shared__ bool notDone, debug;
-    __shared__ float4 clusterPos[400], prevClusterPos[400];
-    __shared__ float clusterEnergy[400], rhFracSum[1500];
-    __shared__ int seeds[400], rechits[1500];
+    //__shared__ float4 clusterPos[400], prevClusterPos[400];
+    //__shared__ float clusterEnergy[400], rhFracSum[1500];
+    //__shared__ int seeds[400], rechits[1500];
+    extern __shared__ float4 sharedArr[];
+    float4* clusterPos = sharedArr;                          //nSeeds
+    float4* prevClusterPos = (float4*)&clusterPos[nSeeds];   //nSeeds
+    float* clusterEnergy = (float*)&prevClusterPos[nSeeds];  //nSeeds
+    float* rhFracSum = (float*)&clusterEnergy[nSeeds];       //nRHTopo - nSeeds
+    int* seeds = (int*)&rhFracSum[nRHTopo - nSeeds];         //nSeeds
+    int* rechits = (int*)&seeds[nSeeds];                     //nRHTopo - nSeeds
 
     if (threadIdx.x == 0) {
       nRHNotSeed = nRHTopo - nSeeds + 1;  // 1 + (# rechits per topoId that are NOT seeds)
@@ -991,27 +1007,14 @@ namespace PFClusterCudaHCAL {
     }
     __syncthreads();
 
-    auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; };
-
-    auto getRhFracIdx = [&](int seedNum, int rhNum) {
-      if (rhNum <= 0)
-        printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum);
-      return rechits[rhNum - 1];
-    };
-
-    auto getRhFrac = [&](int seedNum, int rhNum) {
-      int seedIdx = topoSeedList[topoSeedBegin + seedNum];
-      return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
-    };
-
     if (debug) {
       if (threadIdx.x == 0) {
         printf("\n===========================================================================================\n");
         printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
         for (int s = 0; s < nSeeds; s++) {
           if (s != 0)
             printf(", ");
-          printf("%d", getSeedRhIdx(s));
+          printf("%d", dev_getSeedRhIdx(seeds, s));
         }
         if (nRHTopo == nSeeds) {
           printf(")\n\n");
@@ -1020,38 +1023,14 @@ namespace PFClusterCudaHCAL {
           for (int r = 1; r < nRHNotSeed; r++) {
             if (r != 1)
               printf(", ");
-            printf("%d", getRhFracIdx(0, r));
+            printf("%d", dev_getRhFracIdx(rechits, r));
           }
           printf(")\n\n");
         }
       }
       __syncthreads();
     }
 
-    auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
-                                 float4& pos4,
-                                 float frac,
-                                 int rhInd,
-                                 bool isDebug) {
-      float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
-      const auto rh_energy = pfrh_energy[rhInd] * frac;
-      const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
-      if (isDebug)
-        printf("\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
-               rhInd,
-               norm,
-               frac,
-               rh_energy,
-               rechitPos.x,
-               rechitPos.y,
-               rechitPos.z);
-
-      pos4.x += rechitPos.x * norm;
-      pos4.y += rechitPos.y * norm;
-      pos4.z += rechitPos.z * norm;
-      pos4.w += norm;  //  position_norm
-    };
-
     // Set initial cluster position (energy) to seed rechit position (energy)
     for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
       int i = seeds[s];
@@ -1122,7 +1101,7 @@ namespace PFClusterCudaHCAL {
 
       // Reset cluster position and energy
       for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
-        int seedRhIdx = getSeedRhIdx(s);
+        int seedRhIdx = dev_getSeedRhIdx(seeds, s);
         float norm = logf(pfrh_energy[seedRhIdx] * rhENormInv);
         clusterPos[s] = make_float4(pfrh_x[seedRhIdx] * norm, pfrh_y[seedRhIdx] * norm, pfrh_z[seedRhIdx] * norm, norm);
         clusterEnergy[s] = pfrh_energy[seedRhIdx];
@@ -1141,17 +1120,18 @@ namespace PFClusterCudaHCAL {
 
       // Recalculate position
       for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
-        int seedRhIdx = getSeedRhIdx(s);
+        int seedRhIdx = dev_getSeedRhIdx(seeds, s);
         for (int r = 0; r < nRHNotSeed - 1; r++) {
           int j = rechits[r];
-          float frac = getRhFrac(s, r + 1);
+          float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, s, r + 1);
 
           if (frac > -0.5) {
             clusterEnergy[s] += frac * pfrh_energy[j];
 
             if (nSeeds == 1 || j == pfrh_neighbours[8 * seedRhIdx] || j == pfrh_neighbours[8 * seedRhIdx + 1] ||
                 j == pfrh_neighbours[8 * seedRhIdx + 2] || j == pfrh_neighbours[8 * seedRhIdx + 3])
-              computeClusterPos(pfClusParams, clusterPos[s], frac, j, debug);
+                dev_computeClusterPos(
+                    pfClusParams, clusterPos[s], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
           }
         }
       }
@@ -1743,10 +1723,13 @@ namespace PFClusterCudaHCAL {
     dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
     dim3 block(32, 32);
 
-    typeof(nTopo) h_nTopo;
-
-    cudaCheck(cudaMemcpyFromSymbolAsync(&h_nTopo, nTopo, sizeof(int), 0, cudaMemcpyDeviceToHost, cudaStream));
-
+    int nRHTopo_h[nRH];
+    int nSeedsTopo_h[nRH];
+    cudaCheck(cudaMemcpyAsync(&nRHTopo_h, outputGPU.topoRHCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
+    cudaCheck(cudaMemcpyAsync(&nSeedsTopo_h, outputGPU.topoSeedCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
+    int nRHTopoMax = *std::max_element(nRHTopo_h, nRHTopo_h + (nRH-1));
+    int nSeedsTopoMax = *std::max_element(nSeedsTopo_h, nSeedsTopo_h + (nRH-1));
+
     fillRhfIndex<<<grid, block, 0, cudaStream>>>(nRH,
                                                  outputGPU.pfrh_topoId.get(),
                                                  outputGPU.pfrh_isSeed.get(),
@@ -1755,8 +1738,16 @@ namespace PFClusterCudaHCAL {
                                                  outputGPU.seedFracOffsets.get(),
                                                  scratchGPU.rhcount.get(),
                                                  outputGPU.pcrh_fracInd.get());
+    int sharedMem =
+    nSeedsTopoMax*sizeof(float4)+
+    nSeedsTopoMax*sizeof(float4)+
+    nSeedsTopoMax*sizeof(float)+
+    (nRHTopoMax-nSeedsTopoMax)*sizeof(float)+
+    nSeedsTopoMax*sizeof(int)+
+    (nRHTopoMax-nSeedsTopoMax)*sizeof(int);
+
 
-    hcalFastCluster_selection<<<nRH, threadsPerBlock, 0, cudaStream>>>(pfClusParams.const_view(),
+    hcalFastCluster_selection<<<nRH, threadsPerBlock, sharedMem, cudaStream>>>(pfClusParams.const_view(),
                                                            nRH,
                                                            inputPFRecHits.pfrh_x.get(),
                                                            inputPFRecHits.pfrh_y.get(),