From 8ae2ba80def631aaecd104fb9c7c3a8f757f527d Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Fri, 30 Sep 2022 07:27:30 -0500
Subject: [PATCH 01/11] temporary fix for shared memory bool going out of
 bound.

---
 .../plugins/PFClusterCudaHCAL.cu              | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index d29e3794aa3b1..2a495a2d9fa79 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -73,9 +73,9 @@ namespace PFClusterCudaHCAL {
   // --- kernel summary --
   // initializeCudaConstants
   // PFRechitToPFCluster_HCAL_entryPoint
-  //   seedingTopoThreshKernel_HCAL
-  //   prepareTopoInputs
-  //   topoClusterLinking
+  //   seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU]
+  //   prepareTopoInputs: prepare "edge" data (output: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
+  //   topoClusterLinking:
   //   topoClusterContraction
   //   fillRhfIndex
   //   hcalFastCluster_selection
@@ -3766,7 +3766,7 @@ namespace PFClusterCudaHCAL {
                                      int* pfrh_edgeMask,
                                      const int* pfrh_passTopoThresh,
                                      int* topoIter) {
-    __shared__ bool notDone;
+    __shared__ int notDone; // This is better be bool, but somehow it leads to out of bound
     __shared__ int iter, gridStride, nEdges;
 
     int start = blockIdx.x * blockDim.x + threadIdx.x;
@@ -3790,7 +3790,7 @@ namespace PFClusterCudaHCAL {
 
     do {
       if (threadIdx.x == 0) {
-        notDone = false;
+        notDone = 0;
       }
       __syncthreads();
 
@@ -3814,7 +3814,7 @@ namespace PFClusterCudaHCAL {
           // edgeMask set to true if elements of edgeId and edgeList are different
           if (pfrh_edgeId[idx] != pfrh_edgeList[idx]) {
             pfrh_edgeMask[idx] = 1;
-            notDone = true;
+            notDone = 1;
           } else {
             pfrh_edgeMask[idx] = 0;
           }
@@ -3825,13 +3825,13 @@ namespace PFClusterCudaHCAL {
 
       __syncthreads();
 
-      if (!notDone)
+      if (notDone==0)
         break;
 
       __syncthreads();//!!
 
       if (threadIdx.x == 0) {
-        notDone = false;
+        notDone = 0; // KenH is this necessary?
       }
 
       __syncthreads();
@@ -3858,7 +3858,7 @@ namespace PFClusterCudaHCAL {
           // edgeMask set to true if elements of edgeId and edgeList are different
           if (pfrh_edgeId[idx] != pfrh_edgeList[idx]) {
             pfrh_edgeMask[idx] = 1;
-            notDone = true;
+            notDone = 1;
           } else {
             pfrh_edgeMask[idx] = 0;
           }
@@ -3870,7 +3870,7 @@ namespace PFClusterCudaHCAL {
 
       __syncthreads();
 
-    } while (notDone);
+    } while (notDone==1);
 
     *topoIter = iter;
 #ifdef DEBUG_GPU_HCAL

From f5f6d63e4723598d6d4a5d5a413e470859e1d1a1 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Mon, 3 Oct 2022 01:29:53 -0500
Subject: [PATCH 02/11] memori allocation adjustment

---
 RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h   | 3 ++-
 .../PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc     | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
index 46310729d5a5e..02b1930d66aec 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
+++ b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
@@ -161,7 +161,7 @@ namespace PFRecHit {
 namespace PFClustering {
   namespace HCAL {
     struct ConfigurationParameters {
-      uint32_t maxRH = 3000;          // previously: 2000
+      uint32_t maxRH = 4000;          // previously: 2000
       uint32_t maxPFCFracs = 600000;  // previously: 80000
       uint32_t maxNeighbors = 8;
     };
@@ -332,6 +332,7 @@ namespace PFClustering {
       }
     };
   }  // namespace HCAL
+
 }  // namespace PFClustering
 
 #endif  // RecoParticleFlow_PFClusterProducer_plugins_DeclsForKernels_h
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
index 4cf0d9525b98e..aace914db3ab7 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
@@ -317,6 +317,7 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
 
   nRH_ = PFRecHits.size;
   if (nRH_ == 0) return;
+  if (nRH_>3000) std::cout << "nRH(PFRecHitSize)>3000: " << nRH_ << std::endl;
 
   const int numbytes_int = nRH_ * sizeof(int);
   int totalNeighbours = 0;  // Running count of 8 neighbour edges for edgeId, edgeList

From 8f30bee3fe1d063d2ab497c9c994209bd9a08f47 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Mon, 3 Oct 2022 22:33:45 -0500
Subject: [PATCH 03/11] memory allocation param updates.

---
 RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h    | 2 +-
 .../PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
index 02b1930d66aec..f27e576afe3b0 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
+++ b/RecoParticleFlow/PFClusterProducer/plugins/DeclsForKernels.h
@@ -162,7 +162,7 @@ namespace PFClustering {
   namespace HCAL {
     struct ConfigurationParameters {
       uint32_t maxRH = 4000;          // previously: 2000
-      uint32_t maxPFCFracs = 600000;  // previously: 80000
+      uint32_t maxPFCFracs = 200000;  // previously: 80000
       uint32_t maxNeighbors = 8;
     };
 
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
index aace914db3ab7..3dd5a66718421 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterProducerCudaHCAL.cc
@@ -317,7 +317,7 @@ void PFClusterProducerCudaHCAL::acquire(edm::Event const& event,
 
   nRH_ = PFRecHits.size;
   if (nRH_ == 0) return;
-  if (nRH_>3000) std::cout << "nRH(PFRecHitSize)>3000: " << nRH_ << std::endl;
+  if (nRH_>4000) std::cout << "nRH(PFRecHitSize)>4000: " << nRH_ << std::endl;
 
   const int numbytes_int = nRH_ * sizeof(int);
   int totalNeighbours = 0;  // Running count of 8 neighbour edges for edgeId, edgeList

From a6fe13943b5646c2388edee1ef9a16527281cc96 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Mon, 3 Oct 2022 22:34:42 -0500
Subject: [PATCH 04/11] Ad-hoc kernel for topo linking.

---
 .../plugins/PFClusterCudaHCAL.cu              | 312 ++++++++++++++----
 1 file changed, 252 insertions(+), 60 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 2a495a2d9fa79..16a1605e4247f 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -74,10 +74,10 @@ namespace PFClusterCudaHCAL {
   // initializeCudaConstants
   // PFRechitToPFCluster_HCAL_entryPoint
   //   seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU]
-  //   prepareTopoInputs: prepare "edge" data (output: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
-  //   topoClusterLinking:
-  //   topoClusterContraction
-  //   fillRhfIndex
+  //   prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
+  //   topoClusterLinking: run topo clustering (output: pfrh_topoId)
+  //   topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac)
+  //   fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices)
   //   hcalFastCluster_selection
   //     dev_hcalFastCluster_optimizedSimple
   //     dev_hcalFastCluster_optimizedComplex
@@ -1830,7 +1830,7 @@ namespace PFClusterCudaHCAL {
         pos4.z += rechitPos.z * norm;
         pos4.w += norm;  //  position_norm
       };
-      /*    
+      /*
     auto computeClusterPosAtomic = [&] (float4& pos4, float _frac, int rhInd, bool isDebug) {
         float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
 
@@ -1839,7 +1839,7 @@ namespace PFClusterCudaHCAL {
             (_frac < minFractionInCalc ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
         if (isDebug)
             printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", rhInd, norm, _frac, rh_energy, rechitPos.x, rechitPos.y, rechitPos.z);
-        
+
         atomicAdd(&pos4.x, rechitPos.x * norm);
         atomicAdd(&pos4.y, rechitPos.y * norm);
         atomicAdd(&pos4.z, rechitPos.z * norm);
@@ -2168,7 +2168,7 @@ namespace PFClusterCudaHCAL {
         pos4.z += rechitPos.z * norm;
         pos4.w += norm;  //  position_norm
       };
-      /*    
+      /*
     auto computeClusterPosAtomic = [&] (float4& pos4, float _frac, int rhInd, bool isDebug) {
         float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
 
@@ -2177,7 +2177,7 @@ namespace PFClusterCudaHCAL {
             (_frac < minFractionInCalc ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
         if (isDebug)
             printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n", rhInd, norm, _frac, rh_energy, rechitPos.x, rechitPos.y, rechitPos.z);
-        
+
         atomicAdd(&pos4.x, rechitPos.x * norm);
         atomicAdd(&pos4.y, rechitPos.y * norm);
         atomicAdd(&pos4.z, rechitPos.z * norm);
@@ -3520,38 +3520,11 @@ namespace PFClusterCudaHCAL {
     }
   }
 
-  // Contraction in a single block
-  __global__ void topoClusterContraction(size_t size, int* pfrh_parent, int* pfrh_isSeed) {
-    __shared__ int notDone;
-    if (threadIdx.x == 0)
-      notDone = 0;
-    __syncthreads();
-
-    do {
-      volatile bool threadNotDone = false;
-      for (int i = threadIdx.x; i < size; i += blockDim.x) {
-        int parent = pfrh_parent[i];
-        if (parent >= 0 && parent != pfrh_parent[parent]) {
-          threadNotDone = true;
-          pfrh_parent[i] = pfrh_parent[parent];
-        }
-      }
-      if (threadIdx.x == 0)
-        notDone = 0;
-      __syncthreads();
-
-      atomicAdd(&notDone, (int)threadNotDone);
-      //if (threadNotDone) notDone = true;
-      //notDone |= threadNotDone;
-      __syncthreads();
-
-    } while (notDone);
-  }
-
   // Contraction in a single block
   __global__ void topoClusterContraction(size_t size,
                                          int* pfrh_parent,
                                          int* pfrh_isSeed,
+                                         //const int* pfrh_neighbours, // temporary inputs for debuggi
                                          int* rhCount,
                                          int* topoSeedCount,
                                          int* topoRHCount,
@@ -3588,6 +3561,30 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
+    // // debugging printing block
+    // __syncthreads();
+    // if (threadIdx.x == 0) {
+    //   int nnode=0;
+    //   for (int i = 0; i < size; i++) {
+    // 	//printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
+    // 	if (i==pfrh_parent[i]) nnode++;
+    //   }
+    //   printf("pfrh_parent 3 multiplicity: %d\n",nnode);
+    //   for (int pos = 0; pos < size; pos++) {
+    // 	int parent_target = pfrh_parent[pos];
+    // 	for (int i = 0; i < 8; i++) {
+    // 	  int neighbor_id = pfrh_neighbours[pos * 8 + i];
+    // 	  if (neighbor_id>-1){ // valid neighbors
+    // 	    int parent_neighbor = pfrh_parent[neighbor_id];
+    // 	    if (parent_target!=parent_neighbor){
+    // 	      printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id);
+    // 	    }
+    // 	  }
+    // 	}
+    //   }
+    //   }
+    // __syncthreads();
+
     // Now determine the number of seeds and rechits in each topo cluster
     for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) {
       int topoId = pfrh_parent[rhIdx];
@@ -3640,8 +3637,10 @@ namespace PFClusterCudaHCAL {
     __syncthreads();
     if (threadIdx.x == 0) {
       *pcrhFracSize = totalSeedFracOffset;
-      //printf("At the end of topoClusterContraction, found *pcrhFracSize = %d\n", *pcrhFracSize);
+      if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs
+	printf("At the end of topoClusterContraction, found large *pcrhFracSize = %d\n", *pcrhFracSize);
     }
+
   }
 
   // Prefill the rechit index for all PFCluster fractions
@@ -3676,7 +3675,7 @@ namespace PFClusterCudaHCAL {
                                          int* pcrhfracind) {
     //int debugSeedIdx = 500;
 
-    /* 
+    /*
     printf("rhCount = \n[");
     for (int i = 0; i < (int)nRH; i++) {
         if (i != 0) printf(", ");
@@ -3729,6 +3728,27 @@ namespace PFClusterCudaHCAL {
     return false;
   }
 
+  // when on the left edge of the edgeId/List block, returns true
+  __device__ __forceinline__ bool isLeftEdgeKH(const int idx,
+                                               const int nEdges,
+                                               const int* __restrict__ pfrh_edgeId,
+                                               const int* __restrict__ pfrh_edgeMask) {
+    int temp = idx - 1;
+    if (idx > 0) {
+      int edgeId = pfrh_edgeId[idx];
+      int tempId = pfrh_edgeId[temp];
+      if (edgeId != tempId) {
+	// Different topo Id here!
+	return true;
+      }
+    } else if (temp < 0) { // idx==0
+      return true;
+    }
+
+    // Invalid index
+    return false;
+  }
+
   __device__ __forceinline__ bool isRightEdge(const int idx,
                                               const int nEdges,
                                               const int* __restrict__ pfrh_edgeId,
@@ -3881,6 +3901,176 @@ namespace PFClusterCudaHCAL {
 #endif
   }
 
+  __global__ void topoClusterLinkingKH(int nRH,
+                                       int* nEdgesIn,
+                                       //float* pfrh_energy, // Temporary entry for debugging
+                                       int* pfrh_parent,
+                                       int* pfrh_edgeId,
+                                       int* pfrh_edgeList,
+                                       int* pfrh_edgeMask,
+                                       const int* pfrh_passTopoThresh,
+                                       int* topoIter) {
+    __shared__ int notDone;  // This is better be bool, but somehow it leads to out of bound
+    __shared__ int notDone2;
+    __shared__ int gridStride, nEdges;
+
+    // Initialization
+    int start = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (threadIdx.x == 0) {
+      *topoIter = 0;
+      nEdges = *nEdgesIn;
+      gridStride = blockDim.x * gridDim.x;  // For single block kernel this is the number of threads
+    }
+
+    __syncthreads();
+
+    // Check if pairs in edgeId,edgeList contain a rh not passing topo threshold
+    // If found, set the mask to 0
+    // But, for now, not using edgeMask hereafter, because the same threshold cut is applied at the PFRecHit level
+    // for (int idx = start; idx < nEdges; idx += gridStride) {
+    //   if (pfrh_passTopoThresh[pfrh_edgeId[idx]] && pfrh_passTopoThresh[pfrh_edgeList[idx]])
+    //     pfrh_edgeMask[idx] = 1;
+    //   else
+    //     pfrh_edgeMask[idx] = 0;
+    // }
+
+    // __syncthreads();
+
+    // // Print out debugging info
+    // if (threadIdx.x == 0) {
+    //   // for (int idx = 0; idx < nEdges; idx++) {
+    //   // 	printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]);
+    //   // 	//printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]);
+    //   //  }
+    //   printf("number of eges %d\n",nEdges);
+    //   // for (int i = 0; i < nRH; i++) {
+    //   // 	printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]);
+    //   // }
+    // }
+
+    // __syncthreads();
+
+    // for notDone
+    if (threadIdx.x == 0) {
+      notDone = 0;
+      notDone2 = 0;
+      //printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x);
+    }
+
+    __syncthreads();
+
+    // (1) First attempt
+    // First edge [set parents to those smaller numbers]
+    for (int idx = start; idx < nEdges; idx += gridStride) {
+      int i = pfrh_edgeId[idx];  // Get edge topo id
+      if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH
+	pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
+      }
+    }
+
+    __syncthreads();
+
+    // KenH
+    for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges
+
+      // for notDone
+      if (threadIdx.x == 0) {
+	notDone2 = 0;
+      }
+
+      // Follow parents of parents .... to contract parent structure
+      do {
+	volatile bool threadNotDone = false;
+	for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
+	  int parent = pfrh_parent[i];
+	  if (parent >= 0 && parent != pfrh_parent[parent]) {
+	    threadNotDone = true;
+	    pfrh_parent[i] = pfrh_parent[parent];
+	  }
+	}
+	if (threadIdx.x == 0)
+	  notDone = 0;
+	__syncthreads();
+
+	atomicAdd(&notDone, (int)threadNotDone);
+	__syncthreads();
+
+      } while (notDone);
+
+      __syncthreads();
+
+      // Print out debugging info
+      // Connect remaining links
+      // if (threadIdx.x == 0) {
+      // 	int nnode=0;
+      // 	for (int i = 0; i < nRH; i++) {
+      // 	  //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
+      // 	  if (i==pfrh_parent[i]) nnode++;
+      // 	}
+      // 	printf("pfrh_parent multiplicity: %d\n",nnode);
+      // }
+
+      // __syncthreads();
+
+      for (int idx = start; idx < nEdges; idx += gridStride) {
+	//for (int idx = 0; idx < nEdges; idx++) {
+	int i = pfrh_edgeId[idx];    // Get edge topo id
+	int j = pfrh_edgeList[idx];  // Get edge neighbor list
+	int parent_target = pfrh_parent[i];
+	int parent_neighbor = pfrh_parent[j];
+	if (parent_target!=parent_neighbor){
+	  notDone2 = 1;
+	  //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii);
+	  int min_parent = (int)min(parent_target,parent_neighbor);
+	  int max_parent = (int)max(parent_target,parent_neighbor);
+	  int idx_max = i;
+	  if (parent_neighbor == max_parent) idx_max = j;
+	  pfrh_parent[idx_max] = min_parent;
+	}
+      }
+
+      __syncthreads();
+      if (notDone2==0) // if topocluster finding is converged, terminate the for-ii loop
+	break;
+
+    } // for-loop ii
+
+    __syncthreads();
+
+    // Follow parents of parents .... to contract parent structure
+    do {
+      volatile bool threadNotDone = false;
+      for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
+        int parent = pfrh_parent[i];
+        if (parent >= 0 && parent != pfrh_parent[parent]) {
+          threadNotDone = true;
+          pfrh_parent[i] = pfrh_parent[parent];
+        }
+      }
+      if (threadIdx.x == 0)
+        notDone = 0;
+      __syncthreads();
+
+      atomicAdd(&notDone, (int)threadNotDone);
+      __syncthreads();
+
+    } while (notDone);
+
+    //__syncthreads();
+
+    // Print out debugging info
+    // if (threadIdx.x == 0) {
+    //   int nnode=0;
+    //   for (int i = 0; i < nRH; i++) {
+    // 	//printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
+    // 	if (i==pfrh_parent[i]) nnode++;
+    //   }
+    //   printf("pfrh_parent 2 multiplicity: %d\n",nnode);
+    // }
+
+  }
+
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
     const int tmp = min(toSort[a], toSort[b]);
     toSort[b] = max(toSort[a], toSort[b]);
@@ -4475,23 +4665,23 @@ namespace PFClusterCudaHCAL {
     cudaEventRecord(start, cudaStream);
 #endif
 
-    //    prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
-    //        nRH,
-    //        outputGPU.nEdges.get(),
-    //        outputGPU.pfrh_passTopoThresh.get(),
-    //        inputPFRecHits.pfrh_neighbours.get(),
-    //        scratchGPU.pfrh_edgeId.get(),
-    //        scratchGPU.pfrh_edgeList.get());
+       prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
+           nRH,
+           outputGPU.nEdges.get(),
+           outputGPU.pfrh_passTopoThresh.get(),
+           inputPFRecHits.pfrh_neighbours.get(),
+           scratchGPU.pfrh_edgeId.get(),
+           scratchGPU.pfrh_edgeList.get());
 
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
-    prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
-                                                                           outputGPU.nEdges.get(),
-                                                                           outputGPU.pfrh_passTopoThresh.get(),
-                                                                           inputPFRecHits.pfrh_neighbours.get(),
-                                                                           scratchGPU.pfrh_edgeId.get(),
-                                                                           scratchGPU.pfrh_edgeList.get());
+    // prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
+    //                                                                        outputGPU.nEdges.get(),
+    //                                                                        outputGPU.pfrh_passTopoThresh.get(),
+    //                                                                        inputPFRecHits.pfrh_neighbours.get(),
+    //                                                                        scratchGPU.pfrh_edgeId.get(),
+    //                                                                        scratchGPU.pfrh_edgeList.get());
     cudaCheck(cudaStreamSynchronize(cudaStream));
 
     //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
@@ -4522,20 +4712,22 @@ namespace PFClusterCudaHCAL {
 #endif
 
     // Topo clustering
-    topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
-                                                  outputGPU.nEdges.get(),
-                                                  outputGPU.pfrh_topoId.get(),
-                                                  scratchGPU.pfrh_edgeId.get(),
-                                                  scratchGPU.pfrh_edgeList.get(),
-                                                  scratchGPU.pfrh_edgeMask.get(),
-                                                  //inputGPU.pfrh_edgeMask.get(),
-                                                  outputGPU.pfrh_passTopoThresh.get(),
-                                                  outputGPU.topoIter.get());
+    topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
+						    outputGPU.nEdges.get(),
+						    //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging
+						    outputGPU.pfrh_topoId.get(),
+						    scratchGPU.pfrh_edgeId.get(),
+						    scratchGPU.pfrh_edgeList.get(),
+						    scratchGPU.pfrh_edgeMask.get(),
+						    //inputGPU.pfrh_edgeMask.get(),
+						    outputGPU.pfrh_passTopoThresh.get(),
+						    outputGPU.topoIter.get());
     cudaCheck(cudaStreamSynchronize(cudaStream));
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
                                                       outputGPU.pfrh_isSeed.get(),
+                                                      //inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging
                                                       scratchGPU.rhcount.get(),
                                                       outputGPU.topoSeedCount.get(),
                                                       outputGPU.topoRHCount.get(),

From 80e88a586216033865212ff1ed4842755e1a909e Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Tue, 4 Oct 2022 11:24:51 -0500
Subject: [PATCH 05/11] Add explicit initialization of pfrh_parent.

---
 .../plugins/PFClusterCudaHCAL.cu              | 100 ++++++++++--------
 1 file changed, 53 insertions(+), 47 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 16a1605e4247f..99413662041ea 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3851,7 +3851,7 @@ namespace PFClusterCudaHCAL {
       __syncthreads();//!!
 
       if (threadIdx.x == 0) {
-        notDone = 0; // KenH is this necessary?
+        notDone = 0;
       }
 
       __syncthreads();
@@ -3949,7 +3949,12 @@ namespace PFClusterCudaHCAL {
     //   // }
     // }
 
-    // __syncthreads();
+    // Explicitly initialize pfrh_parent
+    for (int i = start; i < nRH; i += gridStride) {
+      pfrh_parent[i] = i;
+    }
+
+    __syncthreads();
 
     // for notDone
     if (threadIdx.x == 0) {
@@ -3976,25 +3981,25 @@ namespace PFClusterCudaHCAL {
 
       // for notDone
       if (threadIdx.x == 0) {
-	notDone2 = 0;
+    	notDone2 = 0;
       }
 
       // Follow parents of parents .... to contract parent structure
       do {
-	volatile bool threadNotDone = false;
-	for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
-	  int parent = pfrh_parent[i];
-	  if (parent >= 0 && parent != pfrh_parent[parent]) {
-	    threadNotDone = true;
-	    pfrh_parent[i] = pfrh_parent[parent];
-	  }
-	}
-	if (threadIdx.x == 0)
-	  notDone = 0;
-	__syncthreads();
-
-	atomicAdd(&notDone, (int)threadNotDone);
-	__syncthreads();
+    	volatile bool threadNotDone = false;
+    	for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
+    	  int parent = pfrh_parent[i];
+    	  if (parent >= 0 && parent != pfrh_parent[parent]) {
+    	    threadNotDone = true;
+    	    pfrh_parent[i] = pfrh_parent[parent];
+    	  }
+    	}
+    	if (threadIdx.x == 0)
+    	  notDone = 0;
+    	__syncthreads();
+
+    	atomicAdd(&notDone, (int)threadNotDone);
+    	__syncthreads();
 
       } while (notDone);
 
@@ -4014,25 +4019,25 @@ namespace PFClusterCudaHCAL {
       // __syncthreads();
 
       for (int idx = start; idx < nEdges; idx += gridStride) {
-	//for (int idx = 0; idx < nEdges; idx++) {
-	int i = pfrh_edgeId[idx];    // Get edge topo id
-	int j = pfrh_edgeList[idx];  // Get edge neighbor list
-	int parent_target = pfrh_parent[i];
-	int parent_neighbor = pfrh_parent[j];
-	if (parent_target!=parent_neighbor){
-	  notDone2 = 1;
-	  //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii);
-	  int min_parent = (int)min(parent_target,parent_neighbor);
-	  int max_parent = (int)max(parent_target,parent_neighbor);
-	  int idx_max = i;
-	  if (parent_neighbor == max_parent) idx_max = j;
-	  pfrh_parent[idx_max] = min_parent;
-	}
+      	//for (int idx = 0; idx < nEdges; idx++) {
+      	int i = pfrh_edgeId[idx];    // Get edge topo id
+      	int j = pfrh_edgeList[idx];  // Get edge neighbor list
+      	int parent_target = pfrh_parent[i];
+      	int parent_neighbor = pfrh_parent[j];
+      	if (parent_target!=parent_neighbor){
+      	  notDone2 = 1;
+      	  //printf("hmm. they should have the same parent, but they don't. why... %d %d %d\n",i,j,ii);
+      	  int min_parent = (int)min(parent_target,parent_neighbor);
+      	  int max_parent = (int)max(parent_target,parent_neighbor);
+      	  int idx_max = i;
+      	  if (parent_neighbor == max_parent) idx_max = j;
+      	  pfrh_parent[idx_max] = min_parent;
+      	}
       }
 
       __syncthreads();
       if (notDone2==0) // if topocluster finding is converged, terminate the for-ii loop
-	break;
+    	break;
 
     } // for-loop ii
 
@@ -4057,9 +4062,9 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    //__syncthreads();
+    // __syncthreads();
 
-    // Print out debugging info
+    // // Print out debugging info
     // if (threadIdx.x == 0) {
     //   int nnode=0;
     //   for (int i = 0; i < nRH; i++) {
@@ -4665,23 +4670,23 @@ namespace PFClusterCudaHCAL {
     cudaEventRecord(start, cudaStream);
 #endif
 
-       prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
-           nRH,
-           outputGPU.nEdges.get(),
-           outputGPU.pfrh_passTopoThresh.get(),
-           inputPFRecHits.pfrh_neighbours.get(),
-           scratchGPU.pfrh_edgeId.get(),
-           scratchGPU.pfrh_edgeList.get());
+       // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
+       //     nRH,
+       //     outputGPU.nEdges.get(),
+       //     outputGPU.pfrh_passTopoThresh.get(),
+       //     inputPFRecHits.pfrh_neighbours.get(),
+       //     scratchGPU.pfrh_edgeId.get(),
+       //     scratchGPU.pfrh_edgeList.get());
 
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
-    // prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
-    //                                                                        outputGPU.nEdges.get(),
-    //                                                                        outputGPU.pfrh_passTopoThresh.get(),
-    //                                                                        inputPFRecHits.pfrh_neighbours.get(),
-    //                                                                        scratchGPU.pfrh_edgeId.get(),
-    //                                                                        scratchGPU.pfrh_edgeList.get());
+    prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
+                                                                           outputGPU.nEdges.get(),
+                                                                           outputGPU.pfrh_passTopoThresh.get(),
+                                                                           inputPFRecHits.pfrh_neighbours.get(),
+                                                                           scratchGPU.pfrh_edgeId.get(),
+                                                                           scratchGPU.pfrh_edgeList.get());
     cudaCheck(cudaStreamSynchronize(cudaStream));
 
     //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
@@ -4712,6 +4717,7 @@ namespace PFClusterCudaHCAL {
 #endif
 
     // Topo clustering
+    //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
     topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
 						    outputGPU.nEdges.get(),
 						    //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging

From a12d9442830884580bda93ba21fa8ff50689daab Mon Sep 17 00:00:00 2001
From: Marino Missiroli <m.missiroli@cern.ch>
Date: Fri, 30 Sep 2022 00:05:55 +0200
Subject: [PATCH 06/11] more kernel improvements to avoid race conditions

---
 .../PFClusterProducer/plugins/PFClusterCudaHCAL.cu   | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 99413662041ea..78ccc4f2d5053 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3892,13 +3892,8 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone==1);
 
-    *topoIter = iter;
-#ifdef DEBUG_GPU_HCAL
-//    if (threadIdx.x == 0) {
-//        printf("*** Topo clustering converged in %d iterations ***\n", iter);
-//    }
-//    __syncthreads();
-#endif
+    if (threadIdx.x == 0)
+      *topoIter = iter;
   }
 
   __global__ void topoClusterLinkingKH(int nRH,
@@ -4133,6 +4128,7 @@ namespace PFClusterCudaHCAL {
   }
 
   __device__ __forceinline__ int scan1Inclusive(int idata, volatile int* s_Data, int size) {
+    assert(size == 32);
     int pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
     s_Data[pos] = 0;
     pos += size;
@@ -4140,7 +4136,9 @@ namespace PFClusterCudaHCAL {
 
     for (int offset = 1; offset < size; offset <<= 1) {
       int t = s_Data[pos] + s_Data[pos - offset];
+      __syncwarp();
       s_Data[pos] = t;
+      __syncwarp();
     }
 
     return s_Data[pos];

From 171d2b6917e9ffe0e62458e71acf7185c0afb4c4 Mon Sep 17 00:00:00 2001
From: Marino Missiroli <m.missiroli@cern.ch>
Date: Fri, 30 Sep 2022 01:01:41 +0200
Subject: [PATCH 07/11] more syncthreads calls in topoClusterLinking kernel

---
 .../plugins/PFClusterCudaHCAL.cu              | 84 ++++++-------------
 1 file changed, 25 insertions(+), 59 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 78ccc4f2d5053..bc27bff0a0420 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3635,6 +3635,7 @@ namespace PFClusterCudaHCAL {
       }
     }
     __syncthreads();
+
     if (threadIdx.x == 0) {
       *pcrhFracSize = totalSeedFracOffset;
       if (*pcrhFracSize>200000) // DeclsForKernels.h maxPFCFracs
@@ -3807,8 +3808,11 @@ namespace PFClusterCudaHCAL {
       else
         pfrh_edgeMask[idx] = 0;
     }
+    __syncthreads();//!!
 
     do {
+      __syncthreads();//!!
+
       if (threadIdx.x == 0) {
         notDone = 0;
       }
@@ -3840,6 +3844,9 @@ namespace PFClusterCudaHCAL {
           }
         }
       }
+
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
@@ -3853,7 +3860,6 @@ namespace PFClusterCudaHCAL {
       if (threadIdx.x == 0) {
         notDone = 0;
       }
-
       __syncthreads();
 
       // Even linking
@@ -3864,7 +3870,6 @@ namespace PFClusterCudaHCAL {
           pfrh_parent[i] = (int)max(i, pfrh_edgeList[idx]);
         }
       }
-
       __syncthreads();
 
       // edgeParent
@@ -3885,6 +3890,8 @@ namespace PFClusterCudaHCAL {
         }
       }
 
+      __syncthreads();//!!
+
       if (threadIdx.x == 0)
         iter++;
 
@@ -3892,6 +3899,8 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone==1);
 
+    __syncthreads();//!!
+
     if (threadIdx.x == 0)
       *topoIter = iter;
   }
@@ -4628,18 +4637,9 @@ namespace PFClusterCudaHCAL {
       ::PFClustering::HCAL::ScratchDataGPU& scratchGPU,
       float (&timer)[8]) {
 
-#ifdef DEBUG_GPU_HCAL
-    cudaProfilerStart();
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     int nRH = inputPFRecHits.size;
 
     // Combined seeding & topo clustering thresholds, array initialization
-
     seedingTopoThreshKernel_HCAL<<<(nRH + 31) / 32, 64, 0, cudaStream>>>(nRH,
                                                                          inputPFRecHits.pfrh_energy.get(),
                                                                          inputPFRecHits.pfrh_x.get(),
@@ -4659,6 +4659,7 @@ namespace PFClusterCudaHCAL {
                                                                          outputGPU.topoSeedList.get(),
                                                                          outputGPU.pfc_iter.get());
 
+<<<<<<< HEAD
     cudaCheck(cudaStreamSynchronize(cudaStream));
 
 #ifdef DEBUG_GPU_HCAL
@@ -4676,6 +4677,8 @@ namespace PFClusterCudaHCAL {
        //     scratchGPU.pfrh_edgeId.get(),
        //     scratchGPU.pfrh_edgeList.get());
 
+=======
+>>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel)
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
@@ -4685,36 +4688,9 @@ namespace PFClusterCudaHCAL {
                                                                            inputPFRecHits.pfrh_neighbours.get(),
                                                                            scratchGPU.pfrh_edgeId.get(),
                                                                            scratchGPU.pfrh_edgeList.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-    //    prepareTopoInputs<<<1, 256, 256 * (8+4) * sizeof(int), cudaStream>>>(
-    //        nRH,
-    //        outputGPU.nEdges.get(),
-    //        outputGPU.pfrh_passTopoThresh.get(),
-    //        inputPFRecHits.pfrh_neighbours.get(),
-    //        scratchGPU.pfrh_edgeId.get(),
-    //        scratchGPU.pfrh_edgeList.get());
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[4], start, stop);
-    //printf("\nprepareTopoInputs took %f ms\n", timer[4]);
-
-    compareEdgeArrays<<<1, 1, 0, cudaStream>>>(outputGPU.nEdges.get(),
-                                               scratchGPU.pfrh_edgeId.get(),
-                                               scratchGPU.pfrh_edgeList.get(),
-                                               nEdges,
-                                               inputGPU.pfrh_edgeId.get(),
-                                               inputGPU.pfrh_edgeList.get(),
-                                               nRH,
-                                               inputGPU.pfNeighFourInd.get(),
-                                               inputPFRecHits.pfrh_neighbours.get());
-
-    cudaEventRecord(start, cudaStream);
-#endif
 
     // Topo clustering
+<<<<<<< HEAD
     //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
     topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
 						    outputGPU.nEdges.get(),
@@ -4727,6 +4703,16 @@ namespace PFClusterCudaHCAL {
 						    outputGPU.pfrh_passTopoThresh.get(),
 						    outputGPU.topoIter.get());
     cudaCheck(cudaStreamSynchronize(cudaStream));
+=======
+    topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
+                                                  outputGPU.nEdges.get(),
+                                                  outputGPU.pfrh_topoId.get(),
+                                                  scratchGPU.pfrh_edgeId.get(),
+                                                  scratchGPU.pfrh_edgeList.get(),
+                                                  scratchGPU.pfrh_edgeMask.get(),
+                                                  outputGPU.pfrh_passTopoThresh.get(),
+                                                  outputGPU.topoIter.get());
+>>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel)
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
@@ -4742,13 +4728,6 @@ namespace PFClusterCudaHCAL {
                                                       outputGPU.pcrh_frac.get(),
                                                       outputGPU.pcrhFracSize.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[1], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
     dim3 block(32, 32);
 
@@ -4761,13 +4740,6 @@ namespace PFClusterCudaHCAL {
                                                  scratchGPU.rhcount.get(),
                                                  outputGPU.pcrh_fracInd.get());
 
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[2], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
     hcalFastCluster_selection<<<nRH, 256, 0, cudaStream>>>(nRH,
                                                            inputPFRecHits.pfrh_x.get(),
                                                            inputPFRecHits.pfrh_y.get(),
@@ -4791,11 +4763,5 @@ namespace PFClusterCudaHCAL {
                                                            inputGPU.pfc_prevPos4.get(),
                                                            inputGPU.pfc_energy.get(),
                                                            outputGPU.pfc_iter.get());
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[3], start, stop);
-    cudaProfilerStop();
-#endif
   }
 }  // namespace PFClusterCudaHCAL

From 6ca17e5c05665d50f4cd0daef8559b11179102d5 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Tue, 4 Oct 2022 12:41:48 -0500
Subject: [PATCH 08/11] resolve conflict

---
 .../plugins/PFClusterCudaHCAL.cu              | 100 +-----------------
 1 file changed, 2 insertions(+), 98 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index bc27bff0a0420..140ed6cca5bb2 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3524,7 +3524,6 @@ namespace PFClusterCudaHCAL {
   __global__ void topoClusterContraction(size_t size,
                                          int* pfrh_parent,
                                          int* pfrh_isSeed,
-                                         //const int* pfrh_neighbours, // temporary inputs for debuggi
                                          int* rhCount,
                                          int* topoSeedCount,
                                          int* topoRHCount,
@@ -3561,30 +3560,6 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    // // debugging printing block
-    // __syncthreads();
-    // if (threadIdx.x == 0) {
-    //   int nnode=0;
-    //   for (int i = 0; i < size; i++) {
-    // 	//printf("final pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-    // 	if (i==pfrh_parent[i]) nnode++;
-    //   }
-    //   printf("pfrh_parent 3 multiplicity: %d\n",nnode);
-    //   for (int pos = 0; pos < size; pos++) {
-    // 	int parent_target = pfrh_parent[pos];
-    // 	for (int i = 0; i < 8; i++) {
-    // 	  int neighbor_id = pfrh_neighbours[pos * 8 + i];
-    // 	  if (neighbor_id>-1){ // valid neighbors
-    // 	    int parent_neighbor = pfrh_parent[neighbor_id];
-    // 	    if (parent_target!=parent_neighbor){
-    // 	      printf("hmm. they should have the same parent, but they don't. why... %d %d\n",pos,neighbor_id);
-    // 	    }
-    // 	  }
-    // 	}
-    //   }
-    //   }
-    // __syncthreads();
-
     // Now determine the number of seeds and rechits in each topo cluster
     for (int rhIdx = threadIdx.x; rhIdx < size; rhIdx += blockDim.x) {
       int topoId = pfrh_parent[rhIdx];
@@ -3907,7 +3882,6 @@ namespace PFClusterCudaHCAL {
 
   __global__ void topoClusterLinkingKH(int nRH,
                                        int* nEdgesIn,
-                                       //float* pfrh_energy, // Temporary entry for debugging
                                        int* pfrh_parent,
                                        int* pfrh_edgeId,
                                        int* pfrh_edgeList,
@@ -3941,18 +3915,6 @@ namespace PFClusterCudaHCAL {
 
     // __syncthreads();
 
-    // // Print out debugging info
-    // if (threadIdx.x == 0) {
-    //   // for (int idx = 0; idx < nEdges; idx++) {
-    //   // 	printf("initial edge id, list, mask: %d %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx],pfrh_edgeMask[idx]);
-    //   // 	//printf("initial edge id, list, mask: %d %d\n",pfrh_edgeId[idx],pfrh_edgeList[idx]);
-    //   //  }
-    //   printf("number of eges %d\n",nEdges);
-    //   // for (int i = 0; i < nRH; i++) {
-    //   // 	printf("initial pfrh_id,parent,energy: %d %d %8.3f\n",i,pfrh_parent[i],pfrh_energy[i]);
-    //   // }
-    // }
-
     // Explicitly initialize pfrh_parent
     for (int i = start; i < nRH; i += gridStride) {
       pfrh_parent[i] = i;
@@ -4009,19 +3971,7 @@ namespace PFClusterCudaHCAL {
 
       __syncthreads();
 
-      // Print out debugging info
-      // Connect remaining links
-      // if (threadIdx.x == 0) {
-      // 	int nnode=0;
-      // 	for (int i = 0; i < nRH; i++) {
-      // 	  //printf("middle pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-      // 	  if (i==pfrh_parent[i]) nnode++;
-      // 	}
-      // 	printf("pfrh_parent multiplicity: %d\n",nnode);
-      // }
-
-      // __syncthreads();
-
+      // All rechit pairs in edge id-list have the same topo cluster label?
       for (int idx = start; idx < nEdges; idx += gridStride) {
       	//for (int idx = 0; idx < nEdges; idx++) {
       	int i = pfrh_edgeId[idx];    // Get edge topo id
@@ -4066,18 +4016,6 @@ namespace PFClusterCudaHCAL {
 
     } while (notDone);
 
-    // __syncthreads();
-
-    // // Print out debugging info
-    // if (threadIdx.x == 0) {
-    //   int nnode=0;
-    //   for (int i = 0; i < nRH; i++) {
-    // 	//printf("middle2 pfrh_id,parent: %d %d\n",i,pfrh_parent[i]);
-    // 	if (i==pfrh_parent[i]) nnode++;
-    //   }
-    //   printf("pfrh_parent 2 multiplicity: %d\n",nnode);
-    // }
-
   }
 
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {
@@ -4659,29 +4597,10 @@ namespace PFClusterCudaHCAL {
                                                                          outputGPU.topoSeedList.get(),
                                                                          outputGPU.pfc_iter.get());
 
-<<<<<<< HEAD
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-
-#ifdef DEBUG_GPU_HCAL
-    cudaEventRecord(stop, cudaStream);
-    cudaEventSynchronize(stop);
-    cudaEventElapsedTime(&timer[0], start, stop);
-    cudaEventRecord(start, cudaStream);
-#endif
-
-       // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
-       //     nRH,
-       //     outputGPU.nEdges.get(),
-       //     outputGPU.pfrh_passTopoThresh.get(),
-       //     inputPFRecHits.pfrh_neighbours.get(),
-       //     scratchGPU.pfrh_edgeId.get(),
-       //     scratchGPU.pfrh_edgeList.get());
-
-=======
->>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel)
     // Topo clustering
     // Fill edgeId, edgeList arrays with rechit neighbors
     // Has a bug when using more than 128 threads..
+    // prepareTopoInputsSerial<<<1, 1, 4 * (8+4) * sizeof(int), cudaStream>>>(
     prepareTopoInputs<<<1, 128, 128 * (8 + 4) * sizeof(int), cudaStream>>>(nRH,
                                                                            outputGPU.nEdges.get(),
                                                                            outputGPU.pfrh_passTopoThresh.get(),
@@ -4690,34 +4609,19 @@ namespace PFClusterCudaHCAL {
                                                                            scratchGPU.pfrh_edgeList.get());
 
     // Topo clustering
-<<<<<<< HEAD
     //topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
     topoClusterLinkingKH<<<1, 512, 0, cudaStream>>>(nRH,
 						    outputGPU.nEdges.get(),
-						    //inputPFRecHits.pfrh_energy.get(), // temporary entry for debugging
 						    outputGPU.pfrh_topoId.get(),
 						    scratchGPU.pfrh_edgeId.get(),
 						    scratchGPU.pfrh_edgeList.get(),
 						    scratchGPU.pfrh_edgeMask.get(),
-						    //inputGPU.pfrh_edgeMask.get(),
 						    outputGPU.pfrh_passTopoThresh.get(),
 						    outputGPU.topoIter.get());
-    cudaCheck(cudaStreamSynchronize(cudaStream));
-=======
-    topoClusterLinking<<<1, 512, 0, cudaStream>>>(nRH,
-                                                  outputGPU.nEdges.get(),
-                                                  outputGPU.pfrh_topoId.get(),
-                                                  scratchGPU.pfrh_edgeId.get(),
-                                                  scratchGPU.pfrh_edgeList.get(),
-                                                  scratchGPU.pfrh_edgeMask.get(),
-                                                  outputGPU.pfrh_passTopoThresh.get(),
-                                                  outputGPU.topoIter.get());
->>>>>>> 8189a9f0f08 (more syncthreads calls in topoClusterLinking kernel)
 
     topoClusterContraction<<<1, 512, 0, cudaStream>>>(nRH,
                                                       outputGPU.pfrh_topoId.get(),
                                                       outputGPU.pfrh_isSeed.get(),
-                                                      //inputPFRecHits.pfrh_neighbours.get(), // temporary entry for debugging
                                                       scratchGPU.rhcount.get(),
                                                       outputGPU.topoSeedCount.get(),
                                                       outputGPU.topoRHCount.get(),

From dd6b122d0a4d3a0f7c58bee77ba21da72d307d9e Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Tue, 4 Oct 2022 21:11:23 -0500
Subject: [PATCH 09/11] Add machinery to toggle produceLegacy.

---
 .../python/customizeHcalOnlyForProfiling.py    |  2 ++
 .../plugins/PFClusterCudaHCAL.cu               | 18 ++++++------------
 .../python/particleFlowClusterHBHE_cfi.py      |  3 +++
 .../python/particleFlowRecHitHBHE_cfi.py       |  4 ++++
 4 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
index 93da82c580d99..052dd7f3d59c8 100644
--- a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
+++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
@@ -33,6 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process):
   process.consume_step = cms.EndPath(process.consumer)
 
   process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
+  #process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
+  #process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)
 
   return process
 
diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 140ed6cca5bb2..42304ec139c2a 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -75,7 +75,7 @@ namespace PFClusterCudaHCAL {
   // PFRechitToPFCluster_HCAL_entryPoint
   //   seedingTopoThreshKernel_HCAL: apply seeding/topo-clustering threshold to RecHits, also ensure a peak (outputs: pfrh_isSeed, pfrh_passTopoThresh) [OutputDataGPU]
   //   prepareTopoInputs: prepare "edge" data (outputs: nEdges, pfrh_edgeId, pfrh_edgeList [nEdges dimension])
-  //   topoClusterLinking: run topo clustering (output: pfrh_topoId)
+  //   topoClusterLinking(KH): run topo clustering (output: pfrh_topoId)
   //   topoClusterContraction: find parent of parent (or parent (of parent ...)) (outputs: pfrh_parent, topoSeedCount, topoSeedOffsets, topoSeedList, seedFracOffsets, pcrhfracind, pcrhfrac)
   //   fillRhfIndex: fill rhfracind (PFCluster RecHitFraction constituent PFRecHit indices)
   //   hcalFastCluster_selection
@@ -3899,6 +3899,8 @@ namespace PFClusterCudaHCAL {
       *topoIter = 0;
       nEdges = *nEdgesIn;
       gridStride = blockDim.x * gridDim.x;  // For single block kernel this is the number of threads
+      notDone = 0;
+      notDone2 = 0;
     }
 
     __syncthreads();
@@ -3922,15 +3924,6 @@ namespace PFClusterCudaHCAL {
 
     __syncthreads();
 
-    // for notDone
-    if (threadIdx.x == 0) {
-      notDone = 0;
-      notDone2 = 0;
-      //printf("gridStride, blockDim.x %d %d\n",gridStride,blockDim.x);
-    }
-
-    __syncthreads();
-
     // (1) First attempt
     // First edge [set parents to those smaller numbers]
     for (int idx = start; idx < nEdges; idx += gridStride) {
@@ -3942,8 +3935,9 @@ namespace PFClusterCudaHCAL {
 
     __syncthreads();
 
-    // KenH
-    for (int ii=0; ii<100; ii++) { // loop until topo clustering iteration converges
+    //
+    // loop until topo clustering iteration converges
+    for (int ii=0; ii<100; ii++) {
 
       // for notDone
       if (threadIdx.x == 0) {
diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py
index 7dea30a6d7dc7..ab713030f821b 100644
--- a/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py
+++ b/RecoParticleFlow/PFClusterProducer/python/particleFlowClusterHBHE_cfi.py
@@ -111,11 +111,14 @@
     positionReCalc = cms.PSet(),
     energyCorrector = cms.PSet()
 )
+_module_config_cuda = _module_config.clone()
 
 #### PF CLUSTER HCAL ####
 _particleFlowClusterHBHE_cpu = cms.EDProducer("PFClusterProducer", _module_config.clone())
 _particleFlowClusterHBHE_cuda = cms.EDProducer("PFClusterProducerCudaHCAL", _module_config.clone())
 _particleFlowClusterHBHE_cuda.PFRecHitsLabelIn = cms.InputTag("particleFlowRecHitHBHE","")
+_particleFlowClusterHBHE_cuda.produceLegacy = cms.bool(True)
+_particleFlowClusterHBHE_cuda.produceSoA = cms.bool(True)
 
 #####
 
diff --git a/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py b/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py
index 33b77bccff92a..0dcec57965c03 100644
--- a/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py
+++ b/RecoParticleFlow/PFClusterProducer/python/particleFlowRecHitHBHE_cfi.py
@@ -44,6 +44,10 @@
 
 _module_pset_cuda = _module_pset.clone()
 _module_pset_cuda.producers[0].src= "hbheRecHitProducerGPU" # use GPU version as input instead of legacy version
+_module_pset_cuda.produceSoA = cms.bool(True)
+_module_pset_cuda.produceLegacy = cms.bool(True)
+_module_pset_cuda.produceCleanedLegacy = cms.bool(True)
+
 for idx, x in enumerate(_module_pset_cuda.producers):
     for idy, y in enumerate(x.qualityTests):
         if y.name._value == "PFRecHitQTestHCALThresholdVsDepth": # when applying phase1 depth-dependent HCAL thresholds

From 18f7e39c99aedad4c5b800fa2136a50299a75e65 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Wed, 5 Oct 2022 11:48:14 -0500
Subject: [PATCH 10/11] Optimising kernels.

---
 .../plugins/PFClusterCudaHCAL.cu              | 32 ++-----------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
index 42304ec139c2a..9734f46f4e7eb 100644
--- a/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
+++ b/RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
@@ -3917,19 +3917,14 @@ namespace PFClusterCudaHCAL {
 
     // __syncthreads();
 
-    // Explicitly initialize pfrh_parent
-    for (int i = start; i < nRH; i += gridStride) {
-      pfrh_parent[i] = i;
-    }
-
-    __syncthreads();
-
-    // (1) First attempt
+    // First attempt of topo clustering
     // First edge [set parents to those smaller numbers]
     for (int idx = start; idx < nEdges; idx += gridStride) {
       int i = pfrh_edgeId[idx];  // Get edge topo id
       if (pfrh_edgeMask[idx] > 0 && isLeftEdgeKH(idx, nEdges, pfrh_edgeId, pfrh_edgeMask)) { // isLeftEdgeKH
 	pfrh_parent[i] = (int)min(i, pfrh_edgeList[idx]);
+      } else {
+	pfrh_parent[i] = i;
       }
     }
 
@@ -3989,27 +3984,6 @@ namespace PFClusterCudaHCAL {
 
     } // for-loop ii
 
-    __syncthreads();
-
-    // Follow parents of parents .... to contract parent structure
-    do {
-      volatile bool threadNotDone = false;
-      for (int i = threadIdx.x; i < nRH; i += blockDim.x) {
-        int parent = pfrh_parent[i];
-        if (parent >= 0 && parent != pfrh_parent[parent]) {
-          threadNotDone = true;
-          pfrh_parent[i] = pfrh_parent[parent];
-        }
-      }
-      if (threadIdx.x == 0)
-        notDone = 0;
-      __syncthreads();
-
-      atomicAdd(&notDone, (int)threadNotDone);
-      __syncthreads();
-
-    } while (notDone);
-
   }
 
   __device__ __forceinline__ void sortSwap(int* toSort, int a, int b) {

From eb14f9abe135f2bdd6f64d7c6abd78e7ebfe3153 Mon Sep 17 00:00:00 2001
From: Kenichi Hatakeyama <kenichi.hatakeyama@cern.ch>
Date: Wed, 5 Oct 2022 12:34:58 -0500
Subject: [PATCH 11/11] turn off legacy data storing when running profiling
 customisation.

---
 .../Configuration/python/customizeHcalOnlyForProfiling.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
index 052dd7f3d59c8..1ba182960b82c 100644
--- a/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
+++ b/RecoLocalCalo/Configuration/python/customizeHcalOnlyForProfiling.py
@@ -23,7 +23,7 @@ def customizeHcalOnlyForProfilingGPUOnly(process):
 # Currently, this means:
 #   - running the unpacker on CPU, converting the digis into SoA format and copying them to GPU
 #   - running the HBHE local reconstruction, including MAHI, on GPU.
-#   - running the HBHE PFRecHit and PFCluster producers on GPU [including copy of them to host, as we cannot untangle the copy back at a moment]
+#   - running the HBHE PFRecHit and PFCluster producers on GPU [without copy of them to host]
 def customizeHcalPFOnlyForProfilingGPUOnly(process):
 
   process.consumer = cms.EDAnalyzer("GenericConsumer",
@@ -33,8 +33,8 @@ def customizeHcalPFOnlyForProfilingGPUOnly(process):
   process.consume_step = cms.EndPath(process.consumer)
 
   process.schedule = cms.Schedule(process.raw2digi_step, process.reconstruction_step, process.consume_step)
-  #process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
-  #process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)
+  process.particleFlowClusterHBHEOnly.cuda.produceLegacy = cms.bool(False)
+  process.particleFlowRecHitHBHEOnly.cuda.produceLegacy = cms.bool(False)
 
   return process