From 339bb0fcbdc3955056712ca0c1ecfb026e59cb25 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 30 May 2018 15:08:12 +0200
Subject: [PATCH 1/3] faster cluster, now do not requires to know number of
 modules

---
 .../plugins/gpuClustering.h                   | 21 +++++++++++++------
 .../SiPixelClusterizer/test/gpuClustering.cu  | 11 +++++-----
 2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 672a4dbe97450..77cb2cf608a76 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,9 +1,9 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
-#include <cassert>
 #include <cstdint>
 #include <cstdio>
+#include<cassert>
 
 namespace gpuClustering {
 
@@ -46,6 +46,8 @@ namespace gpuClustering {
 
     __shared__  int msize;    
 
+    if (blockIdx.x >=moduleStart[0]) return;
+
     auto first = moduleStart[1 + blockIdx.x];  
     
     auto me = id[first];
@@ -79,9 +81,11 @@ namespace gpuClustering {
     if (first>=msize) return;
 
     int jmax[10];
+    // int jmin[10];
     auto niter = (msize-first)/blockDim.x;
     assert(niter<10);
-    for (int i=0; i<niter+1; ++i) jmax[i]=msize;    
+    // for (int k=0; k<niter+1; ++k) jmin[k]=first+k*blockDim.x+1;    
+    for (int k=0; k<niter+1; ++k) jmax[k]=msize;
 
     while (go) {
       __syncthreads();
@@ -94,11 +98,16 @@ namespace gpuClustering {
 	if (id[i]==InvId) continue;  // not valid
 	assert(id[i]==me); //  break;  // end of module
 	++debug[i];
+        auto js = i+1;
+//        auto js = jmin[k]; 
         auto jm = jmax[k];
-	for (int j=i+1; j<jm; ++j) {
+        jmax[k]=i+1;
+//        bool first = true;
+	for (int j=js; j<jm; ++j) {
  	  if (id[j]==InvId) continue;  // not valid
-          if (std::abs(int(x[j])-int(x[i]))>1) continue;
-	  if (std::abs(int(y[j])-int(y[i]))>1) continue;
+          if (std::abs(int(x[j])-int(x[i]))>1 |
+	      std::abs(int(y[j])-int(y[i]))>1) continue;
+//          if (first) {jmin[k] = j; first=false;}
 	  auto old = atomicMin(&clus[j],clus[i]);
 	  if (old!=clus[i]) go=true;
 	  atomicMin(&clus[i],old);
@@ -167,4 +176,4 @@ namespace gpuClustering {
   
 } //namespace gpuClustering
 
-#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index ae0359cd683e0..beafe83ba60bd 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -198,13 +198,8 @@ int main(void)
 	       d_id.get(), d_moduleStart.get() ,d_clus.get(),n
 	       );
   
-  cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
-		     
-  std::cout << "found " << nModules << " Modules active" << std::endl;
-
-  
   threadsPerBlock = 256;
-  blocksPerGrid = nModules;
+  blocksPerGrid = MaxNumModules;    //nModules;
 
 
 
@@ -226,6 +221,10 @@ int main(void)
 	       );
 
 
+  cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
+  std::cout << "found " << nModules << " Modules active" << std::endl;
+
+
   uint32_t nclus[MaxNumModules], moduleId[nModules];  
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));

From 8f0a694a96c92a07a3190679ccfefb59108d2588 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 4 Jun 2018 12:11:35 +0200
Subject: [PATCH 2/3] Clean up

---
 .../plugins/gpuClustering.h                   | 244 +++++++++---------
 1 file changed, 118 insertions(+), 126 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 77cb2cf608a76..11bcf5c41543c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,179 +1,171 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
+#include <cassert>
 #include <cstdint>
 #include <cstdio>
-#include<cassert>
 
 namespace gpuClustering {
 
-  constexpr uint32_t MaxNumModules = 2000;
+  constexpr uint32_t MaxNumModules  = 2000;
+  constexpr uint32_t MaxNumPixels   = 256 * 2000;   // this does not mean maxPixelPerModule == 256!
+  constexpr uint16_t InvId          = 9999;         // must be > MaxNumModules
 
-  constexpr uint32_t MaxNumPixels = 256*2000;  // this does not mean maxPixelPerModule==256!
-  
-  constexpr uint16_t InvId=9999; // must be > MaxNumModules
-  
   __global__ void countModules(uint16_t const * id,
-			       uint32_t * moduleStart,
-			       int32_t * clus,
-			       int numElements){
-    
+                               uint32_t * moduleStart,
+                               int32_t * clus,
+                               int numElements){
+
     int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= numElements) return;
-    clus[i]=i;
-    if (InvId==id[i]) return;
-    auto j=i-1;
-    while(j>=0 && id[j]==InvId) --j;
-    if(j<0 || id[j]!=id[i]) {
+    if (i >= numElements)
+      return;
+    clus[i] = i;
+    if (InvId == id[i])
+      return;
+    auto j = i - 1;
+    while (j >= 0 and id[j] == InvId)
+      --j;
+    if (j < 0 or id[j] != id[i]) {
       // boundary...
-      auto loc = atomicInc(moduleStart,MaxNumModules);
-      moduleStart[loc+1]=i;
+      auto loc = atomicInc(moduleStart, MaxNumModules);
+      moduleStart[loc + 1] = i;
     }
   }
-  
-  
+
   __global__ void findClus(uint16_t const * id,
-			   uint16_t const * x,
-			   uint16_t const * y,
-			   uint16_t const * adc,
-			   uint32_t const * moduleStart,
-			   uint32_t * clusInModule, uint32_t * moduleId,
-			   int32_t * clus, uint32_t * debug, 
-			   int numElements){
-    
+                           uint16_t const * x,
+                           uint16_t const * y,
+                           uint16_t const * adc,
+                           uint32_t const * moduleStart,
+                           uint32_t * clusInModule, uint32_t * moduleId,
+                           int32_t * clus, uint32_t * debug,
+                           int numElements)
+  {
     __shared__ bool go;
     __shared__ int nclus;
+    __shared__ int msize;
 
-    __shared__  int msize;    
-
-    if (blockIdx.x >=moduleStart[0]) return;
+    if (blockIdx.x >= moduleStart[0])
+      return;
 
-    auto first = moduleStart[1 + blockIdx.x];  
-    
+    auto first = moduleStart[1 + blockIdx.x];
     auto me = id[first];
 
-    assert(me<MaxNumModules);    
-
-    // auto debugme = (me==31 && threadIdx.x==127);
+    assert(me < MaxNumModules);
 
 #ifdef GPU_DEBUG
-    if (me%100==1)
-      if (threadIdx.x==0) printf("start clusterizer for module %d in block %d\n",me,blockIdx.x);
+    if (me%100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", me, blockIdx.x);
 #endif
 
-    first+=threadIdx.x;
-   
-    if (first>=numElements) return;
-    
-    go=true;
-    nclus=0;
+    first += threadIdx.x;
+    if (first>= numElements)
+      return;
+
+    go = true;
+    nclus = 0;
 
-    msize=numElements;
+    msize = numElements;
     __syncthreads();
 
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) continue;  // not valid
-      if (id[i]!=me) { atomicMin(&msize,i); break;}  // end of module
-     }
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)               // not valid
+        continue;
+      if (id[i] != me) {                // end of module
+        atomicMin(&msize, i);
+        break;
+      }
+    }
     __syncthreads();
 
-    assert(msize<=numElements);    
-    if (first>=msize) return;
+    assert(msize<= numElements);
+    if (first>= msize)
+      return;
 
     int jmax[10];
-    // int jmin[10];
-    auto niter = (msize-first)/blockDim.x;
-    assert(niter<10);
-    // for (int k=0; k<niter+1; ++k) jmin[k]=first+k*blockDim.x+1;    
-    for (int k=0; k<niter+1; ++k) jmax[k]=msize;
+    auto niter = (msize - first) / blockDim.x;
+    assert(niter < 10);
+    for (int k = 0; k < niter + 1; ++k)
+      jmax[k] = msize;
 
     while (go) {
       __syncthreads();
-      go=false;
-      __syncthreads();
+      go = false;
 
-      int k=-1;
-      for (int i=first; i<msize; i+=blockDim.x) {
+      __syncthreads();
+      int k = -1;
+      for (int i = first; i < msize; i += blockDim.x) {
         ++k;
-	if (id[i]==InvId) continue;  // not valid
-	assert(id[i]==me); //  break;  // end of module
-	++debug[i];
-        auto js = i+1;
-//        auto js = jmin[k]; 
+        if (id[i] == InvId)             // not valid
+          continue;
+        assert(id[i] == me); // break;  // end of module
+        ++debug[i];
+        auto js = i + 1;
         auto jm = jmax[k];
-        jmax[k]=i+1;
-//        bool first = true;
-	for (int j=js; j<jm; ++j) {
- 	  if (id[j]==InvId) continue;  // not valid
-          if (std::abs(int(x[j])-int(x[i]))>1 |
-	      std::abs(int(y[j])-int(y[i]))>1) continue;
-//          if (first) {jmin[k] = j; first=false;}
-	  auto old = atomicMin(&clus[j],clus[i]);
-	  if (old!=clus[i]) go=true;
-	  atomicMin(&clus[i],old);
-          jmax[k]=j+1;
-	}
+        jmax[k] = i + 1;
+        for (int j = js; j < jm; ++j) {
+          if (id[j] == InvId)           // not valid
+            continue;
+          if (std::abs(int(x[j]) - int(x[i])) > 1 |
+              std::abs(int(y[j]) - int(y[i])) > 1)
+            continue;
+          auto old = atomicMin(&clus[j], clus[i]);
+          if (old != clus[i]) go = true;
+          atomicMin(&clus[i], old);
+          jmax[k] = j + 1;
+        }
       }
-      assert (k<=niter);
-    __syncthreads();
-    }
-    
-    /*
-    // fast count (nice but not much useful)
-    auto laneId = threadIdx.x & 0x1f;
-    
-    for (int i=first; i<numElements; i+=blockDim.x) {
-    if (id[i]==InvId) continue;  // not valid
-    if (id[i]!=me) break;  // end of module
-    auto value = clus[i]^i;
-    auto mask = __ballot_sync(0xffffffff,value==0);
-    if (laneId==0) atomicAdd(&nclus,__popc(mask));
+      assert (k<= niter);
+      __syncthreads();
     }
-    
-    __syncthreads();
-    if (me<30)
-    if (laneId==0) printf("%d clusters in module %d\n",nclus,me);
-    */
 
-    nclus=0;
+    nclus = 0;
     __syncthreads();
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) continue;  // not valid
-      if (id[i]!=me) break;  // end of module
-      if (clus[i]==i) {
-	auto old = atomicAdd(&nclus,1);
-	clus[i]=-(old+1);
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)               // not valid
+        continue;
+      if (id[i] != me)                  // end of module
+        break;
+      if (clus[i] == i) {
+        auto old = atomicAdd(&nclus, 1);
+        clus[i] = -(old + 1);
       }
     }
-    
+
     __syncthreads();
-    
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) continue;  // not valid
-      if (id[i]!=me) break;  // end of module
-      if (clus[i]>=0) clus[i]=clus[clus[i]];
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId)               // not valid
+        continue;
+      if (id[i] != me)                  // end of module
+        break;
+      if (clus[i]>= 0) clus[i] = clus[clus[i]];
     }
-    
+
     __syncthreads();
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) {clus[i]=-9999; continue; } // not valid
-      if (id[i]!=me) break;  // end of module
-      clus[i] = -clus[i] -1;
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) {             // not valid
+        clus[i] = -9999;
+        continue;
+      }
+      if (id[i] != me)                  // end of module
+        break;
+      clus[i] = - clus[i] - 1;
     }
-    
-  
+
     __syncthreads();
-    if (threadIdx.x==0) {
-      clusInModule[me]=nclus;
-      moduleId[blockIdx.x]=me;
+    if (threadIdx.x == 0) {
+      clusInModule[me] = nclus;
+      moduleId[blockIdx.x] = me;
     }
-    
-#ifdef GPU_DEBUG    
-    if (me%100==1)
-      if (threadIdx.x==0) printf("%d clusters in module %d\n",nclus,me);
-#endif    
+
+#ifdef GPU_DEBUG
+    if (me % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("%d clusters in module %d\n", nclus, me);
+#endif
   }
-  
+
 } //namespace gpuClustering
 
-#endif
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h

From c0a703b95a4f30b4ea763012a107a1c960f7b895 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 4 Jun 2018 14:34:52 +0200
Subject: [PATCH 3/3] Clean up

---
 .../SiPixelClusterizer/test/gpuClustering.cu  | 169 +++++++-----------
 1 file changed, 69 insertions(+), 100 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index beafe83ba60bd..8c6b6cdf7a821 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -1,37 +1,27 @@
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
-
-
-#include "cuda/api_wrappers.h"
-
-#include <iostream>
-#include <iomanip>
-
+#include <algorithm>
+#include <cassert>
+#include <cmath>
 #include <cstdint>
+#include <iomanip>
+#include <iostream>
 #include <memory>
-#include <algorithm>
-#include<numeric>
-#include<cmath>
-#include<cassert>
-
-#include<set>
-#include<vector>
+#include <numeric>
+#include <set>
+#include <vector>
 
+#include <cuda/api_wrappers.h>
 
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
 
-  
 int main(void)
 {
   if (cuda::device::count() == 0) {
     std::cerr << "No CUDA devices on this system" << "\n";
     exit(EXIT_FAILURE);
   }
-
-
   using namespace gpuClustering;
-  
-  int numElements = MaxNumPixels;
- 
 
+  int numElements = MaxNumPixels;
   // these in reality are already on GPU
   auto h_id = std::make_unique<uint16_t[]>(numElements);
   auto h_x = std::make_unique<uint16_t[]>(numElements);
@@ -39,27 +29,23 @@ int main(void)
   auto h_adc = std::make_unique<uint16_t[]>(numElements);
 
   auto h_clus = std::make_unique<int[]>(numElements);
-  
+
   auto h_debug = std::make_unique<unsigned int[]>(numElements);
- 
-  
   auto current_device = cuda::device::current::get();
   auto d_id = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
   auto d_x = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
   auto d_y = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
   auto d_adc = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
-  
+
   auto d_clus = cuda::memory::device::make_unique<int[]>(current_device, numElements);
 
   auto d_moduleStart = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules+1);
 
   auto d_clusInModule = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
   auto d_moduleId = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
-  
-  auto d_debug = cuda::memory::device::make_unique<unsigned int[]>(current_device, numElements);
 
-  
-    // later random number
+  auto d_debug = cuda::memory::device::make_unique<unsigned int[]>(current_device, numElements);
+  // later random number
   int n=0;
   int ncl=0;
   int y[10]={5,7,9,1,3,0,4,8,2,6};
@@ -112,10 +98,10 @@ int main(void)
       h_adc[n]=100;
       ++n;
       if (xx[k]%2==0) {
-	h_id[n]=id;
-	h_x[n]=xx[k];
-	h_y[n]=101;
-	h_adc[n]=100;
+        h_id[n]=id;
+        h_x[n]=xx[k];
+        h_y[n]=101;
+        h_adc[n]=100;
       ++n;
       }
     }
@@ -129,43 +115,41 @@ int main(void)
     h_x[n]=x;
     h_y[n]=x;
     h_adc[n]=100;
-    ++n;    
+    ++n;
   }
-
-  
   // all odd id
   for(int id=11; id<=1800; id+=2) {
     if ( (id/20)%2) h_id[n++]=InvId;  // error
-    for (int x=0; x<40; x+=4) {	 
+    for (int x=0; x<40; x+=4) {
       ++ncl;
       if ((id/10)%2) {
-	for (int k=0; k<10; ++k) {
-	  h_id[n]=id;
-	  h_x[n]=x;
-	  h_y[n]=x+y[k];
-	  h_adc[n]=100;
-	  ++n;
-	  h_id[n]=id;
-	  h_x[n]=x+1;
-	  h_y[n]=x+y[k]+2;
-	  h_adc[n]=100;
-	  ++n;
-	}
+        for (int k=0; k<10; ++k) {
+          h_id[n]=id;
+          h_x[n]=x;
+          h_y[n]=x+y[k];
+          h_adc[n]=100;
+          ++n;
+          h_id[n]=id;
+          h_x[n]=x+1;
+          h_y[n]=x+y[k]+2;
+          h_adc[n]=100;
+          ++n;
+        }
       } else {
-	for (int k=0; k<10; ++k) {
-	  h_id[n]=id;
-	  h_x[n]=x;
-	  h_y[n]=x+y[9-k];
-	  h_adc[n]=100;
-	  ++n;
-	  if (y[k]==3) continue; // hole
-	  if (id==51)  {h_id[n++]=InvId; h_id[n++]=InvId; }// error
-	  h_id[n]=id;
-	  h_x[n]=x+1;
-	  h_y[n]=x+y[k]+2;
-	  h_adc[n]=100;
-	  ++n;
-	}
+        for (int k=0; k<10; ++k) {
+          h_id[n]=id;
+          h_x[n]=x;
+          h_y[n]=x+y[9-k];
+          h_adc[n]=100;
+          ++n;
+          if (y[k]==3) continue; // hole
+          if (id==51)  {h_id[n++]=InvId; h_id[n++]=InvId; }// error
+          h_id[n]=id;
+          h_x[n]=x+1;
+          h_y[n]=x+y[k]+2;
+          h_adc[n]=100;
+          ++n;
+        }
       }
     }
   }
@@ -174,35 +158,31 @@ int main(void)
   size_t size32 = n * sizeof(unsigned int);
   size_t size16 = n * sizeof(unsigned short);
   size_t size8 = n * sizeof(uint8_t);
-	 
+
   uint32_t nModules=0;
   cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t));
-  
+
   cuda::memory::copy(d_id.get(), h_id.get(), size16);
   cuda::memory::copy(d_x.get(), h_x.get(), size16);
   cuda::memory::copy(d_y.get(), h_y.get(), size16);
   cuda::memory::copy(d_adc.get(), h_adc.get(), size8);
-  cuda::memory::device::zero(d_debug.get(),size32);  
+  cuda::memory::device::zero(d_debug.get(),size32);
   // Launch CUDA Kernels
-
-  
   int threadsPerBlock = 256;
   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
   std::cout
     << "CUDA countModules kernel launch with " << blocksPerGrid
     << " blocks of " << threadsPerBlock << " threads\n";
-		     
+
   cuda::launch(
-	       countModules, 
-	       { blocksPerGrid, threadsPerBlock },
-	       d_id.get(), d_moduleStart.get() ,d_clus.get(),n
-	       );
-  
+               countModules,
+               { blocksPerGrid, threadsPerBlock },
+               d_id.get(), d_moduleStart.get() ,d_clus.get(),n
+               );
+
   threadsPerBlock = 256;
   blocksPerGrid = MaxNumModules;    //nModules;
 
-
-
   std::cout
     << "CUDA findModules kernel launch with " << blocksPerGrid
     << " blocks of " << threadsPerBlock << " threads\n";
@@ -210,35 +190,28 @@ int main(void)
   cuda::memory::device::zero(d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
 
   cuda::launch(
-	       findClus,
-	       { blocksPerGrid, threadsPerBlock },
-	       d_id.get(), d_x.get(), d_y.get(),  d_adc.get(),
-	       d_moduleStart.get(),
-	       d_clusInModule.get(), d_moduleId.get(),
-	       d_clus.get(),
-	       d_debug.get(),
-	       n
-	       );
-
+               findClus,
+               { blocksPerGrid, threadsPerBlock },
+               d_id.get(), d_x.get(), d_y.get(),  d_adc.get(),
+               d_moduleStart.get(),
+               d_clusInModule.get(), d_moduleId.get(),
+               d_clus.get(),
+               d_debug.get(),
+               n
+               );
 
   cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
   std::cout << "found " << nModules << " Modules active" << std::endl;
 
-
-  uint32_t nclus[MaxNumModules], moduleId[nModules];  
+  uint32_t nclus[MaxNumModules], moduleId[nModules];
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
   cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));
-
-  
   cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
 
   auto p = std::minmax_element(h_debug.get(),h_debug.get()+n);
-  std::cout << "debug " << *p.first << ' ' << *p.second << std::endl;  
+  std::cout << "debug " << *p.first << ' ' << *p.second << std::endl;
 
-
-  
-  
   std::set<unsigned int> clids;
   std::vector<unsigned int> seeds;
   for (int i=0; i<n; ++i) {
@@ -246,16 +219,12 @@ int main(void)
     assert(h_clus[i]>=0);
     assert(h_clus[i]<nclus[h_id[i]]);
     clids.insert(h_id[i]*100+h_clus[i]);
-		 // clids.insert(h_clus[i]);
+                 // clids.insert(h_clus[i]);
     // if (h_clus[i]==i) seeds.push_back(i); // only if no renumbering
   }
-   
+
   std::cout << "found " << std::accumulate(nclus,nclus+MaxNumModules,0) << ' ' <<  clids.size() << " clusters" << std::endl;
   // << " and " << seeds.size() << " seeds" << std::endl;
 
-
-
   return 0;
-
 }
-