From eb29a8b226e5428c126a3177237fd24cd7442d39 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 14 Sep 2018 16:08:56 +0200
Subject: [PATCH 01/36] use ext ws or shared

---
 .../CUDAUtilities/interface/radixSort.h       | 53 +++++++++++-----
 .../CUDAUtilities/test/radixSort_t.cu         | 63 ++++++++++++++-----
 2 files changed, 86 insertions(+), 30 deletions(-)
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 60d0a48164678..1311865ce2ae6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -75,22 +75,21 @@ template<
   int NS, // number of significant bytes to use in sorting
   typename RF
 >
-__device__  
-void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
+__device__
+void
+__forceinline__
+radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_t size, RF reorder) {
     
   constexpr int d = 8, w = 8*sizeof(T);
   constexpr int sb = 1<<d;
   constexpr int ps = int(sizeof(T)) - NS; 
 
-  constexpr int MaxSize = 256*32;
-  __shared__ uint16_t ind2[MaxSize];
   __shared__ int32_t c[sb], ct[sb], cu[sb];
 
   __shared__ int ibs;
   __shared__ int p;
 
   assert(size>0);
-  assert(size<=MaxSize); 
   assert(blockDim.x==sb);  
 
   // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
@@ -203,8 +202,10 @@ template<
   typename std::enable_if<std::is_unsigned<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
-  radixSortImpl<T,NS>(a,ind,size,dummyReorder<T>);
+void 
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+  radixSortImpl<T,NS>(a,ind,ind2,size,dummyReorder<T>);
 }
 
 template<
@@ -213,8 +214,10 @@ template<
   typename std::enable_if<std::is_integral<T>::value&&std::is_signed<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
-  radixSortImpl<T,NS>(a,ind,size,reorderSigned<T>);
+void 
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+  radixSortImpl<T,NS>(a,ind,ind2,size,reorderSigned<T>);
 }
 
 template<
@@ -223,29 +226,47 @@ template<
   typename std::enable_if<std::is_floating_point<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
+void
+__forceinline__
+ radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
   using I = int;
-  radixSortImpl<I,NS>((I const *)(a),ind,size,reorderFloat<I>);
+  radixSortImpl<I,NS>((I const *)(a),ind,ind2, size,reorderFloat<I>);
 }
 
 
 
 template<typename T, int NS=sizeof(T)>
 __device__
-void radixSortMulti(T * v, uint16_t * index, uint32_t * offsets) {
+void 
+__forceinline__
+radixSortMulti(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+
+  extern __shared__ uint16_t ws[];
 
   auto a = v+offsets[blockIdx.x];
-  auto ind = index+offsets[blockIdx.x];;
+  auto ind = index+offsets[blockIdx.x];
+  auto ind2 = nullptr==workspace ? ws : workspace+offsets[blockIdx.x];
   auto size = offsets[blockIdx.x+1]-offsets[blockIdx.x];
   assert(offsets[blockIdx.x+1]>=offsets[blockIdx.x]);
-  if (size>0) radixSort<T,NS>(a,ind,size);
+  if (size>0) radixSort<T,NS>(a,ind,ind2,size);
 
 }
 
 template<typename T, int NS=sizeof(T)>
 __global__
-void radixSortMultiWrapper(T * v, uint16_t * index, uint32_t * offsets) {
-  radixSortMulti<T,NS>(v,index,offsets);
+void 
+__launch_bounds__(256, 4)
+radixSortMultiWrapper(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+  radixSortMulti<T,NS>(v,index,offsets, workspace);
 }
 
+template<typename T, int NS=sizeof(T)>
+__global__
+void 
+// __launch_bounds__(256, 4)
+radixSortMultiWrapper2(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+  radixSortMulti<T,NS>(v,index,offsets, workspace);
+}
+
+
 #endif // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index bdc1af0123637..020b657bbfd29 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -31,7 +31,7 @@ struct RS<float> {
 
 template<typename T, int NS=sizeof(T), 
          typename U=T, typename LL=long long>
-void go() {
+void go(bool useShared) {
 
   std::mt19937 eng;
 // std::mt19937 eng2;
@@ -89,6 +89,7 @@ void go() {
 
   auto v_d = cuda::memory::device::make_unique<U[]>(current_device, N);
   auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
+  auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
   auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks+1);
 
   cuda::memory::copy(v_d.get(), v, N*sizeof(T));
@@ -97,12 +98,21 @@ void go() {
   if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl;
 
    delta -= (std::chrono::high_resolution_clock::now()-start);
+   constexpr int MaxSize = 256*32;
+   if (useShared)
    cuda::launch(
                 radixSortMultiWrapper<U,NS>,
+                { blocks, 256, MaxSize*2 },
+                v_d.get(),ind_d.get(),off_d.get(),nullptr
+        );
+   else
+   cuda::launch(
+                radixSortMultiWrapper2<U,NS>,
                 { blocks, 256 },
-                v_d.get(),ind_d.get(),off_d.get()
+                v_d.get(),ind_d.get(),off_d.get(),ws_d.get()
         );
 
+
  if (i==0) std::cout << "done for " << offsets[blocks] << std::endl;
 
 //  cuda::memory::copy(v, v_d.get(), 2*N);
@@ -140,17 +150,42 @@ void go() {
 
 int main() {
 
-  go<int8_t>();
-  go<int16_t>();
-  go<int32_t>();
-  go<int32_t,3>();
-  go<int64_t>();
-  go<float,4,float,double>();
-  go<float,2,float,double>();
-
-  go<uint8_t>();
-  go<uint16_t>();
-  go<uint32_t>();
-  // go<uint64_t>();
+  bool useShared=false;
+
+  std::cout << "using Global memory" <<    std::endl;
+
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t,3>(useShared);
+  go<int64_t>(useShared);
+  go<float,4,float,double>(useShared);
+  go<float,2,float,double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  useShared=true;
+
+  std::cout << "using Shared memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t,3>(useShared);
+  go<int64_t>(useShared);
+  go<float,4,float,double>(useShared);
+  go<float,2,float,double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+
+
   return 0;
 }

From fe2af2e50f983f4c6d43c8dafe4e4f8a826edb2f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 14 Sep 2018 16:26:52 +0200
Subject: [PATCH 02/36] add minBLperML and a debug printout

---
 .../plugins/SiPixelRecHitHeterogeneous.cc             | 11 ++++++++++-
 .../PixelTriplets/plugins/gpuPixelDoublets.h          |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 6065fa3cb274a..f6e49ea7ddd08 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -218,6 +218,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
 
   int numberOfDetUnits = 0;
   int numberOfClusters = 0;
+  int numberOfLostClusters = 0;
   for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) {
     numberOfDetUnits++;
     unsigned int detid = DSViter->detId();
@@ -234,7 +235,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
     auto mrp = &hoc.mr[fc];
     uint32_t ngh=0;
     for (uint32_t i=0; i<nhits;++i) {
-      if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) continue;
+      if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) { ++numberOfLostClusters; continue;}
       ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a]<mrp[b];});
       ++ngh;
     }
@@ -307,6 +308,14 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
 
 
   } //    <-- End loop on DetUnits
+
+  std::cout << "SiPixelRecHitGPUVI $ det, clus, lost "
+    <<  numberOfDetUnits << ' '
+    << numberOfClusters  << ' '
+    << numberOfLostClusters
+    << std::endl;
+
+
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitHeterogeneous);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 31844f39f9727..c75c492d6953d 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -138,9 +138,10 @@ namespace gpuPixelDoublets {
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
   void getDoubletsFromHisto(GPUCACell * cells,
                             uint32_t * nCells,
                             siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,

From 8b47a72863225cb3a0b8780319119908437b2a4b Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 15 Sep 2018 17:36:54 +0200
Subject: [PATCH 03/36] a small prefix scan in single block

---
 .../CUDAUtilities/interface/prefixScan.h      | 52 +++++++++++++++++++
 .../CUDAUtilities/test/BuildFile.xml          |  2 +
 .../CUDAUtilities/test/prefixScan_t.cu        | 33 ++++++++++++
 3 files changed, 87 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu

diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
new file mode 100644
index 0000000000000..24925242950f6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h
+#define HeterogeneousCore_CUDAUtilities_prefixScan_h
+
+#include <cstdint>
+#include <cassert>
+
+template<typename T>
+__device__
+void 
+__forceinline__
+warpPrefixScan(T * c, uint32_t i) {
+   auto x = c[i];
+   auto laneId = threadIdx.x & 0x1f;
+   #pragma unroll
+   for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+     auto y = __shfl_up_sync(0xffffffff,x, offset);
+     if(laneId >= offset) x += y;
+   }
+   c[i] = x;
+}
+
+// limited to 32*32 elements....
+template<typename T>
+__device__
+void
+__forceinline__
+blockPrefixScan(T * c, uint32_t size, T* ws) {
+  assert(size<=1024);
+  assert(0==blockDim.x%32);
+
+  auto first = threadIdx.x;
+
+  for (auto i=first; i<size; i+=blockDim.x) {
+    warpPrefixScan(c,i);
+    auto laneId = threadIdx.x & 0x1f;
+    auto warpId = i/32;
+    assert(warpId<32);
+    if (31==laneId) ws[warpId]=c[i];
+  }
+  __syncthreads();
+  if (size<=32) return;
+  if (threadIdx.x<32) warpPrefixScan(ws,threadIdx.x);
+  __syncthreads();
+  for (auto i=first+32; i<size; i+=blockDim.x) {
+    auto warpId = i/32;
+    c[i]+=ws[warpId-1];
+  }
+  __syncthreads();
+}
+
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index ab97c243c4385..9957266fc064b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -29,3 +29,5 @@
   <use name="cuda-api-wrappers"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
+
+<bin file="prefixScan_t.cu" name="gpuPrefixScan_t"/>
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
new file mode 100644
index 0000000000000..3e8645439a28f
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -0,0 +1,33 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cassert>
+
+template<typename T>
+__global__
+void testPrefixScan(uint32_t size) {
+
+  __shared__ T ws[32];
+  __shared__ T c[1024];
+  auto first = threadIdx.x;
+  for (auto i=first; i<size; i+=blockDim.x) c[i]=1;
+
+  blockPrefixScan(c, size, ws);
+
+  assert(1==c[0]);
+  for (auto i=first+1; i<size; i+=blockDim.x) {
+    if (c[i]!=c[i-1]+1) printf("failed %d %d %d: %d %d\n",size, i, blockDim.x, c[i],c[i-1]);
+    assert(c[i]==c[i-1]+1); assert(c[i]==i+1);
+  }
+}
+
+
+int main() {
+  
+  for(int bs=32; bs<=1024; bs+=32)
+  for (int j=1;j<=1024; ++j) {
+   testPrefixScan<uint16_t><<<1,bs>>>(j);
+   testPrefixScan<float><<<1,bs>>>(j);
+  }
+  cudaDeviceSynchronize();
+
+  return 0;
+}

From 90344c76ebf98ca0737aab70b528727622360043 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 15 Sep 2018 19:43:07 +0200
Subject: [PATCH 04/36] add charge cut

---
 .../plugins/gpuClusterChargeCut.h             | 92 +++++++++++++++++++
 .../plugins/gpuClusteringConstants.h          |  2 +
 .../SiPixelClusterizer/test/gpuClustering.cu  | 45 ++++++---
 3 files changed, 127 insertions(+), 12 deletions(-)
 create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
new file mode 100644
index 0000000000000..c86b67e66d131
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -0,0 +1,92 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include "gpuClusteringConstants.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+
+namespace gpuClustering {
+
+  __global__ void  clusterChargeCut(
+                           uint16_t * __restrict__ id,             // module id of each pixel (modified if bad cluster)
+                           uint16_t const * __restrict__ adc,              //  charge of each pixel
+                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
+                           uint32_t * __restrict__ nClustersInModule,    // modified: number of clusters found in each module
+                           uint32_t const * __restrict__ moduleId,             // module id of each module
+                           int32_t * __restrict__  clusterId,            // modified: cluster id of each pixel
+                           int numElements)
+  {
+
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+
+    auto nclus = nClustersInModule[thisModuleId];
+    if (nclus==0) return;
+
+    assert(nclus<=MaxNumClustersPerModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      charge[i]=0;
+    }
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      atomicAdd(&charge[clusterId[i]], adc[i]);
+    }
+    __syncthreads();
+
+    auto chargeCut = thisModuleId<96 ? 2000 : 4000; // move in constants (calib?)
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+       newclusId[i] =  charge[i]>chargeCut ? 1 : 0;
+    }
+
+    // renumber
+    __shared__ uint16_t ws[32];
+    blockPrefixScan(newclusId, nclus, ws);
+    
+    assert(nclus>=newclusId[nclus-1]);
+    
+    if(nclus==newclusId[nclus-1]) return;
+
+    nClustersInModule[thisModuleId] = newclusId[nclus-1];
+
+    // mark bad cluster with 0 again
+    for (int i=threadIdx.x+1; i<nclus; i += blockDim.x) {
+      if (newclusId[i]==newclusId[i-1]) newclusId[i]=InvId+1;
+    }
+    __syncthreads();
+    if (newclusId[0]==0) newclusId[0]=InvId+1;
+
+    // reassign id
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      clusterId[i] = newclusId[clusterId[i]]-1;
+      if(clusterId[i]==InvId) id[i] = InvId;
+    }
+
+    //done
+  }
+
+
+} // namespace
+#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
index 4b5c67df7b88c..7b4bb5a1c8c95 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
@@ -6,7 +6,9 @@
 namespace gpuClustering {
   constexpr uint32_t MaxNumModules  = 2000;
   constexpr uint32_t MaxNumPixels   = 256 * 2000;   // this does not mean maxPixelPerModule == 256!
+  constexpr uint32_t MaxNumClustersPerModules = 1024;
   constexpr uint16_t InvId          = 9999;         // must be > MaxNumModules
+
 }
 
 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 2264be0de02af..a94081c2d69b4 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -12,6 +12,7 @@
 #include <cuda/api_wrappers.h>
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 
 int main(void)
 {
@@ -58,7 +59,7 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=100;
+    h_adc[n]=5000;
     ++n;
     // diagonal
     ++ncl;
@@ -66,7 +67,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=x;
       h_y[n]=x;
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     ++ncl;
@@ -75,7 +76,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=x;
       h_y[n]=x;
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     ++ncl;
@@ -86,7 +87,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=xx[k];
       h_y[n]=20+xx[k];
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     // holes
@@ -95,13 +96,13 @@ int main(void)
       h_id[n]=id;
       h_x[n]=xx[k];
       h_y[n]=100;
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
       if (xx[k]%2==0) {
         h_id[n]=id;
         h_x[n]=xx[k];
         h_y[n]=101;
-        h_adc[n]=100;
+        h_adc[n]=1000;
       ++n;
       }
     }
@@ -114,7 +115,7 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=100;
+    h_adc[n]=1000;
     ++n;
   }
   // all odd id
@@ -132,7 +133,7 @@ int main(void)
           h_id[n]=id;
           h_x[n]=x+1;
           h_y[n]=x+y[k]+2;
-          h_adc[n]=100;
+          h_adc[n]=1000;
           ++n;
         }
       } else {
@@ -140,14 +141,14 @@ int main(void)
           h_id[n]=id;
           h_x[n]=x;
           h_y[n]=x+y[9-k];
-          h_adc[n]=100;
+          h_adc[n]=1000;
           ++n;
           if (y[k]==3) continue; // hole
           if (id==51)  {h_id[n++]=InvId; h_id[n++]=InvId; }// error
           h_id[n]=id;
           h_x[n]=x+1;
           h_y[n]=x+y[k]+2;
-          h_adc[n]=100;
+          h_adc[n]=1000;
           ++n;
         }
       }
@@ -199,10 +200,30 @@ int main(void)
                n
                );
 
-  cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
+    cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
+
+    uint32_t nclus[MaxNumModules], moduleId[nModules];
+
+    cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
+    std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl;
+
+
+
+  cuda::launch(
+               clusterChargeCut,
+               { blocksPerGrid, threadsPerBlock },
+               d_id.get(), d_adc.get(),
+               d_moduleStart.get(),
+               d_clusInModule.get(), d_moduleId.get(),
+               d_clus.get(),
+               n
+               );
+
+
+
   std::cout << "found " << nModules << " Modules active" << std::endl;
 
-  uint32_t nclus[MaxNumModules], moduleId[nModules];
+  cuda::memory::copy(h_id.get(), d_id.get(), size16);
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
   cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));

From b1e774689496d80fda4fd09b17b5159954ba9481 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 15 Sep 2018 20:13:03 +0200
Subject: [PATCH 05/36] apply charge cut

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu         | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index f3242a11d7ae6..0fc31fc1dc36b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -34,6 +34,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
 // local includes
@@ -717,6 +718,18 @@ namespace pixelgpudetails {
           wordCounter);
       cudaCheck(cudaGetLastError());
 
+      // apply charge cut
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          moduleInd_d,
+          adc_d,
+          moduleStart_d,
+          clusInModule_d, moduleId_d,
+          clus_d,
+          wordCounter);
+      cudaCheck(cudaGetLastError());
+
+
+
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of

From dca2f1e16854bf475b7b6f9d85fcb4b43b4a3d23 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 16 Sep 2018 10:38:37 +0200
Subject: [PATCH 06/36] ccc works

---
 .../plugins/SiPixelRawToClusterGPUKernel.cu         | 10 +++++-----
 .../plugins/SiPixelRawToClusterHeterogeneous.cc     |  1 +
 .../SiPixelRecHits/plugins/PixelRecHits.cu          |  4 ++++
 .../plugins/SiPixelRecHitHeterogeneous.cc           |  3 ++-
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h        | 13 +++++++++++--
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 0fc31fc1dc36b..7bd6eac473cc7 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -688,11 +688,11 @@ namespace pixelgpudetails {
         cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
       }
 
-      /*
-         std::cout
+#ifdef GPU_DEBUG
+       std::cout
          << "CUDA countModules kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
-       */
+#endif
 
       cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id()));
 
@@ -704,10 +704,10 @@ namespace pixelgpudetails {
 
       threadsPerBlock = 256;
       blocks = MaxNumModules;
-      /*
+#ifdef GPU_DEBUG
          std::cout << "CUDA findClus kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
-       */
+#endif
       cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
       findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           moduleInd_d,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 7f7e65d2874f1..76f3536ab1706 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -611,6 +611,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
 
   for (uint32_t i = 0; i < gpu.nDigis; i++) {
     if (gpu.pdigi_h[i]==0) continue;
+    if (gpu.clus_h[i]>9000) continue; // not in cluster
     assert(gpu.rawIdArr_h[i] > 109999);
     if ( (*detDigis).detId() != gpu.rawIdArr_h[i])
       {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index e2165471c3386..bc376536dee12 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -132,6 +132,10 @@ namespace pixelgpudetails {
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
+
+#ifdef GPU_DEBUG
+    std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
+#endif
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       gpu_.bs_d,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index f6e49ea7ddd08..f0c3b42b30eba 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -309,12 +309,13 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
 
   } //    <-- End loop on DetUnits
 
+  /*
   std::cout << "SiPixelRecHitGPUVI $ det, clus, lost "
     <<  numberOfDetUnits << ' '
     << numberOfClusters  << ' '
     << numberOfLostClusters
     << std::endl;
-
+   */
 
 }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 2ee4a10c6fc99..ec48333bde889 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -46,10 +46,19 @@ namespace gpuPixelRecHits {
     __shared__ ClusParams clusParams;
 
     auto first = digiModuleStart[1 + blockIdx.x];
-    auto me = id[first];
-    assert(moduleId[blockIdx.x] == me);
+    auto me = moduleId[blockIdx.x];
     auto nclus = clusInModule[me];
 
+    if (0==nclus) return;
+
+#ifdef GPU_DEBUG
+    if (threadIdx.x==0) {
+      auto k=first;
+      while (id[k]==InvId) ++k;
+      assert(id[k]==me);
+    }
+#endif
+
 #ifdef GPU_DEBUG
     if (me%100==1)
       if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);

From 6a06ece2b37a4d97a81d4237fa7aadfefe43bb23 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 16 Sep 2018 14:26:50 +0200
Subject: [PATCH 07/36] stable, but something fishy in test

---
 .../plugins/gpuClusterChargeCut.h             | 15 +++--
 .../SiPixelClusterizer/test/BuildFile.xml     |  7 +++
 .../SiPixelClusterizer/test/gpuClustering.cu  | 58 +++++++++++++++++--
 3 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index c86b67e66d131..c0e2174339bd6 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -45,6 +45,7 @@ namespace gpuClustering {
     for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
       charge[i]=0;
     }
+    __syncthreads();
 
     for (int i = first; i < numElements; i += blockDim.x) {
       if (id[i] == InvId) continue;     // not valid
@@ -54,27 +55,29 @@ namespace gpuClustering {
     __syncthreads();
 
     auto chargeCut = thisModuleId<96 ? 2000 : 4000; // move in constants (calib?)
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
     __shared__ uint16_t newclusId[MaxNumClustersPerModules];
     for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
-       newclusId[i] =  charge[i]>chargeCut ? 1 : 0;
+       newclusId[i] = ok[i] =  charge[i]>chargeCut ? 1 : 0;
     }
 
+    __syncthreads();
+
     // renumber
     __shared__ uint16_t ws[32];
     blockPrefixScan(newclusId, nclus, ws);
-    
+
     assert(nclus>=newclusId[nclus-1]);
     
     if(nclus==newclusId[nclus-1]) return;
 
     nClustersInModule[thisModuleId] = newclusId[nclus-1];
 
-    // mark bad cluster with 0 again
-    for (int i=threadIdx.x+1; i<nclus; i += blockDim.x) {
-      if (newclusId[i]==newclusId[i-1]) newclusId[i]=InvId+1;
+    // mark bad cluster again
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      if (0==ok[i]) newclusId[i]=InvId+1;
     }
     __syncthreads();
-    if (newclusId[0]==0) newclusId[0]=InvId+1;
 
     // reassign id
     for (int i = first; i < numElements; i += blockDim.x) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index 3fc830883ca58..1ba4945a4c76b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -37,3 +37,10 @@
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
+
+<bin file="gpuClustering.cu" name="gpuClustering_debug">
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g"/>
+  <flags CUDA_FLAGS="-g -G"/>
+</bin>
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index a94081c2d69b4..7e2bfd69a96a4 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -51,6 +51,23 @@ int main(void)
   int ncl=0;
   int y[10]={5,7,9,1,3,0,4,8,2,6};
 
+  auto generateClusters = [&](bool addBigNoise) {
+  if (addBigNoise) {
+    constexpr int MaxPixels = 1000;
+    int id = 666;
+    for (int x=0; x<140; x+=3) {
+      for (int yy=0; yy<400; yy+=3) {
+       h_id[n]=id;
+       h_x[n]=x;
+       h_y[n]=yy;
+       h_adc[n]=1000;
+       ++n; ++ncl;
+       if (MaxPixels<=ncl) break;
+     }
+     if (MaxPixels<=ncl) break; 
+    }
+  }
+
   {
     // isolated
     int id = 42;
@@ -154,8 +171,15 @@ int main(void)
       }
     }
   }
+  }; // end lambda
+  for (auto kkk=0; kkk<2; ++kkk) {
+  n=0; ncl=0;
+  generateClusters(1==kkk);
+
   std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
   assert(n<=numElements);
+
+
   size_t size32 = n * sizeof(unsigned int);
   size_t size16 = n * sizeof(unsigned short);
   size_t size8 = n * sizeof(uint8_t);
@@ -227,24 +251,48 @@ int main(void)
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
   cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));
-  cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
+//  cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
 
-  auto p = std::minmax_element(h_debug.get(),h_debug.get()+n);
-  std::cout << "debug " << *p.first << ' ' << *p.second << std::endl;
 
   std::set<unsigned int> clids;
   std::vector<unsigned int> seeds;
   for (int i=0; i<n; ++i) {
+    assert(h_id[i]!=666);  // only noise
     if (h_id[i]==InvId) continue;
     assert(h_clus[i]>=0);
     assert(h_clus[i]<nclus[h_id[i]]);
-    clids.insert(h_id[i]*100+h_clus[i]);
+    clids.insert(h_id[i]*1000+h_clus[i]);
                  // clids.insert(h_clus[i]);
     // if (h_clus[i]==i) seeds.push_back(i); // only if no renumbering
   }
 
+  // verify no hole in numbering
+  auto p = clids.begin();
+  auto cmid = (*p)/1000;
+  assert (0==(*p)%1000);
+  auto c= p; ++c;
+  std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c)/1000] << std::endl;
+  for(;c!=clids.end(); ++c) {
+     auto cc = *c;
+     auto pp = *p;
+     auto mid = cc/1000;
+     auto pnc = pp%1000;
+     auto nc = cc%1000;
+     if(mid!=cmid) {
+       assert (0==cc%1000);
+       assert (nclus[cmid]-1 == pp%1000);
+       // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;   
+       cmid=mid;
+       p=c;
+       continue;
+     }
+     p=c;
+     // assert(nc==pnc+1);
+     if (nc!=pnc+1) std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
+  }
+
   std::cout << "found " << std::accumulate(nclus,nclus+MaxNumModules,0) << ' ' <<  clids.size() << " clusters" << std::endl;
   // << " and " << seeds.size() << " seeds" << std::endl;
-
+  } /// end loop kkk
   return 0;
 }

From 50460d9fe26574e64b3b1d454dd860db0b29b518 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sun, 16 Sep 2018 15:12:21 +0200
Subject: [PATCH 08/36] of course de test had a bug

---
 .../plugins/gpuClusterChargeCut.h             |  1 +
 .../SiPixelClusterizer/test/gpuClustering.cu  | 35 +++++++++----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index c0e2174339bd6..4abc029e1942a 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -27,6 +27,7 @@ namespace gpuClustering {
     auto firstPixel = moduleStart[1 + blockIdx.x];
     auto thisModuleId = id[firstPixel];
     assert(thisModuleId < MaxNumModules);
+    assert(thisModuleId==moduleId[blockIdx.x]);
 
     auto nclus = nClustersInModule[thisModuleId];
     if (nclus==0) return;
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 7e2bfd69a96a4..af20ae17c745a 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -31,7 +31,6 @@ int main(void)
 
   auto h_clus = std::make_unique<int[]>(numElements);
 
-  auto h_debug = std::make_unique<unsigned int[]>(numElements);
   auto current_device = cuda::device::current::get();
   auto d_id = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
   auto d_x = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
@@ -45,13 +44,13 @@ int main(void)
   auto d_clusInModule = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
   auto d_moduleId = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
 
-  auto d_debug = cuda::memory::device::make_unique<unsigned int[]>(current_device, numElements);
   // later random number
   int n=0;
   int ncl=0;
   int y[10]={5,7,9,1,3,0,4,8,2,6};
 
-  auto generateClusters = [&](bool addBigNoise) {
+  auto generateClusters = [&](int kn) {
+  auto addBigNoise = 1==kn%2;
   if (addBigNoise) {
     constexpr int MaxPixels = 1000;
     int id = 666;
@@ -76,7 +75,7 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=5000;
+    h_adc[n]= kn==0 ? 100 : 5000;
     ++n;
     // diagonal
     ++ncl;
@@ -113,7 +112,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=xx[k];
       h_y[n]=100;
-      h_adc[n]=1000;
+      h_adc[n]= kn==2 ? 100 : 1000;
       ++n;
       if (xx[k]%2==0) {
         h_id[n]=id;
@@ -132,7 +131,7 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=1000;
+    h_adc[n]=5000;
     ++n;
   }
   // all odd id
@@ -158,23 +157,23 @@ int main(void)
           h_id[n]=id;
           h_x[n]=x;
           h_y[n]=x+y[9-k];
-          h_adc[n]=1000;
+          h_adc[n]= kn==2 ? 10 : 1000;
           ++n;
           if (y[k]==3) continue; // hole
           if (id==51)  {h_id[n++]=InvId; h_id[n++]=InvId; }// error
           h_id[n]=id;
           h_x[n]=x+1;
           h_y[n]=x+y[k]+2;
-          h_adc[n]=1000;
+          h_adc[n]= kn==2 ? 10 : 1000;
           ++n;
         }
       }
     }
   }
   }; // end lambda
-  for (auto kkk=0; kkk<2; ++kkk) {
+  for (auto kkk=0; kkk<4; ++kkk) {
   n=0; ncl=0;
-  generateClusters(1==kkk);
+  generateClusters(kkk);
 
   std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
   assert(n<=numElements);
@@ -182,7 +181,7 @@ int main(void)
 
   size_t size32 = n * sizeof(unsigned int);
   size_t size16 = n * sizeof(unsigned short);
-  size_t size8 = n * sizeof(uint8_t);
+  // size_t size8 = n * sizeof(uint8_t);
 
   uint32_t nModules=0;
   cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t));
@@ -190,8 +189,7 @@ int main(void)
   cuda::memory::copy(d_id.get(), h_id.get(), size16);
   cuda::memory::copy(d_x.get(), h_x.get(), size16);
   cuda::memory::copy(d_y.get(), h_y.get(), size16);
-  cuda::memory::copy(d_adc.get(), h_adc.get(), size8);
-  cuda::memory::device::zero(d_debug.get(),size32);
+  cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
   // Launch CUDA Kernels
   int threadsPerBlock = 256;
   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
@@ -223,6 +221,7 @@ int main(void)
                d_clus.get(),
                n
                );
+    cudaDeviceSynchronize();
 
     cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
 
@@ -230,10 +229,10 @@ int main(void)
 
     cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
     std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl;
+    for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;}
 
 
-
-  cuda::launch(
+    cuda::launch(
                clusterChargeCut,
                { blocksPerGrid, threadsPerBlock },
                d_id.get(), d_adc.get(),
@@ -244,6 +243,7 @@ int main(void)
                );
 
 
+    cudaDeviceSynchronize();
 
   std::cout << "found " << nModules << " Modules active" << std::endl;
 
@@ -251,11 +251,9 @@ int main(void)
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
   cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));
-//  cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
 
 
   std::set<unsigned int> clids;
-  std::vector<unsigned int> seeds;
   for (int i=0; i<n; ++i) {
     assert(h_id[i]!=666);  // only noise
     if (h_id[i]==InvId) continue;
@@ -263,7 +261,6 @@ int main(void)
     assert(h_clus[i]<nclus[h_id[i]]);
     clids.insert(h_id[i]*1000+h_clus[i]);
                  // clids.insert(h_clus[i]);
-    // if (h_clus[i]==i) seeds.push_back(i); // only if no renumbering
   }
 
   // verify no hole in numbering
@@ -272,6 +269,7 @@ int main(void)
   assert (0==(*p)%1000);
   auto c= p; ++c;
   std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c)/1000] << std::endl;
+  std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin())/1000] << std::endl;
   for(;c!=clids.end(); ++c) {
      auto cc = *c;
      auto pp = *p;
@@ -292,6 +290,7 @@ int main(void)
   }
 
   std::cout << "found " << std::accumulate(nclus,nclus+MaxNumModules,0) << ' ' <<  clids.size() << " clusters" << std::endl;
+  for(auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 <<    ' ' << nclus[i-1] << std::endl;    break;}
   // << " and " << seeds.size() << " seeds" << std::endl;
   } /// end loop kkk
   return 0;

From 9103a9c0f927abac842f3db75e9b4402ea5727fd Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 17 Sep 2018 14:30:23 +0200
Subject: [PATCH 09/36] works, sort only with blksize 256

---
 .../CUDAUtilities/interface/radixSort.h       | 67 +++++++++++--------
 .../CUDAUtilities/test/radixSort_t.cu         | 28 +++++---
 .../plugins/CAHitQuadrupletGeneratorGPU.cu    | 14 ++--
 .../PixelTriplets/plugins/GPUCACell.h         |  2 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  1 +
 .../pixelVertexHeterogeneousProduct.h         |  2 +
 .../src/PixelVertexHeterogeneousProducer.cc   | 36 +++++-----
 .../PixelVertexFinding/src/gpuClusterTracks.h | 38 +++++++++++
 .../PixelVertexFinding/src/gpuVertexFinder.cu | 23 +++++--
 .../PixelVertexFinding/src/gpuVertexFinder.h  |  9 ++-
 .../test/gpuVertexFinder_t.cu                 | 31 ++++++++-
 11 files changed, 179 insertions(+), 72 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 1311865ce2ae6..83468d854b1e6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -90,7 +90,7 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
   __shared__ int p;
 
   assert(size>0);
-  assert(blockDim.x==sb);  
+  assert(blockDim.x>=sb);  
 
   // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
 
@@ -104,8 +104,8 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
   __syncthreads();
 
 
-  while(p < w/d) {
-    c[threadIdx.x]=0;
+  while(__syncthreads_and(p < w/d)) {
+    if (threadIdx.x<sb) c[threadIdx.x]=0;
     __syncthreads();
 
     // fill bins
@@ -116,19 +116,22 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
     __syncthreads();
 
     // prefix scan "optimized"???...
-    auto x = c[threadIdx.x];
-    auto laneId = threadIdx.x & 0x1f;
-    #pragma unroll
-    for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
-      auto y = __shfl_up_sync(0xffffffff,x, offset);
-      if(laneId >= offset) x += y;
+    if (threadIdx.x<sb) {
+      auto x = c[threadIdx.x];
+      auto laneId = threadIdx.x & 0x1f;
+      #pragma unroll
+      for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+        auto y = __shfl_up_sync(0xffffffff,x, offset);
+        if(laneId >= offset) x += y;
+      }
+      ct[threadIdx.x] = x;
     }
-    ct[threadIdx.x] = x;
     __syncthreads();
-    auto ss = (threadIdx.x/32)*32 -1;
-    c[threadIdx.x] = ct[threadIdx.x];
-    for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; 
- 
+    if (threadIdx.x<sb) {
+      auto ss = (threadIdx.x/32)*32 -1;
+      c[threadIdx.x] = ct[threadIdx.x];
+      for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; 
+    }
     /* 
     //prefix scan for the nulls  (for documentation)
     if (threadIdx.x==0)
@@ -139,27 +142,33 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
      // broadcast
      ibs =size-1;
      __syncthreads();
-     while (ibs>0) {
+     while (__syncthreads_and(ibs>0)) {
        int i = ibs - threadIdx.x;
-       cu[threadIdx.x]=-1;
-       ct[threadIdx.x]=-1;
+       if (threadIdx.x<sb) {
+         cu[threadIdx.x]=-1;
+         ct[threadIdx.x]=-1;
+       }
        __syncthreads();
        int32_t bin = -1;
-       if (i>=0) { 
-         bin = (a[j[i]] >> d*p)&(sb-1);
-         ct[threadIdx.x]=bin;
-         atomicMax(&cu[bin],int(i));
+       if (threadIdx.x<sb) {
+         if (i>=0) { 
+           bin = (a[j[i]] >> d*p)&(sb-1);
+           ct[threadIdx.x]=bin;
+           atomicMax(&cu[bin],int(i));
+         }
        }
        __syncthreads();
-       if (i>=0 && i==cu[bin])  // ensure to keep them in order
-         for (int ii=threadIdx.x; ii<blockDim.x; ++ii) if (ct[ii]==bin) {
-           auto oi = ii-threadIdx.x; 
-           // assert(i>=oi);if(i>=oi) 
-           k[--c[bin]] = j[i-oi]; 
-         }
+       if (threadIdx.x<sb) {
+         if (i>=0 && i==cu[bin])  // ensure to keep them in order
+           for (int ii=threadIdx.x; ii<sb; ++ii) if (ct[ii]==bin) {
+             auto oi = ii-threadIdx.x; 
+             // assert(i>=oi);if(i>=oi) 
+             k[--c[bin]] = j[i-oi]; 
+           }
+       } 
        __syncthreads();
        if (bin>=0) assert(c[bin]>=0);
-       if (threadIdx.x==0) ibs-=blockDim.x;
+       if (threadIdx.x==0) ibs-=sb;
        __syncthreads();
      }    
       
@@ -190,6 +199,8 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
   if (j!=ind) // odd...
      for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
 
+  __syncthreads();
+
   // now move negative first... (if signed)
   reorder(a,ind,ind2,size);
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index 020b657bbfd29..08441c8cc9555 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <chrono>
 #include<random>
-
+#include<set>
 
 #include<cassert>
 #include<iostream>
@@ -97,18 +97,20 @@ void go(bool useShared) {
 
   if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl;
 
+   auto ntXBl = 1==i%4 ? 256 : 256;
+
    delta -= (std::chrono::high_resolution_clock::now()-start);
    constexpr int MaxSize = 256*32;
    if (useShared)
    cuda::launch(
                 radixSortMultiWrapper<U,NS>,
-                { blocks, 256, MaxSize*2 },
+                { blocks, ntXBl, MaxSize*2 },
                 v_d.get(),ind_d.get(),off_d.get(),nullptr
         );
    else
    cuda::launch(
                 radixSortMultiWrapper2<U,NS>,
-                { blocks, 256 },
+                { blocks, ntXBl },
                 v_d.get(),ind_d.get(),off_d.get(),ws_d.get()
         );
 
@@ -127,10 +129,13 @@ void go(bool useShared) {
     std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize-1000]]) << std::endl;
     std::cout << LL(v[ind[blockSize/2-1]]) << ' ' << LL(v[ind[blockSize/2]]) << ' ' << LL(v[ind[blockSize/2+1]]) << std::endl;
   }
-  for (int ib=0; ib<blocks; ++ib)
-  for (auto i = offsets[ib]+1; i < offsets[ib+1]; i++) {
+  for (int ib=0; ib<blocks; ++ib) {
+  std::set<uint16_t> inds;
+  if (offsets[ib+1]> offsets[ib]) inds.insert(ind[offsets[ib]]);
+  for (auto j = offsets[ib]+1; j < offsets[ib+1]; j++) {
+     inds.insert(ind[j]);
      auto a = v+offsets[ib];
-     auto k1=a[ind[i]]; auto k2=a[ind[i-1]];
+     auto k1=a[ind[j]]; auto k2=a[ind[j-1]];
      auto sh = sizeof(uint64_t)-NS; sh*=8;
      auto shorten = [sh](T& t) {
        auto k = (uint64_t *)(&t);
@@ -138,8 +143,15 @@ void go(bool useShared) {
      };
     shorten(k1);shorten(k2);
      if (k1<k2)
-      std::cout << ib << " not ordered at " << ind[i] << " : "
-  		<< a[ind[i]] <<' '<< a[ind[i-1]] << std::endl;
+      std::cout << ib << " not ordered at " << ind[j] << " : "
+  		<< a[ind[j]] <<' '<< a[ind[j-1]] << std::endl;
+  }
+  if (!inds.empty()) {
+    assert(0 == *inds.begin());
+    assert(inds.size()-1 == *inds.rbegin());
+  }
+  if(inds.size()!=(offsets[ib+1]-offsets[ib])) std::cout << "error " << i << ' ' << ib << ' ' << inds.size() <<"!=" << (offsets[ib+1]-offsets[ib]) << std::endl;
+  assert(inds.size()==(offsets[ib+1]-offsets[ib]));
   }
  }  // 50 times
      std::cout <<"cuda computation took "
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index ba387c4a25b8f..ca7f8928873e4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -33,7 +33,8 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 }
 
 
-__global__ void
+__global__ 
+void
 kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
                GPUCACell *cells, uint32_t const * nCells,
                GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
@@ -158,8 +159,9 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
 
   auto nhits = hh.nHits;
   assert(nhits <= PixelGPUConstants::maxNumberOfHits);
-  auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512;
-  kernel_connect<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
+  kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
@@ -169,15 +171,15 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
   );
   cudaCheck(cudaGetLastError());
 
-  kernel_find_ntuplets<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_, device_nCells_,
       d_foundNtupletsVec_[regionIndex],
       4, maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
 
-  numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512;
-  kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize;
+  kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
                         d_foundNtupletsVec_[regionIndex],
                         device_theCells_, device_nCells_,
                         device_isOuterHitOfCell_, nhits,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 5995d286fc38d..02cf0d6f91642 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -139,7 +139,7 @@ class GPUCACell {
 
       return distance_13_beamspot_squared <
              (region_origin_radius + phiCut) * (region_origin_radius + phiCut);
-    }
+    } 
 
     // 87 cm/GeV = 1/(3.8T * 0.3)
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index c75c492d6953d..2d6ad83c63b2c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,6 +20,7 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
+  __forceinline__
   void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
                          uint32_t nPairs,
                          GPUCACell * cells,
diff --git a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
index fcb19f855a9ba..ff3624cdafd65 100644
--- a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
+++ b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
@@ -16,6 +16,7 @@ namespace pixelVertexHeterogeneousProduct {
     float * z_d;
     float * zerr_d;
     float * chi2_d;
+    uint16_t * sortInd;
     int32_t * ivtx_d; // this should be indexed with the original tracks, not the reduced set (oops)
   };
 
@@ -31,6 +32,7 @@ namespace pixelVertexHeterogeneousProduct {
     { }
 
     std::vector<float,    CUDAHostAllocator<float>> z,zerr, chi2;
+    std::vector<int16_t, CUDAHostAllocator<uint16_t>> sortInd;
     std::vector<int32_t, CUDAHostAllocator<int32_t>> ivtx;
 
     uint32_t nVertices=0;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
index 3451655a282d1..0b8c31235abea 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
@@ -117,18 +117,20 @@ void PixelVertexHeterogeneousProducer::acquireGPUCuda(
 
   // Second, make a collection of pointers to the tracks we want for the vertex finder
   // fill z,ez
-  std::vector<float> z,ez;
+  std::vector<float> z,ez2,pt2;
   assert(m_trks.empty());
   for (unsigned int i=0; i<tracks.size(); i++) {
     if (tracks[i].pt() < m_ptMin) continue;    
     m_trks.push_back( reco::TrackRef(trackCollection, i) );
     z.push_back(tracks[i].dz(bs));
-    ez.push_back(tracks[i].dzError());ez.back()*=ez.back();
+    ez2.push_back(tracks[i].dzError());ez2.back()*=ez2.back();
+    pt2.push_back(std::min(25.,tracks[i].pt()));pt2.back()*=pt2.back();
+
   }
   if (verbose_) edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Selected " << m_trks.size() << " of these tracks for vertexing\n";
   
   // Third, ship these tracks off to be vertexed
-  m_gpuAlgo.produce(cudaStream.id(),z.data(),ez.data(),z.size());
+  m_gpuAlgo.produce(cudaStream.id(),z.data(),ez2.data(),pt2.data(),z.size());
 
 }
 
@@ -154,23 +156,14 @@ void PixelVertexHeterogeneousProducer::produceGPUCuda(
     const reco::BeamSpot & bs = *bsHandle;
     x0=bs.x0();y0=bs.y0();z0=bs.z0(); dxdz=bs.dxdz();dydz=bs.dydz();
   }
-  // to be moved to gpu....
-  float pt2[gpuProduct.nVertices];
-  int ind[gpuProduct.nVertices];
-  for (unsigned int i=0; i<gpuProduct.nVertices; ++i) {
-    pt2[i]=0; ind[i]=i;
-  }
-  for (auto k=0U; k<m_trks.size(); ++k) {
-    auto i = gpuProduct.ivtx[k];
-    pt2[i] += m_trks[k]->pt() > 20 ? 20*20 : m_trks[k]->pt()*m_trks[k]->pt(); 
-  }
-  // sort
-  std::sort(ind,ind+gpuProduct.nVertices,[&](int i, int j){ return pt2[i]>pt2[j];});
-  if(gpuProduct.nVertices>1) assert(pt2[ind[0]]>=pt2[ind[1]]);
 
   // fill legacy data format
-  for (unsigned int j=0; j<gpuProduct.nVertices; ++j) {
-    auto i = ind[j];
+  std::set<uint16_t> uind; // fort verifing index consistency
+  for (int j=int(gpuProduct.nVertices)-1; j>=0; --j) {
+    auto i = gpuProduct.sortInd[j];  // on gpu sorted in ascending order....
+    assert(i>=0);
+    assert(i<int(gpuProduct.nVertices));
+    uind.insert(i);    
     assert(itrk.empty());
     auto z= gpuProduct.z[i];
     auto x= x0 + dxdz*z;
@@ -187,14 +180,17 @@ void PixelVertexHeterogeneousProducer::produceGPUCuda(
     assert(nt>0);
     (*vertexes).emplace_back(reco::Vertex::Point(x,y,z), err, gpuProduct.chi2[i], nt-1, nt );
     auto & v = (*vertexes).back();
-    pt2[i]=0;
     for (auto k: itrk) {
       v.add(reco::TrackBaseRef(m_trks[k]));
     }
     itrk.clear();
   }
+  assert(uind.size()==(*vertexes).size());
+  if (!uind.empty()) {
+    assert(0 == *uind.begin());
+    assert(uind.size()-1 == *uind.rbegin());  
+  }
 
-  
   if (verbose_) {
     edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Found " << vertexes->size() << " vertexes\n";
     for (unsigned int i=0; i<vertexes->size(); ++i) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 7c97665ece061..8d4c4d967a6b8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -7,10 +7,48 @@
 #include<cassert>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+
 
 #include "gpuVertexFinder.h"
 
 namespace gpuVertexFinder {
+
+
+  __global__
+  void sortByPt2(int nt,
+                 OnGPU * pdata
+                )  {
+    auto & __restrict__ data = *pdata;
+    float const * __restrict__ ptt2 = data.ptt2;
+    uint32_t const & nv = *data.nv;
+
+    int32_t const * __restrict__ iv = data.iv;
+    float * __restrict__ ptv2 = data.ptv2;
+    uint16_t * __restrict__ sortInd = data.sortInd;
+
+    if (nv<2) return;
+
+    // can be done asynchronoisly at the end of previous event
+    for (int i = threadIdx.x; i < nv; i += blockDim.x) {
+      ptv2[i]=0;
+    }
+    __syncthreads();
+
+
+    for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i]>9990) continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    __shared__ uint16_t ws[1024];
+    radixSort(ptv2,sortInd,ws,nv);
+    
+    assert(ptv2[sortInd[nv-1]]>=ptv2[sortInd[nv-2]]);
+    assert(ptv2[sortInd[1]]>=ptv2[sortInd[0]]);
+  }
+
   
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index cb39e003a098d..e1e346253ceb5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -6,13 +6,16 @@ namespace gpuVertexFinder {
   void Producer::allocateOnGPU() {
     cudaCheck(cudaMalloc(&onGPU.zt, OnGPU::MAXTRACKS*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.ezt2, OnGPU::MAXTRACKS*sizeof(float)));
+    cudaCheck(cudaMalloc(&onGPU.ptt2, OnGPU::MAXTRACKS*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.iv, OnGPU::MAXTRACKS*sizeof(int32_t)));
 
     cudaCheck(cudaMalloc(&onGPU.nv, sizeof(uint32_t)));
     cudaCheck(cudaMalloc(&onGPU.zv, OnGPU::MAXVTX*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.wv, OnGPU::MAXVTX*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.chi2, OnGPU::MAXVTX*sizeof(float)));
- 
+    cudaCheck(cudaMalloc(&onGPU.ptv2, OnGPU::MAXVTX*sizeof(float)));
+    cudaCheck(cudaMalloc(&onGPU.sortInd, OnGPU::MAXVTX*sizeof(uint16_t)));
+
 
     cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(int8_t)));
     cudaCheck(cudaMalloc(&onGPU.nn, OnGPU::MAXTRACKS*sizeof(int32_t)));
@@ -25,13 +28,15 @@ namespace gpuVertexFinder {
   void Producer::deallocateOnGPU() {
     cudaCheck(cudaFree(onGPU.zt));
     cudaCheck(cudaFree(onGPU.ezt2));
+    cudaCheck(cudaFree(onGPU.ptt2));
     cudaCheck(cudaFree(onGPU.iv));
 
     cudaCheck(cudaFree(onGPU.nv));
     cudaCheck(cudaFree(onGPU.zv));
     cudaCheck(cudaFree(onGPU.wv));
     cudaCheck(cudaFree(onGPU.chi2));
- 
+    cudaCheck(cudaFree(onGPU.ptv2));
+    cudaCheck(cudaFree(onGPU.sortInd));
 
     cudaCheck(cudaFree(onGPU.izt));
     cudaCheck(cudaFree(onGPU.nn));
@@ -44,6 +49,7 @@ namespace gpuVertexFinder {
   void Producer::produce(cudaStream_t stream,
 			 float const * __restrict__ zt,
 			 float const * __restrict__ ezt2,
+                         float const * __restrict__ ptt2,
 			 uint32_t ntrks
 			 ) {
     
@@ -52,10 +58,15 @@ namespace gpuVertexFinder {
 			      cudaMemcpyHostToDevice,stream));
     cudaCheck(cudaMemcpyAsync(onGPU.ezt2,ezt2,ntrks*sizeof(float),
 			      cudaMemcpyHostToDevice,stream));
+
+    cudaCheck(cudaMemcpyAsync(onGPU.ptt2,ptt2,ntrks*sizeof(float),
+                              cudaMemcpyHostToDevice,stream));
     
+
     assert(onGPU_d);
     clusterTracks<<<1,1024,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
-    
+    sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);    
+
     cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t),
 			      cudaMemcpyDeviceToHost, stream));
     
@@ -78,7 +89,11 @@ namespace gpuVertexFinder {
     gpuProduct.chi2.resize(gpuProduct.nVertices);
     cudaCheck(cudaMemcpyAsync(gpuProduct.chi2.data(),onGPU.chi2,sizeof(float)*gpuProduct.nVertices,
 			      cudaMemcpyDeviceToHost, stream));
-    
+        
+    gpuProduct.sortInd.resize(gpuProduct.nVertices);
+    cudaCheck(cudaMemcpyAsync(gpuProduct.sortInd.data(),onGPU.sortInd,sizeof(uint16_t)*gpuProduct.nVertices,
+                              cudaMemcpyDeviceToHost, stream));
+
     cudaStreamSynchronize(stream);
     
     return gpuProduct;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 1f90641e3c260..bea3db7338a86 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -13,17 +13,21 @@ namespace gpuVertexFinder {
     static constexpr uint32_t MAXTRACKS = 16000;
     static constexpr uint32_t MAXVTX= 1024;
     
-    float * zt;  // input track z at bs
+    float * zt;   // input track z at bs
     float * ezt2; // input error^2 on the above
+    float * ptt2; // input pt^2 on the above
    
+
     float * zv;  // output z-posistion of found vertices
     float * wv;  //  output weight (1/error^2) on the above
     float * chi2;  // vertices chi2
+    float * ptv2;  // vertices pt^2
     uint32_t * nv;  // the number of vertices
     int32_t * iv;  // vertex index for each associated track
+    uint16_t * sortInd; // sorted index (by pt2)
 
     // workspace  
-    int8_t  * izt;  // interized z-position of input tracks (reused as interize pt2 of vertices for sorting)
+    int8_t  * izt;  // interized z-position of input tracks
     int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices)
     
   };
@@ -53,6 +57,7 @@ namespace gpuVertexFinder {
     void produce(cudaStream_t stream,
 		 float const * zt,
 		 float const * ezt2,
+                 float const * ptt2,
 		 uint32_t ntrks
 		 );
     
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index f47c4362503ae..840b8947e7220 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -13,13 +13,14 @@ struct Event {
   std::vector<uint16_t>  itrack;
   std::vector<float> ztrack;
   std::vector<float> eztrack;
+  std::vector<float> pttrack;
   std::vector<uint16_t> ivert;
 };
 
 struct ClusterGenerator {
 
   explicit ClusterGenerator(float nvert, float ntrack) :
-    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.)
+    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
   {}
 
   void operator()(Event & ev) {
@@ -42,6 +43,8 @@ struct ClusterGenerator {
        ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
        ev.eztrack.push_back(err*err);
        ev.ivert.push_back(iv);
+       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
+       ev.pttrack.back()*=ev.pttrack.back();
       }
     }
     // add noise
@@ -51,6 +54,8 @@ struct ClusterGenerator {
       ev.ztrack.push_back(rgen(reng));
       ev.eztrack.push_back(err*err);
       ev.ivert.push_back(9999);
+      ev.pttrack.push_back( 0.5f + ptGen(reng) );
+      ev.pttrack.back()*=ev.pttrack.back();
     }
 
   }
@@ -61,7 +66,7 @@ struct ClusterGenerator {
   std::poisson_distribution<int> clusGen;
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
-
+  std::exponential_distribution<float> ptGen;
 
 };
 
@@ -79,9 +84,12 @@ int main() {
 
   auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
+  auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto zv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto wv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto chi2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ptv2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, 256);
 
   auto izt_d = cuda::memory::device::make_unique<int8_t[]>(current_device, 64000);
   auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
@@ -95,9 +103,12 @@ int main() {
 
   onGPU.zt = zt_d.get();
   onGPU.ezt2 = ezt2_d.get();
+  onGPU.ptt2 = ptt2_d.get();
   onGPU.zv = zv_d.get();
   onGPU.wv = wv_d.get();
   onGPU.chi2 = chi2_d.get();
+  onGPU.ptv2 = ptv2_d.get();
+  onGPU.sortInd = ind_d.get();
   onGPU.nv = nv_d.get();
   onGPU.izt = izt_d.get();
   onGPU.nn = nn_d.get();
@@ -123,6 +134,7 @@ int main() {
 
   cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
   cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+  cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
 
   float eps = 0.1f;
   
@@ -157,17 +169,25 @@ int main() {
 		 );
   
 
+  cuda::launch(sortByPt2,
+               { 1, 256 },
+               ev.ztrack.size(), onGPU_d.get()
+              );
 
   uint32_t nv;
   cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
   float zv[nv];
   float	wv[nv];
   float	chi2[nv];
+  float ptv2[nv];
   int32_t nn[nv];
+  uint16_t ind[nv];
   cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
   cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
   cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float));
   cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
   for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
    
   {
@@ -178,7 +198,12 @@ int main() {
     auto mx = std::minmax_element(chi2,chi2+nv);
     std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
   }
-  
+  {
+    auto mx = std::minmax_element(ptv2,ptv2+nv);
+    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
+
+  }  
 
   float dd[nv];
   uint32_t ii=0;

From f05df88e6a978e510458b0fcd306b858f8cec06d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 18 Sep 2018 14:37:22 +0200
Subject: [PATCH 10/36] make the number of bins configurable

---
 .../CUDAUtilities/interface/HistoContainer.h  | 71 +++++++++++++++----
 .../CUDAUtilities/test/BuildFile.xml          |  1 +
 .../CUDAUtilities/test/HistoContainer_t.cpp   | 42 +++++++----
 .../CUDAUtilities/test/HistoContainer_t.cu    |  2 +-
 .../plugins/gpuClustering.h                   | 17 ++---
 .../SiPixelClusterizer/test/gpuClustering.cu  | 41 +++++++++++
 .../siPixelRecHitsHeterogeneousProduct.h      |  2 +-
 .../PixelVertexFinding/src/gpuClusterTracks.h |  2 +-
 8 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 94acd0015edf6..bd0f60a1d0122 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -31,7 +31,7 @@ namespace cudautils {
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h,  uint32_t nh, T const * v, uint32_t * offsets) {
+  void fillFromVector(Histo * h,  uint32_t nh, T const * __restrict__ v, uint32_t * offsets) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i >= offsets[nh]) return;
      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
@@ -44,7 +44,7 @@ namespace cudautils {
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h, T const * v, uint32_t size) {
+  void fillFromVector(Histo * h, T const * __restrict__ v, uint32_t size) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i < size) h->fill(v[i], i);
   }
@@ -57,7 +57,7 @@ namespace cudautils {
   }
 
   template<typename Histo, typename T>
-  void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) {
+  void fillOneFromVector(Histo * h, T const * __restrict__ v, uint32_t size, int nthreads, cudaStream_t stream) {
     zero(h, 1, nthreads, stream);
     auto nblocks = (size + nthreads - 1) / nthreads;
     fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, v, size);
@@ -65,7 +65,7 @@ namespace cudautils {
   }
 
   template<typename Histo, typename T>
-  void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
+  void fillManyFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
     zero(h, nh, nthreads, stream);
     auto nblocks = (totSize + nthreads - 1) / nthreads;
     fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
@@ -80,15 +80,41 @@ namespace cudautils {
 // including spillBin
 template<typename Hist, typename V, typename Func>
 __host__ __device__
+__forceinline__
 void forEachInBins(Hist const & hist, V value, int n, Func func) {
    int bs = hist.bin(value);
    int be = std::min(int(hist.nbins()),bs+n+1);
    bs = std::max(0,bs-n);
-   assert(be>bs);
+   // assert(be>bs);
+//   bool tbc=false;
    for (auto b=bs; b<be; ++b){
+//   tbc |= hist.full(b);
    for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
       func(*pj);
    }}
+//   if (tbc)
+   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
+     func(*pj);
+}
+
+// iteratate over bins containing all values in window wmin, wmax
+// including spillBin
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
+   auto bs = hist.bin(wmin);
+   auto be = hist.bin(wmax);
+   // be = std::min(int(hist.nbins()),be+1);
+   // bs = std::max(0,bs);
+   // assert(be>=bs);
+//   bool tbc=false;
+   for (auto b=bs; b<=be; ++b){
+//   tbc |= hist.full(b);
+   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+      func(*pj);
+   }}
+//   if (tbc)
    for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
      func(*pj);
 }
@@ -96,8 +122,8 @@ void forEachInBins(Hist const & hist, V value, int n, Func func) {
 
 template<
   typename T, // the type of the discretized input values
-  uint32_t N, // number of bins (in bits)
-  uint32_t M, // max number of element a bin can contain
+  uint32_t NBINS, // number of bins 
+  uint32_t M, // max number of element a bin can contain (in bits)
   uint32_t S=sizeof(T) * 8, // number of significant bits in T
   typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
 >
@@ -111,15 +137,31 @@ class HistoContainer {
 
   using index_type = I;
   using UT = typename std::make_unsigned<T>::type;
+
+  static constexpr uint32_t ilog2(uint32_t v) {
+
+    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+    uint32_t r = 0; // result of log2(v) will go here
+    for (auto i = 4; i >= 0; i--) if (v & b[i]) {
+      v >>= s[i];
+      r |= s[i];
+    }
+    return r;
+  }
+
+
   static constexpr uint32_t sizeT()     { return S; }
-  static constexpr uint32_t nbins()     { return 1 << N; }
-  static constexpr uint32_t shift()     { return sizeT() - N; }
-  static constexpr uint32_t mask()      { return nbins() - 1; }
+  static constexpr uint32_t nbins()     { return NBINS;}
+  static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
   static constexpr uint32_t binSize()   { return 1 << M; }
   static constexpr uint32_t spillSize() { return 16 * binSize(); }
 
   static constexpr UT bin(T t) {
-    return (t >> shift()) & mask();
+    constexpr uint32_t shift = sizeT() - nbits();
+    constexpr uint32_t mask = (1<<nbits()) - 1;
+    return (t >> shift) & mask;
   }
 
   void zero() {
@@ -128,7 +170,8 @@ class HistoContainer {
       i = 0;
   }
 
-  static constexpr
+  static __host__ __device__
+  __forceinline__
   uint32_t atomicIncrement(Counter & x) {
     #ifdef __CUDA_ARCH__
     return atomicAdd(&x, 1);
@@ -152,11 +195,11 @@ class HistoContainer {
   }
 
   constexpr bool fullSpill() const {
-    return nspills >= spillSize();
+    return nspills > spillSize();
   }
 
   constexpr bool full(uint32_t b) const {
-    return n[b] >= binSize();
+    return n[b] > binSize();
   }
 
   constexpr auto const * begin(uint32_t b) const {
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 9957266fc064b..794a7e519b21f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -19,6 +19,7 @@
 </bin>
 
 <bin file="HistoContainer_t.cpp">
+  <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 61c561f96267d..03218d87a2208 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -6,41 +6,54 @@
 #include<random>
 #include<limits>
 
-template<typename T>
+template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
 void go() {
   std::mt19937 eng;
-  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(),std::numeric_limits<T>::max());
+
+  int rmin=std::numeric_limits<T>::min();
+  int rmax=std::numeric_limits<T>::max();
+  if (NBINS!=128) {
+    rmin=0;
+    rmax=NBINS*2-1;
+  }
+
+
+
+  std::uniform_int_distribution<T> rgen(rmin,rmax);
 
   
   constexpr int N=12000;
   T v[N];
 
-  using Hist = HistoContainer<T,7,8>;
-  std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << std::endl;
-  
+  using Hist = HistoContainer<T,NBINS,8,S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+
   Hist h;
   for (int it=0; it<5; ++it) {
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
+    if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4;
     h.zero();
     for (long long j = 0; j < N; j++) h.fill(v[j],j);
-    
+  
     std::cout << "nspills " << h.nspills << std::endl;    
 
-    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
-      if ( T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
+      if (i!=j && T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
     };
 
     for (uint32_t i=0; i<Hist::nbins(); ++i) {
       if (0==h.n[i]) continue;
       auto k= *h.begin(i);
       assert(k<N);
-      auto kl = h.bin(v[k]-T(1000));
-      auto kh =	h.bin(v[k]+T(1000));
-      assert(kl!=i);  assert(kh!=i);
+      auto kl = NBINS!=128 ? h.bin(std::max(rmin,v[k]-DELTA)) : h.bin(v[k]-T(DELTA));
+      auto kh =	NBINS!=128 ? h.bin(std::min(rmax,v[k]+DELTA)) : h.bin(v[k]+T(DELTA));
+      if(NBINS==128) { assert(kl!=i);  assert(kh!=i); }
+      if(NBINS!=128) { assert(kl<=i);  assert(kh>=i); }
       // std::cout << kl << ' ' << kh << std::endl;
-      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,k,k,(*j));
-      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,k,(*j),k);
+      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,kl, k,k,(*j));
+      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,kh, k,(*j),k);
     }
   }
 
@@ -76,6 +89,9 @@ void go() {
 
 int main() {
   go<int16_t>();
+  go<uint8_t,128,8,4>();
+  go<uint16_t,313/2,9,4>();
+
 
   return 0;
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 73e9f20ae589f..98c36492bacdc 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -34,7 +34,7 @@ void go() {
   constexpr uint32_t partSize = N/nParts;
   uint32_t offsets[nParts+1];
 
-  using Hist = HistoContainer<T,7,8>;
+  using Hist = HistoContainer<T,1<<7,8>;
   std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits<T>::max()-std::numeric_limits<T>::min())/Hist::nbins() << std::endl;
   
   Hist h[nParts];
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 16df3abdf8df9..973771e13ac01 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -6,6 +6,7 @@
 #include <cstdio>
 
 #include "gpuClusteringConstants.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
@@ -75,8 +76,9 @@ namespace gpuClustering {
       }
     }
 
-   //init hist  (ymax < 512)
-   __shared__ HistoContainer<uint16_t,8,4,9,uint16_t> hist;
+   //init hist  (ymax=416 < 512 : 9bits)
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule/2+2;
+   __shared__ HistoContainer<uint16_t,nbins,4,9,uint16_t> hist;
    hist.nspills = 0;
    for (auto k = threadIdx.x; k<hist.nbins(); k+=blockDim.x) hist.n[k]=0;
 
@@ -125,12 +127,10 @@ namespace gpuClustering {
        for (int i = first, k = 0; i < msize; i += blockDim.x, ++k) {
           if (id[i] == InvId)               // skip invalid pixels
             continue;
-          assert(id[i] == thisModuleId);    // same module
+          // assert(id[i] == thisModuleId);    // same module
           auto jm = jmax[k];
           jmax[k] = i + 1;
           // loop to columns
-          auto bs = hist.bin(y[i]>0 ? y[i]-1 : 0);
-          auto be = hist.bin(y[i]+1)+1;
           auto loop = [&](int j) {
             j+=firstPixel;
             if (i>=j or j>jm or 
@@ -145,12 +145,7 @@ namespace gpuClustering {
             // update the loop boundary for the next iteration
             jmax[k] = std::max(j + 1,jmax[k]);
           };
-          for (auto b=bs; b<be; ++b){
-          for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
-            loop(*pj);
-          }}
-          for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-             loop(*pj);
+          forEachInWindow(hist,y[i]>0 ? y[i]-1 :0 ,y[i]+1,loop);
          } // pixel loop
       } // end active
     }  // end while
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index af20ae17c745a..3e373663bab85 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -77,6 +77,47 @@ int main(void)
     h_y[n]=x;
     h_adc[n]= kn==0 ? 100 : 5000;
     ++n;
+
+   // first column
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x;
+    h_y[n]=0;
+    h_adc[n]= 5000;
+    ++n;
+    // first columns
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=2;
+    h_adc[n]= 5000;
+    ++n;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=1;
+    h_adc[n]= 5000;
+    ++n;
+
+    // last column
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x;
+    h_y[n]=415;
+    h_adc[n]= 5000;
+    ++n;
+   // last columns
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=415;
+    h_adc[n]= 2500;
+    ++n;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=414;
+    h_adc[n]= 2500;
+    ++n;
+
     // diagonal
     ++ncl;
     for (int x=20; x<25; ++x) {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 778b763b28cff..a8b97e988e5e6 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -32,7 +32,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     using Hist = HistoContainer<int16_t,7,8>;
+     using Hist = HistoContainer<int16_t,1<<7,8>;
      Hist * hist_d;
 
      HitsOnGPU const * me_d = nullptr;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 8d4c4d967a6b8..062c802ed83a4 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -80,7 +80,7 @@ namespace gpuVertexFinder {
     assert(pdata);
     assert(zt);
     
-    __shared__ HistoContainer<int8_t,8,5,8,uint16_t> hist;
+    __shared__ HistoContainer<int8_t,256,5,8,uint16_t> hist;
     
     //  if(0==threadIdx.x) printf("params %d %f\n",minT,eps);    
     //  if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt);

From 304c6077da0a991c9b1a84e351679f56cd704f79 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 18 Sep 2018 16:03:09 +0200
Subject: [PATCH 11/36] fix race

---
 .../CUDAUtilities/interface/HistoContainer.h  | 46 ++++++++++++-------
 .../CUDAUtilities/test/HistoContainer_t.cpp   |  2 +-
 .../CUDAUtilities/test/HistoContainer_t.cu    |  4 +-
 .../plugins/gpuClusterChargeCut.h             |  1 +
 .../plugins/gpuClustering.h                   |  4 +-
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  6 +--
 .../PixelVertexFinding/src/gpuClusterTracks.h |  3 +-
 7 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index bd0f60a1d0122..e7bcaf3422945 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -20,11 +20,10 @@ namespace cudautils {
   __global__
   void zeroMany(Histo * h, uint32_t nh) {
     auto i  = blockIdx.x * blockDim.x + threadIdx.x;
-    auto ih = i / Histo::nbins();
-    auto k  = i - ih * Histo::nbins();
+    auto ih = i / Histo::totbins();
+    auto k  = i - ih * Histo::totbins();
     if (ih < nh) {
-      h[ih].nspills = 0;
-      if (k < Histo::nbins())
+      if (k < Histo::totbins())
         h[ih].n[k] = 0;
     }
   }
@@ -51,7 +50,7 @@ namespace cudautils {
 
   template<typename Histo>
   void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
-    auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads;
+    auto nblocks = (nh * Histo::totbins() + nthreads - 1) / nthreads;
     zeroMany<<<nblocks, nthreads, 0, stream>>>(h, nh);
     cudaCheck(cudaGetLastError());
   }
@@ -154,6 +153,7 @@ class HistoContainer {
 
   static constexpr uint32_t sizeT()     { return S; }
   static constexpr uint32_t nbins()     { return NBINS;}
+  static constexpr uint32_t totbins()   { return NBINS+1;} // including spillbin
   static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
   static constexpr uint32_t binSize()   { return 1 << M; }
   static constexpr uint32_t spillSize() { return 16 * binSize(); }
@@ -165,7 +165,6 @@ class HistoContainer {
   }
 
   void zero() {
-    nspills = 0;
     for (auto & i : n)
       i = 0;
   }
@@ -181,6 +180,7 @@ class HistoContainer {
   }
 
   __host__ __device__
+  __forceinline__
   void fill(T t, index_type j) {
     UT b = bin(t);
     assert(b<nbins());
@@ -188,14 +188,19 @@ class HistoContainer {
     if (w < binSize()) {
       bins[b * binSize() + w] = j;
     } else {
-      auto w = atomicIncrement(nspills);
+      auto w = atomicIncrement(n[nbins()]);
       if (w < spillSize())
-        spillBin[w] = j;
+        bins[nbins() * binSize() + w] = j;
     }
   }
 
+
+  constexpr auto nspills() const {
+   return uint32_t(n[nbins()]);
+  }
+
   constexpr bool fullSpill() const {
-    return nspills > spillSize();
+    return nspills() > spillSize();
   }
 
   constexpr bool full(uint32_t b) const {
@@ -211,21 +216,30 @@ class HistoContainer {
   }
 
   constexpr auto size(uint32_t b) const {
-     return n[b];
+     return uint32_t(n[b]);
+  }
+
+  constexpr auto const * spillBin() const {
+    return bins + nbins()*binSize();
   }
 
   constexpr auto const * beginSpill() const {
-     return spillBin;
+     return spillBin();
   }
     
   constexpr auto const * endSpill() const {
-     return beginSpill() + std::min(spillSize(), uint32_t(nspills));
+     return beginSpill() + std::min(spillSize(), uint32_t(nspills()));
   }
 
-  Counter  n[nbins()];
-  Counter  nspills;
-  index_type bins[nbins()*binSize()];
-  index_type spillBin[spillSize()];
+  Counter  n[nbins()+1];  // last is the spill bin
+  index_type bins[nbins()*binSize()+spillSize()];
+};
+
+// a compactified version of above resuing the very same space
+template<typename H>
+class CompactHistoContainer {
+
+
 };
 
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 03218d87a2208..190a4dc18e3fa 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -36,7 +36,7 @@ void go() {
     h.zero();
     for (long long j = 0; j < N; j++) h.fill(v[j],j);
   
-    std::cout << "nspills " << h.nspills << std::endl;    
+    std::cout << "nspills " << h.nspills() << std::endl;    
 
     auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 98c36492bacdc..2b6549361f182 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -85,7 +85,7 @@ void go() {
     auto window = T(1300);
     
     for (uint32_t j=0; j<nParts; ++j) {
-      std::cout << j << ": nspills " << h[j].nspills << std::endl;
+      std::cout << j << ": nspills " << h[j].nspills() << std::endl;
       for (uint32_t i=0; i<Hist::nbins(); ++i) {
         if (0==h[j].size(i)) continue;
         auto k= *h[j].begin(i);
@@ -111,7 +111,7 @@ void go() {
           if (l) for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,k,(*p));
           else for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,(*p),k);
         }
-        if (h[j].nspills==0 && !(tot>=nm)) {
+        if (h[j].nspills()==0 && !(tot>=nm)) {
            std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
         }
         if (l) std::cout << "what? " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index 4abc029e1942a..0284a378ecd39 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -73,6 +73,7 @@ namespace gpuClustering {
     if(nclus==newclusId[nclus-1]) return;
 
     nClustersInModule[thisModuleId] = newclusId[nclus-1];
+    __syncthreads();
 
     // mark bad cluster again
     for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 973771e13ac01..3cc1f97efed35 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -79,9 +79,7 @@ namespace gpuClustering {
    //init hist  (ymax=416 < 512 : 9bits)
    constexpr auto  nbins = phase1PixelTopology::numColsInModule/2+2;
    __shared__ HistoContainer<uint16_t,nbins,4,9,uint16_t> hist;
-   hist.nspills = 0;
-   for (auto k = threadIdx.x; k<hist.nbins(); k+=blockDim.x) hist.n[k]=0;
-
+   for (auto k = threadIdx.x; k<hist.totbins(); k+=blockDim.x) hist.n[k]=0;
     __syncthreads();
 
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 2d6ad83c63b2c..418964869dfda 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -129,10 +129,10 @@ namespace gpuPixelDoublets {
       if (tooMany > 0)
         printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-      if (hist[outer].nspills > 0)
-        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
+      if (hist[outer].nspills() > 0)
+        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills());
 
-      // if (0==hist[outer].nspills) assert(tot>=nmin);
+      // if (0==hist[outer].nspills()) assert(tot>=nmin);
       // look in spill bin as well....
 
     }  // loop in block...
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 062c802ed83a4..54121d5babfb5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -86,8 +86,7 @@ namespace gpuVertexFinder {
     //  if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt);
     
     // zero hist
-    hist.nspills = 0;
-    for (auto k = threadIdx.x; k<hist.nbins(); k+=blockDim.x) hist.n[k]=0;
+    for (auto k = threadIdx.x; k<hist.totbins(); k+=blockDim.x) hist.n[k]=0;
     __syncthreads();
 
     //  if(0==threadIdx.x) printf("histo zeroed\n");

From b1e28101336d321a7b3a8e45b577c679adf83a95 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 19 Sep 2018 15:31:14 +0200
Subject: [PATCH 12/36] compactify the histogram (to make itaration faster?)

---
 .../CUDAUtilities/interface/HistoContainer.h  | 89 +++++++++++++++++++
 .../CUDAUtilities/test/BuildFile.xml          |  5 ++
 .../CUDAUtilities/test/prefixScan_t.cu        |  6 +-
 3 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index e7bcaf3422945..d85bc761df151 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -10,6 +10,9 @@
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
@@ -119,6 +122,23 @@ void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
 }
 
 
+// same as above but for compactified histos
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInWindowCompact(Hist const & hist, V wmin, V wmax, Func const & func) {
+   auto bs = hist.getH().bin(wmin);
+   auto be = hist.getH().bin(wmax);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
+      func(*pj);
+   }
+   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
+     func(*pj);
+}
+
+
+
 template<
   typename T, // the type of the discretized input values
   uint32_t NBINS, // number of bins 
@@ -239,7 +259,76 @@ class HistoContainer {
 template<typename H>
 class CompactHistoContainer {
 
+public:
+
+  using index_type = typename H::index_type;
+
+  static constexpr auto wsSize() { return std::max(H::spillSize(),32U);}
 
+  __host__ __device__
+  __forceinline__
+  H & getH() { return histo;}
+
+  __host__ __device__
+  __forceinline__
+  H const & getH() const { return histo;}
+
+
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  void compactify(typename H::Counter * ws) {
+    auto  & h = histo;
+    // fix size
+    for (auto j=threadIdx.x; j<H::nbins(); j+=blockDim.x) h.n[j]= std::min(h.n[j],H::binSize());
+    if (threadIdx.x==0) h.n[H::nbins()] = std::min(h.n[H::nbins()],H::spillSize());
+    __syncthreads();
+    blockPrefixScan(histo.n,H::totbins(),ws);
+    __syncthreads();
+    // relocate :  the following is slow
+    __shared__ uint32_t i;
+    i=1;
+    __syncthreads();
+    while(__syncthreads_and(i<H::totbins())) {
+      auto b = h.n[i-1];
+      auto s = int(h.n[i])-int(b);
+      assert(s>=0);
+      assert(b<=h.begin(i)-h.begin(0));
+      if (i<H::nbins()) assert(s<=H::binSize());
+      for (auto j=threadIdx.x; j<s; j+=blockDim.x) {
+         ws[j] = h.begin(i)[j]; 
+      }
+      __syncthreads();
+      for (auto j=threadIdx.x; j<s; j+=blockDim.x) {
+        histo.bins[b+j] = ws[j];
+      }
+      __syncthreads();
+      if (threadIdx.x==0) ++i;
+      __syncthreads();
+    }
+
+  }
+#endif
+
+  __host__
+  void compactify() {
+  }
+
+  constexpr auto size() const { return uint32_t(histo.n[H::nbins()]);}
+
+  constexpr index_type const * begin() const { return histo.bins;}
+  constexpr index_type const * end() const { return begin() + size();}
+
+
+  constexpr index_type const * begin(uint32_t b) const { return histo.bins + ( (0==b) ? 0 : histo.n[b-1]) ;}
+  constexpr index_type const * end(uint32_t b) const { return histo.bins + histo.n[b];}
+   
+  constexpr index_type const * beginSpill() const { return histo.bins + histo.n[H::nbins()-1];}
+
+  constexpr index_type const * endSpill() const { return begin() + size();}
+
+   H histo;
 };
 
+
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 794a7e519b21f..f5a3dd3ae8207 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -31,4 +31,9 @@
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
 
+<bin file="OneHistoContainer_t.cu" name="gpuOneHistoContainer_t">
+  <use name="cuda-api-wrappers"/>
+</bin>
+
+
 <bin file="prefixScan_t.cu" name="gpuPrefixScan_t"/>
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 3e8645439a28f..e109f6672bc32 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -13,9 +13,9 @@ void testPrefixScan(uint32_t size) {
   blockPrefixScan(c, size, ws);
 
   assert(1==c[0]);
-  for (auto i=first+1; i<size; i+=blockDim.x) {
-    if (c[i]!=c[i-1]+1) printf("failed %d %d %d: %d %d\n",size, i, blockDim.x, c[i],c[i-1]);
-    assert(c[i]==c[i-1]+1); assert(c[i]==i+1);
+  for (auto i=first; i<size-1; i+=blockDim.x) {
+    if (c[i+1]!=c[i]+1) printf("failed %d %d %d: %d %d\n",size, i, blockDim.x, c[i],c[i+1]);
+    assert(c[i+1]==c[i]+1); assert(c[i]==i+1);
   }
 }
 

From f9dad1374d0a816444ed13b8d2d25690577c657a Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 12:14:08 +0200
Subject: [PATCH 13/36] not faster, at least avoid spill

---
 .../interface/FancyHistoContainer.h           | 172 ++++++++++++++++++
 .../CUDAUtilities/interface/HistoContainer.h  |   6 +-
 .../CUDAUtilities/test/HistoContainer_t.cpp   |  26 ++-
 .../SiPixelClusterizer/test/BuildFile.xml     |   4 +-
 .../SiPixelClusterizer/test/gpuClustering.cu  |   4 +-
 5 files changed, 195 insertions(+), 17 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h

diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
new file mode 100644
index 0000000000000..f5e669ca3d7ef
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
@@ -0,0 +1,172 @@
+#ifndef HeterogeneousCore_CUDAUtilities_HistoContainer_h
+#define HeterogeneousCore_CUDAUtilities_HistoContainer_h
+
+#include <cassert>
+#include <cstdint>
+#include <algorithm>
+#include <type_traits>
+#ifndef __CUDA_ARCH__
+#include <atomic>
+#endif // __CUDA_ARCH__
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#endif
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+
+#ifdef __CUDACC__
+namespace cudautils {
+
+  template<typename Histo>
+  __global__
+  void zeroMany(Histo * h, uint32_t nh) {
+    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
+    auto ih = i / Histo::totbins();
+    auto k  = i - ih * Histo::totbins();
+    if (ih < nh) {
+      if (k < Histo::totbins())
+        h[ih].n[k] = 0;
+    }
+  }
+
+} // namespace cudautils
+#endif
+
+
+// iteratate over N bins left and right of the one containing "v"
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInBins(Hist const & hist, V value, int n, Func func) {
+   int bs = Hist::bin(value);
+   int be = std::min(int(Hist::nbins()-1),bs+n);
+   bs = std::max(0,bs-n);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
+      func(*pj);
+   }
+}
+
+// iteratate over bins containing all values in window wmin, wmax
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
+   auto bs = Hist::bin(wmin);
+   auto be = Hist::bin(wmax);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
+      func(*pj);
+   }
+}
+
+
+
+template<
+  typename T, // the type of the discretized input values
+  uint32_t NBINS, // number of bins 
+  uint32_t SIZE, // max number of element
+  uint32_t S=sizeof(T) * 8, // number of significant bits in T
+  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+>
+class HistoContainer {
+public:
+#ifdef __CUDACC__
+  using Counter = uint32_t;
+#else
+  using Counter = std::atomic<uint32_t>;
+#endif
+
+  using index_type = I;
+  using UT = typename std::make_unsigned<T>::type;
+
+  static constexpr uint32_t ilog2(uint32_t v) {
+
+    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+    uint32_t r = 0; // result of log2(v) will go here
+    for (auto i = 4; i >= 0; i--) if (v & b[i]) {
+      v >>= s[i];
+      r |= s[i];
+    }
+    return r;
+  }
+
+
+  static constexpr uint32_t sizeT()     { return S; }
+  static constexpr uint32_t nbins()     { return NBINS;}
+  static constexpr uint32_t totbins()   { return NBINS+1;}
+  static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
+  static constexpr uint32_t capacity()  { return SIZE; }
+
+  static constexpr UT bin(T t) {
+    constexpr uint32_t shift = sizeT() - nbits();
+    constexpr uint32_t mask = (1<<nbits()) - 1;
+    return (t >> shift) & mask;
+  }
+
+  void zero() {
+    for (auto & i : off)
+      i = 0;
+  }
+
+  static __host__ __device__
+  __forceinline__
+  uint32_t atomicIncrement(Counter & x) {
+    #ifdef __CUDA_ARCH__
+    return atomicAdd(&x, 1);
+    #else
+    return x++;
+    #endif
+  }
+
+  __host__ __device__
+  __forceinline__
+  void count(T t) {
+    UT b = bin(t);
+    assert(b<nbins());
+    atomicIncrement(off[b+1]);
+  }
+
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws) {
+    UT b = bin(t);
+    assert(b<nbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
+  }
+
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  void finalize(Counter * ws) {
+    blockPrefixScan(off+1,nbins(),ws);
+  }
+  __host__
+#endif
+  void finalize() {
+    for    (uint32_t i=2; i<=totbins(); ++i) off[i]+=off[i-1];
+  }
+
+  constexpr auto size() const { return uint32_t(off[nbins()]);}
+  constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
+
+
+  constexpr index_type const * begin() const { return bins;}
+  constexpr index_type const * end() const { return begin() + size();}
+
+
+  constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
+  constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
+
+
+  Counter  off[nbins()+1];
+  index_type bins[capacity()];
+};
+
+#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index d85bc761df151..ad446ad27c1bc 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -292,9 +292,9 @@ class CompactHistoContainer {
     while(__syncthreads_and(i<H::totbins())) {
       auto b = h.n[i-1];
       auto s = int(h.n[i])-int(b);
-      assert(s>=0);
-      assert(b<=h.begin(i)-h.begin(0));
-      if (i<H::nbins()) assert(s<=H::binSize());
+//      assert(s>=0);
+//      assert(b<=h.begin(i)-h.begin(0));
+//      if (i<H::nbins()) assert(s<=H::binSize());
       for (auto j=threadIdx.x; j<s; j+=blockDim.x) {
          ws[j] = h.begin(i)[j]; 
       }
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 190a4dc18e3fa..d0a3ea7582b11 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -1,4 +1,4 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
 
 #include<algorithm>
 #include<cassert>
@@ -25,18 +25,25 @@ void go() {
   constexpr int N=12000;
   T v[N];
 
-  using Hist = HistoContainer<T,NBINS,8,S>;
-  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  using Hist = HistoContainer<T,NBINS,N,S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
 
   Hist h;
+  typename Hist::Counter ws[Hist::totbins()];
   for (int it=0; it<5; ++it) {
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
-    if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4;
+    if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
     h.zero();
-    for (long long j = 0; j < N; j++) h.fill(v[j],j);
-  
-    std::cout << "nspills " << h.nspills() << std::endl;    
+    assert(h.size()==0);
+    for (auto & i: ws) i=0;
+    for (long long j = 0; j < N; j++) h.count(v[j]);
+    h.finalize();
+    assert(h.off[0]==0);
+    assert(h.size()==N);
+    for (long long j = 0; j < N; j++) h.fill(v[j],j,ws);
+    assert(h.size()==N);
+
 
     auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
@@ -44,7 +51,7 @@ void go() {
     };
 
     for (uint32_t i=0; i<Hist::nbins(); ++i) {
-      if (0==h.n[i]) continue;
+      if (0==h.size(i)) continue;
       auto k= *h.begin(i);
       assert(k<N);
       auto kl = NBINS!=128 ? h.bin(std::max(rmin,v[k]-DELTA)) : h.bin(v[k]-T(DELTA));
@@ -59,7 +66,6 @@ void go() {
 
   for (long long j = 0; j < N; j++) {
     auto b0 = h.bin(v[j]);
-    auto stot = h.endSpill()-h.beginSpill();
     int w=0;
     int tot=0;
     auto ftest = [&](int k) {
@@ -67,7 +73,7 @@ void go() {
        tot++;
     };
     forEachInBins(h,v[j],w,ftest);
-    int rtot = h.end(b0)-h.begin(b0) + stot;
+    int rtot = h.end(b0)-h.begin(b0);
     assert(tot==rtot);
     w=1; tot=0;
     forEachInBins(h,v[j],w,ftest);
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index 1ba4945a4c76b..17ce3cd5ce230 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -41,6 +41,6 @@
 <bin file="gpuClustering.cu" name="gpuClustering_debug">
   <use name="cuda"/>
   <use name="cuda-api-wrappers"/>
-  <flags CXXFLAGS="-g"/>
-  <flags CUDA_FLAGS="-g -G"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 3e373663bab85..f517a7e7731bd 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -11,7 +11,7 @@
 
 #include <cuda/api_wrappers.h>
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 
 int main(void)
@@ -271,7 +271,7 @@ int main(void)
     cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
     std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl;
     for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;}
-
+    if (ncl!=std::accumulate(nclus,nclus+MaxNumModules,0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
     cuda::launch(
                clusterChargeCut,

From 81a15e6e5a2560739130d55723c0c3e56c1060db Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 12:14:46 +0200
Subject: [PATCH 14/36] will need some cleanup

---
 .../test/NaiveHistoContainer_t.cpp            |  97 ++++++++
 .../CUDAUtilities/test/OneHistoContainer_t.cu | 147 +++++++++++
 .../plugins/gpuFancyClustering.h              | 232 ++++++++++++++++++
 3 files changed, 476 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
 create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h

diff --git a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
new file mode 100644
index 0000000000000..190a4dc18e3fa
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
@@ -0,0 +1,97 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include<algorithm>
+#include<cassert>
+#include<iostream>
+#include<random>
+#include<limits>
+
+template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
+void go() {
+  std::mt19937 eng;
+
+  int rmin=std::numeric_limits<T>::min();
+  int rmax=std::numeric_limits<T>::max();
+  if (NBINS!=128) {
+    rmin=0;
+    rmax=NBINS*2-1;
+  }
+
+
+
+  std::uniform_int_distribution<T> rgen(rmin,rmax);
+
+  
+  constexpr int N=12000;
+  T v[N];
+
+  using Hist = HistoContainer<T,NBINS,8,S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+
+  Hist h;
+  for (int it=0; it<5; ++it) {
+    for (long long j = 0; j < N; j++) v[j]=rgen(eng);
+    if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4;
+    h.zero();
+    for (long long j = 0; j < N; j++) h.fill(v[j],j);
+  
+    std::cout << "nspills " << h.nspills() << std::endl;    
+
+    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
+      assert(t1<N); assert(t2<N);
+      if (i!=j && T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
+    };
+
+    for (uint32_t i=0; i<Hist::nbins(); ++i) {
+      if (0==h.n[i]) continue;
+      auto k= *h.begin(i);
+      assert(k<N);
+      auto kl = NBINS!=128 ? h.bin(std::max(rmin,v[k]-DELTA)) : h.bin(v[k]-T(DELTA));
+      auto kh =	NBINS!=128 ? h.bin(std::min(rmax,v[k]+DELTA)) : h.bin(v[k]+T(DELTA));
+      if(NBINS==128) { assert(kl!=i);  assert(kh!=i); }
+      if(NBINS!=128) { assert(kl<=i);  assert(kh>=i); }
+      // std::cout << kl << ' ' << kh << std::endl;
+      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,kl, k,k,(*j));
+      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,kh, k,(*j),k);
+    }
+  }
+
+  for (long long j = 0; j < N; j++) {
+    auto b0 = h.bin(v[j]);
+    auto stot = h.endSpill()-h.beginSpill();
+    int w=0;
+    int tot=0;
+    auto ftest = [&](int k) {
+       assert(k>=0 && k<N);
+       tot++;
+    };
+    forEachInBins(h,v[j],w,ftest);
+    int rtot = h.end(b0)-h.begin(b0) + stot;
+    assert(tot==rtot);
+    w=1; tot=0;
+    forEachInBins(h,v[j],w,ftest);
+    int bp = b0+1;
+    int bm = b0-1;
+    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
+    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
+    assert(tot==rtot);
+    w=2; tot=0;
+    forEachInBins(h,v[j],w,ftest);
+    bp++;
+    bm--;
+    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
+    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
+    assert(tot==rtot);
+  }
+
+}
+
+int main() {
+  go<int16_t>();
+  go<uint8_t,128,8,4>();
+  go<uint16_t,313/2,9,4>();
+
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
new file mode 100644
index 0000000000000..9d520929a20c0
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -0,0 +1,147 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+
+#include<algorithm>
+#include<cassert>
+#include<iostream>
+#include<random>
+#include<limits>
+
+#include <cuda/api_wrappers.h>
+
+template<typename T, int NBINS, int S, int DELTA>
+__global__
+void mykernel(T const * __restrict__  v, uint32_t N) {
+
+  assert(v);
+  assert(N==12000);
+
+  if (threadIdx.x==0) printf("start kernel for %d data\n",N);
+
+  using Hist = HistoContainer<T,NBINS,12000,S,uint16_t>;
+  constexpr auto wss = Hist::totbins();
+
+  if (threadIdx.x==0) printf("ws size %d\n",wss);
+
+  __shared__ Hist  hist;
+  __shared__ typename Hist::Counter ws[wss];
+
+  for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
+  __syncthreads();
+
+  for (auto j=threadIdx.x; j<N; j+=blockDim.x) hist.count(v[j]);
+  __syncthreads();
+
+  assert(0==hist.off[0]);
+  __syncthreads();
+
+  
+  hist.finalize(ws);
+  __syncthreads();
+
+  assert(0==hist.off[0]);
+  assert(N==hist.size());
+  for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) assert(hist.off[j]<=hist.off[j+1]);
+  __syncthreads();
+
+  if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+  __syncthreads();
+
+  for (auto j=threadIdx.x; j<N; j+=blockDim.x) hist.fill(v[j],j,ws);
+  __syncthreads();
+
+
+   for (auto j=threadIdx.x; j<hist.size()-1; j+=blockDim.x) {
+     auto p = hist.begin()+j;
+     assert((*p)<N);
+     auto k1 = Hist::bin(v[*p]);
+     auto k2 = Hist::bin(v[*(p+1)]);
+     assert(k2>=k1);
+   }
+
+   for (auto i=threadIdx.x; i<hist.size(); i+=blockDim.x) {
+      auto p = hist.begin()+i;
+      auto j = *p;
+      auto b0 = Hist::bin(v[j]);
+      int tot=0;
+      auto ftest =  [&](int k) {
+         assert(k>=0 && k<N);
+         ++tot;
+      };
+      forEachInWindow(hist,v[j],v[j],ftest);
+      int rtot = hist.size(b0);
+      assert(tot==rtot);
+      tot=0;
+      auto vm = int(v[j])-DELTA;
+      auto vp = int(v[j])+DELTA;
+      constexpr int vmax = NBINS!=128 ? NBINS*2-1 : std::numeric_limits<T>::max();
+      vm = std::max(vm, 0);
+      vm = std::min(vm,vmax);
+      vp = std::min(vp,vmax);
+      vp = std::max(vp, 0);
+      assert(vp>=vm);
+      forEachInWindow(hist, vm,vp, ftest);
+      int bp = Hist::bin(vp);
+      int bm = Hist::bin(vm);
+      rtot = hist.end(bp)-hist.begin(bm);
+      assert(tot==rtot);
+   }
+
+
+}
+
+template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
+void go() {
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system" << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+
+  std::mt19937 eng;
+
+  int rmin=std::numeric_limits<T>::min();
+  int rmax=std::numeric_limits<T>::max();
+  if (NBINS!=128) {
+    rmin=0;
+    rmax=NBINS*2-1;
+  }
+
+
+
+  std::uniform_int_distribution<T> rgen(rmin,rmax);
+
+  
+  constexpr int N=12000;
+  T v[N];
+
+  auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
+  assert(v_d.get());
+
+  using Hist = HistoContainer<T,NBINS,N,S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+
+  for (int it=0; it<5; ++it) {
+    for (long long j = 0; j < N; j++) v[j]=rgen(eng);
+    if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
+
+    assert(v_d.get());
+    assert(v);
+    cuda::memory::copy(v_d.get(), v, N*sizeof(T));
+    assert(v_d.get());
+    cuda::launch(mykernel<T,NBINS,S,DELTA>,{1,256},v_d.get(),N);
+  }
+
+}
+
+int main() {
+  go<int16_t>();
+  go<uint8_t,128,8,4>();
+  go<uint16_t,313/2,9,4>();
+
+
+  return 0;
+}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
new file mode 100644
index 0000000000000..a4d5c0f7a864a
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
@@ -0,0 +1,232 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
+
+
+// #define CLUS_LIMIT_LOOP
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include "gpuClusteringConstants.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+
+namespace gpuClustering {
+
+  __global__ void countModules(uint16_t const * __restrict__ id,
+                               uint32_t * __restrict__ moduleStart,
+                               int32_t * __restrict__ clusterId,
+                               int numElements)
+  {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i >= numElements)
+      return;
+    clusterId[i] = i;
+    if (InvId == id[i])
+      return;
+    auto j = i - 1;
+    while (j >= 0 and id[j] == InvId)
+      --j;
+    if (j < 0 or id[j] != id[i]) {
+      // boundary...
+      auto loc = atomicInc(moduleStart, MaxNumModules);
+      moduleStart[loc + 1] = i;
+    }
+  }
+
+  __global__ void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
+                           uint16_t const * __restrict__ x,              // local coordinates of each pixel
+                           uint16_t const * __restrict__ y,              //
+                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
+                           uint32_t * __restrict__ nClustersInModule,    // output: number of clusters found in each module
+                           uint32_t * __restrict__ moduleId,             // output: module id of each module
+                           int32_t * __restrict__  clusterId,            // output: cluster id of each pixel
+                           int numElements)
+  {
+
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    // find the index of the first pixel not belonging to this module (or invalid)
+    __shared__ int msize;
+    msize = numElements;
+    __syncthreads();
+
+    // skip threads not associated to an existing pixel
+      for (int i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {        // find the first pixel in a different module
+          atomicMin(&msize, i);
+          break;
+        }
+      }
+
+   //init hist  (ymax=416 < 512 : 9bits)
+   constexpr uint32_t maxPixInModule = 4000;
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule/2+2;
+   using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
+   constexpr auto wss = Hist::totbins();
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[wss];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
+    __syncthreads();
+
+    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+    assert(msize-firstPixel<maxPixInModule);  
+ 
+
+#ifdef GPU_DEBUG
+    __shared__ uint32_t totGood;
+    totGood=0;
+    __syncthreads();
+#endif
+
+    // fill histo
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.count(y[i]);
+#ifdef GPU_DEBUG
+        atomicAdd(&totGood,1);
+#endif
+      }
+    __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+#ifdef GPU_DEBUG
+    assert(hist.size()==totGood);
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("histo size %d\n",hist.size());
+#endif
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.fill(y[i],i-firstPixel,ws);
+      }
+
+#ifdef CLUS_LIMIT_LOOP
+    // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
+    constexpr int maxiter = 10;
+    if (threadIdx.x==0) {
+      assert((hist.size()/ blockDim.x) <= maxiter);
+    }
+    uint16_t const * jmax[maxiter];
+    for (int k = 0; k < maxiter; ++k)
+      jmax[k] = hist.end();
+#endif
+    __syncthreads();  // for hit filling!
+
+    // for each pixel, look at all the pixels until the end of the module;
+    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+    // pixel in the cluster ( clus[i] == i ).
+    bool done = false;
+    while (not __syncthreads_and(done)) {
+      done = true;
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+          auto p = hist.begin()+j;
+          auto i = *p + firstPixel;
+          assert (id[i] != InvId);
+          assert(id[i] == thisModuleId);    // same module
+#ifdef CLUS_LIMIT_LOOP
+          auto jm = jmax[k];
+          jmax[k] = p + 1;
+#endif
+          int be = Hist::bin(y[i]+1);
+          auto e = hist.end(be);
+#ifdef CLUS_LIMIT_LOOP
+          e = std::min(e,jm);
+#endif      
+          // loop to columns
+          auto loop = [&](uint16_t const * kk) {
+            auto m = (*kk)+firstPixel;
+#ifdef GPU_DEBUG
+            assert(m!=i);
+#endif
+            if (std::abs(int(x[m]) - int(x[i])) > 1 or
+                std::abs(int(y[m]) - int(y[i])) > 1) return;
+            auto old = atomicMin(&clusterId[m], clusterId[i]);
+            if (old != clusterId[i]) {
+              // end the loop only if no changes were applied
+              done = false;
+            }
+            atomicMin(&clusterId[i], old);
+#ifdef CLUS_LIMIT_LOOP
+            // update the loop boundary for the next iteration
+            jmax[k] = std::max(kk + 1,jmax[k]);
+#endif
+          };
+          ++p;
+          for (;p<e;++p) loop(p);
+        } // pixel loop
+    }  // end while
+
+    __shared__ int foundClusters;
+    foundClusters = 0;
+    __syncthreads();
+
+    // find the number of different clusters, identified by a pixels with clus[i] == i;
+    // mark these pixels with a negative id.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (clusterId[i] == i) {
+          auto old = atomicAdd(&foundClusters, 1);
+          clusterId[i] = -(old + 1);
+        }
+      }
+    __syncthreads();
+
+    // propagate the negative id to all the pixels in the cluster.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        if (clusterId[i] >= 0) {
+          // mark each pixel in a cluster with the same id as the first one
+          clusterId[i] = clusterId[clusterId[i]];
+        }
+      }
+    __syncthreads();
+
+    // adjust the cluster id to be a positive value starting from 0
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId) {               // skip invalid pixels
+          clusterId[i] = -9999;
+          continue;
+        }
+        clusterId[i] = - clusterId[i] - 1;
+      }
+    __syncthreads();
+
+      if (threadIdx.x == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[blockIdx.x] = thisModuleId;
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+#endif
+    }
+  }
+
+} // namespace gpuClustering
+
+#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h

From 8eb1210d815a835de36d9651ebf272ec6c774a9b Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 13:13:28 +0200
Subject: [PATCH 15/36] clean hist header

---
 .../CUDAUtilities/interface/HistoContainer.h  | 96 -------------------
 .../plugins/gpuFancyClustering.h              |  4 +-
 .../SiPixelClusterizer/test/gpuClustering.cu  |  5 +-
 3 files changed, 5 insertions(+), 100 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index ad446ad27c1bc..fb346c1406cb5 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -10,9 +10,6 @@
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
-#ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
-#endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
@@ -122,23 +119,6 @@ void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
 }
 
 
-// same as above but for compactified histos
-template<typename Hist, typename V, typename Func>
-__host__ __device__
-__forceinline__
-void forEachInWindowCompact(Hist const & hist, V wmin, V wmax, Func const & func) {
-   auto bs = hist.getH().bin(wmin);
-   auto be = hist.getH().bin(wmax);
-   assert(be>=bs);
-   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
-      func(*pj);
-   }
-   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-     func(*pj);
-}
-
-
-
 template<
   typename T, // the type of the discretized input values
   uint32_t NBINS, // number of bins 
@@ -255,80 +235,4 @@ class HistoContainer {
   index_type bins[nbins()*binSize()+spillSize()];
 };
 
-// a compactified version of above resuing the very same space
-template<typename H>
-class CompactHistoContainer {
-
-public:
-
-  using index_type = typename H::index_type;
-
-  static constexpr auto wsSize() { return std::max(H::spillSize(),32U);}
-
-  __host__ __device__
-  __forceinline__
-  H & getH() { return histo;}
-
-  __host__ __device__
-  __forceinline__
-  H const & getH() const { return histo;}
-
-
-#ifdef __CUDACC__
-  __device__
-  __forceinline__
-  void compactify(typename H::Counter * ws) {
-    auto  & h = histo;
-    // fix size
-    for (auto j=threadIdx.x; j<H::nbins(); j+=blockDim.x) h.n[j]= std::min(h.n[j],H::binSize());
-    if (threadIdx.x==0) h.n[H::nbins()] = std::min(h.n[H::nbins()],H::spillSize());
-    __syncthreads();
-    blockPrefixScan(histo.n,H::totbins(),ws);
-    __syncthreads();
-    // relocate :  the following is slow
-    __shared__ uint32_t i;
-    i=1;
-    __syncthreads();
-    while(__syncthreads_and(i<H::totbins())) {
-      auto b = h.n[i-1];
-      auto s = int(h.n[i])-int(b);
-//      assert(s>=0);
-//      assert(b<=h.begin(i)-h.begin(0));
-//      if (i<H::nbins()) assert(s<=H::binSize());
-      for (auto j=threadIdx.x; j<s; j+=blockDim.x) {
-         ws[j] = h.begin(i)[j]; 
-      }
-      __syncthreads();
-      for (auto j=threadIdx.x; j<s; j+=blockDim.x) {
-        histo.bins[b+j] = ws[j];
-      }
-      __syncthreads();
-      if (threadIdx.x==0) ++i;
-      __syncthreads();
-    }
-
-  }
-#endif
-
-  __host__
-  void compactify() {
-  }
-
-  constexpr auto size() const { return uint32_t(histo.n[H::nbins()]);}
-
-  constexpr index_type const * begin() const { return histo.bins;}
-  constexpr index_type const * end() const { return begin() + size();}
-
-
-  constexpr index_type const * begin(uint32_t b) const { return histo.bins + ( (0==b) ? 0 : histo.n[b-1]) ;}
-  constexpr index_type const * end(uint32_t b) const { return histo.bins + histo.n[b];}
-   
-  constexpr index_type const * beginSpill() const { return histo.bins + histo.n[H::nbins()-1];}
-
-  constexpr index_type const * endSpill() const { return begin() + size();}
-
-   H histo;
-};
-
-
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
index a4d5c0f7a864a..9783b8e1b08be 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
@@ -36,7 +36,9 @@ namespace gpuClustering {
     }
   }
 
-  __global__ void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
+  __global__
+//  __launch_bounds__(256,4)
+  void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
                            uint16_t const * __restrict__ x,              // local coordinates of each pixel
                            uint16_t const * __restrict__ y,              //
                            uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index f517a7e7731bd..2392822bab336 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -212,7 +212,7 @@ int main(void)
     }
   }
   }; // end lambda
-  for (auto kkk=0; kkk<4; ++kkk) {
+  for (auto kkk=0; kkk<5; ++kkk) {
   n=0; ncl=0;
   generateClusters(kkk);
 
@@ -232,7 +232,7 @@ int main(void)
   cuda::memory::copy(d_y.get(), h_y.get(), size16);
   cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
   // Launch CUDA Kernels
-  int threadsPerBlock = 256;
+  int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256);
   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
   std::cout
     << "CUDA countModules kernel launch with " << blocksPerGrid
@@ -244,7 +244,6 @@ int main(void)
                d_id.get(), d_moduleStart.get() ,d_clus.get(),n
                );
 
-  threadsPerBlock = 256;
   blocksPerGrid = MaxNumModules;    //nModules;
 
   std::cout

From 960de3d57322da55f96fc9d3a507798850e67cef Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 13:44:56 +0200
Subject: [PATCH 16/36] use one bin per column

---
 .../SiPixelClusterizer/plugins/gpuFancyClustering.h         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
index 9783b8e1b08be..ab58d7d6154d3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
@@ -80,7 +80,7 @@ namespace gpuClustering {
 
    //init hist  (ymax=416 < 512 : 9bits)
    constexpr uint32_t maxPixInModule = 4000;
-   constexpr auto  nbins = phase1PixelTopology::numColsInModule/2+2;
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
    using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
    constexpr auto wss = Hist::totbins();
     __shared__ Hist hist;
@@ -163,8 +163,8 @@ namespace gpuClustering {
 #ifdef GPU_DEBUG
             assert(m!=i);
 #endif
-            if (std::abs(int(x[m]) - int(x[i])) > 1 or
-                std::abs(int(y[m]) - int(y[i])) > 1) return;
+            if (std::abs(int(x[m]) - int(x[i])) > 1) return;
+            // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
             auto old = atomicMin(&clusterId[m], clusterId[i]);
             if (old != clusterId[i]) {
               // end the loop only if no changes were applied

From 25b24c5a5142a5e04886cf18be0556876b46a0f6 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 14:21:23 +0200
Subject: [PATCH 17/36] from done to more, and debug

---
 .../plugins/gpuFancyClustering.h              | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
index ab58d7d6154d3..2aa894b1d9f7f 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
@@ -134,15 +134,22 @@ namespace gpuClustering {
     for (int k = 0; k < maxiter; ++k)
       jmax[k] = hist.end();
 #endif
+
+#ifdef GPU_DEBUG
+    __shared__ int nloops;
+    nloops=0;
+#endif
+
+
     __syncthreads();  // for hit filling!
 
     // for each pixel, look at all the pixels until the end of the module;
     // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
     // after the loop, all the pixel in each cluster should have the id equeal to the lowest
     // pixel in the cluster ( clus[i] == i ).
-    bool done = false;
-    while (not __syncthreads_and(done)) {
-      done = true;
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      more = false;
         for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
           auto p = hist.begin()+j;
           auto i = *p + firstPixel;
@@ -168,7 +175,7 @@ namespace gpuClustering {
             auto old = atomicMin(&clusterId[m], clusterId[i]);
             if (old != clusterId[i]) {
               // end the loop only if no changes were applied
-              done = false;
+              more = true;
             }
             atomicMin(&clusterId[i], old);
 #ifdef CLUS_LIMIT_LOOP
@@ -179,8 +186,17 @@ namespace gpuClustering {
           ++p;
           for (;p<e;++p) loop(p);
         } // pixel loop
+#ifdef GPU_DEBUG
+        if (threadIdx.x==0) ++nloops;
+#endif
     }  // end while
 
+#ifdef GPU_DEBUG
+   if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("# loops %d\n",nloops);
+#endif
+
     __shared__ int foundClusters;
     foundClusters = 0;
     __syncthreads();

From 7097934248ab556269a86d1fb58a50392a415603 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 16:12:01 +0200
Subject: [PATCH 18/36] vertex finding is faster now

---
 .../PixelVertexFinding/src/gpuClusterTracks.h | 66 +++++++++++--------
 .../PixelVertexFinding/src/gpuVertexFinder.cu |  4 +-
 .../PixelVertexFinding/src/gpuVertexFinder.h  |  2 +-
 .../test/gpuVertexFinder_t.cu                 | 18 +++--
 4 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 54121d5babfb5..599ef57c7cd4a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include<cassert>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
 
 
@@ -14,7 +14,6 @@
 
 namespace gpuVertexFinder {
 
-
   __global__
   void sortByPt2(int nt,
                  OnGPU * pdata
@@ -62,6 +61,9 @@ namespace gpuVertexFinder {
 		     )  {
 
     constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+
+    if(verbose && 0==threadIdx.x) printf("params %d %f\n",minT,eps);
     
     auto er2mx = errmax*errmax;
     
@@ -73,41 +75,49 @@ namespace gpuVertexFinder {
     float * __restrict__ chi2 = data.chi2;
     uint32_t & nv = *data.nv;
     
-    int8_t  * __restrict__ izt = data.izt;
+    uint8_t  * __restrict__ izt = data.izt;
     int32_t * __restrict__ nn = data.nn;
     int32_t * __restrict__ iv = data.iv;
     
     assert(pdata);
     assert(zt);
     
-    __shared__ HistoContainer<int8_t,256,5,8,uint16_t> hist;
-    
-    //  if(0==threadIdx.x) printf("params %d %f\n",minT,eps);    
-    //  if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt);
-    
-    // zero hist
-    for (auto k = threadIdx.x; k<hist.totbins(); k+=blockDim.x) hist.n[k]=0;
+    using Hist=HistoContainer<uint8_t,256,16000,8,uint16_t>;
+    constexpr auto wss = Hist::totbins();
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[wss];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
     __syncthreads();
-
-    //  if(0==threadIdx.x) printf("histo zeroed\n");
     
+    if(verbose && 0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.capacity(),nt);
+    
+    assert(nt<=hist.capacity());
     
-  // fill hist  (bin shall be wider than "eps")
+    // fill hist  (bin shall be wider than "eps")
     for (int i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i<OnGPU::MAXTRACKS);
       int iz =  int(zt[i]*10.); // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN),INT8_MAX);
-      izt[i]=iz;
-      hist.fill(int8_t(iz),uint16_t(i));
+      izt[i]=iz-INT8_MIN;
+      assert(iz-INT8_MIN >= 0);
+      assert(iz-INT8_MIN < 256);
+      hist.count(izt[i]);
       iv[i]=i;
       nn[i]=0;
     }
     __syncthreads();
-    
-    //   if(0==threadIdx.x) printf("histo filled %d\n",hist.nspills);
-    if(0==threadIdx.x && hist.fullSpill()) printf("histo overflow\n");
-    
+    hist.finalize(ws);
+    __syncthreads();
+    assert(hist.size()==nt);
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+    for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i],uint16_t(i),ws);
+    }
+    __syncthreads();    
+
+
     // count neighbours
     for (int i = threadIdx.x; i < nt; i += blockDim.x) {
       if (ezt2[i]>er2mx) continue;
@@ -124,20 +134,20 @@ namespace gpuVertexFinder {
       
     __syncthreads();
     
-    //  if(0==threadIdx.x) printf("nn counted\n");
-    
     // cluster seeds only
     bool more = true;
     while (__syncthreads_or(more)) {
       more=false;
-      for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      for (int  k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+        auto p = hist.begin()+k;
+        auto i = (*p);
+        auto be = std::min(Hist::bin(izt[i])+1,int(hist.nbins()-1));
 	if (nn[i]<minT) continue; // DBSCAN core rule
 	auto loop = [&](int j) {
-	  if (i==j) return;
+	  assert (i!=j);
 	  if (nn[j]<minT) return;  // DBSCAN core rule
-	  // look on the left
-	  auto dist = zt[j]-zt[i];
-	  if (dist<0 || dist>eps) return;
+          auto dist = std::abs(zt[i]-zt[j]);
+          if (dist>eps) return;
 	  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
 	  auto old = atomicMin(&iv[j], iv[i]);
 	  if (old != iv[i]) {
@@ -146,8 +156,8 @@ namespace gpuVertexFinder {
 	  }
 	  atomicMin(&iv[i], old);
 	};
-
-	forEachInBins(hist,izt[i],1,loop);
+        ++p;
+        for (;p<hist.end(be);++p) loop(*p);
       } // for i
     } // while
     
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index e1e346253ceb5..6cb08ae817721 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -17,7 +17,7 @@ namespace gpuVertexFinder {
     cudaCheck(cudaMalloc(&onGPU.sortInd, OnGPU::MAXVTX*sizeof(uint16_t)));
 
 
-    cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(int8_t)));
+    cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(uint8_t)));
     cudaCheck(cudaMalloc(&onGPU.nn, OnGPU::MAXTRACKS*sizeof(int32_t)));
 
     cudaCheck(cudaMalloc(&onGPU_d,sizeof(OnGPU)));
@@ -64,7 +64,7 @@ namespace gpuVertexFinder {
     
 
     assert(onGPU_d);
-    clusterTracks<<<1,1024,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
+    clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
     sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);    
 
     cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t),
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index bea3db7338a86..ded6759a940bd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -27,7 +27,7 @@ namespace gpuVertexFinder {
     uint16_t * sortInd; // sorted index (by pt2)
 
     // workspace  
-    int8_t  * izt;  // interized z-position of input tracks
+    uint8_t * izt;  // interized z-position of input tracks
     int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices)
     
   };
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index 840b8947e7220..a92c116702231 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -91,7 +91,7 @@ int main() {
   auto ptv2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, 256);
 
-  auto izt_d = cuda::memory::device::make_unique<int8_t[]>(current_device, 64000);
+  auto izt_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, 64000);
   auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
   auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
 
@@ -142,33 +142,33 @@ int main() {
   
   if ( (i%4) == 0 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,12.0f
 		 );
   
   if ( (i%4) == 1 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,9.0f
 		 );
   
   if ( (i%4) == 2 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.01f,9.0f
 		 );
   
   if ( (i%4) == 3 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
 		 0.01f,9.0f
 		 );
   
-
+  cudaDeviceSynchronize();
   cuda::launch(sortByPt2,
                { 1, 256 },
                ev.ztrack.size(), onGPU_d.get()
@@ -176,6 +176,12 @@ int main() {
 
   uint32_t nv;
   cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+
   float zv[nv];
   float	wv[nv];
   float	chi2[nv];

From a1085711b63db914a98358f8fb719a46323432b3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Thu, 20 Sep 2018 19:48:15 +0200
Subject: [PATCH 19/36] use new clustering for real!

---
 .../SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu  | 2 +-
 RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h     | 2 ++
 .../SiPixelClusterizer/plugins/gpuFancyClustering.h             | 1 +
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu         | 2 +-
 4 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 7bd6eac473cc7..bcceed91f2fc5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -33,7 +33,7 @@
 // CMSSW includes
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 3cc1f97efed35..17d2fefc0ec93 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,6 +1,8 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
+#error "old version"
+
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
index 2aa894b1d9f7f..6475d6422557c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
@@ -13,6 +13,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
 
+
 namespace gpuClustering {
 
   __global__ void countModules(uint16_t const * __restrict__ id,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index bc376536dee12..d442fb8195562 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -8,7 +8,7 @@
 // CMSSW headers
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
 #include "PixelRecHits.h"
 #include "gpuPixelRecHits.h"
 

From c711b5dfc2c0d2221e253561e61f9c2f24d1a873 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 11:00:00 +0200
Subject: [PATCH 20/36] test cub as well

---
 .../CUDAUtilities/test/BuildFile.xml          |  5 +-
 .../CUDAUtilities/test/prefixScan_t.cu        | 49 +++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index f5a3dd3ae8207..db708e3328b4b 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -36,4 +36,7 @@
 </bin>
 
 
-<bin file="prefixScan_t.cu" name="gpuPrefixScan_t"/>
+<bin file="prefixScan_t.cu" name="gpuPrefixScan_t">
+  <use name="cub"/>
+</bin>
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index e109f6672bc32..240ca684121f0 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -19,6 +19,23 @@ void testPrefixScan(uint32_t size) {
   }
 }
 
+#include <cub/cub.cuh> 
+
+
+__global__
+void  init(uint32_t  * v, uint32_t  val, uint32_t n) {
+    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i<n) v[i]=val;
+    if (i==0) printf("init\n");
+}
+
+__global__
+void  verify(uint32_t  const * v, uint32_t n) {
+    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i<n) assert(v[i]==i+1);
+    if (i==0) printf("verify\n");
+}
+
 
 int main() {
   
@@ -29,5 +46,37 @@ int main() {
   }
   cudaDeviceSynchronize();
 
+  // Declare, allocate, and initialize device-accessible pointers for input and output
+  int  num_items = 10000;
+  uint32_t  *d_in;         
+  uint32_t  *d_out;
+
+
+  cudaMalloc(&d_in,num_items*sizeof(uint32_t));
+  // cudaMalloc(&d_out,num_items*sizeof(uint32_t));
+
+  d_out = d_in;
+ 
+  auto nthreads = 256;
+  auto nblocks = (num_items + nthreads - 1) / nthreads;
+
+  init<<<nblocks, nthreads, 0>>>(d_in, 1, num_items);
+
+  // Determine temporary device storage requirements for inclusive prefix sum
+  void     *d_temp_storage = nullptr;
+  size_t   temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+  // Allocate temporary storage for inclusive prefix sum
+  cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  // Run inclusive prefix sum
+   CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+
+  verify<<<nblocks, nthreads, 0>>>(d_out, num_items);
+
+  cudaDeviceSynchronize();
+
   return 0;
 }

From 83879cd899983970bc0fa9905c1f93c1f57da1c0 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 14:27:54 +0200
Subject: [PATCH 21/36] works in c++

---
 .../interface/FancyHistoContainer.h           | 43 +++++++++++++++----
 .../CUDAUtilities/test/HistoContainer_t.cpp   | 25 +++++++----
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
index f5e669ca3d7ef..d050240a6b810 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
@@ -69,7 +69,8 @@ template<
   uint32_t NBINS, // number of bins 
   uint32_t SIZE, // max number of element
   uint32_t S=sizeof(T) * 8, // number of significant bits in T
-  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+  typename I=uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+  uint32_t NHISTS=1 // number of histos stored
 >
 class HistoContainer {
 public:
@@ -98,10 +99,13 @@ class HistoContainer {
 
   static constexpr uint32_t sizeT()     { return S; }
   static constexpr uint32_t nbins()     { return NBINS;}
-  static constexpr uint32_t totbins()   { return NBINS+1;}
+  static constexpr uint32_t nhists()    { return NHISTS;}
+  static constexpr uint32_t totbins()   { return NHISTS*NBINS+1;}
   static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
   static constexpr uint32_t capacity()  { return SIZE; }
 
+  static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
+
   static constexpr UT bin(T t) {
     constexpr uint32_t shift = sizeT() - nbits();
     constexpr uint32_t mask = (1<<nbits()) - 1;
@@ -126,7 +130,7 @@ class HistoContainer {
   __host__ __device__
   __forceinline__
   void count(T t) {
-    UT b = bin(t);
+    uint32_t b = bin(t);
     assert(b<nbins());
     atomicIncrement(off[b+1]);
   }
@@ -134,8 +138,31 @@ class HistoContainer {
   __host__ __device__
   __forceinline__
   void fill(T t, index_type j, Counter * ws) {
-    UT b = bin(t);
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
+  }
+
+
+  __host__ __device__
+  __forceinline__
+  void count(T t, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    atomicIncrement(off[b+1]);
+  }
+
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
+    uint32_t b = bin(t);
     assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
     auto w = atomicIncrement(ws[b]);
     assert(w < size(b));
     bins[off[b] + w] = j;
@@ -145,15 +172,15 @@ class HistoContainer {
   __device__
   __forceinline__
   void finalize(Counter * ws) {
-    blockPrefixScan(off+1,nbins(),ws);
+    blockPrefixScan(off+1,totbins()-1,ws);
   }
   __host__
 #endif
   void finalize() {
-    for    (uint32_t i=2; i<=totbins(); ++i) off[i]+=off[i-1];
+    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
   }
 
-  constexpr auto size() const { return uint32_t(off[nbins()]);}
+  constexpr auto size() const { return uint32_t(off[totbins()-1]);}
   constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
 
 
@@ -165,7 +192,7 @@ class HistoContainer {
   constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
 
 
-  Counter  off[nbins()+1];
+  Counter  off[totbins()];
   index_type bins[capacity()];
 };
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index d0a3ea7582b11..90facc0a5b216 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -26,24 +26,33 @@ void go() {
   T v[N];
 
   using Hist = HistoContainer<T,NBINS,N,S>;
-  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  using Hist4 = HistoContainer<T,NBINS,N,S,uint16_t,4>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
   std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+  std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' << Hist4::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  for (auto nh=0; nh<4; ++nh) std::cout << "bins " << int(Hist4::bin(0))+Hist4::histOff(nh) << ' ' <<  int(Hist::bin(rmin))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax))+Hist4::histOff(nh) << std::endl;
+
 
   Hist h;
+  Hist4 h4;
   typename Hist::Counter ws[Hist::totbins()];
+  typename Hist4::Counter ws4[Hist4::totbins()];
   for (int it=0; it<5; ++it) {
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
     if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
-    h.zero();
-    assert(h.size()==0);
-    for (auto & i: ws) i=0;
-    for (long long j = 0; j < N; j++) h.count(v[j]);
-    h.finalize();
+    h.zero();h4.zero();
+    assert(h.size()==0);assert(h4.size()==0);
+    for (auto & i: ws) i=0; 
+    for (auto & i: ws4) i=0;
+    for (long long j = 0; j < N; j++) { h.count(v[j]); if(j<2000) h4.count(v[j],2); else h4.count(v[j],j%4); }
+    h.finalize(); h4.finalize();
     assert(h.off[0]==0);
     assert(h.size()==N);
-    for (long long j = 0; j < N; j++) h.fill(v[j],j,ws);
+    assert(h4.off[0]==0);
+    assert(h4.size()==N);
+    for (long long j = 0; j < N; j++) { h.fill(v[j],j,ws);  if(j<2000) h4.fill(v[j],j,ws4,2); else h4.fill(v[j],j,ws4,j%4); }
     assert(h.size()==N);
-
+    assert(h4.size()==N);
 
     auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);

From ba38606fe9dda22110a588b95ee24bada6770ab9 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 17:46:46 +0200
Subject: [PATCH 22/36] works

---
 .../interface/FancyHistoContainer.h           | 64 ++++++++++++++++---
 .../CUDAUtilities/test/BuildFile.xml          |  3 +
 .../CUDAUtilities/test/HistoContainer_t.cu    | 55 +++++++++-------
 .../CUDAUtilities/test/prefixScan_t.cu        |  6 +-
 4 files changed, 94 insertions(+), 34 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
index d050240a6b810..e21d03a4d86e3 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDAUtilities_HistoContainer_h
 
 #include <cassert>
+#include <cstddef> 
 #include <cstdint>
 #include <algorithm>
 #include <type_traits>
@@ -12,6 +13,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #ifdef __CUDACC__
 #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cub/cub.cuh>
 #endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -19,18 +21,50 @@
 #ifdef __CUDACC__
 namespace cudautils {
 
-  template<typename Histo>
+  template<typename Histo, typename T>
   __global__
-  void zeroMany(Histo * h, uint32_t nh) {
-    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
-    auto ih = i / Histo::totbins();
-    auto k  = i - ih * Histo::totbins();
-    if (ih < nh) {
-      if (k < Histo::totbins())
-        h[ih].n[k] = 0;
-    }
+  void countFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
+     auto i = blockIdx.x * blockDim.x + threadIdx.x;
+     if(i >= offsets[nh]) return;
+     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+     assert((*off) > 0);
+     int32_t ih = off - offsets - 1;
+     assert(ih >= 0);
+     assert(ih < nh);
+     (*h).count(v[i], ih);
+  }
+
+  template<typename Histo, typename T>
+  __global__
+  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
+                      uint32_t * __restrict__ ws ) {
+     auto i = blockIdx.x * blockDim.x + threadIdx.x;
+     if(i >= offsets[nh]) return;
+     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+     assert((*off) > 0);
+     int32_t ih = off - offsets - 1;
+     assert(ih >= 0);
+     assert(ih < nh);
+     (*h).fill(v[i], i, ws, ih);
+  }
+
+
+  template<typename Histo, typename T>
+  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
+                          uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
+                          int nthreads, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
+    auto nblocks = (totSize + nthreads - 1) / nthreads;
+    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    size_t wss = Histo::totbins();
+    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
+    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
+    cudaCheck(cudaGetLastError());
   }
 
+
 } // namespace cudautils
 #endif
 
@@ -106,6 +140,18 @@ class HistoContainer {
 
   static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
 
+#ifdef __CUDACC__
+  __host__
+  static size_t wsSize() {
+    uint32_t * v =nullptr;
+    void * d_temp_storage = nullptr;
+    size_t  temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
+    return std::max(temp_storage_bytes,size_t(totbins()));
+  }
+#endif
+
+
   static constexpr UT bin(T t) {
     constexpr uint32_t shift = sizeT() - nbits();
     constexpr uint32_t mask = (1<<nbits()) - 1;
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index db708e3328b4b..1049aed41fd5f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -24,15 +24,18 @@
 
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
   <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
 </bin>
 
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
   <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
 
 <bin file="OneHistoContainer_t.cu" name="gpuOneHistoContainer_t">
   <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
 </bin>
 
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 2b6549361f182..d6dec63144f58 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -1,4 +1,4 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
 
 #include <algorithm>
 #include <cassert>
@@ -34,12 +34,16 @@ void go() {
   constexpr uint32_t partSize = N/nParts;
   uint32_t offsets[nParts+1];
 
-  using Hist = HistoContainer<T,1<<7,8>;
-  std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits<T>::max()-std::numeric_limits<T>::min())/Hist::nbins() << std::endl;
+  using Hist = HistoContainer<T,128,N,8*sizeof(T),uint32_t,nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist,off)) << ' ' 
+             << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << Hist::wsSize() << ' '
+            << (std::numeric_limits<T>::max()-std::numeric_limits<T>::min())/Hist::nbins() << std::endl;
   
-  Hist h[nParts];
+  Hist h;
+
+  auto h_d = cuda::memory::device::make_unique<Hist[]>(current_device, 1);
+  auto ws_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, Hist::totbins());
 
-  auto h_d = cuda::memory::device::make_unique<Hist[]>(current_device, nParts);
   auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, nParts+1);
 
 
@@ -62,18 +66,20 @@ void go() {
 
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
 
-    if (it==2) { // spill 
+    if (it==2) { // big bin 
        for (long long j = 1000; j < 2000; j++) v[j]= sizeof(T)==1 ? 22 : 3456;
     }
 
 
     cuda::memory::copy(v_d.get(), v, N*sizeof(T));
 
-    cudautils::fillManyFromVector(h_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
-
-    cuda::memory::copy(&h, h_d.get(), nParts*sizeof(Hist));                                
+    cudautils::fillManyFromVector(h_d.get(),ws_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
 
-        
+    cuda::memory::copy(&h, h_d.get(), sizeof(Hist));                                
+    assert(0==h.off[0]);
+    assert(offsets[10]==h.size());
+    
+    
     auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
       if ( T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
@@ -85,16 +91,17 @@ void go() {
     auto window = T(1300);
     
     for (uint32_t j=0; j<nParts; ++j) {
-      std::cout << j << ": nspills " << h[j].nspills() << std::endl;
+      auto off = Hist::histOff(j);
       for (uint32_t i=0; i<Hist::nbins(); ++i) {
-        if (0==h[j].size(i)) continue;
-        auto k= *h[j].begin(i);
-        if (j%2) k = *(h[j].begin(i)+(h[j].end(i)-h[j].begin(i))/2);
-        auto bk = h[j].bin(v[k]);
+        auto ii = i+off;
+        if (0==h.size(ii)) continue;
+        auto k= *h.begin(ii);
+        if (j%2) k = *(h.begin(ii)+(h.end(ii)-h.begin(ii))/2);
+        auto bk = h.bin(v[k]);
         assert(bk==i);
         assert(k<offsets[j+1]);
-        auto kl = h[j].bin(v[k]-window);
-        auto kh = h[j].bin(v[k]+window);
+        auto kl = h.bin(v[k]-window);
+        auto kh = h.bin(v[k]+window);
         assert(kl!=i);  assert(kh!=i);
         // std::cout << kl << ' ' << kh << std::endl;
 
@@ -103,18 +110,18 @@ void go() {
         auto nm = 0;
         bool l = true; auto khh = kh; incr(khh);
         for (auto kk=kl; kk!=khh; incr(kk)) {
-          if (kk!=kl && kk!=kh) nm+=h[j].size(kk);
-          for(auto p=h[j].begin(kk); p<h[j].end(kk); ++p) {
+          if (kk!=kl && kk!=kh) nm+=h.size(kk+off);
+          for(auto p=h.begin(kk+off); p<h.end(kk+off); ++p) {
            if ( std::min(std::abs(T(v[*p]-me)), std::abs(T(me-v[*p]))) > window ) {} else {++tot;}
           }
           if (kk==i) { l=false; continue; }
-          if (l) for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,k,(*p));
-          else for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,(*p),k);
+          if (l) for (auto p=h.begin(kk+off); p<h.end(kk+off); ++p) verify(i,k,k,(*p));
+          else for (auto p=h.begin(kk+off); p<h.end(kk+off); ++p) verify(i,k,(*p),k);
         }
-        if (h[j].nspills()==0 && !(tot>=nm)) {
-           std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
+        if (!(tot>=nm)) {
+           std::cout << "too bad " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
         }
-        if (l) std::cout << "what? " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
+        if (l) std::cout << "what? " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
         assert(!l);
       }
     }
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
index 240ca684121f0..36c4c3b88abe6 100644
--- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -70,9 +70,13 @@ int main() {
   std::cout << "temp storage " << temp_storage_bytes << std::endl;
 
   // Allocate temporary storage for inclusive prefix sum
+  // fake larger ws already available
+  temp_storage_bytes *=8;
   cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
   // Run inclusive prefix sum
-   CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+  CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
 
   verify<<<nblocks, nthreads, 0>>>(d_out, num_items);
 

From b8df4f849819c29f523c4d3c72e86cffc4edf525 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 17:49:03 +0200
Subject: [PATCH 23/36] old histo is gone

---
 .../interface/FancyHistoContainer.h           | 245 ------------------
 .../CUDAUtilities/interface/HistoContainer.h  | 213 +++++++--------
 .../CUDAUtilities/test/HistoContainer_t.cpp   |   2 +-
 .../CUDAUtilities/test/HistoContainer_t.cu    |   2 +-
 .../CUDAUtilities/test/OneHistoContainer_t.cu |   2 +-
 5 files changed, 113 insertions(+), 351 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h

diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
deleted file mode 100644
index e21d03a4d86e3..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h
+++ /dev/null
@@ -1,245 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_HistoContainer_h
-#define HeterogeneousCore_CUDAUtilities_HistoContainer_h
-
-#include <cassert>
-#include <cstddef> 
-#include <cstdint>
-#include <algorithm>
-#include <type_traits>
-#ifndef __CUDA_ARCH__
-#include <atomic>
-#endif // __CUDA_ARCH__
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
-#ifdef __CUDACC__
-#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
-#include <cub/cub.cuh>
-#endif
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-
-#ifdef __CUDACC__
-namespace cudautils {
-
-  template<typename Histo, typename T>
-  __global__
-  void countFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
-     auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i >= offsets[nh]) return;
-     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-     assert((*off) > 0);
-     int32_t ih = off - offsets - 1;
-     assert(ih >= 0);
-     assert(ih < nh);
-     (*h).count(v[i], ih);
-  }
-
-  template<typename Histo, typename T>
-  __global__
-  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
-                      uint32_t * __restrict__ ws ) {
-     auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i >= offsets[nh]) return;
-     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-     assert((*off) > 0);
-     int32_t ih = off - offsets - 1;
-     assert(ih >= 0);
-     assert(ih < nh);
-     (*h).fill(v[i], i, ws, ih);
-  }
-
-
-  template<typename Histo, typename T>
-  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
-                          uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
-                          int nthreads, cudaStream_t stream) {
-    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
-    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
-    auto nblocks = (totSize + nthreads - 1) / nthreads;
-    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-    size_t wss = Histo::totbins();
-    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
-    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
-    cudaCheck(cudaGetLastError());
-  }
-
-
-} // namespace cudautils
-#endif
-
-
-// iteratate over N bins left and right of the one containing "v"
-template<typename Hist, typename V, typename Func>
-__host__ __device__
-__forceinline__
-void forEachInBins(Hist const & hist, V value, int n, Func func) {
-   int bs = Hist::bin(value);
-   int be = std::min(int(Hist::nbins()-1),bs+n);
-   bs = std::max(0,bs-n);
-   assert(be>=bs);
-   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
-      func(*pj);
-   }
-}
-
-// iteratate over bins containing all values in window wmin, wmax
-template<typename Hist, typename V, typename Func>
-__host__ __device__
-__forceinline__
-void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
-   auto bs = Hist::bin(wmin);
-   auto be = Hist::bin(wmax);
-   assert(be>=bs);
-   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
-      func(*pj);
-   }
-}
-
-
-
-template<
-  typename T, // the type of the discretized input values
-  uint32_t NBINS, // number of bins 
-  uint32_t SIZE, // max number of element
-  uint32_t S=sizeof(T) * 8, // number of significant bits in T
-  typename I=uint32_t,  // type stored in the container (usually an index in a vector of the input values)
-  uint32_t NHISTS=1 // number of histos stored
->
-class HistoContainer {
-public:
-#ifdef __CUDACC__
-  using Counter = uint32_t;
-#else
-  using Counter = std::atomic<uint32_t>;
-#endif
-
-  using index_type = I;
-  using UT = typename std::make_unsigned<T>::type;
-
-  static constexpr uint32_t ilog2(uint32_t v) {
-
-    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
-    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
-
-    uint32_t r = 0; // result of log2(v) will go here
-    for (auto i = 4; i >= 0; i--) if (v & b[i]) {
-      v >>= s[i];
-      r |= s[i];
-    }
-    return r;
-  }
-
-
-  static constexpr uint32_t sizeT()     { return S; }
-  static constexpr uint32_t nbins()     { return NBINS;}
-  static constexpr uint32_t nhists()    { return NHISTS;}
-  static constexpr uint32_t totbins()   { return NHISTS*NBINS+1;}
-  static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
-  static constexpr uint32_t capacity()  { return SIZE; }
-
-  static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
-
-#ifdef __CUDACC__
-  __host__
-  static size_t wsSize() {
-    uint32_t * v =nullptr;
-    void * d_temp_storage = nullptr;
-    size_t  temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
-    return std::max(temp_storage_bytes,size_t(totbins()));
-  }
-#endif
-
-
-  static constexpr UT bin(T t) {
-    constexpr uint32_t shift = sizeT() - nbits();
-    constexpr uint32_t mask = (1<<nbits()) - 1;
-    return (t >> shift) & mask;
-  }
-
-  void zero() {
-    for (auto & i : off)
-      i = 0;
-  }
-
-  static __host__ __device__
-  __forceinline__
-  uint32_t atomicIncrement(Counter & x) {
-    #ifdef __CUDA_ARCH__
-    return atomicAdd(&x, 1);
-    #else
-    return x++;
-    #endif
-  }
-
-  __host__ __device__
-  __forceinline__
-  void count(T t) {
-    uint32_t b = bin(t);
-    assert(b<nbins());
-    atomicIncrement(off[b+1]);
-  }
-
-  __host__ __device__
-  __forceinline__
-  void fill(T t, index_type j, Counter * ws) {
-    uint32_t b = bin(t);
-    assert(b<nbins());
-    auto w = atomicIncrement(ws[b]);
-    assert(w < size(b));
-    bins[off[b] + w] = j;
-  }
-
-
-  __host__ __device__
-  __forceinline__
-  void count(T t, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b<nbins());
-    b+=histOff(nh);
-    assert(b<totbins());
-    atomicIncrement(off[b+1]);
-  }
-
-  __host__ __device__
-  __forceinline__
-  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
-    uint32_t b = bin(t);
-    assert(b<nbins());
-    b+=histOff(nh);
-    assert(b<totbins());
-    auto w = atomicIncrement(ws[b]);
-    assert(w < size(b));
-    bins[off[b] + w] = j;
-  }
-
-#ifdef __CUDACC__
-  __device__
-  __forceinline__
-  void finalize(Counter * ws) {
-    blockPrefixScan(off+1,totbins()-1,ws);
-  }
-  __host__
-#endif
-  void finalize() {
-    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
-  }
-
-  constexpr auto size() const { return uint32_t(off[totbins()-1]);}
-  constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
-
-
-  constexpr index_type const * begin() const { return bins;}
-  constexpr index_type const * end() const { return begin() + size();}
-
-
-  constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
-  constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
-
-
-  Counter  off[totbins()];
-  index_type bins[capacity()];
-};
-
-#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index fb346c1406cb5..e21d03a4d86e3 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDAUtilities_HistoContainer_h
 
 #include <cassert>
+#include <cstddef> 
 #include <cstdint>
 #include <algorithm>
 #include <type_traits>
@@ -10,121 +11,100 @@
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cub/cub.cuh>
+#endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
 #ifdef __CUDACC__
 namespace cudautils {
 
-  template<typename Histo>
-  __global__
-  void zeroMany(Histo * h, uint32_t nh) {
-    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
-    auto ih = i / Histo::totbins();
-    auto k  = i - ih * Histo::totbins();
-    if (ih < nh) {
-      if (k < Histo::totbins())
-        h[ih].n[k] = 0;
-    }
-  }
-
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h,  uint32_t nh, T const * __restrict__ v, uint32_t * offsets) {
+  void countFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i >= offsets[nh]) return;
      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
      assert((*off) > 0);
      int32_t ih = off - offsets - 1;
      assert(ih >= 0);
-     assert(ih < nh); 
-     h[ih].fill(v[i], i);
+     assert(ih < nh);
+     (*h).count(v[i], ih);
   }
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h, T const * __restrict__ v, uint32_t size) {
+  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
+                      uint32_t * __restrict__ ws ) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i < size) h->fill(v[i], i);
-  }
-
-  template<typename Histo>
-  void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
-    auto nblocks = (nh * Histo::totbins() + nthreads - 1) / nthreads;
-    zeroMany<<<nblocks, nthreads, 0, stream>>>(h, nh);
-    cudaCheck(cudaGetLastError());
+     if(i >= offsets[nh]) return;
+     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+     assert((*off) > 0);
+     int32_t ih = off - offsets - 1;
+     assert(ih >= 0);
+     assert(ih < nh);
+     (*h).fill(v[i], i, ws, ih);
   }
 
-  template<typename Histo, typename T>
-  void fillOneFromVector(Histo * h, T const * __restrict__ v, uint32_t size, int nthreads, cudaStream_t stream) {
-    zero(h, 1, nthreads, stream);
-    auto nblocks = (size + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, v, size);
-    cudaCheck(cudaGetLastError());
-  }
 
   template<typename Histo, typename T>
-  void fillManyFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
-    zero(h, nh, nthreads, stream);
+  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
+                          uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
+                          int nthreads, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
     auto nblocks = (totSize + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    size_t wss = Histo::totbins();
+    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
+    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
     cudaCheck(cudaGetLastError());
   }
 
+
 } // namespace cudautils
 #endif
 
 
 // iteratate over N bins left and right of the one containing "v"
-// including spillBin
 template<typename Hist, typename V, typename Func>
 __host__ __device__
 __forceinline__
 void forEachInBins(Hist const & hist, V value, int n, Func func) {
-   int bs = hist.bin(value);
-   int be = std::min(int(hist.nbins()),bs+n+1);
+   int bs = Hist::bin(value);
+   int be = std::min(int(Hist::nbins()-1),bs+n);
    bs = std::max(0,bs-n);
-   // assert(be>bs);
-//   bool tbc=false;
-   for (auto b=bs; b<be; ++b){
-//   tbc |= hist.full(b);
-   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
       func(*pj);
-   }}
-//   if (tbc)
-   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-     func(*pj);
+   }
 }
 
 // iteratate over bins containing all values in window wmin, wmax
-// including spillBin
 template<typename Hist, typename V, typename Func>
 __host__ __device__
 __forceinline__
 void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
-   auto bs = hist.bin(wmin);
-   auto be = hist.bin(wmax);
-   // be = std::min(int(hist.nbins()),be+1);
-   // bs = std::max(0,bs);
-   // assert(be>=bs);
-//   bool tbc=false;
-   for (auto b=bs; b<=be; ++b){
-//   tbc |= hist.full(b);
-   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+   auto bs = Hist::bin(wmin);
+   auto be = Hist::bin(wmax);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
       func(*pj);
-   }}
-//   if (tbc)
-   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-     func(*pj);
+   }
 }
 
 
+
 template<
   typename T, // the type of the discretized input values
   uint32_t NBINS, // number of bins 
-  uint32_t M, // max number of element a bin can contain (in bits)
+  uint32_t SIZE, // max number of element
   uint32_t S=sizeof(T) * 8, // number of significant bits in T
-  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+  typename I=uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+  uint32_t NHISTS=1 // number of histos stored
 >
 class HistoContainer {
 public:
@@ -153,10 +133,24 @@ class HistoContainer {
 
   static constexpr uint32_t sizeT()     { return S; }
   static constexpr uint32_t nbins()     { return NBINS;}
-  static constexpr uint32_t totbins()   { return NBINS+1;} // including spillbin
+  static constexpr uint32_t nhists()    { return NHISTS;}
+  static constexpr uint32_t totbins()   { return NHISTS*NBINS+1;}
   static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
-  static constexpr uint32_t binSize()   { return 1 << M; }
-  static constexpr uint32_t spillSize() { return 16 * binSize(); }
+  static constexpr uint32_t capacity()  { return SIZE; }
+
+  static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
+
+#ifdef __CUDACC__
+  __host__
+  static size_t wsSize() {
+    uint32_t * v =nullptr;
+    void * d_temp_storage = nullptr;
+    size_t  temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
+    return std::max(temp_storage_bytes,size_t(totbins()));
+  }
+#endif
+
 
   static constexpr UT bin(T t) {
     constexpr uint32_t shift = sizeT() - nbits();
@@ -165,7 +159,7 @@ class HistoContainer {
   }
 
   void zero() {
-    for (auto & i : n)
+    for (auto & i : off)
       i = 0;
   }
 
@@ -181,58 +175,71 @@ class HistoContainer {
 
   __host__ __device__
   __forceinline__
-  void fill(T t, index_type j) {
-    UT b = bin(t);
+  void count(T t) {
+    uint32_t b = bin(t);
     assert(b<nbins());
-    auto w = atomicIncrement(n[b]);
-    if (w < binSize()) {
-      bins[b * binSize() + w] = j;
-    } else {
-      auto w = atomicIncrement(n[nbins()]);
-      if (w < spillSize())
-        bins[nbins() * binSize() + w] = j;
-    }
+    atomicIncrement(off[b+1]);
   }
 
-
-  constexpr auto nspills() const {
-   return uint32_t(n[nbins()]);
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr bool fullSpill() const {
-    return nspills() > spillSize();
-  }
 
-  constexpr bool full(uint32_t b) const {
-    return n[b] > binSize();
+  __host__ __device__
+  __forceinline__
+  void count(T t, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    atomicIncrement(off[b+1]);
   }
 
-  constexpr auto const * begin(uint32_t b) const {
-     return bins + b * binSize();
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr auto const * end(uint32_t b) const {
-     return begin(b) + std::min(binSize(), uint32_t(n[b]));
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  void finalize(Counter * ws) {
+    blockPrefixScan(off+1,totbins()-1,ws);
   }
-
-  constexpr auto size(uint32_t b) const {
-     return uint32_t(n[b]);
+  __host__
+#endif
+  void finalize() {
+    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
   }
 
-  constexpr auto const * spillBin() const {
-    return bins + nbins()*binSize();
-  }
+  constexpr auto size() const { return uint32_t(off[totbins()-1]);}
+  constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
+
+
+  constexpr index_type const * begin() const { return bins;}
+  constexpr index_type const * end() const { return begin() + size();}
+
+
+  constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
+  constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
 
-  constexpr auto const * beginSpill() const {
-     return spillBin();
-  }
-    
-  constexpr auto const * endSpill() const {
-     return beginSpill() + std::min(spillSize(), uint32_t(nspills()));
-  }
 
-  Counter  n[nbins()+1];  // last is the spill bin
-  index_type bins[nbins()*binSize()+spillSize()];
+  Counter  off[totbins()];
+  index_type bins[capacity()];
 };
 
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 90facc0a5b216..ca8167ab10894 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -1,4 +1,4 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 #include<algorithm>
 #include<cassert>
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index d6dec63144f58..8d640b728e25c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -1,4 +1,4 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 #include <algorithm>
 #include <cassert>
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
index 9d520929a20c0..c3682578163c8 100644
--- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -1,4 +1,4 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
 #include<algorithm>
 #include<cassert>

From 1bb3bdad60b5ed20ddef5f895f139d44485fb53d Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 19:13:09 +0200
Subject: [PATCH 24/36] wip

---
 .../test/NaiveHistoContainer_t.cpp            |  97 -------
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |   2 +-
 .../plugins/gpuClustering.h                   | 142 ++++++----
 .../plugins/gpuFancyClustering.h              | 251 ------------------
 .../SiPixelClusterizer/test/gpuClustering.cu  |   2 +-
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |   6 +-
 .../siPixelRecHitsHeterogeneousProduct.h      |   4 +-
 .../PixelVertexFinding/src/gpuClusterTracks.h |   2 +-
 8 files changed, 101 insertions(+), 405 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
 delete mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h

diff --git a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
deleted file mode 100644
index 190a4dc18e3fa..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-
-#include<algorithm>
-#include<cassert>
-#include<iostream>
-#include<random>
-#include<limits>
-
-template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
-void go() {
-  std::mt19937 eng;
-
-  int rmin=std::numeric_limits<T>::min();
-  int rmax=std::numeric_limits<T>::max();
-  if (NBINS!=128) {
-    rmin=0;
-    rmax=NBINS*2-1;
-  }
-
-
-
-  std::uniform_int_distribution<T> rgen(rmin,rmax);
-
-  
-  constexpr int N=12000;
-  T v[N];
-
-  using Hist = HistoContainer<T,NBINS,8,S>;
-  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
-  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
-
-  Hist h;
-  for (int it=0; it<5; ++it) {
-    for (long long j = 0; j < N; j++) v[j]=rgen(eng);
-    if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4;
-    h.zero();
-    for (long long j = 0; j < N; j++) h.fill(v[j],j);
-  
-    std::cout << "nspills " << h.nspills() << std::endl;    
-
-    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
-      assert(t1<N); assert(t2<N);
-      if (i!=j && T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
-    };
-
-    for (uint32_t i=0; i<Hist::nbins(); ++i) {
-      if (0==h.n[i]) continue;
-      auto k= *h.begin(i);
-      assert(k<N);
-      auto kl = NBINS!=128 ? h.bin(std::max(rmin,v[k]-DELTA)) : h.bin(v[k]-T(DELTA));
-      auto kh =	NBINS!=128 ? h.bin(std::min(rmax,v[k]+DELTA)) : h.bin(v[k]+T(DELTA));
-      if(NBINS==128) { assert(kl!=i);  assert(kh!=i); }
-      if(NBINS!=128) { assert(kl<=i);  assert(kh>=i); }
-      // std::cout << kl << ' ' << kh << std::endl;
-      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,kl, k,k,(*j));
-      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,kh, k,(*j),k);
-    }
-  }
-
-  for (long long j = 0; j < N; j++) {
-    auto b0 = h.bin(v[j]);
-    auto stot = h.endSpill()-h.beginSpill();
-    int w=0;
-    int tot=0;
-    auto ftest = [&](int k) {
-       assert(k>=0 && k<N);
-       tot++;
-    };
-    forEachInBins(h,v[j],w,ftest);
-    int rtot = h.end(b0)-h.begin(b0) + stot;
-    assert(tot==rtot);
-    w=1; tot=0;
-    forEachInBins(h,v[j],w,ftest);
-    int bp = b0+1;
-    int bm = b0-1;
-    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
-    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
-    assert(tot==rtot);
-    w=2; tot=0;
-    forEachInBins(h,v[j],w,ftest);
-    bp++;
-    bm--;
-    if (bp<int(h.nbins())) rtot += h.end(bp)-h.begin(bp);
-    if (bm>=0) rtot += h.end(bm)-h.begin(bm);
-    assert(tot==rtot);
-  }
-
-}
-
-int main() {
-  go<int16_t>();
-  go<uint8_t,128,8,4>();
-  go<uint16_t,313/2,9,4>();
-
-
-  return 0;
-}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index bcceed91f2fc5..7bd6eac473cc7 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -33,7 +33,7 @@
 // CMSSW includes
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 17d2fefc0ec93..163378c6adbf0 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,7 +1,8 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
-#error "old version"
+
+// #define CLUS_LIMIT_LOOP
 
 #include <cassert>
 #include <cstdint>
@@ -12,6 +13,7 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
+
 namespace gpuClustering {
 
   __global__ void countModules(uint16_t const * __restrict__ id,
@@ -35,7 +37,9 @@ namespace gpuClustering {
     }
   }
 
-  __global__ void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
+  __global__
+//  __launch_bounds__(256,4)
+  void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
                            uint16_t const * __restrict__ x,              // local coordinates of each pixel
                            uint16_t const * __restrict__ y,              //
                            uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
@@ -66,8 +70,6 @@ namespace gpuClustering {
     __syncthreads();
 
     // skip threads not associated to an existing pixel
-    bool active = (first < numElements);
-    if (active) {
       for (int i = first; i < numElements; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -76,87 +78,132 @@ namespace gpuClustering {
           break;
         }
       }
-    }
 
    //init hist  (ymax=416 < 512 : 9bits)
-   constexpr auto  nbins = phase1PixelTopology::numColsInModule/2+2;
-   __shared__ HistoContainer<uint16_t,nbins,4,9,uint16_t> hist;
-   for (auto k = threadIdx.x; k<hist.totbins(); k+=blockDim.x) hist.n[k]=0;
+   constexpr uint32_t maxPixInModule = 4000;
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
+   using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
+   constexpr auto wss = Hist::totbins();
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[wss];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
     __syncthreads();
 
-
     assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
-    assert(msize-firstPixel<64000);  
+    assert(msize-firstPixel<maxPixInModule);  
  
-    // skip threads not assocoated to pixels in this module
-    active = (first < msize);
-
-    // __syncthreads();
 
+#ifdef GPU_DEBUG
+    __shared__ uint32_t totGood;
+    totGood=0;
+    __syncthreads();
+#endif
 
     // fill histo
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
-        hist.fill(y[i],i-firstPixel);
+        hist.count(y[i]);
+#ifdef GPU_DEBUG
+        atomicAdd(&totGood,1);
+#endif
+      }
+    __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+#ifdef GPU_DEBUG
+    assert(hist.size()==totGood);
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("histo size %d\n",hist.size());
+#endif
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.fill(y[i],i-firstPixel,ws);
       }
-    }
 
+#ifdef CLUS_LIMIT_LOOP
     // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
     constexpr int maxiter = 10;
-    if (active) {
-      assert(((msize - first) / blockDim.x) <= maxiter);
+    if (threadIdx.x==0) {
+      assert((hist.size()/ blockDim.x) <= maxiter);
     }
-    int jmax[maxiter];
+    uint16_t const * jmax[maxiter];
     for (int k = 0; k < maxiter; ++k)
-      jmax[k] = msize;
+      jmax[k] = hist.end();
+#endif
 
-    __syncthreads();
-    if (threadIdx.x==0 && hist.fullSpill()) printf("histo overflow in det %d\n",thisModuleId);
+#ifdef GPU_DEBUG
+    __shared__ int nloops;
+    nloops=0;
+#endif
 
 
+    __syncthreads();  // for hit filling!
+
     // for each pixel, look at all the pixels until the end of the module;
     // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
     // after the loop, all the pixel in each cluster should have the id equeal to the lowest
     // pixel in the cluster ( clus[i] == i ).
-    bool done = false;
-    while (not __syncthreads_and(done)) {
-      done = true;
-      if (active) {
-       for (int i = first, k = 0; i < msize; i += blockDim.x, ++k) {
-          if (id[i] == InvId)               // skip invalid pixels
-            continue;
-          // assert(id[i] == thisModuleId);    // same module
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      more = false;
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+          auto p = hist.begin()+j;
+          auto i = *p + firstPixel;
+          assert (id[i] != InvId);
+          assert(id[i] == thisModuleId);    // same module
+#ifdef CLUS_LIMIT_LOOP
           auto jm = jmax[k];
-          jmax[k] = i + 1;
+          jmax[k] = p + 1;
+#endif
+          int be = Hist::bin(y[i]+1);
+          auto e = hist.end(be);
+#ifdef CLUS_LIMIT_LOOP
+          e = std::min(e,jm);
+#endif      
           // loop to columns
-          auto loop = [&](int j) {
-            j+=firstPixel;
-            if (i>=j or j>jm or 
-                std::abs(int(x[j]) - int(x[i])) > 1 or
-                std::abs(int(y[j]) - int(y[i])) > 1) return;
-            auto old = atomicMin(&clusterId[j], clusterId[i]);
+          auto loop = [&](uint16_t const * kk) {
+            auto m = (*kk)+firstPixel;
+#ifdef GPU_DEBUG
+            assert(m!=i);
+#endif
+            if (std::abs(int(x[m]) - int(x[i])) > 1) return;
+            // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
+            auto old = atomicMin(&clusterId[m], clusterId[i]);
             if (old != clusterId[i]) {
               // end the loop only if no changes were applied
-              done = false;
+              more = true;
             }
             atomicMin(&clusterId[i], old);
+#ifdef CLUS_LIMIT_LOOP
             // update the loop boundary for the next iteration
-            jmax[k] = std::max(j + 1,jmax[k]);
+            jmax[k] = std::max(kk + 1,jmax[k]);
+#endif
           };
-          forEachInWindow(hist,y[i]>0 ? y[i]-1 :0 ,y[i]+1,loop);
-         } // pixel loop
-      } // end active
+          ++p;
+          for (;p<e;++p) loop(p);
+        } // pixel loop
+#ifdef GPU_DEBUG
+        if (threadIdx.x==0) ++nloops;
+#endif
     }  // end while
 
+#ifdef GPU_DEBUG
+   if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("# loops %d\n",nloops);
+#endif
+
     __shared__ int foundClusters;
     foundClusters = 0;
     __syncthreads();
 
     // find the number of different clusters, identified by a pixels with clus[i] == i;
     // mark these pixels with a negative id.
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -165,11 +212,9 @@ namespace gpuClustering {
           clusterId[i] = -(old + 1);
         }
       }
-    }
     __syncthreads();
 
     // propagate the negative id to all the pixels in the cluster.
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -178,11 +223,9 @@ namespace gpuClustering {
           clusterId[i] = clusterId[clusterId[i]];
         }
       }
-    }
     __syncthreads();
 
     // adjust the cluster id to be a positive value starting from 0
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId) {               // skip invalid pixels
           clusterId[i] = -9999;
@@ -190,14 +233,11 @@ namespace gpuClustering {
         }
         clusterId[i] = - clusterId[i] - 1;
       }
-    }
     __syncthreads();
 
-    if (active) {
       if (threadIdx.x == 0) {
         nClustersInModule[thisModuleId] = foundClusters;
         moduleId[blockIdx.x] = thisModuleId;
-      }
 #ifdef GPU_DEBUG
       if (thisModuleId % 100 == 1)
         if (threadIdx.x == 0)
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
deleted file mode 100644
index 6475d6422557c..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h
+++ /dev/null
@@ -1,251 +0,0 @@
-#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
-#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
-
-
-// #define CLUS_LIMIT_LOOP
-
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-
-#include "gpuClusteringConstants.h"
-#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
-
-
-namespace gpuClustering {
-
-  __global__ void countModules(uint16_t const * __restrict__ id,
-                               uint32_t * __restrict__ moduleStart,
-                               int32_t * __restrict__ clusterId,
-                               int numElements)
-  {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i >= numElements)
-      return;
-    clusterId[i] = i;
-    if (InvId == id[i])
-      return;
-    auto j = i - 1;
-    while (j >= 0 and id[j] == InvId)
-      --j;
-    if (j < 0 or id[j] != id[i]) {
-      // boundary...
-      auto loc = atomicInc(moduleStart, MaxNumModules);
-      moduleStart[loc + 1] = i;
-    }
-  }
-
-  __global__
-//  __launch_bounds__(256,4)
-  void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
-                           uint16_t const * __restrict__ x,              // local coordinates of each pixel
-                           uint16_t const * __restrict__ y,              //
-                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
-                           uint32_t * __restrict__ nClustersInModule,    // output: number of clusters found in each module
-                           uint32_t * __restrict__ moduleId,             // output: module id of each module
-                           int32_t * __restrict__  clusterId,            // output: cluster id of each pixel
-                           int numElements)
-  {
-
-    if (blockIdx.x >= moduleStart[0])
-      return;
-
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
-
-#ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
-#endif
-
-    auto first = firstPixel + threadIdx.x;
-
-    // find the index of the first pixel not belonging to this module (or invalid)
-    __shared__ int msize;
-    msize = numElements;
-    __syncthreads();
-
-    // skip threads not associated to an existing pixel
-      for (int i = first; i < numElements; i += blockDim.x) {
-        if (id[i] == InvId)                 // skip invalid pixels
-          continue;
-        if (id[i] != thisModuleId) {        // find the first pixel in a different module
-          atomicMin(&msize, i);
-          break;
-        }
-      }
-
-   //init hist  (ymax=416 < 512 : 9bits)
-   constexpr uint32_t maxPixInModule = 4000;
-   constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
-   using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
-   constexpr auto wss = Hist::totbins();
-    __shared__ Hist hist;
-    __shared__ typename Hist::Counter ws[wss];
-    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
-    __syncthreads();
-
-    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
-    assert(msize-firstPixel<maxPixInModule);  
- 
-
-#ifdef GPU_DEBUG
-    __shared__ uint32_t totGood;
-    totGood=0;
-    __syncthreads();
-#endif
-
-    // fill histo
-      for (int i = first; i < msize; i += blockDim.x) {
-        if (id[i] == InvId)                 // skip invalid pixels
-          continue;
-        hist.count(y[i]);
-#ifdef GPU_DEBUG
-        atomicAdd(&totGood,1);
-#endif
-      }
-    __syncthreads();
-    hist.finalize(ws);
-    __syncthreads();
-    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
-    __syncthreads();
-#ifdef GPU_DEBUG
-    assert(hist.size()==totGood);
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("histo size %d\n",hist.size());
-#endif
-      for (int i = first; i < msize; i += blockDim.x) {
-        if (id[i] == InvId)                 // skip invalid pixels
-          continue;
-        hist.fill(y[i],i-firstPixel,ws);
-      }
-
-#ifdef CLUS_LIMIT_LOOP
-    // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
-    constexpr int maxiter = 10;
-    if (threadIdx.x==0) {
-      assert((hist.size()/ blockDim.x) <= maxiter);
-    }
-    uint16_t const * jmax[maxiter];
-    for (int k = 0; k < maxiter; ++k)
-      jmax[k] = hist.end();
-#endif
-
-#ifdef GPU_DEBUG
-    __shared__ int nloops;
-    nloops=0;
-#endif
-
-
-    __syncthreads();  // for hit filling!
-
-    // for each pixel, look at all the pixels until the end of the module;
-    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
-    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
-    // pixel in the cluster ( clus[i] == i ).
-    bool more = true;
-    while (__syncthreads_or(more)) {
-      more = false;
-        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
-          auto p = hist.begin()+j;
-          auto i = *p + firstPixel;
-          assert (id[i] != InvId);
-          assert(id[i] == thisModuleId);    // same module
-#ifdef CLUS_LIMIT_LOOP
-          auto jm = jmax[k];
-          jmax[k] = p + 1;
-#endif
-          int be = Hist::bin(y[i]+1);
-          auto e = hist.end(be);
-#ifdef CLUS_LIMIT_LOOP
-          e = std::min(e,jm);
-#endif      
-          // loop to columns
-          auto loop = [&](uint16_t const * kk) {
-            auto m = (*kk)+firstPixel;
-#ifdef GPU_DEBUG
-            assert(m!=i);
-#endif
-            if (std::abs(int(x[m]) - int(x[i])) > 1) return;
-            // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
-            auto old = atomicMin(&clusterId[m], clusterId[i]);
-            if (old != clusterId[i]) {
-              // end the loop only if no changes were applied
-              more = true;
-            }
-            atomicMin(&clusterId[i], old);
-#ifdef CLUS_LIMIT_LOOP
-            // update the loop boundary for the next iteration
-            jmax[k] = std::max(kk + 1,jmax[k]);
-#endif
-          };
-          ++p;
-          for (;p<e;++p) loop(p);
-        } // pixel loop
-#ifdef GPU_DEBUG
-        if (threadIdx.x==0) ++nloops;
-#endif
-    }  // end while
-
-#ifdef GPU_DEBUG
-   if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("# loops %d\n",nloops);
-#endif
-
-    __shared__ int foundClusters;
-    foundClusters = 0;
-    __syncthreads();
-
-    // find the number of different clusters, identified by a pixels with clus[i] == i;
-    // mark these pixels with a negative id.
-      for (int i = first; i < msize; i += blockDim.x) {
-        if (id[i] == InvId)                 // skip invalid pixels
-          continue;
-        if (clusterId[i] == i) {
-          auto old = atomicAdd(&foundClusters, 1);
-          clusterId[i] = -(old + 1);
-        }
-      }
-    __syncthreads();
-
-    // propagate the negative id to all the pixels in the cluster.
-      for (int i = first; i < msize; i += blockDim.x) {
-        if (id[i] == InvId)                 // skip invalid pixels
-          continue;
-        if (clusterId[i] >= 0) {
-          // mark each pixel in a cluster with the same id as the first one
-          clusterId[i] = clusterId[clusterId[i]];
-        }
-      }
-    __syncthreads();
-
-    // adjust the cluster id to be a positive value starting from 0
-      for (int i = first; i < msize; i += blockDim.x) {
-        if (id[i] == InvId) {               // skip invalid pixels
-          clusterId[i] = -9999;
-          continue;
-        }
-        clusterId[i] = - clusterId[i] - 1;
-      }
-    __syncthreads();
-
-      if (threadIdx.x == 0) {
-        nClustersInModule[thisModuleId] = foundClusters;
-        moduleId[blockIdx.x] = thisModuleId;
-#ifdef GPU_DEBUG
-      if (thisModuleId % 100 == 1)
-        if (threadIdx.x == 0)
-          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
-#endif
-    }
-  }
-
-} // namespace gpuClustering
-
-#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 2392822bab336..a01c1230586fe 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -11,7 +11,7 @@
 
 #include <cuda/api_wrappers.h>
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 
 int main(void)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index d442fb8195562..d10c5b7d07952 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -70,7 +70,8 @@ namespace pixelgpudetails {
     gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
     gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
 
-    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins())));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
     gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
@@ -109,6 +110,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.owner_32bit_));
     cudaCheck(cudaFree(gpu_.owner_16bit_));
     cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hws_d));
     cudaCheck(cudaFree(gpu_d));
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
 
@@ -192,6 +194,6 @@ namespace pixelgpudetails {
       // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
     }
 
-    cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
+    cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
   }
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index a8b97e988e5e6..b8c2f06cd6854 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -18,6 +18,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
   using CPUProduct = int; // dummy
 
   struct HitsOnGPU{
+     constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
      pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
      float * bs_d;
      const uint32_t * hitsModuleStart_d; // forwarded from clusters
@@ -32,8 +33,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     using Hist = HistoContainer<int16_t,1<<7,8>;
+     using Hist = HistoContainer<int16_t,128,MAX_HITS,8*sizeof(int16_t),uint32_t,10>;
      Hist * hist_d;
+     uint32_t hws_d;  
 
      HitsOnGPU const * me_d = nullptr;
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 599ef57c7cd4a..32160e87275d2 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -6,7 +6,7 @@
 #include <algorithm>
 #include<cassert>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
 
 

From 78e4aa5b6864f30a56a71d671e7dcbbf9d9e87d6 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Fri, 21 Sep 2018 19:46:41 +0200
Subject: [PATCH 25/36] wip

---
 HeterogeneousCore/CUDAUtilities/BuildFile.xml                | 1 +
 RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml       | 2 ++
 RecoLocalTracker/SiPixelRecHits/BuildFile.xml                | 3 ++-
 RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu      | 2 +-
 .../plugins/siPixelRecHitsHeterogeneousProduct.h             | 5 +++--
 RecoPixelVertexing/PixelVertexFinding/BuildFile.xml          | 2 +-
 SimTracker/TrackerHitAssociation/plugins/BuildFile.xml       | 1 +
 7 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 38ee88b068999..fc6a61ffd433e 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1 +1,2 @@
 <use name="cuda"/>
+<use name="cub"/>
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index 17ce3cd5ce230..335591b583b58 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -34,12 +34,14 @@
 
 <bin file="gpuClustering.cu" name="gpuClustering_t">
   <use name="cuda"/>
+  <use name="cub"/>
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="gpuClustering.cu" name="gpuClustering_debug">
   <use name="cuda"/>
+  <use name="cub"/>
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
index c52545a601341..7918c7a4f4d9a 100644
--- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
@@ -14,7 +14,8 @@
 
 <use   name="cuda"/>
 <use   name="cuda-api-wrappers"/>
-<use   name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="HeterogeneousCore/CUDAServices"/>
 
 <export>
   <lib   name="1"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index d10c5b7d07952..bb9ff5eb84de4 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -34,7 +34,7 @@ namespace {
 namespace pixelgpudetails {
   PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
 
-    constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
+    constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
 
     cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index b8c2f06cd6854..4890333caaa6d 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -17,8 +17,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
 
   using CPUProduct = int; // dummy
 
+  static constexpr uint32_t maxHits() { return 65536;}
+
   struct HitsOnGPU{
-     constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
      pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
      float * bs_d;
      const uint32_t * hitsModuleStart_d; // forwarded from clusters
@@ -33,7 +34,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     using Hist = HistoContainer<int16_t,128,MAX_HITS,8*sizeof(int16_t),uint32_t,10>;
+     using Hist = HistoContainer<int16_t,128,maxHits(),8*sizeof(int16_t),uint32_t,10>;
      Hist * hist_d;
      uint32_t hws_d;  
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
index 5a3208da2c87f..43261b0417410 100644
--- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
@@ -20,4 +20,4 @@
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
-
+<use name="HeterogeneousCore/CUDAUtilities"/>
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index c767b1e68936a..4f3e0d27878a1 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -9,6 +9,7 @@
 
 <library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
   <use name="cuda"/>
+  <use name="cub"/>
   <use name="cuda-api-wrappers"/>
   <flags EDM_PLUGIN="1"/>
 </library>

From 0c19cd105622201df5fc55d4efe5db3c8fc2bf3f Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 22 Sep 2018 11:36:45 +0200
Subject: [PATCH 26/36] does not crash

---
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |  2 +-
 .../siPixelRecHitsHeterogeneousProduct.h      |  4 ++--
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 22 +++++++++----------
 3 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index bb9ff5eb84de4..c63466f157a1b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -71,7 +71,7 @@ namespace pixelgpudetails {
     gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
 
     cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
-    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins())));
+    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
     gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 4890333caaa6d..c06cb17e2e282 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -34,9 +34,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     using Hist = HistoContainer<int16_t,128,maxHits(),8*sizeof(int16_t),uint32_t,10>;
+     using Hist = HistoContainer<int16_t,128,maxHits(),8*sizeof(int16_t),uint16_t,10>;
      Hist * hist_d;
-     uint32_t hws_d;  
+     typename Hist::Counter *  hws_d;  
 
      HitsOnGPU const * me_d = nullptr;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 418964869dfda..ccbff0480e6c5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -26,7 +26,7 @@ namespace gpuPixelDoublets {
                          GPUCACell * cells,
                          uint32_t * nCells,
                          int16_t const * __restrict__ iphi,
-                         Hist const * __restrict__ hist,
+                         Hist const & __restrict__ hist,
                          uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
                          GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
@@ -64,6 +64,8 @@ namespace gpuPixelDoublets {
       uint8_t outer = layerPairs[2*pairLayerId+1];
       assert(outer > inner);
 
+      auto hoff = Hist::histOff(outer);
+
       auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
       i += offsets[inner];
 
@@ -98,8 +100,8 @@ namespace gpuPixelDoublets {
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = hist[outer].bin(int16_t(mep-iphicut));
-      auto kh = hist[outer].bin(int16_t(mep+iphicut));
+      auto kl = Hist::bin(int16_t(mep-iphicut));
+      auto kh = Hist::bin(int16_t(mep+iphicut));
       auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
       int  tot  = 0;
       int  nmin = 0;
@@ -109,8 +111,10 @@ namespace gpuPixelDoublets {
       int tooMany=0;
       for (auto kk = kl; kk != khh; incr(kk)) {
         if (kk != kl && kk != kh)
-          nmin += hist[outer].size(kk);
-        for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
+          nmin += hist.size(kk+hoff);
+        auto const * __restrict__ p = hist.begin(kk+hoff);
+        auto const * __restrict__ e = hist.end(kk+hoff);
+        for (;p < e; ++p) {
           auto oi=*p;
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
@@ -129,12 +133,6 @@ namespace gpuPixelDoublets {
       if (tooMany > 0)
         printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-      if (hist[outer].nspills() > 0)
-        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills());
-
-      // if (0==hist[outer].nspills()) assert(tot>=nmin);
-      // look in spill bin as well....
-
     }  // loop in block...
   }
 
@@ -186,7 +184,7 @@ namespace gpuPixelDoublets {
 
     auto const &  __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs, nPairs, cells, nCells,
-                      hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
+                      hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
   }

From 1b55356c1f43eccdace636494fc5b93a90f6fc67 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 22 Sep 2018 14:15:54 +0200
Subject: [PATCH 27/36] remove hard limit

---
 .../SiPixelRawToClusterHeterogeneous.cc       |  4 ++--
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  | 24 ++++++++++---------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 76f3536ab1706..384a4732b32e1 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -584,7 +584,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
   }
 
   int32_t nclus=-1;
-  std::vector<AccretionCluster> aclusters(256);
+  std::vector<AccretionCluster> aclusters(1024);
   auto totCluseFilled=0;
 
   auto fillClusters = [&](uint32_t detId){
@@ -626,7 +626,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
     auto const & dig = (*detDigis).data.back();
     // fill clusters
     assert(gpu.clus_h[i]>=0);
-    assert(gpu.clus_h[i]<256);
+    assert(gpu.clus_h[i]<1024);
     nclus = std::max(gpu.clus_h[i],nclus);
     auto row = dig.row();
     auto col = dig.column();
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index ec48333bde889..816a8492429ef 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -14,15 +14,6 @@ namespace gpuPixelRecHits {
 
 
 
-  // to be moved in common namespace...
-  constexpr uint16_t InvId=9999; // must be > MaxNumModules
-
-
-  constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
-
-  using ClusParams = pixelCPEforGPU::ClusParams;
-
-
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
                           float const * __restrict__  bs,
                           uint16_t const * __restrict__  id,
@@ -42,6 +33,14 @@ namespace gpuPixelRecHits {
                           float * xe, float * ye, 
                           uint16_t * mr, uint16_t * mc)
   {
+
+    // to be moved in common namespace...
+    constexpr uint16_t InvId=9999; // must be > MaxNumModules
+    constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
+
+    using ClusParams = pixelCPEforGPU::ClusParams;
+
+
     // as usual one block per module
     __shared__ ClusParams clusParams;
 
@@ -65,7 +64,9 @@ namespace gpuPixelRecHits {
 #endif
 
     assert(blockDim.x >= MaxClusInModule);
-    assert(nclus <= MaxClusInModule);
+
+    if (threadIdx.x==0 && nclus > MaxClusInModule) printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+    nclus = std::min(nclus, MaxClusInModule);
 
     auto ic = threadIdx.x;
 
@@ -90,7 +91,7 @@ namespace gpuPixelRecHits {
     for (int i = first; i < numElements; i += blockDim.x) {
       if (id[i] == InvId) continue;     // not valid
       if (id[i] != me) break;           // end of module
-      assert(clus[i] < nclus);
+      if (clus[i] >= nclus) continue;
       atomicMin(&clusParams.minRow[clus[i]], x[i]);
       atomicMax(&clusParams.maxRow[clus[i]], x[i]);
       atomicMin(&clusParams.minCol[clus[i]], y[i]);
@@ -102,6 +103,7 @@ namespace gpuPixelRecHits {
     for (int i = first; i < numElements; i += blockDim.x) {
       if (id[i] == InvId) continue;     // not valid
       if (id[i] != me) break;           // end of module
+      if (clus[i] >= nclus) continue;
       atomicAdd(&clusParams.charge[clus[i]], adc[i]);
       if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
       if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);

From 70d2fcb5c20db8d0e7eb6a9a5c071400ab32dc59 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 22 Sep 2018 15:39:34 +0200
Subject: [PATCH 28/36] mark lost clusters

---
 RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 816a8492429ef..3f92e4833bc22 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -65,7 +65,11 @@ namespace gpuPixelRecHits {
 
     assert(blockDim.x >= MaxClusInModule);
 
-    if (threadIdx.x==0 && nclus > MaxClusInModule) printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+    if (threadIdx.x==0 && nclus > MaxClusInModule) { 
+      printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+      // zero charge: do not bother to do it in parallel
+      for (auto d=MaxClusInModule; d<nclus; ++d) { chargeh[d]=0; detInd[d]=InvId;}
+    }
     nclus = std::min(nclus, MaxClusInModule);
 
     auto ic = threadIdx.x;

From 47a3f042f044b873b2786b54823305fa5ddc67f3 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Sat, 22 Sep 2018 17:03:24 +0200
Subject: [PATCH 29/36] a bit of cleanup, no speedup

---
 .../plugins/CAHitQuadrupletGeneratorGPU.cu      | 16 ++++++++--------
 .../PixelTriplets/plugins/GPUCACell.h           | 17 +++++++----------
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index ca7f8928873e4..4e17b580bbaf3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -12,8 +12,8 @@ using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
 __global__ void
 kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
-               GPUCACell *cells, uint32_t const * nCells,
-               GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+               GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells,
+               GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
                uint32_t nHits, uint32_t maxNumberOfDoublets) {
 
  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -36,8 +36,8 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 __global__ 
 void
 kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
-               GPUCACell *cells, uint32_t const * nCells,
-               GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+               GPUCACell * cells, uint32_t const * __restrict__ nCells,
+               GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
                float ptmin,
                float region_origin_radius, const float thetaCut,
                const float phiCut, const float hardPtCut,
@@ -51,14 +51,14 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
   if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel
 
   if (cellIndex >= (*nCells) ) return;
-  auto &thisCell = cells[cellIndex];
+  auto const & thisCell = cells[cellIndex];
   auto innerHitId = thisCell.get_inner_hit_id();
   auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
   for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
      auto otherCell = isOuterHitOfCell[innerHitId][j];
 
-     if (thisCell.check_alignment_and_tag(
-                 cells, otherCell, ptmin, region_origin_x, region_origin_y,
+     if (thisCell.check_alignment(
+                 cells[otherCell], ptmin, region_origin_x, region_origin_y,
                   region_origin_radius, thetaCut, phiCut, hardPtCut)
         ) {
           cells[otherCell].theOuterNeighbors.push_back(cellIndex);
@@ -67,7 +67,7 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 }
 
 __global__ void kernel_find_ntuplets(
-    GPUCACell *cells, uint32_t const * nCells,
+    GPUCACell * const __restrict__ cells, uint32_t const * nCells,
     GPU::SimpleVector<Quadruplet> *foundNtuplets,
     unsigned int minHitsPerNtuplet,
     unsigned int maxNumberOfDoublets_)
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 02cf0d6f91642..a9578e7dab9dc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -20,7 +20,7 @@ class GPUCACell {
   GPUCACell() = default;
 
   __host__ __device__
-  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh,
       int layerPairId, int doubletId, int innerHitId,int outerHitId)
   {
     theInnerHitId = innerHitId;
@@ -64,21 +64,20 @@ class GPUCACell {
   }
 
   __host__ __device__
-  bool check_alignment_and_tag(
-      const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
+  bool check_alignment(
+      GPUCACell const & otherCell, const float ptmin,
       const float region_origin_x, const float region_origin_y,
       const float region_origin_radius, const float thetaCut,
-      const float phiCut, const float hardPtCut)
+      const float phiCut, const float hardPtCut) const
   {
     auto ro = get_outer_r();
     auto zo = get_outer_z();
-    const auto &otherCell = cells[innerCellId];
 
     auto r1 = otherCell.get_inner_r();
     auto z1 = otherCell.get_inner_z();
     bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
     return (aligned &&
-            haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
+            haveSimilarCurvature(otherCell, ptmin, region_origin_x,
                                  region_origin_y, region_origin_radius, phiCut,
                                  hardPtCut));
   }
@@ -101,14 +100,12 @@ class GPUCACell {
   }
 
   constexpr bool
-  haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
+  haveSimilarCurvature(GPUCACell const & otherCell,
                        const float ptmin, const float region_origin_x,
                        const float region_origin_y,
                        const float region_origin_radius, const float phiCut,
                        const float hardPtCut) const {
 
-    const auto &otherCell = cells[innerCellId];
-
     auto x1 = otherCell.get_inner_x();
     auto y1 = otherCell.get_inner_y();
 
@@ -190,7 +187,7 @@ class GPUCACell {
 
   __device__
   inline void find_ntuplets(
-      const GPUCACell *cells,
+      GPUCACell const * __restrict__ cells,
       GPU::SimpleVector<Quadruplet> *foundNtuplets,
       GPU::VecArray<unsigned int,3> &tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const

From 85453afd53160745b86f3f3d9fc61de7007b8ffe Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 24 Sep 2018 12:39:00 +0200
Subject: [PATCH 30/36] force load in constant cache, reduce size of Cell

---
 .../siPixelRecHitsHeterogeneousProduct.h      |   1 +
 .../plugins/CAHitQuadrupletGeneratorGPU.cu    |   6 +-
 .../plugins/CAHitQuadrupletGeneratorGPU.h     |   2 +
 .../PixelTriplets/plugins/GPUCACell.h         | 108 +++++++++---------
 .../PixelTriplets/plugins/gpuPixelDoublets.h  |  12 +-
 5 files changed, 68 insertions(+), 61 deletions(-)

diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index c06cb17e2e282..9d0fe7a279799 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -18,6 +18,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
   using CPUProduct = int; // dummy
 
   static constexpr uint32_t maxHits() { return 65536;}
+  using hindex_type = uint16_t; // if above is <=2^16
 
   struct HitsOnGPU{
      pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 4e17b580bbaf3..3574276b69ebd 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -36,6 +36,7 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 __global__ 
 void
 kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
+               GPUCACell::Hits const *  __restrict__ hhp,
                GPUCACell * cells, uint32_t const * __restrict__ nCells,
                GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
                float ptmin,
@@ -43,6 +44,8 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
                const float phiCut, const float hardPtCut,
                unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
 
+  auto const & hh = *hhp;
+
   float region_origin_x = 0.;
   float region_origin_y = 0.;
 
@@ -57,7 +60,7 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
   for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
      auto otherCell = isOuterHitOfCell[innerHitId][j];
 
-     if (thisCell.check_alignment(
+     if (thisCell.check_alignment(hh,
                  cells[otherCell], ptmin, region_origin_x, region_origin_y,
                   region_origin_radius, thetaCut, phiCut, hardPtCut)
         ) {
@@ -163,6 +166,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
   auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
   kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
+      hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
       region.ptMin(),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index c06f1be2f257b..81841fdc9bd0c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -21,6 +21,8 @@
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h"
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h"
 
+
+// FIXME  (split header???)
 #include "GPUCACell.h"
 
 class TrackingRegion;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index a9578e7dab9dc..5646b6db4f96b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,7 +6,6 @@
 
 #include <cuda_runtime.h>
 
-#include "GPUHitsAndDoublets.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
@@ -15,40 +14,40 @@ struct Quadruplet {
   int hitId[4];
 };
 
+
 class GPUCACell {
 public:
+
+  using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+  using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+
   GPUCACell() = default;
+#ifdef __CUDACC__
 
-  __host__ __device__
-  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh,
-      int layerPairId, int doubletId, int innerHitId,int outerHitId)
+  __device__ __forceinline__
+  void init(Hits const & hh,
+      int layerPairId, int doubletId,  
+      hindex_type innerHitId, hindex_type outerHitId)
   {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    theInnerX = hh.xg_d[innerHitId];
-    theOuterX = hh.xg_d[outerHitId];
-
-    theInnerY = hh.yg_d[innerHitId];
-    theOuterY = hh.yg_d[outerHitId];
-
-    theInnerZ = hh.zg_d[innerHitId];
-    theOuterZ = hh.zg_d[outerHitId];
-    theInnerR = hh.rg_d[innerHitId];
-    theOuterR = hh.rg_d[outerHitId];
+    theInnerZ = __ldg(hh.zg_d+innerHitId);
+    theInnerR = __ldg(hh.rg_d+innerHitId);
     theOuterNeighbors.reset();
   }
 
-  constexpr float get_inner_x() const { return theInnerX; }
-  constexpr float get_outer_x() const { return theOuterX; }
-  constexpr float get_inner_y() const { return theInnerY; }
-  constexpr float get_outer_y() const { return theOuterY; }
-  constexpr float get_inner_z() const { return theInnerZ; }
-  constexpr float get_outer_z() const { return theOuterZ; }
-  constexpr float get_inner_r() const { return theInnerR; }
-  constexpr float get_outer_r() const { return theOuterR; }
+  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
+
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
   }
@@ -56,36 +55,42 @@ class GPUCACell {
     return theOuterHitId;
   }
 
-  constexpr void print_cell() const {
+/*
+  __host__ __device__
+  void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
            "%d, innerradius %f, outerRadius %f \n",
-           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
-           theInnerR, theOuterR);
+           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
+    );
   }
+*/
 
-  __host__ __device__
-  bool check_alignment(
+  __device__
+  bool check_alignment(Hits const & hh,
       GPUCACell const & otherCell, const float ptmin,
       const float region_origin_x, const float region_origin_y,
       const float region_origin_radius, const float thetaCut,
       const float phiCut, const float hardPtCut) const
   {
-    auto ro = get_outer_r();
-    auto zo = get_outer_z();
+    auto ri = get_inner_r(hh);
+    auto zi = get_inner_z(hh);
 
-    auto r1 = otherCell.get_inner_r();
-    auto z1 = otherCell.get_inner_z();
-    bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
+
+    auto r1 = otherCell.get_inner_r(hh);
+    auto z1 = otherCell.get_inner_z(hh);
+    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
     return (aligned &&
-            haveSimilarCurvature(otherCell, ptmin, region_origin_x,
+            haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
                                  region_origin_y, region_origin_radius, phiCut,
                                  hardPtCut));
   }
 
-
-  constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
+  __device__ __forceinline__
+  static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
                                         const float ptmin,
-                                        const float thetaCut) const {
+                                        const float thetaCut) {
     float radius_diff = std::abs(r1 - ro);
     float distance_13_squared =
         radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
@@ -95,25 +100,26 @@ class GPUCACell {
                                                 // radius_diff later
 
     float tan_12_13_half_mul_distance_13_squared =
-        fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
+        fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  constexpr bool
-  haveSimilarCurvature(GPUCACell const & otherCell,
+  __device__ 
+  bool
+  haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
                        const float ptmin, const float region_origin_x,
                        const float region_origin_y,
                        const float region_origin_radius, const float phiCut,
                        const float hardPtCut) const {
 
-    auto x1 = otherCell.get_inner_x();
-    auto y1 = otherCell.get_inner_y();
+    auto x1 = otherCell.get_inner_x(hh);
+    auto y1 = otherCell.get_inner_y(hh);
 
-    auto x2 = get_inner_x();
-    auto y2 = get_inner_y();
+    auto x2 = get_inner_x(hh);
+    auto y2 = get_inner_y(hh);
 
-    auto x3 = get_outer_x();
-    auto y3 = get_outer_y();
+    auto x3 = get_outer_x(hh);
+    auto y3 = get_outer_y(hh);
 
     float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
     float tan_12_13_half_mul_distance_13_squared =
@@ -183,7 +189,7 @@ class GPUCACell {
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
 
-#ifdef __CUDACC__
+// #ifdef __CUDACC__
 
   __device__
   inline void find_ntuplets(
@@ -228,16 +234,10 @@ class GPUCACell {
   int theLayerPairId;
 
 private:
-  unsigned int theInnerHitId;
-  unsigned int theOuterHitId;
-  float theInnerX;
-  float theOuterX;
-  float theInnerY;
-  float theOuterY;
   float theInnerZ;
-  float theOuterZ;
   float theInnerR;
-  float theOuterR;
+  hindex_type theInnerHitId;
+  hindex_type theOuterHitId;
 };
 
 #endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index ccbff0480e6c5..61d048637585c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -76,8 +76,8 @@ namespace gpuPixelDoublets {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mep = iphi[i];
-      auto mez = hh.zg_d[i];
-      auto mer = hh.rg_d[i];
+      auto mez = __ldg(hh.zg_d+i);
+      auto mer = __ldg(hh.rg_d+i);
 
       constexpr float z0cut = 12.f;                     // cm
       constexpr float hardPtCut = 0.5f;                 // GeV
@@ -86,13 +86,13 @@ namespace gpuPixelDoublets {
       auto ptcut = [&](int j) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
-        auto ro = hh.rg_d[j];
+        auto ro = __ldg(hh.rg_d+j);
         auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
         return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
       };
       auto z0cutoff = [&](int j) {
-        auto zo = hh.zg_d[j];
-        auto ro = hh.rg_d[j];
+        auto zo = __ldg(hh.zg_d+j);
+        auto ro = __ldg(hh.rg_d+j);
         auto dr = ro-mer;
         return dr > maxr[pairLayerId] ||
           dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
@@ -115,7 +115,7 @@ namespace gpuPixelDoublets {
         auto const * __restrict__ p = hist.begin(kk+hoff);
         auto const * __restrict__ e = hist.end(kk+hoff);
         for (;p < e; ++p) {
-          auto oi=*p;
+          auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
 

From d93e14fd834e13986b21199008d6351591d28a43 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 24 Sep 2018 15:56:31 +0200
Subject: [PATCH 31/36] be consistent with ht index type

---
 .../CUDAUtilities/interface/GPUSimpleVector.h         |  2 +-
 .../CUDAUtilities/interface/GPUVecArray.h             |  2 +-
 .../plugins/CAHitQuadrupletGeneratorGPU.cu            |  9 +++++----
 .../plugins/CAHitQuadrupletGeneratorGPU.h             |  1 +
 RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h  | 11 ++++++-----
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
index 74c8731da3712..47592fd2063d6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -87,7 +87,7 @@ template <class T> struct SimpleVector {
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int size() const { return m_size; }
   inline constexpr int capacity() const { return m_capacity; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr void set_data(T * data) { m_data = data; }
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
index 6083c0a55cd3b..8dcefdce65ab4 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -85,7 +85,7 @@ template <class T, int maxSize> struct VecArray {
   inline constexpr const T& operator[](int i) const { return m_data[i]; }
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int capacity() const { return maxSize; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr bool empty() const { return 0 == m_size; }
   inline constexpr bool full() const { return maxSize == m_size; }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 3574276b69ebd..69ca7f38a5b18 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -46,8 +46,8 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 
   auto const & hh = *hhp;
 
-  float region_origin_x = 0.;
-  float region_origin_y = 0.;
+  constexpr float region_origin_x = 0.;
+  constexpr float region_origin_y = 0.;
 
   auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -57,8 +57,9 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
   auto const & thisCell = cells[cellIndex];
   auto innerHitId = thisCell.get_inner_hit_id();
   auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+  auto vi = isOuterHitOfCell[innerHitId].data();
   for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
-     auto otherCell = isOuterHitOfCell[innerHitId][j];
+     auto otherCell = __ldg(vi+j);
 
      if (thisCell.check_alignment(hh,
                  cells[otherCell], ptmin, region_origin_x, region_origin_y,
@@ -80,7 +81,7 @@ __global__ void kernel_find_ntuplets(
   if (cellIndex >= (*nCells) ) return;
   auto &thisCell = cells[cellIndex];
   if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME
-  GPU::VecArray<unsigned int, 3> stack;
+  GPU::VecArray<CAHitQuadrupletGeneratorGPU::hindex_type, 3> stack;
   stack.reset();
   thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet);
   assert(stack.size()==0);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index 81841fdc9bd0c..f34b323ed2346 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -38,6 +38,7 @@ class CAHitQuadrupletGeneratorGPU {
 
     using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
     using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+    using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
 
     static constexpr unsigned int minLayers = 4;
     typedef OrderedHitSeeds ResultType;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 5646b6db4f96b..772b802282d31 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -11,7 +11,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 struct Quadruplet {
-  int hitId[4];
+   using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+   hindex_type hitId[4];
 };
 
 
@@ -55,15 +56,15 @@ class GPUCACell {
     return theOuterHitId;
   }
 
-/*
-  __host__ __device__
+
+  __device__
   void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
            "%d, innerradius %f, outerRadius %f \n",
            theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
     );
   }
-*/
+
 
   __device__
   bool check_alignment(Hits const & hh,
@@ -195,7 +196,7 @@ class GPUCACell {
   inline void find_ntuplets(
       GPUCACell const * __restrict__ cells,
       GPU::SimpleVector<Quadruplet> *foundNtuplets,
-      GPU::VecArray<unsigned int,3> &tmpNtuplet,
+      GPU::VecArray<hindex_type,3> &tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const
   {
     // the building process for a track ends if:

From bda658ac0ba64b7a5b0788cb4acc8e74c839d913 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Tue, 25 Sep 2018 15:32:53 +0200
Subject: [PATCH 32/36] fill pt2 and index for nv==1 as well

---
 HeterogeneousCore/CUDAUtilities/interface/radixSort.h       | 1 -
 .../PixelVertexFinding/src/gpuClusterTracks.h               | 6 +++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 83468d854b1e6..5de2a5b6f8807 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -103,7 +103,6 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_
   for (auto i=first; i<size; i+=blockDim.x)  j[i]=i;
   __syncthreads();
 
-
   while(__syncthreads_and(p < w/d)) {
     if (threadIdx.x<sb) c[threadIdx.x]=0;
     __syncthreads();
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 32160e87275d2..ae17401d06625 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -26,7 +26,7 @@ namespace gpuVertexFinder {
     float * __restrict__ ptv2 = data.ptv2;
     uint16_t * __restrict__ sortInd = data.sortInd;
 
-    if (nv<2) return;
+    if (nv<1) return;
 
     // can be done asynchronoisly at the end of previous event
     for (int i = threadIdx.x; i < nv; i += blockDim.x) {
@@ -41,6 +41,10 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
+    if (1==nv) {
+      if (threadIdx.x==0) sortInd[0]=0;
+      return;
+    }
     __shared__ uint16_t ws[1024];
     radixSort(ptv2,sortInd,ws,nv);
     

From 75f294871c43f94690dcb5327543fdf69428f324 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 25 Sep 2018 23:17:37 +0200
Subject: [PATCH 33/36] check cudaGetLastError() after launching each kernel

---
 HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index e21d03a4d86e3..e13e30594ab6f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -57,6 +57,7 @@ namespace cudautils {
     cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
     auto nblocks = (totSize + nthreads - 1) / nthreads;
     countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    cudaCheck(cudaGetLastError());
     size_t wss = Histo::totbins();
     CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
     cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);

From 3f69a405558a782e8ca10ab786617e9e9dfb053b Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 25 Sep 2018 23:21:38 +0200
Subject: [PATCH 34/36] check cudaGetLastError() after launching each kernel

---
 .../PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 69ca7f38a5b18..53930f467a6c5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -182,7 +182,6 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
       4, maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
-
   numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize;
   kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
                         d_foundNtupletsVec_[regionIndex],
@@ -190,11 +189,11 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
                         device_isOuterHitOfCell_, nhits,
                         maxNumberOfDoublets_
                        );
-
+  cudaCheck(cudaGetLastError());
 
   // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10);
 
-  if(transferToCPU) {
+  if (transferToCPU) {
     cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
                               sizeof(GPU::SimpleVector<Quadruplet>),
                               cudaMemcpyDeviceToHost, cudaStream));

From a96066857080595c73f61d60d4eb6615e9569808 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Tue, 25 Sep 2018 23:22:59 +0200
Subject: [PATCH 35/36] check cudaGetLastError() after launching each kernel

---
 .../PixelVertexFinding/src/gpuVertexFinder.cu      | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index 6cb08ae817721..b4cb7fa24e6e5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -53,19 +53,18 @@ namespace gpuVertexFinder {
 			 uint32_t ntrks
 			 ) {
     
-
     cudaCheck(cudaMemcpyAsync(onGPU.zt,zt,ntrks*sizeof(float),
 			      cudaMemcpyHostToDevice,stream));
     cudaCheck(cudaMemcpyAsync(onGPU.ezt2,ezt2,ntrks*sizeof(float),
 			      cudaMemcpyHostToDevice,stream));
-
     cudaCheck(cudaMemcpyAsync(onGPU.ptt2,ptt2,ntrks*sizeof(float),
                               cudaMemcpyHostToDevice,stream));
     
-
     assert(onGPU_d);
     clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
-    sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);    
+    cudaCheck(cudaGetLastError());
+    sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);
+    cudaCheck(cudaGetLastError());
 
     cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t),
 			      cudaMemcpyDeviceToHost, stream));
@@ -73,8 +72,6 @@ namespace gpuVertexFinder {
     gpuProduct.ivtx.resize(ntrks);
     cudaCheck(cudaMemcpyAsync(gpuProduct.ivtx.data(),onGPU.iv,sizeof(int32_t)*ntrks,
 			      cudaMemcpyDeviceToHost, stream));
- 
-
   }
   
   Producer::GPUProduct const & Producer::fillResults(cudaStream_t stream) {
@@ -98,8 +95,5 @@ namespace gpuVertexFinder {
     
     return gpuProduct;
   }
-
-  
-  
+	
 } // end namespace
-

From 0e2812cfc49d6f664163f82022739380501d5e68 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 26 Sep 2018 00:20:19 +0200
Subject: [PATCH 36/36] Declare the direct dependencies

Declare the direct dependency on "HeterogeneousCore/CUDAUtilities" rather than the indirect dependency on "cub".
---
 SimTracker/TrackerHitAssociation/plugins/BuildFile.xml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index 4f3e0d27878a1..a63c38264c20d 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,15 +1,14 @@
+<use name="Geometry/Records" />
+<use name="Geometry/TrackerGeometryBuilder" />
 <use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
-<use name="SimTracker/TrackerHitAssociation" />
 <use name="SimDataFormats/TrackingAnalysis" />
-
-<use name="Geometry/TrackerGeometryBuilder" />
-<use name="Geometry/Records" />
+<use name="SimTracker/TrackerHitAssociation" />
 
 <library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
   <use name="cuda"/>
-  <use name="cub"/>
   <use name="cuda-api-wrappers"/>
   <flags EDM_PLUGIN="1"/>
 </library>