diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 38ee88b068999..fc6a61ffd433e 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1 +1,2 @@
 <use name="cuda"/>
+<use name="cub"/>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
index 74c8731da3712..47592fd2063d6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -87,7 +87,7 @@ template <class T> struct SimpleVector {
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int size() const { return m_size; }
   inline constexpr int capacity() const { return m_capacity; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr void set_data(T * data) { m_data = data; }
 
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
index 6083c0a55cd3b..8dcefdce65ab4 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -85,7 +85,7 @@ template <class T, int maxSize> struct VecArray {
   inline constexpr const T& operator[](int i) const { return m_data[i]; }
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int capacity() const { return maxSize; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr bool empty() const { return 0 == m_size; }
   inline constexpr bool full() const { return maxSize == m_size; }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 94acd0015edf6..e13e30594ab6f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDAUtilities_HistoContainer_h
 
 #include <cassert>
+#include <cstddef> 
 #include <cstdint>
 #include <algorithm>
 #include <type_traits>
@@ -10,96 +11,101 @@
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cub/cub.cuh>
+#endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
 #ifdef __CUDACC__
 namespace cudautils {
 
-  template<typename Histo>
-  __global__
-  void zeroMany(Histo * h, uint32_t nh) {
-    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
-    auto ih = i / Histo::nbins();
-    auto k  = i - ih * Histo::nbins();
-    if (ih < nh) {
-      h[ih].nspills = 0;
-      if (k < Histo::nbins())
-        h[ih].n[k] = 0;
-    }
-  }
-
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h,  uint32_t nh, T const * v, uint32_t * offsets) {
+  void countFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i >= offsets[nh]) return;
      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
      assert((*off) > 0);
      int32_t ih = off - offsets - 1;
      assert(ih >= 0);
-     assert(ih < nh); 
-     h[ih].fill(v[i], i);
+     assert(ih < nh);
+     (*h).count(v[i], ih);
   }
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h, T const * v, uint32_t size) {
+  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
+                      uint32_t * __restrict__ ws ) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i < size) h->fill(v[i], i);
-  }
-
-  template<typename Histo>
-  void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
-    auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads;
-    zeroMany<<<nblocks, nthreads, 0, stream>>>(h, nh);
-    cudaCheck(cudaGetLastError());
+     if(i >= offsets[nh]) return;
+     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+     assert((*off) > 0);
+     int32_t ih = off - offsets - 1;
+     assert(ih >= 0);
+     assert(ih < nh);
+     (*h).fill(v[i], i, ws, ih);
   }
 
-  template<typename Histo, typename T>
-  void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) {
-    zero(h, 1, nthreads, stream);
-    auto nblocks = (size + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, v, size);
-    cudaCheck(cudaGetLastError());
-  }
 
   template<typename Histo, typename T>
-  void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
-    zero(h, nh, nthreads, stream);
+  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
+                          uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
+                          int nthreads, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
     auto nblocks = (totSize + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    cudaCheck(cudaGetLastError());
+    size_t wss = Histo::totbins();
+    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
+    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
     cudaCheck(cudaGetLastError());
   }
 
+
 } // namespace cudautils
 #endif
 
 
 // iteratate over N bins left and right of the one containing "v"
-// including spillBin
 template<typename Hist, typename V, typename Func>
 __host__ __device__
+__forceinline__
 void forEachInBins(Hist const & hist, V value, int n, Func func) {
-   int bs = hist.bin(value);
-   int be = std::min(int(hist.nbins()),bs+n+1);
+   int bs = Hist::bin(value);
+   int be = std::min(int(Hist::nbins()-1),bs+n);
    bs = std::max(0,bs-n);
-   assert(be>bs);
-   for (auto b=bs; b<be; ++b){
-   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
+      func(*pj);
+   }
+}
+
+// iteratate over bins containing all values in window wmin, wmax
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
+   auto bs = Hist::bin(wmin);
+   auto be = Hist::bin(wmax);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
       func(*pj);
-   }}
-   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-     func(*pj);
+   }
 }
 
 
+
 template<
   typename T, // the type of the discretized input values
-  uint32_t N, // number of bins (in bits)
-  uint32_t M, // max number of element a bin can contain
+  uint32_t NBINS, // number of bins 
+  uint32_t SIZE, // max number of element
   uint32_t S=sizeof(T) * 8, // number of significant bits in T
-  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+  typename I=uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+  uint32_t NHISTS=1 // number of histos stored
 >
 class HistoContainer {
 public:
@@ -111,24 +117,55 @@ class HistoContainer {
 
   using index_type = I;
   using UT = typename std::make_unsigned<T>::type;
+
+  static constexpr uint32_t ilog2(uint32_t v) {
+
+    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+    uint32_t r = 0; // result of log2(v) will go here
+    for (auto i = 4; i >= 0; i--) if (v & b[i]) {
+      v >>= s[i];
+      r |= s[i];
+    }
+    return r;
+  }
+
+
   static constexpr uint32_t sizeT()     { return S; }
-  static constexpr uint32_t nbins()     { return 1 << N; }
-  static constexpr uint32_t shift()     { return sizeT() - N; }
-  static constexpr uint32_t mask()      { return nbins() - 1; }
-  static constexpr uint32_t binSize()   { return 1 << M; }
-  static constexpr uint32_t spillSize() { return 16 * binSize(); }
+  static constexpr uint32_t nbins()     { return NBINS;}
+  static constexpr uint32_t nhists()    { return NHISTS;}
+  static constexpr uint32_t totbins()   { return NHISTS*NBINS+1;}
+  static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
+  static constexpr uint32_t capacity()  { return SIZE; }
+
+  static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
+
+#ifdef __CUDACC__
+  __host__
+  static size_t wsSize() {
+    uint32_t * v =nullptr;
+    void * d_temp_storage = nullptr;
+    size_t  temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
+    return std::max(temp_storage_bytes,size_t(totbins()));
+  }
+#endif
+
 
   static constexpr UT bin(T t) {
-    return (t >> shift()) & mask();
+    constexpr uint32_t shift = sizeT() - nbits();
+    constexpr uint32_t mask = (1<<nbits()) - 1;
+    return (t >> shift) & mask;
   }
 
   void zero() {
-    nspills = 0;
-    for (auto & i : n)
+    for (auto & i : off)
       i = 0;
   }
 
-  static constexpr
+  static __host__ __device__
+  __forceinline__
   uint32_t atomicIncrement(Counter & x) {
     #ifdef __CUDA_ARCH__
     return atomicAdd(&x, 1);
@@ -138,51 +175,72 @@ class HistoContainer {
   }
 
   __host__ __device__
-  void fill(T t, index_type j) {
-    UT b = bin(t);
+  __forceinline__
+  void count(T t) {
+    uint32_t b = bin(t);
     assert(b<nbins());
-    auto w = atomicIncrement(n[b]);
-    if (w < binSize()) {
-      bins[b * binSize() + w] = j;
-    } else {
-      auto w = atomicIncrement(nspills);
-      if (w < spillSize())
-        spillBin[w] = j;
-    }
+    atomicIncrement(off[b+1]);
   }
 
-  constexpr bool fullSpill() const {
-    return nspills >= spillSize();
-  }
-
-  constexpr bool full(uint32_t b) const {
-    return n[b] >= binSize();
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr auto const * begin(uint32_t b) const {
-     return bins + b * binSize();
-  }
 
-  constexpr auto const * end(uint32_t b) const {
-     return begin(b) + std::min(binSize(), uint32_t(n[b]));
+  __host__ __device__
+  __forceinline__
+  void count(T t, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    atomicIncrement(off[b+1]);
   }
 
-  constexpr auto size(uint32_t b) const {
-     return n[b];
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr auto const * beginSpill() const {
-     return spillBin;
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  void finalize(Counter * ws) {
+    blockPrefixScan(off+1,totbins()-1,ws);
   }
-    
-  constexpr auto const * endSpill() const {
-     return beginSpill() + std::min(spillSize(), uint32_t(nspills));
+  __host__
+#endif
+  void finalize() {
+    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
   }
 
-  Counter  n[nbins()];
-  Counter  nspills;
-  index_type bins[nbins()*binSize()];
-  index_type spillBin[spillSize()];
+  constexpr auto size() const { return uint32_t(off[totbins()-1]);}
+  constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
+
+
+  constexpr index_type const * begin() const { return bins;}
+  constexpr index_type const * end() const { return begin() + size();}
+
+
+  constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
+  constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
+
+
+  Counter  off[totbins()];
+  index_type bins[capacity()];
 };
 
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
new file mode 100644
index 0000000000000..24925242950f6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h
+#define HeterogeneousCore_CUDAUtilities_prefixScan_h
+
+#include <cstdint>
+#include <cassert>
+
+template<typename T>
+__device__
+void 
+__forceinline__
+warpPrefixScan(T * c, uint32_t i) {
+   auto x = c[i];
+   auto laneId = threadIdx.x & 0x1f;
+   #pragma unroll
+   for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+     auto y = __shfl_up_sync(0xffffffff,x, offset);
+     if(laneId >= offset) x += y;
+   }
+   c[i] = x;
+}
+
+// limited to 32*32 elements....
+template<typename T>
+__device__
+void
+__forceinline__
+blockPrefixScan(T * c, uint32_t size, T* ws) {
+  assert(size<=1024);
+  assert(0==blockDim.x%32);
+
+  auto first = threadIdx.x;
+
+  for (auto i=first; i<size; i+=blockDim.x) {
+    warpPrefixScan(c,i);
+    auto laneId = threadIdx.x & 0x1f;
+    auto warpId = i/32;
+    assert(warpId<32);
+    if (31==laneId) ws[warpId]=c[i];
+  }
+  __syncthreads();
+  if (size<=32) return;
+  if (threadIdx.x<32) warpPrefixScan(ws,threadIdx.x);
+  __syncthreads();
+  for (auto i=first+32; i<size; i+=blockDim.x) {
+    auto warpId = i/32;
+    c[i]+=ws[warpId-1];
+  }
+  __syncthreads();
+}
+
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
index 60d0a48164678..5de2a5b6f8807 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h
@@ -75,23 +75,22 @@ template<
   int NS, // number of significant bytes to use in sorting
   typename RF
 >
-__device__  
-void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
+__device__
+void
+__forceinline__
+radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_t size, RF reorder) {
     
   constexpr int d = 8, w = 8*sizeof(T);
   constexpr int sb = 1<<d;
   constexpr int ps = int(sizeof(T)) - NS; 
 
-  constexpr int MaxSize = 256*32;
-  __shared__ uint16_t ind2[MaxSize];
   __shared__ int32_t c[sb], ct[sb], cu[sb];
 
   __shared__ int ibs;
   __shared__ int p;
 
   assert(size>0);
-  assert(size<=MaxSize); 
-  assert(blockDim.x==sb);  
+  assert(blockDim.x>=sb);  
 
   // bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
 
@@ -104,9 +103,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
   for (auto i=first; i<size; i+=blockDim.x)  j[i]=i;
   __syncthreads();
 
-
-  while(p < w/d) {
-    c[threadIdx.x]=0;
+  while(__syncthreads_and(p < w/d)) {
+    if (threadIdx.x<sb) c[threadIdx.x]=0;
     __syncthreads();
 
     // fill bins
@@ -117,19 +115,22 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
     __syncthreads();
 
     // prefix scan "optimized"???...
-    auto x = c[threadIdx.x];
-    auto laneId = threadIdx.x & 0x1f;
-    #pragma unroll
-    for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
-      auto y = __shfl_up_sync(0xffffffff,x, offset);
-      if(laneId >= offset) x += y;
+    if (threadIdx.x<sb) {
+      auto x = c[threadIdx.x];
+      auto laneId = threadIdx.x & 0x1f;
+      #pragma unroll
+      for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+        auto y = __shfl_up_sync(0xffffffff,x, offset);
+        if(laneId >= offset) x += y;
+      }
+      ct[threadIdx.x] = x;
     }
-    ct[threadIdx.x] = x;
     __syncthreads();
-    auto ss = (threadIdx.x/32)*32 -1;
-    c[threadIdx.x] = ct[threadIdx.x];
-    for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; 
- 
+    if (threadIdx.x<sb) {
+      auto ss = (threadIdx.x/32)*32 -1;
+      c[threadIdx.x] = ct[threadIdx.x];
+      for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; 
+    }
     /* 
     //prefix scan for the nulls  (for documentation)
     if (threadIdx.x==0)
@@ -140,27 +141,33 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
      // broadcast
      ibs =size-1;
      __syncthreads();
-     while (ibs>0) {
+     while (__syncthreads_and(ibs>0)) {
        int i = ibs - threadIdx.x;
-       cu[threadIdx.x]=-1;
-       ct[threadIdx.x]=-1;
+       if (threadIdx.x<sb) {
+         cu[threadIdx.x]=-1;
+         ct[threadIdx.x]=-1;
+       }
        __syncthreads();
        int32_t bin = -1;
-       if (i>=0) { 
-         bin = (a[j[i]] >> d*p)&(sb-1);
-         ct[threadIdx.x]=bin;
-         atomicMax(&cu[bin],int(i));
+       if (threadIdx.x<sb) {
+         if (i>=0) { 
+           bin = (a[j[i]] >> d*p)&(sb-1);
+           ct[threadIdx.x]=bin;
+           atomicMax(&cu[bin],int(i));
+         }
        }
        __syncthreads();
-       if (i>=0 && i==cu[bin])  // ensure to keep them in order
-         for (int ii=threadIdx.x; ii<blockDim.x; ++ii) if (ct[ii]==bin) {
-           auto oi = ii-threadIdx.x; 
-           // assert(i>=oi);if(i>=oi) 
-           k[--c[bin]] = j[i-oi]; 
-         }
+       if (threadIdx.x<sb) {
+         if (i>=0 && i==cu[bin])  // ensure to keep them in order
+           for (int ii=threadIdx.x; ii<sb; ++ii) if (ct[ii]==bin) {
+             auto oi = ii-threadIdx.x; 
+             // assert(i>=oi);if(i>=oi) 
+             k[--c[bin]] = j[i-oi]; 
+           }
+       } 
        __syncthreads();
        if (bin>=0) assert(c[bin]>=0);
-       if (threadIdx.x==0) ibs-=blockDim.x;
+       if (threadIdx.x==0) ibs-=sb;
        __syncthreads();
      }    
       
@@ -191,6 +198,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
   if (j!=ind) // odd...
      for (auto i=first; i<size; i+=blockDim.x) ind[i]=ind2[i];
 
+  __syncthreads();
+
   // now move negative first... (if signed)
   reorder(a,ind,ind2,size);
 
@@ -203,8 +212,10 @@ template<
   typename std::enable_if<std::is_unsigned<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
-  radixSortImpl<T,NS>(a,ind,size,dummyReorder<T>);
+void 
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+  radixSortImpl<T,NS>(a,ind,ind2,size,dummyReorder<T>);
 }
 
 template<
@@ -213,8 +224,10 @@ template<
   typename std::enable_if<std::is_integral<T>::value&&std::is_signed<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
-  radixSortImpl<T,NS>(a,ind,size,reorderSigned<T>);
+void 
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+  radixSortImpl<T,NS>(a,ind,ind2,size,reorderSigned<T>);
 }
 
 template<
@@ -223,29 +236,47 @@ template<
   typename std::enable_if<std::is_floating_point<T>::value,T>::type* = nullptr
 >
 __device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
+void
+__forceinline__
+ radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
   using I = int;
-  radixSortImpl<I,NS>((I const *)(a),ind,size,reorderFloat<I>);
+  radixSortImpl<I,NS>((I const *)(a),ind,ind2, size,reorderFloat<I>);
 }
 
 
 
 template<typename T, int NS=sizeof(T)>
 __device__
-void radixSortMulti(T * v, uint16_t * index, uint32_t * offsets) {
+void 
+__forceinline__
+radixSortMulti(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+
+  extern __shared__ uint16_t ws[];
 
   auto a = v+offsets[blockIdx.x];
-  auto ind = index+offsets[blockIdx.x];;
+  auto ind = index+offsets[blockIdx.x];
+  auto ind2 = nullptr==workspace ? ws : workspace+offsets[blockIdx.x];
   auto size = offsets[blockIdx.x+1]-offsets[blockIdx.x];
   assert(offsets[blockIdx.x+1]>=offsets[blockIdx.x]);
-  if (size>0) radixSort<T,NS>(a,ind,size);
+  if (size>0) radixSort<T,NS>(a,ind,ind2,size);
 
 }
 
 template<typename T, int NS=sizeof(T)>
 __global__
-void radixSortMultiWrapper(T * v, uint16_t * index, uint32_t * offsets) {
-  radixSortMulti<T,NS>(v,index,offsets);
+void 
+__launch_bounds__(256, 4)
+radixSortMultiWrapper(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+  radixSortMulti<T,NS>(v,index,offsets, workspace);
 }
 
+template<typename T, int NS=sizeof(T)>
+__global__
+void 
+// __launch_bounds__(256, 4)
+radixSortMultiWrapper2(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+  radixSortMulti<T,NS>(v,index,offsets, workspace);
+}
+
+
 #endif // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index ab97c243c4385..1049aed41fd5f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -19,13 +19,27 @@
 </bin>
 
 <bin file="HistoContainer_t.cpp">
+  <flags CXXFLAGS="-g"/>
 </bin>
 
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_t">
   <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
 </bin>
 
 <bin file="HistoContainer_t.cu" name="gpuHistoContainer_debug">
   <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
   <flags CUDA_FLAGS="-g -G"/>
 </bin>
+
+<bin file="OneHistoContainer_t.cu" name="gpuOneHistoContainer_t">
+  <use name="cuda-api-wrappers"/>
+   <use name="cub"/>
+</bin>
+
+
+<bin file="prefixScan_t.cu" name="gpuPrefixScan_t">
+  <use name="cub"/>
+</bin>
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 61c561f96267d..ca8167ab10894 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -6,47 +6,75 @@
 #include<random>
 #include<limits>
 
-template<typename T>
+template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
 void go() {
   std::mt19937 eng;
-  std::uniform_int_distribution<T> rgen(std::numeric_limits<T>::min(),std::numeric_limits<T>::max());
+
+  int rmin=std::numeric_limits<T>::min();
+  int rmax=std::numeric_limits<T>::max();
+  if (NBINS!=128) {
+    rmin=0;
+    rmax=NBINS*2-1;
+  }
+
+
+
+  std::uniform_int_distribution<T> rgen(rmin,rmax);
 
   
   constexpr int N=12000;
   T v[N];
 
-  using Hist = HistoContainer<T,7,8>;
-  std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << std::endl;
-  
+  using Hist = HistoContainer<T,NBINS,N,S>;
+  using Hist4 = HistoContainer<T,NBINS,N,S,uint16_t,4>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+  std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' << Hist4::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  for (auto nh=0; nh<4; ++nh) std::cout << "bins " << int(Hist4::bin(0))+Hist4::histOff(nh) << ' ' <<  int(Hist::bin(rmin))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax))+Hist4::histOff(nh) << std::endl;
+
+
   Hist h;
+  Hist4 h4;
+  typename Hist::Counter ws[Hist::totbins()];
+  typename Hist4::Counter ws4[Hist4::totbins()];
   for (int it=0; it<5; ++it) {
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
-    h.zero();
-    for (long long j = 0; j < N; j++) h.fill(v[j],j);
-    
-    std::cout << "nspills " << h.nspills << std::endl;    
+    if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
+    h.zero();h4.zero();
+    assert(h.size()==0);assert(h4.size()==0);
+    for (auto & i: ws) i=0; 
+    for (auto & i: ws4) i=0;
+    for (long long j = 0; j < N; j++) { h.count(v[j]); if(j<2000) h4.count(v[j],2); else h4.count(v[j],j%4); }
+    h.finalize(); h4.finalize();
+    assert(h.off[0]==0);
+    assert(h.size()==N);
+    assert(h4.off[0]==0);
+    assert(h4.size()==N);
+    for (long long j = 0; j < N; j++) { h.fill(v[j],j,ws);  if(j<2000) h4.fill(v[j],j,ws4,2); else h4.fill(v[j],j,ws4,j%4); }
+    assert(h.size()==N);
+    assert(h4.size()==N);
 
-    auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+    auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
-      if ( T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
+      if (i!=j && T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
     };
 
     for (uint32_t i=0; i<Hist::nbins(); ++i) {
-      if (0==h.n[i]) continue;
+      if (0==h.size(i)) continue;
       auto k= *h.begin(i);
       assert(k<N);
-      auto kl = h.bin(v[k]-T(1000));
-      auto kh =	h.bin(v[k]+T(1000));
-      assert(kl!=i);  assert(kh!=i);
+      auto kl = NBINS!=128 ? h.bin(std::max(rmin,v[k]-DELTA)) : h.bin(v[k]-T(DELTA));
+      auto kh =	NBINS!=128 ? h.bin(std::min(rmax,v[k]+DELTA)) : h.bin(v[k]+T(DELTA));
+      if(NBINS==128) { assert(kl!=i);  assert(kh!=i); }
+      if(NBINS!=128) { assert(kl<=i);  assert(kh>=i); }
       // std::cout << kl << ' ' << kh << std::endl;
-      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,k,k,(*j));
-      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,k,(*j),k);
+      for (auto j=h.begin(kl); j<h.end(kl); ++j) verify(i,kl, k,k,(*j));
+      for (auto	j=h.begin(kh); j<h.end(kh); ++j) verify(i,kh, k,(*j),k);
     }
   }
 
   for (long long j = 0; j < N; j++) {
     auto b0 = h.bin(v[j]);
-    auto stot = h.endSpill()-h.beginSpill();
     int w=0;
     int tot=0;
     auto ftest = [&](int k) {
@@ -54,7 +82,7 @@ void go() {
        tot++;
     };
     forEachInBins(h,v[j],w,ftest);
-    int rtot = h.end(b0)-h.begin(b0) + stot;
+    int rtot = h.end(b0)-h.begin(b0);
     assert(tot==rtot);
     w=1; tot=0;
     forEachInBins(h,v[j],w,ftest);
@@ -76,6 +104,9 @@ void go() {
 
 int main() {
   go<int16_t>();
+  go<uint8_t,128,8,4>();
+  go<uint16_t,313/2,9,4>();
+
 
   return 0;
 }
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 73e9f20ae589f..8d640b728e25c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -34,12 +34,16 @@ void go() {
   constexpr uint32_t partSize = N/nParts;
   uint32_t offsets[nParts+1];
 
-  using Hist = HistoContainer<T,7,8>;
-  std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits<T>::max()-std::numeric_limits<T>::min())/Hist::nbins() << std::endl;
+  using Hist = HistoContainer<T,128,N,8*sizeof(T),uint32_t,nParts>;
+  std::cout << "HistoContainer " << (int)(offsetof(Hist,off)) << ' ' 
+             << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << Hist::wsSize() << ' '
+            << (std::numeric_limits<T>::max()-std::numeric_limits<T>::min())/Hist::nbins() << std::endl;
   
-  Hist h[nParts];
+  Hist h;
+
+  auto h_d = cuda::memory::device::make_unique<Hist[]>(current_device, 1);
+  auto ws_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, Hist::totbins());
 
-  auto h_d = cuda::memory::device::make_unique<Hist[]>(current_device, nParts);
   auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, nParts+1);
 
 
@@ -62,18 +66,20 @@ void go() {
 
     for (long long j = 0; j < N; j++) v[j]=rgen(eng);
 
-    if (it==2) { // spill 
+    if (it==2) { // big bin 
        for (long long j = 1000; j < 2000; j++) v[j]= sizeof(T)==1 ? 22 : 3456;
     }
 
 
     cuda::memory::copy(v_d.get(), v, N*sizeof(T));
 
-    cudautils::fillManyFromVector(h_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
-
-    cuda::memory::copy(&h, h_d.get(), nParts*sizeof(Hist));                                
+    cudautils::fillManyFromVector(h_d.get(),ws_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
 
-        
+    cuda::memory::copy(&h, h_d.get(), sizeof(Hist));                                
+    assert(0==h.off[0]);
+    assert(offsets[10]==h.size());
+    
+    
     auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
       assert(t1<N); assert(t2<N);
       if ( T(v[t1]-v[t2])<=0) std::cout << "for " << i <<':'<< v[k] <<" failed " << v[t1] << ' ' << v[t2] << std::endl;
@@ -85,16 +91,17 @@ void go() {
     auto window = T(1300);
     
     for (uint32_t j=0; j<nParts; ++j) {
-      std::cout << j << ": nspills " << h[j].nspills << std::endl;
+      auto off = Hist::histOff(j);
       for (uint32_t i=0; i<Hist::nbins(); ++i) {
-        if (0==h[j].size(i)) continue;
-        auto k= *h[j].begin(i);
-        if (j%2) k = *(h[j].begin(i)+(h[j].end(i)-h[j].begin(i))/2);
-        auto bk = h[j].bin(v[k]);
+        auto ii = i+off;
+        if (0==h.size(ii)) continue;
+        auto k= *h.begin(ii);
+        if (j%2) k = *(h.begin(ii)+(h.end(ii)-h.begin(ii))/2);
+        auto bk = h.bin(v[k]);
         assert(bk==i);
         assert(k<offsets[j+1]);
-        auto kl = h[j].bin(v[k]-window);
-        auto kh = h[j].bin(v[k]+window);
+        auto kl = h.bin(v[k]-window);
+        auto kh = h.bin(v[k]+window);
         assert(kl!=i);  assert(kh!=i);
         // std::cout << kl << ' ' << kh << std::endl;
 
@@ -103,18 +110,18 @@ void go() {
         auto nm = 0;
         bool l = true; auto khh = kh; incr(khh);
         for (auto kk=kl; kk!=khh; incr(kk)) {
-          if (kk!=kl && kk!=kh) nm+=h[j].size(kk);
-          for(auto p=h[j].begin(kk); p<h[j].end(kk); ++p) {
+          if (kk!=kl && kk!=kh) nm+=h.size(kk+off);
+          for(auto p=h.begin(kk+off); p<h.end(kk+off); ++p) {
            if ( std::min(std::abs(T(v[*p]-me)), std::abs(T(me-v[*p]))) > window ) {} else {++tot;}
           }
           if (kk==i) { l=false; continue; }
-          if (l) for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,k,(*p));
-          else for (auto p=h[j].begin(kk); p<h[j].end(kk); ++p) verify(i,k,(*p),k);
+          if (l) for (auto p=h.begin(kk+off); p<h.end(kk+off); ++p) verify(i,k,k,(*p));
+          else for (auto p=h.begin(kk+off); p<h.end(kk+off); ++p) verify(i,k,(*p),k);
         }
-        if (h[j].nspills==0 && !(tot>=nm)) {
-           std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
+        if (!(tot>=nm)) {
+           std::cout << "too bad " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
         }
-        if (l) std::cout << "what? " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
+        if (l) std::cout << "what? " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<<nm << std::endl;
         assert(!l);
       }
     }
diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
new file mode 100644
index 0000000000000..c3682578163c8
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu
@@ -0,0 +1,147 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include<algorithm>
+#include<cassert>
+#include<iostream>
+#include<random>
+#include<limits>
+
+#include <cuda/api_wrappers.h>
+
+template<typename T, int NBINS, int S, int DELTA>
+__global__
+void mykernel(T const * __restrict__  v, uint32_t N) {
+
+  assert(v);
+  assert(N==12000);
+
+  if (threadIdx.x==0) printf("start kernel for %d data\n",N);
+
+  using Hist = HistoContainer<T,NBINS,12000,S,uint16_t>;
+  constexpr auto wss = Hist::totbins();
+
+  if (threadIdx.x==0) printf("ws size %d\n",wss);
+
+  __shared__ Hist  hist;
+  __shared__ typename Hist::Counter ws[wss];
+
+  for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
+  __syncthreads();
+
+  for (auto j=threadIdx.x; j<N; j+=blockDim.x) hist.count(v[j]);
+  __syncthreads();
+
+  assert(0==hist.off[0]);
+  __syncthreads();
+
+  
+  hist.finalize(ws);
+  __syncthreads();
+
+  assert(0==hist.off[0]);
+  assert(N==hist.size());
+  for (auto j=threadIdx.x; j<Hist::nbins(); j+=blockDim.x) assert(hist.off[j]<=hist.off[j+1]);
+  __syncthreads();
+
+  if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+  __syncthreads();
+
+  for (auto j=threadIdx.x; j<N; j+=blockDim.x) hist.fill(v[j],j,ws);
+  __syncthreads();
+
+
+   for (auto j=threadIdx.x; j<hist.size()-1; j+=blockDim.x) {
+     auto p = hist.begin()+j;
+     assert((*p)<N);
+     auto k1 = Hist::bin(v[*p]);
+     auto k2 = Hist::bin(v[*(p+1)]);
+     assert(k2>=k1);
+   }
+
+   for (auto i=threadIdx.x; i<hist.size(); i+=blockDim.x) {
+      auto p = hist.begin()+i;
+      auto j = *p;
+      auto b0 = Hist::bin(v[j]);
+      int tot=0;
+      auto ftest =  [&](int k) {
+         assert(k>=0 && k<N);
+         ++tot;
+      };
+      forEachInWindow(hist,v[j],v[j],ftest);
+      int rtot = hist.size(b0);
+      assert(tot==rtot);
+      tot=0;
+      auto vm = int(v[j])-DELTA;
+      auto vp = int(v[j])+DELTA;
+      constexpr int vmax = NBINS!=128 ? NBINS*2-1 : std::numeric_limits<T>::max();
+      vm = std::max(vm, 0);
+      vm = std::min(vm,vmax);
+      vp = std::min(vp,vmax);
+      vp = std::max(vp, 0);
+      assert(vp>=vm);
+      forEachInWindow(hist, vm,vp, ftest);
+      int bp = Hist::bin(vp);
+      int bm = Hist::bin(vm);
+      rtot = hist.end(bp)-hist.begin(bm);
+      assert(tot==rtot);
+   }
+
+
+}
+
+template<typename T, int NBINS=128, int S=8*sizeof(T), int DELTA=1000>
+void go() {
+
+  if (cuda::device::count() == 0) {
+    std::cerr << "No CUDA devices on this system" << "\n";
+    exit(EXIT_FAILURE);
+  }
+
+  auto current_device = cuda::device::current::get();
+
+
+  std::mt19937 eng;
+
+  int rmin=std::numeric_limits<T>::min();
+  int rmax=std::numeric_limits<T>::max();
+  if (NBINS!=128) {
+    rmin=0;
+    rmax=NBINS*2-1;
+  }
+
+
+
+  std::uniform_int_distribution<T> rgen(rmin,rmax);
+
+  
+  constexpr int N=12000;
+  T v[N];
+
+  auto v_d = cuda::memory::device::make_unique<T[]>(current_device, N);
+  assert(v_d.get());
+
+  using Hist = HistoContainer<T,NBINS,N,S>;
+  std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+  std::cout << "bins " << int(Hist::bin(0)) << ' ' <<  int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;  
+
+  for (int it=0; it<5; ++it) {
+    for (long long j = 0; j < N; j++) v[j]=rgen(eng);
+    if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
+
+    assert(v_d.get());
+    assert(v);
+    cuda::memory::copy(v_d.get(), v, N*sizeof(T));
+    assert(v_d.get());
+    cuda::launch(mykernel<T,NBINS,S,DELTA>,{1,256},v_d.get(),N);
+  }
+
+}
+
+int main() {
+  go<int16_t>();
+  go<uint8_t,128,8,4>();
+  go<uint16_t,313/2,9,4>();
+
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
new file mode 100644
index 0000000000000..36c4c3b88abe6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -0,0 +1,86 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cassert>
+
+template<typename T>
+__global__
+void testPrefixScan(uint32_t size) {
+
+  __shared__ T ws[32];
+  __shared__ T c[1024];
+  auto first = threadIdx.x;
+  for (auto i=first; i<size; i+=blockDim.x) c[i]=1;
+
+  blockPrefixScan(c, size, ws);
+
+  assert(1==c[0]);
+  for (auto i=first; i<size-1; i+=blockDim.x) {
+    if (c[i+1]!=c[i]+1) printf("failed %d %d %d: %d %d\n",size, i, blockDim.x, c[i],c[i+1]);
+    assert(c[i+1]==c[i]+1); assert(c[i]==i+1);
+  }
+}
+
+#include <cub/cub.cuh> 
+
+
+__global__
+void  init(uint32_t  * v, uint32_t  val, uint32_t n) {
+    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i<n) v[i]=val;
+    if (i==0) printf("init\n");
+}
+
+__global__
+void  verify(uint32_t  const * v, uint32_t n) {
+    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i<n) assert(v[i]==i+1);
+    if (i==0) printf("verify\n");
+}
+
+
+int main() {
+  
+  for(int bs=32; bs<=1024; bs+=32)
+  for (int j=1;j<=1024; ++j) {
+   testPrefixScan<uint16_t><<<1,bs>>>(j);
+   testPrefixScan<float><<<1,bs>>>(j);
+  }
+  cudaDeviceSynchronize();
+
+  // Declare, allocate, and initialize device-accessible pointers for input and output
+  int  num_items = 10000;
+  uint32_t  *d_in;         
+  uint32_t  *d_out;
+
+
+  cudaMalloc(&d_in,num_items*sizeof(uint32_t));
+  // cudaMalloc(&d_out,num_items*sizeof(uint32_t));
+
+  d_out = d_in;
+ 
+  auto nthreads = 256;
+  auto nblocks = (num_items + nthreads - 1) / nthreads;
+
+  init<<<nblocks, nthreads, 0>>>(d_in, 1, num_items);
+
+  // Determine temporary device storage requirements for inclusive prefix sum
+  void     *d_temp_storage = nullptr;
+  size_t   temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+  // Allocate temporary storage for inclusive prefix sum
+  // fake larger ws already available
+  temp_storage_bytes *=8;
+  cudaMalloc(&d_temp_storage, temp_storage_bytes);
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
+  // Run inclusive prefix sum
+  CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+  std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+  verify<<<nblocks, nthreads, 0>>>(d_out, num_items);
+
+  cudaDeviceSynchronize();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index bdc1af0123637..08441c8cc9555 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -7,7 +7,7 @@
 #include <algorithm>
 #include <chrono>
 #include<random>
-
+#include<set>
 
 #include<cassert>
 #include<iostream>
@@ -31,7 +31,7 @@ struct RS<float> {
 
 template<typename T, int NS=sizeof(T), 
          typename U=T, typename LL=long long>
-void go() {
+void go(bool useShared) {
 
   std::mt19937 eng;
 // std::mt19937 eng2;
@@ -89,6 +89,7 @@ void go() {
 
   auto v_d = cuda::memory::device::make_unique<U[]>(current_device, N);
   auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
+  auto ws_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, N);
   auto off_d = cuda::memory::device::make_unique<uint32_t[]>(current_device, blocks+1);
 
   cuda::memory::copy(v_d.get(), v, N*sizeof(T));
@@ -96,12 +97,23 @@ void go() {
 
   if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl;
 
+   auto ntXBl = 1==i%4 ? 256 : 256;
+
    delta -= (std::chrono::high_resolution_clock::now()-start);
+   constexpr int MaxSize = 256*32;
+   if (useShared)
    cuda::launch(
                 radixSortMultiWrapper<U,NS>,
-                { blocks, 256 },
-                v_d.get(),ind_d.get(),off_d.get()
+                { blocks, ntXBl, MaxSize*2 },
+                v_d.get(),ind_d.get(),off_d.get(),nullptr
         );
+   else
+   cuda::launch(
+                radixSortMultiWrapper2<U,NS>,
+                { blocks, ntXBl },
+                v_d.get(),ind_d.get(),off_d.get(),ws_d.get()
+        );
+
 
  if (i==0) std::cout << "done for " << offsets[blocks] << std::endl;
 
@@ -117,10 +129,13 @@ void go() {
     std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize-1000]]) << std::endl;
     std::cout << LL(v[ind[blockSize/2-1]]) << ' ' << LL(v[ind[blockSize/2]]) << ' ' << LL(v[ind[blockSize/2+1]]) << std::endl;
   }
-  for (int ib=0; ib<blocks; ++ib)
-  for (auto i = offsets[ib]+1; i < offsets[ib+1]; i++) {
+  for (int ib=0; ib<blocks; ++ib) {
+  std::set<uint16_t> inds;
+  if (offsets[ib+1]> offsets[ib]) inds.insert(ind[offsets[ib]]);
+  for (auto j = offsets[ib]+1; j < offsets[ib+1]; j++) {
+     inds.insert(ind[j]);
      auto a = v+offsets[ib];
-     auto k1=a[ind[i]]; auto k2=a[ind[i-1]];
+     auto k1=a[ind[j]]; auto k2=a[ind[j-1]];
      auto sh = sizeof(uint64_t)-NS; sh*=8;
      auto shorten = [sh](T& t) {
        auto k = (uint64_t *)(&t);
@@ -128,8 +143,15 @@ void go() {
      };
     shorten(k1);shorten(k2);
      if (k1<k2)
-      std::cout << ib << " not ordered at " << ind[i] << " : "
-  		<< a[ind[i]] <<' '<< a[ind[i-1]] << std::endl;
+      std::cout << ib << " not ordered at " << ind[j] << " : "
+  		<< a[ind[j]] <<' '<< a[ind[j-1]] << std::endl;
+  }
+  if (!inds.empty()) {
+    assert(0 == *inds.begin());
+    assert(inds.size()-1 == *inds.rbegin());
+  }
+  if(inds.size()!=(offsets[ib+1]-offsets[ib])) std::cout << "error " << i << ' ' << ib << ' ' << inds.size() <<"!=" << (offsets[ib+1]-offsets[ib]) << std::endl;
+  assert(inds.size()==(offsets[ib+1]-offsets[ib]));
   }
  }  // 50 times
      std::cout <<"cuda computation took "
@@ -140,17 +162,42 @@ void go() {
 
 int main() {
 
-  go<int8_t>();
-  go<int16_t>();
-  go<int32_t>();
-  go<int32_t,3>();
-  go<int64_t>();
-  go<float,4,float,double>();
-  go<float,2,float,double>();
-
-  go<uint8_t>();
-  go<uint16_t>();
-  go<uint32_t>();
-  // go<uint64_t>();
+  bool useShared=false;
+
+  std::cout << "using Global memory" <<    std::endl;
+
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t,3>(useShared);
+  go<int64_t>(useShared);
+  go<float,4,float,double>(useShared);
+  go<float,2,float,double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+  useShared=true;
+
+  std::cout << "using Shared memory" << std::endl;
+
+  go<int8_t>(useShared);
+  go<int16_t>(useShared);
+  go<int32_t>(useShared);
+  go<int32_t,3>(useShared);
+  go<int64_t>(useShared);
+  go<float,4,float,double>(useShared);
+  go<float,2,float,double>(useShared);
+
+  go<uint8_t>(useShared);
+  go<uint16_t>(useShared);
+  go<uint32_t>(useShared);
+  // go<uint64_t>(v);
+
+
+
   return 0;
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index f3242a11d7ae6..7bd6eac473cc7 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -34,6 +34,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
 // local includes
@@ -687,11 +688,11 @@ namespace pixelgpudetails {
         cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
       }
 
-      /*
-         std::cout
+#ifdef GPU_DEBUG
+       std::cout
          << "CUDA countModules kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
-       */
+#endif
 
       cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id()));
 
@@ -703,10 +704,10 @@ namespace pixelgpudetails {
 
       threadsPerBlock = 256;
       blocks = MaxNumModules;
-      /*
+#ifdef GPU_DEBUG
          std::cout << "CUDA findClus kernel launch with " << blocks
          << " blocks of " << threadsPerBlock << " threads\n";
-       */
+#endif
       cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
       findClus<<<blocks, threadsPerBlock, 0, stream.id()>>>(
           moduleInd_d,
@@ -717,6 +718,18 @@ namespace pixelgpudetails {
           wordCounter);
       cudaCheck(cudaGetLastError());
 
+      // apply charge cut
+      clusterChargeCut<<<blocks, threadsPerBlock, 0, stream.id()>>>(
+          moduleInd_d,
+          adc_d,
+          moduleStart_d,
+          clusInModule_d, moduleId_d,
+          clus_d,
+          wordCounter);
+      cudaCheck(cudaGetLastError());
+
+
+
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 7f7e65d2874f1..384a4732b32e1 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -584,7 +584,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
   }
 
   int32_t nclus=-1;
-  std::vector<AccretionCluster> aclusters(256);
+  std::vector<AccretionCluster> aclusters(1024);
   auto totCluseFilled=0;
 
   auto fillClusters = [&](uint32_t detId){
@@ -611,6 +611,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
 
   for (uint32_t i = 0; i < gpu.nDigis; i++) {
     if (gpu.pdigi_h[i]==0) continue;
+    if (gpu.clus_h[i]>9000) continue; // not in cluster
     assert(gpu.rawIdArr_h[i] > 109999);
     if ( (*detDigis).detId() != gpu.rawIdArr_h[i])
       {
@@ -625,7 +626,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
     auto const & dig = (*detDigis).data.back();
     // fill clusters
     assert(gpu.clus_h[i]>=0);
-    assert(gpu.clus_h[i]<256);
+    assert(gpu.clus_h[i]<1024);
     nclus = std::max(gpu.clus_h[i],nclus);
     auto row = dig.row();
     auto col = dig.column();
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
new file mode 100644
index 0000000000000..0284a378ecd39
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -0,0 +1,97 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include "gpuClusteringConstants.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+
+namespace gpuClustering {
+
+  __global__ void  clusterChargeCut(
+                           uint16_t * __restrict__ id,             // module id of each pixel (modified if bad cluster)
+                           uint16_t const * __restrict__ adc,              //  charge of each pixel
+                           uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
+                           uint32_t * __restrict__ nClustersInModule,    // modified: number of clusters found in each module
+                           uint32_t const * __restrict__ moduleId,             // module id of each module
+                           int32_t * __restrict__  clusterId,            // modified: cluster id of each pixel
+                           int numElements)
+  {
+
+    if (blockIdx.x >= moduleStart[0])
+      return;
+
+    auto firstPixel = moduleStart[1 + blockIdx.x];
+    auto thisModuleId = id[firstPixel];
+    assert(thisModuleId < MaxNumModules);
+    assert(thisModuleId==moduleId[blockIdx.x]);
+
+    auto nclus = nClustersInModule[thisModuleId];
+    if (nclus==0) return;
+
+    assert(nclus<=MaxNumClustersPerModules);
+
+#ifdef GPU_DEBUG
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+    auto first = firstPixel + threadIdx.x;
+
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      charge[i]=0;
+    }
+    __syncthreads();
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      atomicAdd(&charge[clusterId[i]], adc[i]);
+    }
+    __syncthreads();
+
+    auto chargeCut = thisModuleId<96 ? 2000 : 4000; // move in constants (calib?)
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+       newclusId[i] = ok[i] =  charge[i]>chargeCut ? 1 : 0;
+    }
+
+    __syncthreads();
+
+    // renumber
+    __shared__ uint16_t ws[32];
+    blockPrefixScan(newclusId, nclus, ws);
+
+    assert(nclus>=newclusId[nclus-1]);
+    
+    if(nclus==newclusId[nclus-1]) return;
+
+    nClustersInModule[thisModuleId] = newclusId[nclus-1];
+    __syncthreads();
+
+    // mark bad cluster again
+    for (int i=threadIdx.x; i<nclus; i += blockDim.x) {
+      if (0==ok[i]) newclusId[i]=InvId+1;
+    }
+    __syncthreads();
+
+    // reassign id
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != thisModuleId) break;           // end of module
+      clusterId[i] = newclusId[clusterId[i]]-1;
+      if(clusterId[i]==InvId) id[i] = InvId;
+    }
+
+    //done
+  }
+
+
+} // namespace
+#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 16df3abdf8df9..163378c6adbf0 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -1,14 +1,19 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
 
+
+// #define CLUS_LIMIT_LOOP
+
 #include <cassert>
 #include <cstdint>
 #include <cstdio>
 
 #include "gpuClusteringConstants.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
+
 namespace gpuClustering {
 
   __global__ void countModules(uint16_t const * __restrict__ id,
@@ -32,7 +37,9 @@ namespace gpuClustering {
     }
   }
 
-  __global__ void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
+  __global__
+//  __launch_bounds__(256,4)
+  void findClus(uint16_t const * __restrict__ id,             // module id of each pixel
                            uint16_t const * __restrict__ x,              // local coordinates of each pixel
                            uint16_t const * __restrict__ y,              //
                            uint32_t const * __restrict__ moduleStart,    // index of the first pixel of each module
@@ -63,8 +70,6 @@ namespace gpuClustering {
     __syncthreads();
 
     // skip threads not associated to an existing pixel
-    bool active = (first < numElements);
-    if (active) {
       for (int i = first; i < numElements; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -73,95 +78,132 @@ namespace gpuClustering {
           break;
         }
       }
-    }
-
-   //init hist  (ymax < 512)
-   __shared__ HistoContainer<uint16_t,8,4,9,uint16_t> hist;
-   hist.nspills = 0;
-   for (auto k = threadIdx.x; k<hist.nbins(); k+=blockDim.x) hist.n[k]=0;
 
+   //init hist  (ymax=416 < 512 : 9bits)
+   constexpr uint32_t maxPixInModule = 4000;
+   constexpr auto  nbins = phase1PixelTopology::numColsInModule + 2;   //2+2;
+   using Hist = HistoContainer<uint16_t,nbins,maxPixInModule,9,uint16_t>;
+   constexpr auto wss = Hist::totbins();
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[wss];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
     __syncthreads();
 
-
     assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
-    assert(msize-firstPixel<64000);  
+    assert(msize-firstPixel<maxPixInModule);  
  
-    // skip threads not assocoated to pixels in this module
-    active = (first < msize);
-
-    // __syncthreads();
 
+#ifdef GPU_DEBUG
+    __shared__ uint32_t totGood;
+    totGood=0;
+    __syncthreads();
+#endif
 
     // fill histo
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
-        hist.fill(y[i],i-firstPixel);
+        hist.count(y[i]);
+#ifdef GPU_DEBUG
+        atomicAdd(&totGood,1);
+#endif
+      }
+    __syncthreads();
+    hist.finalize(ws);
+    __syncthreads();
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+#ifdef GPU_DEBUG
+    assert(hist.size()==totGood);
+    if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("histo size %d\n",hist.size());
+#endif
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)                 // skip invalid pixels
+          continue;
+        hist.fill(y[i],i-firstPixel,ws);
       }
-    }
 
+#ifdef CLUS_LIMIT_LOOP
     // assume that we can cover the whole module with up to 10 blockDim.x-wide iterations
     constexpr int maxiter = 10;
-    if (active) {
-      assert(((msize - first) / blockDim.x) <= maxiter);
+    if (threadIdx.x==0) {
+      assert((hist.size()/ blockDim.x) <= maxiter);
     }
-    int jmax[maxiter];
+    uint16_t const * jmax[maxiter];
     for (int k = 0; k < maxiter; ++k)
-      jmax[k] = msize;
+      jmax[k] = hist.end();
+#endif
 
-    __syncthreads();
-    if (threadIdx.x==0 && hist.fullSpill()) printf("histo overflow in det %d\n",thisModuleId);
+#ifdef GPU_DEBUG
+    __shared__ int nloops;
+    nloops=0;
+#endif
 
 
+    __syncthreads();  // for hit filling!
+
     // for each pixel, look at all the pixels until the end of the module;
     // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
     // after the loop, all the pixel in each cluster should have the id equeal to the lowest
     // pixel in the cluster ( clus[i] == i ).
-    bool done = false;
-    while (not __syncthreads_and(done)) {
-      done = true;
-      if (active) {
-       for (int i = first, k = 0; i < msize; i += blockDim.x, ++k) {
-          if (id[i] == InvId)               // skip invalid pixels
-            continue;
+    bool more = true;
+    while (__syncthreads_or(more)) {
+      more = false;
+        for (int j=threadIdx.x, k = 0; j<hist.size(); j+=blockDim.x, ++k) {
+          auto p = hist.begin()+j;
+          auto i = *p + firstPixel;
+          assert (id[i] != InvId);
           assert(id[i] == thisModuleId);    // same module
+#ifdef CLUS_LIMIT_LOOP
           auto jm = jmax[k];
-          jmax[k] = i + 1;
+          jmax[k] = p + 1;
+#endif
+          int be = Hist::bin(y[i]+1);
+          auto e = hist.end(be);
+#ifdef CLUS_LIMIT_LOOP
+          e = std::min(e,jm);
+#endif      
           // loop to columns
-          auto bs = hist.bin(y[i]>0 ? y[i]-1 : 0);
-          auto be = hist.bin(y[i]+1)+1;
-          auto loop = [&](int j) {
-            j+=firstPixel;
-            if (i>=j or j>jm or 
-                std::abs(int(x[j]) - int(x[i])) > 1 or
-                std::abs(int(y[j]) - int(y[i])) > 1) return;
-            auto old = atomicMin(&clusterId[j], clusterId[i]);
+          auto loop = [&](uint16_t const * kk) {
+            auto m = (*kk)+firstPixel;
+#ifdef GPU_DEBUG
+            assert(m!=i);
+#endif
+            if (std::abs(int(x[m]) - int(x[i])) > 1) return;
+            // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
+            auto old = atomicMin(&clusterId[m], clusterId[i]);
             if (old != clusterId[i]) {
               // end the loop only if no changes were applied
-              done = false;
+              more = true;
             }
             atomicMin(&clusterId[i], old);
+#ifdef CLUS_LIMIT_LOOP
             // update the loop boundary for the next iteration
-            jmax[k] = std::max(j + 1,jmax[k]);
+            jmax[k] = std::max(kk + 1,jmax[k]);
+#endif
           };
-          for (auto b=bs; b<be; ++b){
-          for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
-            loop(*pj);
-          }}
-          for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-             loop(*pj);
-         } // pixel loop
-      } // end active
+          ++p;
+          for (;p<e;++p) loop(p);
+        } // pixel loop
+#ifdef GPU_DEBUG
+        if (threadIdx.x==0) ++nloops;
+#endif
     }  // end while
 
+#ifdef GPU_DEBUG
+   if (thisModuleId % 100 == 1)
+      if (threadIdx.x == 0)
+        printf("# loops %d\n",nloops);
+#endif
+
     __shared__ int foundClusters;
     foundClusters = 0;
     __syncthreads();
 
     // find the number of different clusters, identified by a pixels with clus[i] == i;
     // mark these pixels with a negative id.
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -170,11 +212,9 @@ namespace gpuClustering {
           clusterId[i] = -(old + 1);
         }
       }
-    }
     __syncthreads();
 
     // propagate the negative id to all the pixels in the cluster.
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId)                 // skip invalid pixels
           continue;
@@ -183,11 +223,9 @@ namespace gpuClustering {
           clusterId[i] = clusterId[clusterId[i]];
         }
       }
-    }
     __syncthreads();
 
     // adjust the cluster id to be a positive value starting from 0
-    if (active) {
       for (int i = first; i < msize; i += blockDim.x) {
         if (id[i] == InvId) {               // skip invalid pixels
           clusterId[i] = -9999;
@@ -195,14 +233,11 @@ namespace gpuClustering {
         }
         clusterId[i] = - clusterId[i] - 1;
       }
-    }
     __syncthreads();
 
-    if (active) {
       if (threadIdx.x == 0) {
         nClustersInModule[thisModuleId] = foundClusters;
         moduleId[blockIdx.x] = thisModuleId;
-      }
 #ifdef GPU_DEBUG
       if (thisModuleId % 100 == 1)
         if (threadIdx.x == 0)
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
index 4b5c67df7b88c..7b4bb5a1c8c95 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h
@@ -6,7 +6,9 @@
 namespace gpuClustering {
   constexpr uint32_t MaxNumModules  = 2000;
   constexpr uint32_t MaxNumPixels   = 256 * 2000;   // this does not mean maxPixelPerModule == 256!
+  constexpr uint32_t MaxNumClustersPerModules = 1024;
   constexpr uint16_t InvId          = 9999;         // must be > MaxNumModules
+
 }
 
 #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index 3fc830883ca58..335591b583b58 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -34,6 +34,15 @@
 
 <bin file="gpuClustering.cu" name="gpuClustering_t">
   <use name="cuda"/>
+  <use name="cub"/>
   <use name="cuda-api-wrappers"/>
   <flags CXXFLAGS="-g"/>
 </bin>
+
+<bin file="gpuClustering.cu" name="gpuClustering_debug">
+  <use name="cuda"/>
+  <use name="cub"/>
+  <use name="cuda-api-wrappers"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 2264be0de02af..a01c1230586fe 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -12,6 +12,7 @@
 #include <cuda/api_wrappers.h>
 
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
 
 int main(void)
 {
@@ -30,7 +31,6 @@ int main(void)
 
   auto h_clus = std::make_unique<int[]>(numElements);
 
-  auto h_debug = std::make_unique<unsigned int[]>(numElements);
   auto current_device = cuda::device::current::get();
   auto d_id = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
   auto d_x = cuda::memory::device::make_unique<uint16_t[]>(current_device, numElements);
@@ -44,12 +44,29 @@ int main(void)
   auto d_clusInModule = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
   auto d_moduleId = cuda::memory::device::make_unique<uint32_t[]>(current_device, MaxNumModules);
 
-  auto d_debug = cuda::memory::device::make_unique<unsigned int[]>(current_device, numElements);
   // later random number
   int n=0;
   int ncl=0;
   int y[10]={5,7,9,1,3,0,4,8,2,6};
 
+  auto generateClusters = [&](int kn) {
+  auto addBigNoise = 1==kn%2;
+  if (addBigNoise) {
+    constexpr int MaxPixels = 1000;
+    int id = 666;
+    for (int x=0; x<140; x+=3) {
+      for (int yy=0; yy<400; yy+=3) {
+       h_id[n]=id;
+       h_x[n]=x;
+       h_y[n]=yy;
+       h_adc[n]=1000;
+       ++n; ++ncl;
+       if (MaxPixels<=ncl) break;
+     }
+     if (MaxPixels<=ncl) break; 
+    }
+  }
+
   {
     // isolated
     int id = 42;
@@ -58,15 +75,56 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=100;
+    h_adc[n]= kn==0 ? 100 : 5000;
+    ++n;
+
+   // first column
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x;
+    h_y[n]=0;
+    h_adc[n]= 5000;
+    ++n;
+    // first columns
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=2;
+    h_adc[n]= 5000;
+    ++n;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=1;
+    h_adc[n]= 5000;
+    ++n;
+
+    // last column
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x;
+    h_y[n]=415;
+    h_adc[n]= 5000;
+    ++n;
+   // last columns
+    ++ncl;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=415;
+    h_adc[n]= 2500;
+    ++n;
+    h_id[n]=id;
+    h_x[n]=x+80;
+    h_y[n]=414;
+    h_adc[n]= 2500;
     ++n;
+
     // diagonal
     ++ncl;
     for (int x=20; x<25; ++x) {
       h_id[n]=id;
       h_x[n]=x;
       h_y[n]=x;
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     ++ncl;
@@ -75,7 +133,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=x;
       h_y[n]=x;
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     ++ncl;
@@ -86,7 +144,7 @@ int main(void)
       h_id[n]=id;
       h_x[n]=xx[k];
       h_y[n]=20+xx[k];
-      h_adc[n]=100;
+      h_adc[n]=1000;
       ++n;
     }
     // holes
@@ -95,13 +153,13 @@ int main(void)
       h_id[n]=id;
       h_x[n]=xx[k];
       h_y[n]=100;
-      h_adc[n]=100;
+      h_adc[n]= kn==2 ? 100 : 1000;
       ++n;
       if (xx[k]%2==0) {
         h_id[n]=id;
         h_x[n]=xx[k];
         h_y[n]=101;
-        h_adc[n]=100;
+        h_adc[n]=1000;
       ++n;
       }
     }
@@ -114,7 +172,7 @@ int main(void)
     h_id[n]=id;
     h_x[n]=x;
     h_y[n]=x;
-    h_adc[n]=100;
+    h_adc[n]=5000;
     ++n;
   }
   // all odd id
@@ -132,7 +190,7 @@ int main(void)
           h_id[n]=id;
           h_x[n]=x+1;
           h_y[n]=x+y[k]+2;
-          h_adc[n]=100;
+          h_adc[n]=1000;
           ++n;
         }
       } else {
@@ -140,24 +198,31 @@ int main(void)
           h_id[n]=id;
           h_x[n]=x;
           h_y[n]=x+y[9-k];
-          h_adc[n]=100;
+          h_adc[n]= kn==2 ? 10 : 1000;
           ++n;
           if (y[k]==3) continue; // hole
           if (id==51)  {h_id[n++]=InvId; h_id[n++]=InvId; }// error
           h_id[n]=id;
           h_x[n]=x+1;
           h_y[n]=x+y[k]+2;
-          h_adc[n]=100;
+          h_adc[n]= kn==2 ? 10 : 1000;
           ++n;
         }
       }
     }
   }
+  }; // end lambda
+  for (auto kkk=0; kkk<5; ++kkk) {
+  n=0; ncl=0;
+  generateClusters(kkk);
+
   std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
   assert(n<=numElements);
+
+
   size_t size32 = n * sizeof(unsigned int);
   size_t size16 = n * sizeof(unsigned short);
-  size_t size8 = n * sizeof(uint8_t);
+  // size_t size8 = n * sizeof(uint8_t);
 
   uint32_t nModules=0;
   cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t));
@@ -165,10 +230,9 @@ int main(void)
   cuda::memory::copy(d_id.get(), h_id.get(), size16);
   cuda::memory::copy(d_x.get(), h_x.get(), size16);
   cuda::memory::copy(d_y.get(), h_y.get(), size16);
-  cuda::memory::copy(d_adc.get(), h_adc.get(), size8);
-  cuda::memory::device::zero(d_debug.get(),size32);
+  cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
   // Launch CUDA Kernels
-  int threadsPerBlock = 256;
+  int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256);
   int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
   std::cout
     << "CUDA countModules kernel launch with " << blocksPerGrid
@@ -180,7 +244,6 @@ int main(void)
                d_id.get(), d_moduleStart.get() ,d_clus.get(),n
                );
 
-  threadsPerBlock = 256;
   blocksPerGrid = MaxNumModules;    //nModules;
 
   std::cout
@@ -198,32 +261,77 @@ int main(void)
                d_clus.get(),
                n
                );
+    cudaDeviceSynchronize();
+
+    cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
+
+    uint32_t nclus[MaxNumModules], moduleId[nModules];
+
+    cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
+    std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl;
+    for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;}
+    if (ncl!=std::accumulate(nclus,nclus+MaxNumModules,0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
+
+    cuda::launch(
+               clusterChargeCut,
+               { blocksPerGrid, threadsPerBlock },
+               d_id.get(), d_adc.get(),
+               d_moduleStart.get(),
+               d_clusInModule.get(), d_moduleId.get(),
+               d_clus.get(),
+               n
+               );
+
+
+    cudaDeviceSynchronize();
 
-  cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
   std::cout << "found " << nModules << " Modules active" << std::endl;
 
-  uint32_t nclus[MaxNumModules], moduleId[nModules];
+  cuda::memory::copy(h_id.get(), d_id.get(), size16);
   cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
   cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
   cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));
-  cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
 
-  auto p = std::minmax_element(h_debug.get(),h_debug.get()+n);
-  std::cout << "debug " << *p.first << ' ' << *p.second << std::endl;
 
   std::set<unsigned int> clids;
-  std::vector<unsigned int> seeds;
   for (int i=0; i<n; ++i) {
+    assert(h_id[i]!=666);  // only noise
     if (h_id[i]==InvId) continue;
     assert(h_clus[i]>=0);
     assert(h_clus[i]<nclus[h_id[i]]);
-    clids.insert(h_id[i]*100+h_clus[i]);
+    clids.insert(h_id[i]*1000+h_clus[i]);
                  // clids.insert(h_clus[i]);
-    // if (h_clus[i]==i) seeds.push_back(i); // only if no renumbering
+  }
+
+  // verify no hole in numbering
+  auto p = clids.begin();
+  auto cmid = (*p)/1000;
+  assert (0==(*p)%1000);
+  auto c= p; ++c;
+  std::cout << "first clusters " << *p << ' ' << *c << ' ' << nclus[cmid] << ' ' << nclus[(*c)/1000] << std::endl;
+  std::cout << "last cluster " << *clids.rbegin() << ' ' << nclus[(*clids.rbegin())/1000] << std::endl;
+  for(;c!=clids.end(); ++c) {
+     auto cc = *c;
+     auto pp = *p;
+     auto mid = cc/1000;
+     auto pnc = pp%1000;
+     auto nc = cc%1000;
+     if(mid!=cmid) {
+       assert (0==cc%1000);
+       assert (nclus[cmid]-1 == pp%1000);
+       // if (nclus[cmid]-1 != pp%1000) std::cout << "error size " << mid << ": "  << nclus[mid] << ' ' << pp << std::endl;   
+       cmid=mid;
+       p=c;
+       continue;
+     }
+     p=c;
+     // assert(nc==pnc+1);
+     if (nc!=pnc+1) std::cout << "error " << mid << ": " << nc << ' ' << pnc << std::endl;
   }
 
   std::cout << "found " << std::accumulate(nclus,nclus+MaxNumModules,0) << ' ' <<  clids.size() << " clusters" << std::endl;
+  for(auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 <<    ' ' << nclus[i-1] << std::endl;    break;}
   // << " and " << seeds.size() << " seeds" << std::endl;
-
+  } /// end loop kkk
   return 0;
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
index c52545a601341..7918c7a4f4d9a 100644
--- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
@@ -14,7 +14,8 @@
 
 <use   name="cuda"/>
 <use   name="cuda-api-wrappers"/>
-<use   name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="HeterogeneousCore/CUDAServices"/>
 
 <export>
   <lib   name="1"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index e2165471c3386..c63466f157a1b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -8,7 +8,7 @@
 // CMSSW headers
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
 #include "PixelRecHits.h"
 #include "gpuPixelRecHits.h"
 
@@ -34,7 +34,7 @@ namespace {
 namespace pixelgpudetails {
   PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
 
-    constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
+    constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
 
     cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
@@ -70,7 +70,8 @@ namespace pixelgpudetails {
     gpu_.iphi_d = slicePitch<int16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
     gpu_.sortIndex_d = slicePitch<uint16_t>(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
 
-    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
+    cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()));
     cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
     gpu_.me_d = gpu_d;
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
@@ -109,6 +110,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaFree(gpu_.owner_32bit_));
     cudaCheck(cudaFree(gpu_.owner_16bit_));
     cudaCheck(cudaFree(gpu_.hist_d));
+    cudaCheck(cudaFree(gpu_.hws_d));
     cudaCheck(cudaFree(gpu_d));
     cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
 
@@ -132,6 +134,10 @@ namespace pixelgpudetails {
 
     int threadsPerBlock = 256;
     int blocks = input.nModules; // active modules (with digis)
+
+#ifdef GPU_DEBUG
+    std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
+#endif
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       gpu_.bs_d,
@@ -188,6 +194,6 @@ namespace pixelgpudetails {
       // radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
     }
 
-    cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
+    cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
   }
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 6065fa3cb274a..f0c3b42b30eba 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -218,6 +218,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
 
   int numberOfDetUnits = 0;
   int numberOfClusters = 0;
+  int numberOfLostClusters = 0;
   for (auto DSViter=input.begin(); DSViter != input.end() ; DSViter++) {
     numberOfDetUnits++;
     unsigned int detid = DSViter->detId();
@@ -234,7 +235,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
     auto mrp = &hoc.mr[fc];
     uint32_t ngh=0;
     for (uint32_t i=0; i<nhits;++i) {
-      if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) continue;
+      if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) { ++numberOfLostClusters; continue;}
       ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a]<mrp[b];});
       ++ngh;
     }
@@ -307,6 +308,15 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionN
 
 
   } //    <-- End loop on DetUnits
+
+  /*
+  std::cout << "SiPixelRecHitGPUVI $ det, clus, lost "
+    <<  numberOfDetUnits << ' '
+    << numberOfClusters  << ' '
+    << numberOfLostClusters
+    << std::endl;
+   */
+
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitHeterogeneous);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 2ee4a10c6fc99..3f92e4833bc22 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -14,15 +14,6 @@ namespace gpuPixelRecHits {
 
 
 
-  // to be moved in common namespace...
-  constexpr uint16_t InvId=9999; // must be > MaxNumModules
-
-
-  constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
-
-  using ClusParams = pixelCPEforGPU::ClusParams;
-
-
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
                           float const * __restrict__  bs,
                           uint16_t const * __restrict__  id,
@@ -42,21 +33,44 @@ namespace gpuPixelRecHits {
                           float * xe, float * ye, 
                           uint16_t * mr, uint16_t * mc)
   {
+
+    // to be moved in common namespace...
+    constexpr uint16_t InvId=9999; // must be > MaxNumModules
+    constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
+
+    using ClusParams = pixelCPEforGPU::ClusParams;
+
+
     // as usual one block per module
     __shared__ ClusParams clusParams;
 
     auto first = digiModuleStart[1 + blockIdx.x];
-    auto me = id[first];
-    assert(moduleId[blockIdx.x] == me);
+    auto me = moduleId[blockIdx.x];
     auto nclus = clusInModule[me];
 
+    if (0==nclus) return;
+
+#ifdef GPU_DEBUG
+    if (threadIdx.x==0) {
+      auto k=first;
+      while (id[k]==InvId) ++k;
+      assert(id[k]==me);
+    }
+#endif
+
 #ifdef GPU_DEBUG
     if (me%100==1)
       if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);
 #endif
 
     assert(blockDim.x >= MaxClusInModule);
-    assert(nclus <= MaxClusInModule);
+
+    if (threadIdx.x==0 && nclus > MaxClusInModule) { 
+      printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+      // zero charge: do not bother to do it in parallel
+      for (auto d=MaxClusInModule; d<nclus; ++d) { chargeh[d]=0; detInd[d]=InvId;}
+    }
+    nclus = std::min(nclus, MaxClusInModule);
 
     auto ic = threadIdx.x;
 
@@ -81,7 +95,7 @@ namespace gpuPixelRecHits {
     for (int i = first; i < numElements; i += blockDim.x) {
       if (id[i] == InvId) continue;     // not valid
       if (id[i] != me) break;           // end of module
-      assert(clus[i] < nclus);
+      if (clus[i] >= nclus) continue;
       atomicMin(&clusParams.minRow[clus[i]], x[i]);
       atomicMax(&clusParams.maxRow[clus[i]], x[i]);
       atomicMin(&clusParams.minCol[clus[i]], y[i]);
@@ -93,6 +107,7 @@ namespace gpuPixelRecHits {
     for (int i = first; i < numElements; i += blockDim.x) {
       if (id[i] == InvId) continue;     // not valid
       if (id[i] != me) break;           // end of module
+      if (clus[i] >= nclus) continue;
       atomicAdd(&clusParams.charge[clus[i]], adc[i]);
       if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
       if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 778b763b28cff..9d0fe7a279799 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -17,6 +17,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
 
   using CPUProduct = int; // dummy
 
+  static constexpr uint32_t maxHits() { return 65536;}
+  using hindex_type = uint16_t; // if above is <=2^16
+
   struct HitsOnGPU{
      pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
      float * bs_d;
@@ -32,8 +35,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
      uint16_t * mr_d;
      uint16_t * mc_d;
 
-     using Hist = HistoContainer<int16_t,7,8>;
+     using Hist = HistoContainer<int16_t,128,maxHits(),8*sizeof(int16_t),uint16_t,10>;
      Hist * hist_d;
+     typename Hist::Counter *  hws_d;  
 
      HitsOnGPU const * me_d = nullptr;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 101095c5a9e4d..66d6d56286008 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -164,8 +164,8 @@ KernelLineFitAllHits(GPU::SimpleVector<Quadruplet> * foundNtuplets,
 
 __global__ void
 kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
-               GPUCACell *cells, uint32_t const * nCells,
-               GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+               GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells,
+               GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
                uint32_t nHits, uint32_t maxNumberOfDoublets) {
 
  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -185,31 +185,36 @@ kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 }
 
 
-__global__ void
+__global__ 
+void
 kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
-               GPUCACell *cells, uint32_t const * nCells,
-               GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+               GPUCACell::Hits const *  __restrict__ hhp,
+               GPUCACell * cells, uint32_t const * __restrict__ nCells,
+               GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
                float ptmin,
                float region_origin_radius, const float thetaCut,
                const float phiCut, const float hardPtCut,
                unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
 
-  float region_origin_x = 0.;
-  float region_origin_y = 0.;
+  auto const & hh = *hhp;
+
+  constexpr float region_origin_x = 0.;
+  constexpr float region_origin_y = 0.;
 
   auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel
 
   if (cellIndex >= (*nCells) ) return;
-  auto &thisCell = cells[cellIndex];
+  auto const & thisCell = cells[cellIndex];
   auto innerHitId = thisCell.get_inner_hit_id();
   auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+  auto vi = isOuterHitOfCell[innerHitId].data();
   for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
-     auto otherCell = isOuterHitOfCell[innerHitId][j];
+     auto otherCell = __ldg(vi+j);
 
-     if (thisCell.check_alignment_and_tag(
-                 cells, otherCell, ptmin, region_origin_x, region_origin_y,
+     if (thisCell.check_alignment(hh,
+                 cells[otherCell], ptmin, region_origin_x, region_origin_y,
                   region_origin_radius, thetaCut, phiCut, hardPtCut)
         ) {
           cells[otherCell].theOuterNeighbors.push_back(cellIndex);
@@ -218,7 +223,7 @@ kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
 }
 
 __global__ void kernel_find_ntuplets(
-    GPUCACell *cells, uint32_t const * nCells,
+    GPUCACell * const __restrict__ cells, uint32_t const * nCells,
     GPU::SimpleVector<Quadruplet> *foundNtuplets,
     unsigned int minHitsPerNtuplet,
     unsigned int maxNumberOfDoublets_)
@@ -228,7 +233,7 @@ __global__ void kernel_find_ntuplets(
   if (cellIndex >= (*nCells) ) return;
   auto &thisCell = cells[cellIndex];
   if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME
-  GPU::VecArray<unsigned int, 3> stack;
+  GPU::VecArray<CAHitQuadrupletGeneratorGPU::hindex_type, 3> stack;
   stack.reset();
   thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet);
   assert(stack.size()==0);
@@ -336,9 +341,11 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
 
   auto nhits = hh.nHits;
   assert(nhits <= PixelGPUConstants::maxNumberOfHits);
-  auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512;
-  kernel_connect<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  auto blockSize = 64;
+  auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
+  kernel_connect<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
+      hh.gpu_d,
       device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
       region.ptMin(),
@@ -347,21 +354,20 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
   );
   cudaCheck(cudaGetLastError());
 
-  kernel_find_ntuplets<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
       device_theCells_, device_nCells_,
       d_foundNtupletsVec_[regionIndex],
       4, maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
-
-  numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512;
-  kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
+  numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize;
+  kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
                         d_foundNtupletsVec_[regionIndex],
                         device_theCells_, device_nCells_,
                         device_isOuterHitOfCell_, nhits,
                         maxNumberOfDoublets_
                        );
-
+  cudaCheck(cudaGetLastError());
 
   // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10);
 
@@ -383,7 +389,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
       line_fit_resultsGPU_);
   cudaCheck(cudaGetLastError());
 
-  if(transferToCPU) {
+  if (transferToCPU) {
     cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
                               sizeof(GPU::SimpleVector<Quadruplet>),
                               cudaMemcpyDeviceToHost, cudaStream));
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index 589ae994132ad..fdceec9225d76 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -23,6 +23,8 @@
 #include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
 
+
+// FIXME  (split header???)
 #include "GPUCACell.h"
 
 class TrackingRegion;
@@ -38,6 +40,7 @@ class CAHitQuadrupletGeneratorGPU {
 
     using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
     using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+    using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
 
     static constexpr unsigned int minLayers = 4;
     typedef OrderedHitSeeds ResultType;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 5995d286fc38d..772b802282d31 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,49 +6,49 @@
 
 #include <cuda_runtime.h>
 
-#include "GPUHitsAndDoublets.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 struct Quadruplet {
-  int hitId[4];
+   using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+   hindex_type hitId[4];
 };
 
+
 class GPUCACell {
 public:
+
+  using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+  using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+
   GPUCACell() = default;
+#ifdef __CUDACC__
 
-  __host__ __device__
-  void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
-      int layerPairId, int doubletId, int innerHitId,int outerHitId)
+  __device__ __forceinline__
+  void init(Hits const & hh,
+      int layerPairId, int doubletId,  
+      hindex_type innerHitId, hindex_type outerHitId)
   {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    theInnerX = hh.xg_d[innerHitId];
-    theOuterX = hh.xg_d[outerHitId];
-
-    theInnerY = hh.yg_d[innerHitId];
-    theOuterY = hh.yg_d[outerHitId];
-
-    theInnerZ = hh.zg_d[innerHitId];
-    theOuterZ = hh.zg_d[outerHitId];
-    theInnerR = hh.rg_d[innerHitId];
-    theOuterR = hh.rg_d[outerHitId];
+    theInnerZ = __ldg(hh.zg_d+innerHitId);
+    theInnerR = __ldg(hh.rg_d+innerHitId);
     theOuterNeighbors.reset();
   }
 
-  constexpr float get_inner_x() const { return theInnerX; }
-  constexpr float get_outer_x() const { return theOuterX; }
-  constexpr float get_inner_y() const { return theInnerY; }
-  constexpr float get_outer_y() const { return theOuterY; }
-  constexpr float get_inner_z() const { return theInnerZ; }
-  constexpr float get_outer_z() const { return theOuterZ; }
-  constexpr float get_inner_r() const { return theInnerR; }
-  constexpr float get_outer_r() const { return theOuterR; }
+  __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
+  __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
+  __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
+  __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
+  __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
+
   constexpr unsigned int get_inner_hit_id() const {
     return theInnerHitId;
   }
@@ -56,37 +56,42 @@ class GPUCACell {
     return theOuterHitId;
   }
 
-  constexpr void print_cell() const {
+
+  __device__
+  void print_cell() const {
     printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
            "%d, innerradius %f, outerRadius %f \n",
-           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
-           theInnerR, theOuterR);
+           theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
+    );
   }
 
-  __host__ __device__
-  bool check_alignment_and_tag(
-      const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
+
+  __device__
+  bool check_alignment(Hits const & hh,
+      GPUCACell const & otherCell, const float ptmin,
       const float region_origin_x, const float region_origin_y,
       const float region_origin_radius, const float thetaCut,
-      const float phiCut, const float hardPtCut)
+      const float phiCut, const float hardPtCut) const
   {
-    auto ro = get_outer_r();
-    auto zo = get_outer_z();
-    const auto &otherCell = cells[innerCellId];
+    auto ri = get_inner_r(hh);
+    auto zi = get_inner_z(hh);
+
+    auto ro = get_outer_r(hh);
+    auto zo = get_outer_z(hh);
 
-    auto r1 = otherCell.get_inner_r();
-    auto z1 = otherCell.get_inner_z();
-    bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
+    auto r1 = otherCell.get_inner_r(hh);
+    auto z1 = otherCell.get_inner_z(hh);
+    bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
     return (aligned &&
-            haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
+            haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
                                  region_origin_y, region_origin_radius, phiCut,
                                  hardPtCut));
   }
 
-
-  constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
+  __device__ __forceinline__
+  static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
                                         const float ptmin,
-                                        const float thetaCut) const {
+                                        const float thetaCut) {
     float radius_diff = std::abs(r1 - ro);
     float distance_13_squared =
         radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
@@ -96,27 +101,26 @@ class GPUCACell {
                                                 // radius_diff later
 
     float tan_12_13_half_mul_distance_13_squared =
-        fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
+        fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  constexpr bool
-  haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
+  __device__ 
+  bool
+  haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
                        const float ptmin, const float region_origin_x,
                        const float region_origin_y,
                        const float region_origin_radius, const float phiCut,
                        const float hardPtCut) const {
 
-    const auto &otherCell = cells[innerCellId];
-
-    auto x1 = otherCell.get_inner_x();
-    auto y1 = otherCell.get_inner_y();
+    auto x1 = otherCell.get_inner_x(hh);
+    auto y1 = otherCell.get_inner_y(hh);
 
-    auto x2 = get_inner_x();
-    auto y2 = get_inner_y();
+    auto x2 = get_inner_x(hh);
+    auto y2 = get_inner_y(hh);
 
-    auto x3 = get_outer_x();
-    auto y3 = get_outer_y();
+    auto x3 = get_outer_x(hh);
+    auto y3 = get_outer_y(hh);
 
     float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
     float tan_12_13_half_mul_distance_13_squared =
@@ -139,7 +143,7 @@ class GPUCACell {
 
       return distance_13_beamspot_squared <
              (region_origin_radius + phiCut) * (region_origin_radius + phiCut);
-    }
+    } 
 
     // 87 cm/GeV = 1/(3.8T * 0.3)
 
@@ -186,13 +190,13 @@ class GPUCACell {
   // trying to free the track building process from hardcoded layers, leaving
   // the visit of the graph based on the neighborhood connections between cells.
 
-#ifdef __CUDACC__
+// #ifdef __CUDACC__
 
   __device__
   inline void find_ntuplets(
-      const GPUCACell *cells,
+      GPUCACell const * __restrict__ cells,
       GPU::SimpleVector<Quadruplet> *foundNtuplets,
-      GPU::VecArray<unsigned int,3> &tmpNtuplet,
+      GPU::VecArray<hindex_type,3> &tmpNtuplet,
       const unsigned int minHitsPerNtuplet) const
   {
     // the building process for a track ends if:
@@ -231,16 +235,10 @@ class GPUCACell {
   int theLayerPairId;
 
 private:
-  unsigned int theInnerHitId;
-  unsigned int theOuterHitId;
-  float theInnerX;
-  float theOuterX;
-  float theInnerY;
-  float theOuterY;
   float theInnerZ;
-  float theOuterZ;
   float theInnerR;
-  float theOuterR;
+  hindex_type theInnerHitId;
+  hindex_type theOuterHitId;
 };
 
 #endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 31844f39f9727..61d048637585c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,12 +20,13 @@ namespace gpuPixelDoublets {
 
   template<typename Hist>
   __device__
+  __forceinline__
   void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
                          uint32_t nPairs,
                          GPUCACell * cells,
                          uint32_t * nCells,
                          int16_t const * __restrict__ iphi,
-                         Hist const * __restrict__ hist,
+                         Hist const & __restrict__ hist,
                          uint32_t const * __restrict__ offsets,
                          siPixelRecHitsHeterogeneousProduct::HitsOnGPU const &  __restrict__ hh,
                          GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
@@ -63,6 +64,8 @@ namespace gpuPixelDoublets {
       uint8_t outer = layerPairs[2*pairLayerId+1];
       assert(outer > inner);
 
+      auto hoff = Hist::histOff(outer);
+
       auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
       i += offsets[inner];
 
@@ -73,8 +76,8 @@ namespace gpuPixelDoublets {
 
       // found hit corresponding to our cuda thread, now do the job
       auto mep = iphi[i];
-      auto mez = hh.zg_d[i];
-      auto mer = hh.rg_d[i];
+      auto mez = __ldg(hh.zg_d+i);
+      auto mer = __ldg(hh.rg_d+i);
 
       constexpr float z0cut = 12.f;                     // cm
       constexpr float hardPtCut = 0.5f;                 // GeV
@@ -83,13 +86,13 @@ namespace gpuPixelDoublets {
       auto ptcut = [&](int j) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
-        auto ro = hh.rg_d[j];
+        auto ro = __ldg(hh.rg_d+j);
         auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
         return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
       };
       auto z0cutoff = [&](int j) {
-        auto zo = hh.zg_d[j];
-        auto ro = hh.rg_d[j];
+        auto zo = __ldg(hh.zg_d+j);
+        auto ro = __ldg(hh.rg_d+j);
         auto dr = ro-mer;
         return dr > maxr[pairLayerId] ||
           dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
@@ -97,8 +100,8 @@ namespace gpuPixelDoublets {
 
       auto iphicut = phicuts[pairLayerId];
 
-      auto kl = hist[outer].bin(int16_t(mep-iphicut));
-      auto kh = hist[outer].bin(int16_t(mep+iphicut));
+      auto kl = Hist::bin(int16_t(mep-iphicut));
+      auto kh = Hist::bin(int16_t(mep+iphicut));
       auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
       int  tot  = 0;
       int  nmin = 0;
@@ -108,9 +111,11 @@ namespace gpuPixelDoublets {
       int tooMany=0;
       for (auto kk = kl; kk != khh; incr(kk)) {
         if (kk != kl && kk != kh)
-          nmin += hist[outer].size(kk);
-        for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
-          auto oi=*p;
+          nmin += hist.size(kk+hoff);
+        auto const * __restrict__ p = hist.begin(kk+hoff);
+        auto const * __restrict__ e = hist.end(kk+hoff);
+        for (;p < e; ++p) {
+          auto oi=__ldg(p);
           assert(oi>=offsets[outer]);
           assert(oi<offsets[outer+1]);
 
@@ -128,19 +133,14 @@ namespace gpuPixelDoublets {
       if (tooMany > 0)
         printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
 
-      if (hist[outer].nspills > 0)
-        printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
-
-      // if (0==hist[outer].nspills) assert(tot>=nmin);
-      // look in spill bin as well....
-
     }  // loop in block...
   }
 
   constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+  constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
 
   __global__
-  __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+  __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
   void getDoubletsFromHisto(GPUCACell * cells,
                             uint32_t * nCells,
                             siPixelRecHitsHeterogeneousProduct::HitsOnGPU const *  __restrict__ hhp,
@@ -184,7 +184,7 @@ namespace gpuPixelDoublets {
 
     auto const &  __restrict__ hh = *hhp;
     doubletsFromHisto(layerPairs, nPairs, cells, nCells,
-                      hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
+                      hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
                       hh, isOuterHitOfCell,
                       phicuts, minz, maxz, maxr);
   }
diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
index 5a3208da2c87f..43261b0417410 100644
--- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
@@ -20,4 +20,4 @@
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
-
+<use name="HeterogeneousCore/CUDAUtilities"/>
diff --git a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
index fcb19f855a9ba..ff3624cdafd65 100644
--- a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
+++ b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
@@ -16,6 +16,7 @@ namespace pixelVertexHeterogeneousProduct {
     float * z_d;
     float * zerr_d;
     float * chi2_d;
+    uint16_t * sortInd;
     int32_t * ivtx_d; // this should be indexed with the original tracks, not the reduced set (oops)
   };
 
@@ -31,6 +32,7 @@ namespace pixelVertexHeterogeneousProduct {
     { }
 
     std::vector<float,    CUDAHostAllocator<float>> z,zerr, chi2;
+    std::vector<int16_t, CUDAHostAllocator<uint16_t>> sortInd;
     std::vector<int32_t, CUDAHostAllocator<int32_t>> ivtx;
 
     uint32_t nVertices=0;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
index 3451655a282d1..0b8c31235abea 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
@@ -117,18 +117,20 @@ void PixelVertexHeterogeneousProducer::acquireGPUCuda(
 
   // Second, make a collection of pointers to the tracks we want for the vertex finder
   // fill z,ez
-  std::vector<float> z,ez;
+  std::vector<float> z,ez2,pt2;
   assert(m_trks.empty());
   for (unsigned int i=0; i<tracks.size(); i++) {
     if (tracks[i].pt() < m_ptMin) continue;    
     m_trks.push_back( reco::TrackRef(trackCollection, i) );
     z.push_back(tracks[i].dz(bs));
-    ez.push_back(tracks[i].dzError());ez.back()*=ez.back();
+    ez2.push_back(tracks[i].dzError());ez2.back()*=ez2.back();
+    pt2.push_back(std::min(25.,tracks[i].pt()));pt2.back()*=pt2.back();
+
   }
   if (verbose_) edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Selected " << m_trks.size() << " of these tracks for vertexing\n";
   
   // Third, ship these tracks off to be vertexed
-  m_gpuAlgo.produce(cudaStream.id(),z.data(),ez.data(),z.size());
+  m_gpuAlgo.produce(cudaStream.id(),z.data(),ez2.data(),pt2.data(),z.size());
 
 }
 
@@ -154,23 +156,14 @@ void PixelVertexHeterogeneousProducer::produceGPUCuda(
     const reco::BeamSpot & bs = *bsHandle;
     x0=bs.x0();y0=bs.y0();z0=bs.z0(); dxdz=bs.dxdz();dydz=bs.dydz();
   }
-  // to be moved to gpu....
-  float pt2[gpuProduct.nVertices];
-  int ind[gpuProduct.nVertices];
-  for (unsigned int i=0; i<gpuProduct.nVertices; ++i) {
-    pt2[i]=0; ind[i]=i;
-  }
-  for (auto k=0U; k<m_trks.size(); ++k) {
-    auto i = gpuProduct.ivtx[k];
-    pt2[i] += m_trks[k]->pt() > 20 ? 20*20 : m_trks[k]->pt()*m_trks[k]->pt(); 
-  }
-  // sort
-  std::sort(ind,ind+gpuProduct.nVertices,[&](int i, int j){ return pt2[i]>pt2[j];});
-  if(gpuProduct.nVertices>1) assert(pt2[ind[0]]>=pt2[ind[1]]);
 
   // fill legacy data format
-  for (unsigned int j=0; j<gpuProduct.nVertices; ++j) {
-    auto i = ind[j];
+  std::set<uint16_t> uind; // fort verifing index consistency
+  for (int j=int(gpuProduct.nVertices)-1; j>=0; --j) {
+    auto i = gpuProduct.sortInd[j];  // on gpu sorted in ascending order....
+    assert(i>=0);
+    assert(i<int(gpuProduct.nVertices));
+    uind.insert(i);    
     assert(itrk.empty());
     auto z= gpuProduct.z[i];
     auto x= x0 + dxdz*z;
@@ -187,14 +180,17 @@ void PixelVertexHeterogeneousProducer::produceGPUCuda(
     assert(nt>0);
     (*vertexes).emplace_back(reco::Vertex::Point(x,y,z), err, gpuProduct.chi2[i], nt-1, nt );
     auto & v = (*vertexes).back();
-    pt2[i]=0;
     for (auto k: itrk) {
       v.add(reco::TrackBaseRef(m_trks[k]));
     }
     itrk.clear();
   }
+  assert(uind.size()==(*vertexes).size());
+  if (!uind.empty()) {
+    assert(0 == *uind.begin());
+    assert(uind.size()-1 == *uind.rbegin());  
+  }
 
-  
   if (verbose_) {
     edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Found " << vertexes->size() << " vertexes\n";
     for (unsigned int i=0; i<vertexes->size(); ++i) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 7c97665ece061..ae17401d06625 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -7,10 +7,51 @@
 #include<cassert>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+
 
 #include "gpuVertexFinder.h"
 
 namespace gpuVertexFinder {
+
+  __global__
+  void sortByPt2(int nt,
+                 OnGPU * pdata
+                )  {
+    auto & __restrict__ data = *pdata;
+    float const * __restrict__ ptt2 = data.ptt2;
+    uint32_t const & nv = *data.nv;
+
+    int32_t const * __restrict__ iv = data.iv;
+    float * __restrict__ ptv2 = data.ptv2;
+    uint16_t * __restrict__ sortInd = data.sortInd;
+
+    if (nv<1) return;
+
+    // can be done asynchronoisly at the end of previous event
+    for (int i = threadIdx.x; i < nv; i += blockDim.x) {
+      ptv2[i]=0;
+    }
+    __syncthreads();
+
+
+    for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      if (iv[i]>9990) continue;
+      atomicAdd(&ptv2[iv[i]], ptt2[i]);
+    }
+    __syncthreads();
+
+    if (1==nv) {
+      if (threadIdx.x==0) sortInd[0]=0;
+      return;
+    }
+    __shared__ uint16_t ws[1024];
+    radixSort(ptv2,sortInd,ws,nv);
+    
+    assert(ptv2[sortInd[nv-1]]>=ptv2[sortInd[nv-2]]);
+    assert(ptv2[sortInd[1]]>=ptv2[sortInd[0]]);
+  }
+
   
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
@@ -24,6 +65,9 @@ namespace gpuVertexFinder {
 		     )  {
 
     constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+
+    if(verbose && 0==threadIdx.x) printf("params %d %f\n",minT,eps);
     
     auto er2mx = errmax*errmax;
     
@@ -35,42 +79,49 @@ namespace gpuVertexFinder {
     float * __restrict__ chi2 = data.chi2;
     uint32_t & nv = *data.nv;
     
-    int8_t  * __restrict__ izt = data.izt;
+    uint8_t  * __restrict__ izt = data.izt;
     int32_t * __restrict__ nn = data.nn;
     int32_t * __restrict__ iv = data.iv;
     
     assert(pdata);
     assert(zt);
     
-    __shared__ HistoContainer<int8_t,8,5,8,uint16_t> hist;
-    
-    //  if(0==threadIdx.x) printf("params %d %f\n",minT,eps);    
-    //  if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt);
-    
-    // zero hist
-    hist.nspills = 0;
-    for (auto k = threadIdx.x; k<hist.nbins(); k+=blockDim.x) hist.n[k]=0;
+    using Hist=HistoContainer<uint8_t,256,16000,8,uint16_t>;
+    constexpr auto wss = Hist::totbins();
+    __shared__ Hist hist;
+    __shared__ typename Hist::Counter ws[wss];
+    for (auto j=threadIdx.x; j<Hist::totbins(); j+=blockDim.x) { hist.off[j]=0; ws[j]=0;}
     __syncthreads();
-
-    //  if(0==threadIdx.x) printf("histo zeroed\n");
     
+    if(verbose && 0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.capacity(),nt);
+    
+    assert(nt<=hist.capacity());
     
-  // fill hist  (bin shall be wider than "eps")
+    // fill hist  (bin shall be wider than "eps")
     for (int i = threadIdx.x; i < nt; i += blockDim.x) {
       assert(i<OnGPU::MAXTRACKS);
       int iz =  int(zt[i]*10.); // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN),INT8_MAX);
-      izt[i]=iz;
-      hist.fill(int8_t(iz),uint16_t(i));
+      izt[i]=iz-INT8_MIN;
+      assert(iz-INT8_MIN >= 0);
+      assert(iz-INT8_MIN < 256);
+      hist.count(izt[i]);
       iv[i]=i;
       nn[i]=0;
     }
     __syncthreads();
-    
-    //   if(0==threadIdx.x) printf("histo filled %d\n",hist.nspills);
-    if(0==threadIdx.x && hist.fullSpill()) printf("histo overflow\n");
-    
+    hist.finalize(ws);
+    __syncthreads();
+    assert(hist.size()==nt);
+    if (threadIdx.x<32) ws[threadIdx.x]=0;  // used by prefix scan...
+    __syncthreads();
+    for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      hist.fill(izt[i],uint16_t(i),ws);
+    }
+    __syncthreads();    
+
+
     // count neighbours
     for (int i = threadIdx.x; i < nt; i += blockDim.x) {
       if (ezt2[i]>er2mx) continue;
@@ -87,20 +138,20 @@ namespace gpuVertexFinder {
       
     __syncthreads();
     
-    //  if(0==threadIdx.x) printf("nn counted\n");
-    
     // cluster seeds only
     bool more = true;
     while (__syncthreads_or(more)) {
       more=false;
-      for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+      for (int  k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+        auto p = hist.begin()+k;
+        auto i = (*p);
+        auto be = std::min(Hist::bin(izt[i])+1,int(hist.nbins()-1));
 	if (nn[i]<minT) continue; // DBSCAN core rule
 	auto loop = [&](int j) {
-	  if (i==j) return;
+	  assert (i!=j);
 	  if (nn[j]<minT) return;  // DBSCAN core rule
-	  // look on the left
-	  auto dist = zt[j]-zt[i];
-	  if (dist<0 || dist>eps) return;
+          auto dist = std::abs(zt[i]-zt[j]);
+          if (dist>eps) return;
 	  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
 	  auto old = atomicMin(&iv[j], iv[i]);
 	  if (old != iv[i]) {
@@ -109,8 +160,8 @@ namespace gpuVertexFinder {
 	  }
 	  atomicMin(&iv[i], old);
 	};
-
-	forEachInBins(hist,izt[i],1,loop);
+        ++p;
+        for (;p<hist.end(be);++p) loop(*p);
       } // for i
     } // while
     
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
index cb39e003a098d..b4cb7fa24e6e5 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu
@@ -6,15 +6,18 @@ namespace gpuVertexFinder {
   void Producer::allocateOnGPU() {
     cudaCheck(cudaMalloc(&onGPU.zt, OnGPU::MAXTRACKS*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.ezt2, OnGPU::MAXTRACKS*sizeof(float)));
+    cudaCheck(cudaMalloc(&onGPU.ptt2, OnGPU::MAXTRACKS*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.iv, OnGPU::MAXTRACKS*sizeof(int32_t)));
 
     cudaCheck(cudaMalloc(&onGPU.nv, sizeof(uint32_t)));
     cudaCheck(cudaMalloc(&onGPU.zv, OnGPU::MAXVTX*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.wv, OnGPU::MAXVTX*sizeof(float)));
     cudaCheck(cudaMalloc(&onGPU.chi2, OnGPU::MAXVTX*sizeof(float)));
- 
+    cudaCheck(cudaMalloc(&onGPU.ptv2, OnGPU::MAXVTX*sizeof(float)));
+    cudaCheck(cudaMalloc(&onGPU.sortInd, OnGPU::MAXVTX*sizeof(uint16_t)));
 
-    cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(int8_t)));
+
+    cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(uint8_t)));
     cudaCheck(cudaMalloc(&onGPU.nn, OnGPU::MAXTRACKS*sizeof(int32_t)));
 
     cudaCheck(cudaMalloc(&onGPU_d,sizeof(OnGPU)));
@@ -25,13 +28,15 @@ namespace gpuVertexFinder {
   void Producer::deallocateOnGPU() {
     cudaCheck(cudaFree(onGPU.zt));
     cudaCheck(cudaFree(onGPU.ezt2));
+    cudaCheck(cudaFree(onGPU.ptt2));
     cudaCheck(cudaFree(onGPU.iv));
 
     cudaCheck(cudaFree(onGPU.nv));
     cudaCheck(cudaFree(onGPU.zv));
     cudaCheck(cudaFree(onGPU.wv));
     cudaCheck(cudaFree(onGPU.chi2));
- 
+    cudaCheck(cudaFree(onGPU.ptv2));
+    cudaCheck(cudaFree(onGPU.sortInd));
 
     cudaCheck(cudaFree(onGPU.izt));
     cudaCheck(cudaFree(onGPU.nn));
@@ -44,26 +49,29 @@ namespace gpuVertexFinder {
   void Producer::produce(cudaStream_t stream,
 			 float const * __restrict__ zt,
 			 float const * __restrict__ ezt2,
+                         float const * __restrict__ ptt2,
 			 uint32_t ntrks
 			 ) {
     
-
     cudaCheck(cudaMemcpyAsync(onGPU.zt,zt,ntrks*sizeof(float),
 			      cudaMemcpyHostToDevice,stream));
     cudaCheck(cudaMemcpyAsync(onGPU.ezt2,ezt2,ntrks*sizeof(float),
 			      cudaMemcpyHostToDevice,stream));
+    cudaCheck(cudaMemcpyAsync(onGPU.ptt2,ptt2,ntrks*sizeof(float),
+                              cudaMemcpyHostToDevice,stream));
     
     assert(onGPU_d);
-    clusterTracks<<<1,1024,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
-    
+    clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
+    cudaCheck(cudaGetLastError());
+    sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);
+    cudaCheck(cudaGetLastError());
+
     cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t),
 			      cudaMemcpyDeviceToHost, stream));
     
     gpuProduct.ivtx.resize(ntrks);
     cudaCheck(cudaMemcpyAsync(gpuProduct.ivtx.data(),onGPU.iv,sizeof(int32_t)*ntrks,
 			      cudaMemcpyDeviceToHost, stream));
- 
-
   }
   
   Producer::GPUProduct const & Producer::fillResults(cudaStream_t stream) {
@@ -78,13 +86,14 @@ namespace gpuVertexFinder {
     gpuProduct.chi2.resize(gpuProduct.nVertices);
     cudaCheck(cudaMemcpyAsync(gpuProduct.chi2.data(),onGPU.chi2,sizeof(float)*gpuProduct.nVertices,
 			      cudaMemcpyDeviceToHost, stream));
-    
+        
+    gpuProduct.sortInd.resize(gpuProduct.nVertices);
+    cudaCheck(cudaMemcpyAsync(gpuProduct.sortInd.data(),onGPU.sortInd,sizeof(uint16_t)*gpuProduct.nVertices,
+                              cudaMemcpyDeviceToHost, stream));
+
     cudaStreamSynchronize(stream);
     
     return gpuProduct;
   }
-
-  
-  
+	
 } // end namespace
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 1f90641e3c260..ded6759a940bd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -13,17 +13,21 @@ namespace gpuVertexFinder {
     static constexpr uint32_t MAXTRACKS = 16000;
     static constexpr uint32_t MAXVTX= 1024;
     
-    float * zt;  // input track z at bs
+    float * zt;   // input track z at bs
     float * ezt2; // input error^2 on the above
+    float * ptt2; // input pt^2 on the above
    
+
     float * zv;  // output z-posistion of found vertices
     float * wv;  //  output weight (1/error^2) on the above
     float * chi2;  // vertices chi2
+    float * ptv2;  // vertices pt^2
     uint32_t * nv;  // the number of vertices
     int32_t * iv;  // vertex index for each associated track
+    uint16_t * sortInd; // sorted index (by pt2)
 
     // workspace  
-    int8_t  * izt;  // interized z-position of input tracks (reused as interize pt2 of vertices for sorting)
+    uint8_t * izt;  // interized z-position of input tracks
     int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices)
     
   };
@@ -53,6 +57,7 @@ namespace gpuVertexFinder {
     void produce(cudaStream_t stream,
 		 float const * zt,
 		 float const * ezt2,
+                 float const * ptt2,
 		 uint32_t ntrks
 		 );
     
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index f47c4362503ae..a92c116702231 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -13,13 +13,14 @@ struct Event {
   std::vector<uint16_t>  itrack;
   std::vector<float> ztrack;
   std::vector<float> eztrack;
+  std::vector<float> pttrack;
   std::vector<uint16_t> ivert;
 };
 
 struct ClusterGenerator {
 
   explicit ClusterGenerator(float nvert, float ntrack) :
-    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.)
+    rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
   {}
 
   void operator()(Event & ev) {
@@ -42,6 +43,8 @@ struct ClusterGenerator {
        ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
        ev.eztrack.push_back(err*err);
        ev.ivert.push_back(iv);
+       ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
+       ev.pttrack.back()*=ev.pttrack.back();
       }
     }
     // add noise
@@ -51,6 +54,8 @@ struct ClusterGenerator {
       ev.ztrack.push_back(rgen(reng));
       ev.eztrack.push_back(err*err);
       ev.ivert.push_back(9999);
+      ev.pttrack.push_back( 0.5f + ptGen(reng) );
+      ev.pttrack.back()*=ev.pttrack.back();
     }
 
   }
@@ -61,7 +66,7 @@ struct ClusterGenerator {
   std::poisson_distribution<int> clusGen;
   std::poisson_distribution<int> trackGen;
   std::normal_distribution<float> gauss;
-
+  std::exponential_distribution<float> ptGen;
 
 };
 
@@ -79,11 +84,14 @@ int main() {
 
   auto zt_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto ezt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
+  auto ptt2_d = cuda::memory::device::make_unique<float[]>(current_device, 64000);
   auto zv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto wv_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
   auto chi2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ptv2_d = cuda::memory::device::make_unique<float[]>(current_device, 256);
+  auto ind_d = cuda::memory::device::make_unique<uint16_t[]>(current_device, 256);
 
-  auto izt_d = cuda::memory::device::make_unique<int8_t[]>(current_device, 64000);
+  auto izt_d = cuda::memory::device::make_unique<uint8_t[]>(current_device, 64000);
   auto nn_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
   auto iv_d = cuda::memory::device::make_unique<int32_t[]>(current_device, 64000);
 
@@ -95,9 +103,12 @@ int main() {
 
   onGPU.zt = zt_d.get();
   onGPU.ezt2 = ezt2_d.get();
+  onGPU.ptt2 = ptt2_d.get();
   onGPU.zv = zv_d.get();
   onGPU.wv = wv_d.get();
   onGPU.chi2 = chi2_d.get();
+  onGPU.ptv2 = ptv2_d.get();
+  onGPU.sortInd = ind_d.get();
   onGPU.nv = nv_d.get();
   onGPU.izt = izt_d.get();
   onGPU.nn = nn_d.get();
@@ -123,6 +134,7 @@ int main() {
 
   cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size());
   cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size());
+  cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size());
 
   float eps = 0.1f;
   
@@ -130,44 +142,58 @@ int main() {
   
   if ( (i%4) == 0 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,12.0f
 		 );
   
   if ( (i%4) == 1 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.02f,9.0f
 		 );
   
   if ( (i%4) == 2 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,eps,
 		 0.01f,9.0f
 		 );
   
   if ( (i%4) == 3 )
     cuda::launch(clusterTracks,
-		 { 1, 1024 },
+		 { 1, 512+256 },
 		 ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps,
 		 0.01f,9.0f
 		 );
   
-
+  cudaDeviceSynchronize();
+  cuda::launch(sortByPt2,
+               { 1, 256 },
+               ev.ztrack.size(), onGPU_d.get()
+              );
 
   uint32_t nv;
   cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t));
+
+  if (nv==0) {
+    std::cout << "NO VERTICES???" << std::endl;
+    continue;
+  }
+
   float zv[nv];
   float	wv[nv];
   float	chi2[nv];
+  float ptv2[nv];
   int32_t nn[nv];
+  uint16_t ind[nv];
   cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float));
   cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float));
   cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float));
+  cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float));
   cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t));
+  cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t));
   for (auto j=0U; j<nv; ++j) if (nn[j]>0) chi2[j]/=float(nn[j]); 
    
   {
@@ -178,7 +204,12 @@ int main() {
     auto mx = std::minmax_element(chi2,chi2+nv);
     std::cout << "min max chi2 " << *mx.first << ' ' <<  *mx.second << std::endl;
   }
-  
+  {
+    auto mx = std::minmax_element(ptv2,ptv2+nv);
+    std::cout << "min max ptv2 " << *mx.first << ' ' <<  *mx.second << std::endl;
+    std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' <<  ptv2[ind[nv-1]] << " at "  << ind[0] << ' ' << ind[nv-1] << std::endl;
+
+  }  
 
   float dd[nv];
   uint32_t ii=0;
diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
index c767b1e68936a..a63c38264c20d 100644
--- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
+++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml
@@ -1,11 +1,11 @@
+<use name="Geometry/Records" />
+<use name="Geometry/TrackerGeometryBuilder" />
 <use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
-<use name="SimTracker/TrackerHitAssociation" />
 <use name="SimDataFormats/TrackingAnalysis" />
-
-<use name="Geometry/TrackerGeometryBuilder" />
-<use name="Geometry/Records" />
+<use name="SimTracker/TrackerHitAssociation" />
 
 <library file="*.cc *.cu" name="SimTrackerTrackerHitAssociationPlugins">
   <use name="cuda"/>