cms-patatrack · fwyzard · Sep 26, 2018 · Sep 14, 2018 · Sep 14, 2018 · Sep 15, 2018
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1 +1,2 @@
 <use name="cuda"/>
+<use name="cub"/>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -87,7 +87,7 @@ template <class T> struct SimpleVector {
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int size() const { return m_size; }
   inline constexpr int capacity() const { return m_capacity; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr void set_data(T * data) { m_data = data; }
 

diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -85,7 +85,7 @@ template <class T, int maxSize> struct VecArray {
   inline constexpr const T& operator[](int i) const { return m_data[i]; }
   inline constexpr void reset() { m_size = 0; }
   inline constexpr int capacity() const { return maxSize; }
-  inline constexpr T *data() const { return m_data; }
+  inline constexpr T const * data() const { return m_data; }
   inline constexpr void resize(int size) { m_size = size; }
   inline constexpr bool empty() const { return 0 == m_size; }
   inline constexpr bool full() const { return maxSize == m_size; }

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -2,6 +2,7 @@
 #define HeterogeneousCore_CUDAUtilities_HistoContainer_h
 
 #include <cassert>
+#include <cstddef> 
 #include <cstdint>
 #include <algorithm>
 #include <type_traits>
@@ -10,96 +11,100 @@
 #endif // __CUDA_ARCH__
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include <cub/cub.cuh>
+#endif
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 
 #ifdef __CUDACC__
 namespace cudautils {
 
-  template<typename Histo>
-  __global__
-  void zeroMany(Histo * h, uint32_t nh) {
-    auto i  = blockIdx.x * blockDim.x + threadIdx.x;
-    auto ih = i / Histo::nbins();
-    auto k  = i - ih * Histo::nbins();
-    if (ih < nh) {
-      h[ih].nspills = 0;
-      if (k < Histo::nbins())
-        h[ih].n[k] = 0;
-    }
-  }
-
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h,  uint32_t nh, T const * v, uint32_t * offsets) {
+  void countFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
      if(i >= offsets[nh]) return;
      auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
      assert((*off) > 0);
      int32_t ih = off - offsets - 1;
      assert(ih >= 0);
-     assert(ih < nh); 
-     h[ih].fill(v[i], i);
+     assert(ih < nh);
+     (*h).count(v[i], ih);
   }
 
   template<typename Histo, typename T>
   __global__
-  void fillFromVector(Histo * h, T const * v, uint32_t size) {
+  void fillFromVector(Histo * __restrict__ h,  uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
+                      uint32_t * __restrict__ ws ) {
      auto i = blockIdx.x * blockDim.x + threadIdx.x;
-     if(i < size) h->fill(v[i], i);
-  }
-
-  template<typename Histo>
-  void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
-    auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads;
-    zeroMany<<<nblocks, nthreads, 0, stream>>>(h, nh);
-    cudaCheck(cudaGetLastError());
+     if(i >= offsets[nh]) return;
+     auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+     assert((*off) > 0);
+     int32_t ih = off - offsets - 1;
+     assert(ih >= 0);
+     assert(ih < nh);
+     (*h).fill(v[i], i, ws, ih);
   }
 
-  template<typename Histo, typename T>
-  void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) {
-    zero(h, 1, nthreads, stream);
-    auto nblocks = (size + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, v, size);
-    cudaCheck(cudaGetLastError());
-  }
 
   template<typename Histo, typename T>
-  void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
-    zero(h, nh, nthreads, stream);
+  void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter *  __restrict__ ws,  
+                          uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, 
+                          int nthreads, cudaStream_t stream) {
+    uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+    cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
     auto nblocks = (totSize + nthreads - 1) / nthreads;
-    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+    size_t wss = Histo::totbins();
+    CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+    cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
+    fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
     cudaCheck(cudaGetLastError());
   }
 
+
 } // namespace cudautils
 #endif
 
 
 // iteratate over N bins left and right of the one containing "v"
-// including spillBin
 template<typename Hist, typename V, typename Func>
 __host__ __device__
+__forceinline__
 void forEachInBins(Hist const & hist, V value, int n, Func func) {
-   int bs = hist.bin(value);
-   int be = std::min(int(hist.nbins()),bs+n+1);
+   int bs = Hist::bin(value);
+   int be = std::min(int(Hist::nbins()-1),bs+n);
    bs = std::max(0,bs-n);
-   assert(be>bs);
-   for (auto b=bs; b<be; ++b){
-   for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
+      func(*pj);
+   }
+}
+
+// iteratate over bins containing all values in window wmin, wmax
+template<typename Hist, typename V, typename Func>
+__host__ __device__
+__forceinline__
+void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
+   auto bs = Hist::bin(wmin);
+   auto be = Hist::bin(wmax);
+   assert(be>=bs);
+   for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
       func(*pj);
-   }}
-   for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
-     func(*pj);
+   }
 }
 
 
+
 template<
   typename T, // the type of the discretized input values
-  uint32_t N, // number of bins (in bits)
-  uint32_t M, // max number of element a bin can contain
+  uint32_t NBINS, // number of bins 
+  uint32_t SIZE, // max number of element
   uint32_t S=sizeof(T) * 8, // number of significant bits in T
-  typename I=uint32_t  // type stored in the container (usually an index in a vector of the input values)
+  typename I=uint32_t,  // type stored in the container (usually an index in a vector of the input values)
+  uint32_t NHISTS=1 // number of histos stored
 >
 class HistoContainer {
 public:
@@ -111,24 +116,55 @@ class HistoContainer {
 
   using index_type = I;
   using UT = typename std::make_unsigned<T>::type;
+
+  static constexpr uint32_t ilog2(uint32_t v) {
+
+    constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+    constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+    uint32_t r = 0; // result of log2(v) will go here
+    for (auto i = 4; i >= 0; i--) if (v & b[i]) {
+      v >>= s[i];
+      r |= s[i];
+    }
+    return r;
+  }
+
+
   static constexpr uint32_t sizeT()     { return S; }
-  static constexpr uint32_t nbins()     { return 1 << N; }
-  static constexpr uint32_t shift()     { return sizeT() - N; }
-  static constexpr uint32_t mask()      { return nbins() - 1; }
-  static constexpr uint32_t binSize()   { return 1 << M; }
-  static constexpr uint32_t spillSize() { return 16 * binSize(); }
+  static constexpr uint32_t nbins()     { return NBINS;}
+  static constexpr uint32_t nhists()    { return NHISTS;}
+  static constexpr uint32_t totbins()   { return NHISTS*NBINS+1;}
+  static constexpr uint32_t nbits()     { return ilog2(NBINS-1)+1;}
+  static constexpr uint32_t capacity()  { return SIZE; }
+
+  static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
+
+#ifdef __CUDACC__
+  __host__
+  static size_t wsSize() {
+    uint32_t * v =nullptr;
+    void * d_temp_storage = nullptr;
+    size_t  temp_storage_bytes = 0;
+    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
+    return std::max(temp_storage_bytes,size_t(totbins()));
+  }
+#endif
+
 
   static constexpr UT bin(T t) {
-    return (t >> shift()) & mask();
+    constexpr uint32_t shift = sizeT() - nbits();
+    constexpr uint32_t mask = (1<<nbits()) - 1;
+    return (t >> shift) & mask;
   }
 
   void zero() {
-    nspills = 0;
-    for (auto & i : n)
+    for (auto & i : off)
       i = 0;
   }
 
-  static constexpr
+  static __host__ __device__
+  __forceinline__
   uint32_t atomicIncrement(Counter & x) {
     #ifdef __CUDA_ARCH__
     return atomicAdd(&x, 1);
@@ -138,51 +174,72 @@ class HistoContainer {
   }
 
   __host__ __device__
-  void fill(T t, index_type j) {
-    UT b = bin(t);
+  __forceinline__
+  void count(T t) {
+    uint32_t b = bin(t);
     assert(b<nbins());
-    auto w = atomicIncrement(n[b]);
-    if (w < binSize()) {
-      bins[b * binSize() + w] = j;
-    } else {
-      auto w = atomicIncrement(nspills);
-      if (w < spillSize())
-        spillBin[w] = j;
-    }
+    atomicIncrement(off[b+1]);
   }
 
-  constexpr bool fullSpill() const {
-    return nspills >= spillSize();
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr bool full(uint32_t b) const {
-    return n[b] >= binSize();
-  }
 
-  constexpr auto const * begin(uint32_t b) const {
-     return bins + b * binSize();
-  }
-
-  constexpr auto const * end(uint32_t b) const {
-     return begin(b) + std::min(binSize(), uint32_t(n[b]));
+  __host__ __device__
+  __forceinline__
+  void count(T t, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    atomicIncrement(off[b+1]);
   }
 
-  constexpr auto size(uint32_t b) const {
-     return n[b];
+  __host__ __device__
+  __forceinline__
+  void fill(T t, index_type j, Counter * ws, uint32_t nh) {
+    uint32_t b = bin(t);
+    assert(b<nbins());
+    b+=histOff(nh);
+    assert(b<totbins());
+    auto w = atomicIncrement(ws[b]);
+    assert(w < size(b));
+    bins[off[b] + w] = j;
   }
 
-  constexpr auto const * beginSpill() const {
-     return spillBin;
+#ifdef __CUDACC__
+  __device__
+  __forceinline__
+  void finalize(Counter * ws) {
+    blockPrefixScan(off+1,totbins()-1,ws);
   }
-
-  constexpr auto const * endSpill() const {
-     return beginSpill() + std::min(spillSize(), uint32_t(nspills));
+  __host__
+#endif
+  void finalize() {
+    for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
   }
 
-  Counter  n[nbins()];
-  Counter  nspills;
-  index_type bins[nbins()*binSize()];
-  index_type spillBin[spillSize()];
+  constexpr auto size() const { return uint32_t(off[totbins()-1]);}
+  constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}
+
+
+  constexpr index_type const * begin() const { return bins;}
+  constexpr index_type const * end() const { return begin() + size();}
+
+
+  constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
+  constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}
+
+
+  Counter  off[totbins()];
+  index_type bins[capacity()];
 };
 
 #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
@@ -0,0 +1,52 @@
+#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h
+#define HeterogeneousCore_CUDAUtilities_prefixScan_h
+
+#include <cstdint>
+#include <cassert>
+
+template<typename T>
+__device__
+void 
+__forceinline__
+warpPrefixScan(T * c, uint32_t i) {
+   auto x = c[i];
+   auto laneId = threadIdx.x & 0x1f;
+   #pragma unroll
+   for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+     auto y = __shfl_up_sync(0xffffffff,x, offset);
+     if(laneId >= offset) x += y;
+   }
+   c[i] = x;
+}
+
+// limited to 32*32 elements....
+template<typename T>
+__device__
+void
+__forceinline__
+blockPrefixScan(T * c, uint32_t size, T* ws) {
+  assert(size<=1024);
+  assert(0==blockDim.x%32);
+
+  auto first = threadIdx.x;
+
+  for (auto i=first; i<size; i+=blockDim.x) {
+    warpPrefixScan(c,i);
+    auto laneId = threadIdx.x & 0x1f;
+    auto warpId = i/32;
+    assert(warpId<32);
+    if (31==laneId) ws[warpId]=c[i];
+  }
+  __syncthreads();
+  if (size<=32) return;
+  if (threadIdx.x<32) warpPrefixScan(ws,threadIdx.x);
+  __syncthreads();
+  for (auto i=first+32; i<size; i+=blockDim.x) {
+    auto warpId = i/32;
+    c[i]+=ws[warpId-1];
+  }
+  __syncthreads();
+}
+
+
+#endif