cms-sw · VinInn · Oct 12, 2021 · Oct 12, 2021 · Oct 13, 2021 · Oct 13, 2021
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -3,93 +3,13 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
 
-namespace cms {
-  namespace cuda {
-
-    template <typename Histo, typename T>
-    __global__ void countFromVector(Histo *__restrict__ h,
-                                    uint32_t nh,
-                                    T const *__restrict__ v,
-                                    uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).count(v[i], ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    __global__ void fillFromVector(Histo *__restrict__ h,
-                                   uint32_t nh,
-                                   T const *__restrict__ v,
-                                   uint32_t const *__restrict__ offsets) {
-      int first = blockDim.x * blockIdx.x + threadIdx.x;
-      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
-        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
-        assert((*off) > 0);
-        int32_t ih = off - offsets - 1;
-        assert(ih >= 0);
-        assert(ih < int(nh));
-        (*h).fill(v[i], i, ih);
-      }
-    }
-
-    template <typename Histo, typename T>
-    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
-                                                                  uint32_t nh,
-                                                                  T const *__restrict__ v,
-                                                                  uint32_t const *__restrict__ offsets,
-                                                                  int32_t totSize,
-                                                                  int nthreads,
-                                                                  typename Histo::index_type *mem,
-                                                                  cudaStream_t stream
-#ifndef __CUDACC__
-                                                                  = cudaStreamDefault
-#endif
-    ) {
-      typename Histo::View view = {h, nullptr, mem, -1, totSize};
-      launchZero(view, stream);
 #ifdef __CUDACC__
-      auto nblocks = (totSize + nthreads - 1) / nthreads;
-      assert(nblocks > 0);
-      countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-      launchFinalize(view, stream);
-      fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
-      cudaCheck(cudaGetLastError());
-#else
-      countFromVector(h, nh, v, offsets);
-      h->finalize();
-      fillFromVector(h, nh, v, offsets);
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
 #endif
-    }
-
-    // iteratate over N bins left and right of the one containing "v"
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
-      int bs = Hist::bin(value);
-      int be = std::min(int(Hist::nbins() - 1), bs + n);
-      bs = std::max(0, bs - n);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
-
-    // iteratate over bins containing all values in window wmin, wmax
-    template <typename Hist, typename V, typename Func>
-    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
-      auto bs = Hist::bin(wmin);
-      auto be = Hist::bin(wmax);
-      assert(be >= bs);
-      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
-        func(*pj);
-      }
-    }
+
+namespace cms {
+  namespace cuda {
 
     template <typename T,      // the type of the discretized input values
               uint32_t NBINS,  // number of bins

diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
@@ -0,0 +1,171 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+#define HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
+#endif
+
+namespace cms {
+  namespace cuda {
+
+    template <template <CountOrFill> typename Func, typename Histo, typename... Args>
+    __global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto histo = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      Func<CountOrFill::count>::countOrFill(histo, std::forward<Args>(args)...);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      Func<CountOrFill::fill>::countOrFill(histo, std::forward<Args>(args)...);
+    }
+
+    template <typename Histo, typename T, CountOrFill cof>
+    __device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h,
+                                                     uint32_t nh,
+                                                     T const *__restrict__ v,
+                                                     uint32_t const *__restrict__ offsets) {
+      int first = blockDim.x * blockIdx.x + threadIdx.x;
+      for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
+        auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+        assert((*off) > 0);
+        int32_t ih = off - offsets - 1;
+        assert(ih >= 0);
+        assert(ih < int(nh));
+        if constexpr (CountOrFill::count == cof)
+          (*h).count(v[i], ih);
+        else
+          (*h).fill(v[i], i, ih);
+      }
+    }
+
+    template <typename Histo, typename T, CountOrFill cof>
+    __global__ void countOrFillFromVectorKernel(Histo *__restrict__ h,
+                                                uint32_t nh,
+                                                T const *__restrict__ v,
+                                                uint32_t const *__restrict__ offsets) {
+      countOrFillFromVector<Histo, T, cof>(h, nh, v, offsets);
+    }
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
+                                                                  uint32_t nh,
+                                                                  T const *__restrict__ v,
+                                                                  uint32_t const *__restrict__ offsets,
+                                                                  int32_t totSize,
+                                                                  int nthreads,
+                                                                  typename Histo::index_type *mem,
+                                                                  cudaStream_t stream
+#ifndef __CUDACC__
+                                                                  = cudaStreamDefault
+#endif
+    ) {
+      typename Histo::View view = {h, nullptr, mem, -1, totSize};
+      launchZero(view, stream);
+#ifdef __CUDACC__
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+      launchFinalize(view, stream);
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
+      cudaCheck(cudaGetLastError());
+#else
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::count>(h, nh, v, offsets);
+      h->finalize();
+      countOrFillFromVectorKernel<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
+#endif
+    }
+
+#ifdef __CUDACC__
+    template <typename Histo, typename T>
+    __global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
+                                                 uint32_t nh,
+                                                 T const *__restrict__ v,
+                                                 uint32_t const *__restrict__ offsets,
+                                                 int32_t totSize,
+                                                 typename Histo::View::Counter *ws) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+      auto h = static_cast<Histo *>(view.assoc);
+      zeroAndInitCoop(view);
+      grid.sync();
+      countOrFillFromVector<Histo, T, CountOrFill::count>(h, nh, v, offsets);
+      grid.sync();
+      finalizeCoop(view, ws);
+      grid.sync();
+      countOrFillFromVector<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
+    }
+#endif
+
+    template <typename Histo, typename T>
+    inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
+                                                                      uint32_t nh,
+                                                                      T const *v,
+                                                                      uint32_t const *offsets,
+                                                                      int32_t totSize,
+                                                                      int nthreads,
+                                                                      typename Histo::index_type *mem,
+                                                                      cudaStream_t stream
+#ifndef __CUDACC__
+                                                                      = cudaStreamDefault
+#endif
+    ) {
+      using View = typename Histo::View;
+      View view = {h, nullptr, mem, -1, totSize};
+#ifdef __CUDACC__
+      auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
+      auto nblocks = (totSize + nthreads - 1) / nthreads;
+      assert(nblocks > 0);
+      auto nOnes = view.size();
+      auto nchunks = nOnes / nthreads + 1;
+      auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
+      auto wsp = ws.get();
+      // FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
+      static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
+      auto ncoopblocks = std::min(nblocks, maxBlocks);
+      assert(ncoopblocks > 0);
+      void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
+      dim3 dimBlock(nthreads, 1, 1);
+      dim3 dimGrid(ncoopblocks, 1, 1);
+      // launch
+      cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
+#else
+      launchZero(view, stream);
+      countFromVector(h, nh, v, offsets);
+      h->finalize();
+      fillFromVector(h, nh, v, offsets);
+#endif
+    }
+
+    // iteratate over N bins left and right of the one containing "v"
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
+      int bs = Hist::bin(value);
+      int be = std::min(int(Hist::nbins() - 1), bs + n);
+      bs = std::max(0, bs - n);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+
+    // iteratate over bins containing all values in window wmin, wmax
+    template <typename Hist, typename V, typename Func>
+    __host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) {
+      auto bs = Hist::bin(wmin);
+      auto be = Hist::bin(wmax);
+      assert(be >= bs);
+      for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
+        func(*pj);
+      }
+    }
+  }  // namespace cuda
+}  // namespace cms
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h b/HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
@@ -19,6 +19,8 @@
 namespace cms {
   namespace cuda {
 
+    enum class CountOrFill { count, fill };
+
     template <typename Assoc>
     struct OneToManyAssocView {
       using Counter = typename Assoc::Counter;
@@ -29,6 +31,24 @@ namespace cms {
       index_type *contentStorage = nullptr;
       int32_t offSize = -1;
       int32_t contentSize = -1;
+
+      constexpr Counter *offsets() const {
+        Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off));
+        if constexpr (Assoc::ctNOnes() < 0) {
+          assert(offStorage);
+          poff = offStorage;
+        }
+        return poff;
+      }
+
+      constexpr int32_t size() const {
+        auto nOnes = Assoc::ctNOnes();
+        if constexpr (Assoc::ctNOnes() < 0) {
+          nOnes = offSize;
+        }
+        assert(nOnes > 0);
+        return nOnes;
+      }
     };
 
     // this MUST BE DONE in a single block (or in two kernels!)
@@ -50,6 +70,26 @@ namespace cms {
       }
     }
 
+    template <typename Assoc>
+    __device__ void zeroAndInitCoop(OneToManyAssocView<Assoc> view) {
+      namespace cg = cooperative_groups;
+      auto grid = cg::this_grid();
+
+      auto h = view.assoc;
+
+      auto first = blockDim.x * blockIdx.x + threadIdx.x;
+
+      if (0 == first) {
+        h->psws = 0;
+        h->initStorage(view);
+      }
+
+      grid.sync();
+      for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) {
+        h->off[i] = 0;
+      }
+    }
+
     template <typename Assoc>
     inline __attribute__((always_inline)) void launchZero(Assoc *h,
                                                           cudaStream_t stream
@@ -111,16 +151,8 @@ namespace cms {
       auto h = view.assoc;
       assert(h);
 #ifdef __CUDACC__
-      using Counter = typename Assoc::Counter;
-      Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
-      auto nOnes = Assoc::ctNOnes();
-      if constexpr (Assoc::ctNOnes() < 0) {
-        assert(view.offStorage);
-        assert(view.offSize > 0);
-        nOnes = view.offSize;
-        poff = view.offStorage;
-      }
-      assert(nOnes > 0);
+      auto poff = view.offsets();
+      auto nOnes = view.size();
       int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
       auto nthreads = 1024;
       auto nblocks = (nOnes + nthreads - 1) / nthreads;
@@ -131,6 +163,19 @@ namespace cms {
 #endif
     }
 
+    template <typename Assoc>
+    __device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter *ws) {
+#ifdef __CUDACC__
+      auto poff = view.offsets();
+      auto nOnes = view.size();
+      coopBlockPrefixScan(poff, poff, nOnes, ws);
+#else
+      auto h = view.assoc;
+      assert(h);
+      h->finalize();
+#endif
+    }
+
     template <typename Assoc>
     __global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
       assoc->bulkFinalizeFill(*apc);