Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cooperative groups to populate Associations (Histograms) in Pixel Patatrack #35713

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
650c971
add cooperative groups
VinInn Oct 12, 2021
e654481
works with coop
VinInn Oct 12, 2021
0614694
coop works in assoc
VinInn Oct 13, 2021
df68916
coops implelented in histo filling
VinInn Oct 13, 2021
f71031a
format
VinInn Oct 13, 2021
ac394e7
use in rechits
VinInn Oct 13, 2021
a3ab3ff
factorize away algos
VinInn Oct 14, 2021
8a5d69b
decapsulate and format
VinInn Oct 14, 2021
6974029
encapsulate
VinInn Oct 14, 2021
0c1e5f4
format
VinInn Oct 14, 2021
151aea7
use coop for other 2 assoc
VinInn Oct 15, 2021
e9a9bda
Merged CUDACOOP from repository VinInn with cms-merge-topic
VinInn Oct 15, 2021
b8e2760
drive performance tests by envvar
VinInn Oct 16, 2021
446d652
add comment
VinInn Oct 16, 2021
5f6f596
factorize, encapsulate
VinInn Oct 17, 2021
71631b2
propagate factorization
VinInn Oct 17, 2021
cf5b8ba
add const
VinInn Nov 16, 2021
da13a56
add const
VinInn Nov 16, 2021
fc4faa7
change to on const
VinInn Nov 16, 2021
ac764da
Update HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h
VinInn Nov 16, 2021
6c38fdf
Merge branch 'master' into CUDACOOP
VinInn Dec 1, 2021
699d28d
Merge branch 'master' into CUDACOOP
VinInn Dec 9, 2021
72f6df7
remove double check
VinInn Dec 9, 2021
6637f96
be more specific with preprocessor flag name
VinInn Dec 9, 2021
e9328e3
be more specific with flag name
VinInn Dec 9, 2021
0160c6d
be more specific with compiler flag (even if it is a test)
VinInn Dec 9, 2021
8f387b0
remove the possibility to tune grid-size using envvar
VinInn Dec 9, 2021
078f314
format code
VinInn Dec 9, 2021
90dec57
By default DO NOT use Cooperative Groups in CA
VinInn Dec 10, 2021
2e58827
fix misspell
VinInn Dec 10, 2021
c09b4de
no cooperative groups in rechits as well
VinInn Dec 11, 2021
9df0add
align text
VinInn Dec 11, 2021
418d0c4
Merge branch 'master' into CUDACOOP
VinInn Mar 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 5 additions & 85 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,93 +3,13 @@

#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"

namespace cms {
namespace cuda {

template <typename Histo, typename T>
__global__ void countFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) {
int first = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
(*h).count(v[i], ih);
}
}

template <typename Histo, typename T>
__global__ void fillFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) {
int first = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
(*h).fill(v[i], i, ih);
}
}

template <typename Histo, typename T>
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
int32_t totSize,
int nthreads,
typename Histo::index_type *mem,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
typename Histo::View view = {h, nullptr, mem, -1, totSize};
launchZero(view, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
assert(nblocks > 0);
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(view, stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
countFromVector(h, nh, v, offsets);
h->finalize();
fillFromVector(h, nh, v, offsets);
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
#endif
}

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins() - 1), bs + n);
bs = std::max(0, bs - n);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

// iteratate over bins containing all values in window wmin, wmax
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func const &func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

namespace cms {
namespace cuda {

template <typename T, // the type of the discretized input values
uint32_t NBINS, // number of bins
Expand Down
171 changes: 171 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainerAlgo.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h

#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"

#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h"
#endif

namespace cms {
namespace cuda {

template <template <CountOrFill> typename Func, typename Histo, typename... Args>
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not used (yet?) It may make the syntax more complex, not simpler

__global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) {
namespace cg = cooperative_groups;
auto grid = cg::this_grid();
auto histo = static_cast<Histo *>(view.assoc);
zeroAndInitCoop(view);
grid.sync();
Func<CountOrFill::count>::countOrFill(histo, std::forward<Args>(args)...);
grid.sync();
finalizeCoop(view, ws);
grid.sync();
Func<CountOrFill::fill>::countOrFill(histo, std::forward<Args>(args)...);
}

template <typename Histo, typename T, CountOrFill cof>
__device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) {
int first = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) {
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < int(nh));
if constexpr (CountOrFill::count == cof)
(*h).count(v[i], ih);
else
(*h).fill(v[i], i, ih);
}
}

template <typename Histo, typename T, CountOrFill cof>
__global__ void countOrFillFromVectorKernel(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets) {
countOrFillFromVector<Histo, T, cof>(h, nh, v, offsets);
}

template <typename Histo, typename T>
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
int32_t totSize,
int nthreads,
typename Histo::index_type *mem,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
typename Histo::View view = {h, nullptr, mem, -1, totSize};
launchZero(view, stream);
#ifdef __CUDACC__
auto nblocks = (totSize + nthreads - 1) / nthreads;
assert(nblocks > 0);
countOrFillFromVectorKernel<Histo, T, CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
launchFinalize(view, stream);
countOrFillFromVectorKernel<Histo, T, CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
cudaCheck(cudaGetLastError());
#else
countOrFillFromVectorKernel<Histo, T, CountOrFill::count>(h, nh, v, offsets);
h->finalize();
countOrFillFromVectorKernel<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
#endif
}

#ifdef __CUDACC__
template <typename Histo, typename T>
__global__ void fillManyFromVectorCoopKernel(typename Histo::View view,
uint32_t nh,
T const *__restrict__ v,
uint32_t const *__restrict__ offsets,
int32_t totSize,
typename Histo::View::Counter *ws) {
namespace cg = cooperative_groups;
auto grid = cg::this_grid();
auto h = static_cast<Histo *>(view.assoc);
zeroAndInitCoop(view);
grid.sync();
countOrFillFromVector<Histo, T, CountOrFill::count>(h, nh, v, offsets);
grid.sync();
finalizeCoop(view, ws);
grid.sync();
countOrFillFromVector<Histo, T, CountOrFill::fill>(h, nh, v, offsets);
}
#endif

template <typename Histo, typename T>
inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h,
uint32_t nh,
T const *v,
uint32_t const *offsets,
int32_t totSize,
int nthreads,
typename Histo::index_type *mem,
cudaStream_t stream
#ifndef __CUDACC__
= cudaStreamDefault
#endif
) {
using View = typename Histo::View;
View view = {h, nullptr, mem, -1, totSize};
#ifdef __CUDACC__
auto kernel = fillManyFromVectorCoopKernel<Histo, T>;
auto nblocks = (totSize + nthreads - 1) / nthreads;
assert(nblocks > 0);
auto nOnes = view.size();
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, a huge stack of boiler plate. could be partially encapsulated in a "launch" interface as in launch.h.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to give it a try, there is launch_cooperative(...) in launch.h .
I don't think I've ever tested it, though.

auto nchunks = nOnes / nthreads + 1;
auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream);
auto wsp = ws.get();
// FIXME: discuss with FW team: cuda calls are expensive and not needed for each event
static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0);
auto ncoopblocks = std::min(nblocks, maxBlocks);
assert(ncoopblocks > 0);
void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp};
dim3 dimBlock(nthreads, 1, 1);
dim3 dimGrid(ncoopblocks, 1, 1);
// launch
cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream));
#else
launchZero(view, stream);
countFromVector(h, nh, v, offsets);
h->finalize();
fillFromVector(h, nh, v, offsets);
#endif
}

// iteratate over N bins left and right of the one containing "v"
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) {
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins() - 1), bs + n);
bs = std::max(0, bs - n);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}

// iteratate over bins containing all values in window wmin, wmax
template <typename Hist, typename V, typename Func>
__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
assert(be >= bs);
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) {
func(*pj);
}
}
} // namespace cuda
} // namespace cms

#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h
65 changes: 55 additions & 10 deletions HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
namespace cms {
namespace cuda {

enum class CountOrFill { count, fill };

template <typename Assoc>
struct OneToManyAssocView {
using Counter = typename Assoc::Counter;
Expand All @@ -29,6 +31,24 @@ namespace cms {
index_type *contentStorage = nullptr;
int32_t offSize = -1;
int32_t contentSize = -1;

constexpr Counter *offsets() const {
Counter *poff = (Counter *)((char *)(assoc) + offsetof(Assoc, off));
if constexpr (Assoc::ctNOnes() < 0) {
assert(offStorage);
poff = offStorage;
}
return poff;
}

constexpr int32_t size() const {
auto nOnes = Assoc::ctNOnes();
if constexpr (Assoc::ctNOnes() < 0) {
nOnes = offSize;
}
assert(nOnes > 0);
return nOnes;
}
};

// this MUST BE DONE in a single block (or in two kernels!)
Expand All @@ -50,6 +70,26 @@ namespace cms {
}
}

template <typename Assoc>
__device__ void zeroAndInitCoop(OneToManyAssocView<Assoc> view) {
namespace cg = cooperative_groups;
auto grid = cg::this_grid();

auto h = view.assoc;

auto first = blockDim.x * blockIdx.x + threadIdx.x;

if (0 == first) {
h->psws = 0;
h->initStorage(view);
}

grid.sync();
for (int i = first, nt = h->totOnes(); i < nt; i += gridDim.x * blockDim.x) {
h->off[i] = 0;
}
}

template <typename Assoc>
inline __attribute__((always_inline)) void launchZero(Assoc *h,
cudaStream_t stream
Expand Down Expand Up @@ -111,16 +151,8 @@ namespace cms {
auto h = view.assoc;
assert(h);
#ifdef __CUDACC__
using Counter = typename Assoc::Counter;
Counter *poff = (Counter *)((char *)(h) + offsetof(Assoc, off));
auto nOnes = Assoc::ctNOnes();
if constexpr (Assoc::ctNOnes() < 0) {
assert(view.offStorage);
assert(view.offSize > 0);
nOnes = view.offSize;
poff = view.offStorage;
}
assert(nOnes > 0);
auto poff = view.offsets();
auto nOnes = view.size();
int32_t *ppsws = (int32_t *)((char *)(h) + offsetof(Assoc, psws));
auto nthreads = 1024;
auto nblocks = (nOnes + nthreads - 1) / nthreads;
Expand All @@ -131,6 +163,19 @@ namespace cms {
#endif
}

template <typename Assoc>
__device__ __inline__ void finalizeCoop(OneToManyAssocView<Assoc> view, typename Assoc::Counter *ws) {
#ifdef __CUDACC__
auto poff = view.offsets();
auto nOnes = view.size();
coopBlockPrefixScan(poff, poff, nOnes, ws);
#else
auto h = view.assoc;
assert(h);
h->finalize();
#endif
}

template <typename Assoc>
__global__ void finalizeBulk(AtomicPairCounter const *apc, Assoc *__restrict__ assoc) {
assoc->bulkFinalizeFill(*apc);
Expand Down
Loading