Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce Cluster Charge Cut, optimize Histogram (bucket sorting) #171

Merged
merged 37 commits into from
Sep 26, 2018
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
eb29a8b
use ext ws or shared
VinInn Sep 14, 2018
fe2af2e
add minBLperML and a debug printout
VinInn Sep 14, 2018
8b47a72
a small prefix scan in single block
VinInn Sep 15, 2018
90344c7
add charge cut
VinInn Sep 15, 2018
b1e7746
apply charge cut
VinInn Sep 15, 2018
dca2f1e
ccc works
VinInn Sep 16, 2018
6a06ece
stable, but something fishy in test
VinInn Sep 16, 2018
50460d9
of course de test had a bug
VinInn Sep 16, 2018
9103a9c
works, sort only with blksize 256
VinInn Sep 17, 2018
f05df88
make the number of bins configurable
VinInn Sep 18, 2018
304c607
fix race
VinInn Sep 18, 2018
b1e2810
compactify the histogram (to make itaration faster?)
VinInn Sep 19, 2018
f9dad13
not faster, at least avoid spill
VinInn Sep 20, 2018
81a15e6
will need some cleanup
VinInn Sep 20, 2018
8eb1210
clean hist header
VinInn Sep 20, 2018
960de3d
use one bin per column
VinInn Sep 20, 2018
25b24c5
from done to more, and debug
VinInn Sep 20, 2018
7097934
vertex finding is faster now
VinInn Sep 20, 2018
a108571
use new clustering for real!
VinInn Sep 20, 2018
c711b5d
test cub as well
VinInn Sep 21, 2018
83879cd
works in c++
VinInn Sep 21, 2018
ba38606
works
VinInn Sep 21, 2018
b8df4f8
old histo is gone
VinInn Sep 21, 2018
1bb3bda
wip
VinInn Sep 21, 2018
78e4aa5
wip
VinInn Sep 21, 2018
0c19cd1
does not crash
VinInn Sep 22, 2018
1b55356
remove hard limit
VinInn Sep 22, 2018
70d2fcb
mark lost clusters
VinInn Sep 22, 2018
47a3f04
a bit of cleanup, no speedup
VinInn Sep 22, 2018
85453af
force load in constant cache, reduce size of Cell
VinInn Sep 24, 2018
d93e14f
be consistent with ht index type
VinInn Sep 24, 2018
bda658a
fill pt2 and index for nv==1 as well
VinInn Sep 25, 2018
75f2948
check cudaGetLastError() after launching each kernel
fwyzard Sep 25, 2018
3f69a40
check cudaGetLastError() after launching each kernel
fwyzard Sep 25, 2018
a960668
check cudaGetLastError() after launching each kernel
fwyzard Sep 25, 2018
742399a
Merge branch 'CMSSW_10_2_X_Patatrack' into TuningFor1025
fwyzard Sep 25, 2018
0e2812c
Declare the direct dependencies
fwyzard Sep 25, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
<use name="cuda"/>
<use name="cub"/>
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ template <class T> struct SimpleVector {
inline constexpr void reset() { m_size = 0; }
inline constexpr int size() const { return m_size; }
inline constexpr int capacity() const { return m_capacity; }
inline constexpr T *data() const { return m_data; }
inline constexpr T const * data() const { return m_data; }
inline constexpr void resize(int size) { m_size = size; }
inline constexpr void set_data(T * data) { m_data = data; }

Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ template <class T, int maxSize> struct VecArray {
inline constexpr const T& operator[](int i) const { return m_data[i]; }
inline constexpr void reset() { m_size = 0; }
inline constexpr int capacity() const { return maxSize; }
inline constexpr T *data() const { return m_data; }
inline constexpr T const * data() const { return m_data; }
inline constexpr void resize(int size) { m_size = size; }
inline constexpr bool empty() const { return 0 == m_size; }
inline constexpr bool full() const { return maxSize == m_size; }
Expand Down
233 changes: 145 additions & 88 deletions HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define HeterogeneousCore_CUDAUtilities_HistoContainer_h

#include <cassert>
#include <cstddef>
#include <cstdint>
#include <algorithm>
#include <type_traits>
Expand All @@ -10,96 +11,100 @@
#endif // __CUDA_ARCH__

#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
#include <cub/cub.cuh>
#endif
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"


#ifdef __CUDACC__
namespace cudautils {

template<typename Histo>
__global__
void zeroMany(Histo * h, uint32_t nh) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
auto ih = i / Histo::nbins();
auto k = i - ih * Histo::nbins();
if (ih < nh) {
h[ih].nspills = 0;
if (k < Histo::nbins())
h[ih].n[k] = 0;
}
}

template<typename Histo, typename T>
__global__
void fillFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets) {
void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= offsets[nh]) return;
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < nh);
h[ih].fill(v[i], i);
assert(ih < nh);
(*h).count(v[i], ih);
}

template<typename Histo, typename T>
__global__
void fillFromVector(Histo * h, T const * v, uint32_t size) {
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
uint32_t * __restrict__ ws ) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < size) h->fill(v[i], i);
}

template<typename Histo>
void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads;
zeroMany<<<nblocks, nthreads, 0, stream>>>(h, nh);
cudaCheck(cudaGetLastError());
if(i >= offsets[nh]) return;
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
assert(ih < nh);
(*h).fill(v[i], i, ws, ih);
}

template<typename Histo, typename T>
void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) {
zero(h, 1, nthreads, stream);
auto nblocks = (size + nthreads - 1) / nthreads;
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, v, size);
cudaCheck(cudaGetLastError());
}

template<typename Histo, typename T>
void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
zero(h, nh, nthreads, stream);
void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
int nthreads, cudaStream_t stream) {
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
auto nblocks = (totSize + nthreads - 1) / nthreads;
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
size_t wss = Histo::totbins();
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
cudaCheck(cudaGetLastError());
}


} // namespace cudautils
#endif


// iteratate over N bins left and right of the one containing "v"
// including spillBin
template<typename Hist, typename V, typename Func>
__host__ __device__
__forceinline__
void forEachInBins(Hist const & hist, V value, int n, Func func) {
int bs = hist.bin(value);
int be = std::min(int(hist.nbins()),bs+n+1);
int bs = Hist::bin(value);
int be = std::min(int(Hist::nbins()-1),bs+n);
bs = std::max(0,bs-n);
assert(be>bs);
for (auto b=bs; b<be; ++b){
for (auto pj=hist.begin(b);pj<hist.end(b);++pj) {
assert(be>=bs);
for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
func(*pj);
}
}

// iteratate over bins containing all values in window wmin, wmax
template<typename Hist, typename V, typename Func>
__host__ __device__
__forceinline__
void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
auto bs = Hist::bin(wmin);
auto be = Hist::bin(wmax);
assert(be>=bs);
for (auto pj=hist.begin(bs);pj<hist.end(be);++pj) {
func(*pj);
}}
for (auto pj=hist.beginSpill();pj<hist.endSpill();++pj)
func(*pj);
}
}



template<
typename T, // the type of the discretized input values
uint32_t N, // number of bins (in bits)
uint32_t M, // max number of element a bin can contain
uint32_t NBINS, // number of bins
uint32_t SIZE, // max number of element
uint32_t S=sizeof(T) * 8, // number of significant bits in T
typename I=uint32_t // type stored in the container (usually an index in a vector of the input values)
typename I=uint32_t, // type stored in the container (usually an index in a vector of the input values)
uint32_t NHISTS=1 // number of histos stored
>
class HistoContainer {
public:
Expand All @@ -111,24 +116,55 @@ class HistoContainer {

using index_type = I;
using UT = typename std::make_unsigned<T>::type;

static constexpr uint32_t ilog2(uint32_t v) {

constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
constexpr uint32_t s[] = {1, 2, 4, 8, 16};

uint32_t r = 0; // result of log2(v) will go here
for (auto i = 4; i >= 0; i--) if (v & b[i]) {
v >>= s[i];
r |= s[i];
}
return r;
}


static constexpr uint32_t sizeT() { return S; }
static constexpr uint32_t nbins() { return 1 << N; }
static constexpr uint32_t shift() { return sizeT() - N; }
static constexpr uint32_t mask() { return nbins() - 1; }
static constexpr uint32_t binSize() { return 1 << M; }
static constexpr uint32_t spillSize() { return 16 * binSize(); }
static constexpr uint32_t nbins() { return NBINS;}
static constexpr uint32_t nhists() { return NHISTS;}
static constexpr uint32_t totbins() { return NHISTS*NBINS+1;}
static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;}
static constexpr uint32_t capacity() { return SIZE; }

static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }

#ifdef __CUDACC__
__host__
static size_t wsSize() {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this function actually called anywhere (besides a printout)? Or is the temp_storage_size in practice always smaller than 4*totbins() (that IIUC gets used for the temporary storage inf InclusiveSum)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in principle it should be used, I verified that temp_storage_size in practice always smaller than 4*totbins()

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I was also waiting that we define a strategy for a reusable ws...

uint32_t * v =nullptr;
void * d_temp_storage = nullptr;
size_t temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
return std::max(temp_storage_bytes,size_t(totbins()));
}
#endif


static constexpr UT bin(T t) {
return (t >> shift()) & mask();
constexpr uint32_t shift = sizeT() - nbits();
constexpr uint32_t mask = (1<<nbits()) - 1;
return (t >> shift) & mask;
}

void zero() {
nspills = 0;
for (auto & i : n)
for (auto & i : off)
i = 0;
}

static constexpr
static __host__ __device__
__forceinline__
uint32_t atomicIncrement(Counter & x) {
#ifdef __CUDA_ARCH__
return atomicAdd(&x, 1);
Expand All @@ -138,51 +174,72 @@ class HistoContainer {
}

__host__ __device__
void fill(T t, index_type j) {
UT b = bin(t);
__forceinline__
void count(T t) {
uint32_t b = bin(t);
assert(b<nbins());
auto w = atomicIncrement(n[b]);
if (w < binSize()) {
bins[b * binSize() + w] = j;
} else {
auto w = atomicIncrement(nspills);
if (w < spillSize())
spillBin[w] = j;
}
atomicIncrement(off[b+1]);
}

constexpr bool fullSpill() const {
return nspills >= spillSize();
__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws) {
uint32_t b = bin(t);
assert(b<nbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
}

constexpr bool full(uint32_t b) const {
return n[b] >= binSize();
}

constexpr auto const * begin(uint32_t b) const {
return bins + b * binSize();
}

constexpr auto const * end(uint32_t b) const {
return begin(b) + std::min(binSize(), uint32_t(n[b]));
__host__ __device__
__forceinline__
void count(T t, uint32_t nh) {
uint32_t b = bin(t);
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
atomicIncrement(off[b+1]);
}

constexpr auto size(uint32_t b) const {
return n[b];
__host__ __device__
__forceinline__
void fill(T t, index_type j, Counter * ws, uint32_t nh) {
uint32_t b = bin(t);
assert(b<nbins());
b+=histOff(nh);
assert(b<totbins());
auto w = atomicIncrement(ws[b]);
assert(w < size(b));
bins[off[b] + w] = j;
}

constexpr auto const * beginSpill() const {
return spillBin;
#ifdef __CUDACC__
__device__
__forceinline__
void finalize(Counter * ws) {
blockPrefixScan(off+1,totbins()-1,ws);
}

constexpr auto const * endSpill() const {
return beginSpill() + std::min(spillSize(), uint32_t(nspills));
__host__
#endif
void finalize() {
for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
}

Counter n[nbins()];
Counter nspills;
index_type bins[nbins()*binSize()];
index_type spillBin[spillSize()];
constexpr auto size() const { return uint32_t(off[totbins()-1]);}
constexpr auto size(uint32_t b) const { return off[b+1]-off[b];}


constexpr index_type const * begin() const { return bins;}
constexpr index_type const * end() const { return begin() + size();}


constexpr index_type const * begin(uint32_t b) const { return bins + off[b];}
constexpr index_type const * end(uint32_t b) const { return bins + off[b+1];}


Counter off[totbins()];
index_type bins[capacity()];
};

#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
52 changes: 52 additions & 0 deletions HeterogeneousCore/CUDAUtilities/interface/prefixScan.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h
#define HeterogeneousCore_CUDAUtilities_prefixScan_h

#include <cstdint>
#include <cassert>

template<typename T>
__device__
void
__forceinline__
warpPrefixScan(T * c, uint32_t i) {
auto x = c[i];
auto laneId = threadIdx.x & 0x1f;
#pragma unroll
for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
auto y = __shfl_up_sync(0xffffffff,x, offset);
if(laneId >= offset) x += y;
}
c[i] = x;
}

// limited to 32*32 elements....
template<typename T>
__device__
void
__forceinline__
blockPrefixScan(T * c, uint32_t size, T* ws) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUB seems to provide something similarly named

Is there a reason for not using them (e.g. they do different thing, interface is awkward etc)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interface is a bit awkward, maybe I need to look at it again (now that I managed to use the host API)

assert(size<=1024);
assert(0==blockDim.x%32);

auto first = threadIdx.x;

for (auto i=first; i<size; i+=blockDim.x) {
warpPrefixScan(c,i);
auto laneId = threadIdx.x & 0x1f;
auto warpId = i/32;
assert(warpId<32);
if (31==laneId) ws[warpId]=c[i];
}
__syncthreads();
if (size<=32) return;
if (threadIdx.x<32) warpPrefixScan(ws,threadIdx.x);
__syncthreads();
for (auto i=first+32; i<size; i+=blockDim.x) {
auto warpId = i/32;
c[i]+=ws[warpId-1];
}
__syncthreads();
}


#endif
Loading