diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index 38ee88b068999..fc6a61ffd433e 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1 +1,2 @@ + diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h index 74c8731da3712..47592fd2063d6 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h @@ -87,7 +87,7 @@ template struct SimpleVector { inline constexpr void reset() { m_size = 0; } inline constexpr int size() const { return m_size; } inline constexpr int capacity() const { return m_capacity; } - inline constexpr T *data() const { return m_data; } + inline constexpr T const * data() const { return m_data; } inline constexpr void resize(int size) { m_size = size; } inline constexpr void set_data(T * data) { m_data = data; } diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h index 6083c0a55cd3b..8dcefdce65ab4 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h @@ -85,7 +85,7 @@ template struct VecArray { inline constexpr const T& operator[](int i) const { return m_data[i]; } inline constexpr void reset() { m_size = 0; } inline constexpr int capacity() const { return maxSize; } - inline constexpr T *data() const { return m_data; } + inline constexpr T const * data() const { return m_data; } inline constexpr void resize(int size) { m_size = size; } inline constexpr bool empty() const { return 0 == m_size; } inline constexpr bool full() const { return maxSize == m_size; } diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 94acd0015edf6..e13e30594ab6f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -2,6 +2,7 @@ #define HeterogeneousCore_CUDAUtilities_HistoContainer_h #include +#include #include #include #include @@ -10,96 +11,101 @@ #endif // __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include +#endif #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #ifdef __CUDACC__ namespace cudautils { - template - __global__ - void zeroMany(Histo * h, uint32_t nh) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto ih = i / Histo::nbins(); - auto k = i - ih * Histo::nbins(); - if (ih < nh) { - h[ih].nspills = 0; - if (k < Histo::nbins()) - h[ih].n[k] = 0; - } - } - template __global__ - void fillFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets) { + void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= offsets[nh]) return; auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; assert(ih >= 0); - assert(ih < nh); - h[ih].fill(v[i], i); + assert(ih < nh); + (*h).count(v[i], ih); } template __global__ - void fillFromVector(Histo * h, T const * v, uint32_t size) { + void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, + uint32_t * __restrict__ ws ) { auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < size) h->fill(v[i], i); - } - - template - void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) { - auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads; - zeroMany<<>>(h, nh); - cudaCheck(cudaGetLastError()); + if(i >= offsets[nh]) return; + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).fill(v[i], i, ws, ih); } - template - void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) { - zero(h, 1, nthreads, stream); - auto nblocks = (size + nthreads - 1) / nthreads; - fillFromVector<<>>(h, v, size); - cudaCheck(cudaGetLastError()); - } template - void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) { - zero(h, nh, nthreads, stream); + void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws, + uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, + int nthreads, cudaStream_t stream) { + uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); + cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); auto nblocks = (totSize + nthreads - 1) / nthreads; - fillFromVector<<>>(h, nh, v, offsets); + countFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); + size_t wss = Histo::totbins(); + CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); + cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream); + fillFromVector<<>>(h, nh, v, offsets,ws); cudaCheck(cudaGetLastError()); } + } // namespace cudautils #endif // iteratate over N bins left and right of the one containing "v" -// including spillBin template __host__ __device__ +__forceinline__ void forEachInBins(Hist const & hist, V value, int n, Func func) { - int bs = hist.bin(value); - int be = std::min(int(hist.nbins()),bs+n+1); + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins()-1),bs+n); bs = std::max(0,bs-n); - assert(be>bs); - for (auto b=bs; b=bs); + for (auto pj=hist.begin(bs);pj +__host__ __device__ +__forceinline__ +void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be>=bs); + for (auto pj=hist.begin(bs);pj class HistoContainer { public: @@ -111,24 +117,55 @@ class HistoContainer { using index_type = I; using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return 1 << N; } - static constexpr uint32_t shift() { return sizeT() - N; } - static constexpr uint32_t mask() { return nbins() - 1; } - static constexpr uint32_t binSize() { return 1 << M; } - static constexpr uint32_t spillSize() { return 16 * binSize(); } + static constexpr uint32_t nbins() { return NBINS;} + static constexpr uint32_t nhists() { return NHISTS;} + static constexpr uint32_t totbins() { return NHISTS*NBINS+1;} + static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } + +#ifdef __CUDACC__ + __host__ + static size_t wsSize() { + uint32_t * v =nullptr; + void * d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1); + return std::max(temp_storage_bytes,size_t(totbins())); + } +#endif + static constexpr UT bin(T t) { - return (t >> shift()) & mask(); + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1<> shift) & mask; } void zero() { - nspills = 0; - for (auto & i : n) + for (auto & i : off) i = 0; } - static constexpr + static __host__ __device__ + __forceinline__ uint32_t atomicIncrement(Counter & x) { #ifdef __CUDA_ARCH__ return atomicAdd(&x, 1); @@ -138,51 +175,72 @@ class HistoContainer { } __host__ __device__ - void fill(T t, index_type j) { - UT b = bin(t); + __forceinline__ + void count(T t) { + uint32_t b = bin(t); assert(b= spillSize(); - } - - constexpr bool full(uint32_t b) const { - return n[b] >= binSize(); + __host__ __device__ + __forceinline__ + void fill(T t, index_type j, Counter * ws) { + uint32_t b = bin(t); + assert(b +#include + +template +__device__ +void +__forceinline__ +warpPrefixScan(T * c, uint32_t i) { + auto x = c[i]; + auto laneId = threadIdx.x & 0x1f; + #pragma unroll + for( int offset = 1 ; offset < 32 ; offset <<= 1 ) { + auto y = __shfl_up_sync(0xffffffff,x, offset); + if(laneId >= offset) x += y; + } + c[i] = x; +} + +// limited to 32*32 elements.... +template +__device__ +void +__forceinline__ +blockPrefixScan(T * c, uint32_t size, T* ws) { + assert(size<=1024); + assert(0==blockDim.x%32); + + auto first = threadIdx.x; + + for (auto i=first; i -__device__ -void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) { +__device__ +void +__forceinline__ +radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_t size, RF reorder) { constexpr int d = 8, w = 8*sizeof(T); constexpr int sb = 1<0); - assert(size<=MaxSize); - assert(blockDim.x==sb); + assert(blockDim.x>=sb); // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; @@ -104,9 +103,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) { for (auto i=first; i= offset) x += y; + if (threadIdx.x= offset) x += y; + } + ct[threadIdx.x] = x; } - ct[threadIdx.x] = x; __syncthreads(); - auto ss = (threadIdx.x/32)*32 -1; - c[threadIdx.x] = ct[threadIdx.x]; - for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; - + if (threadIdx.x0; i-=32) c[threadIdx.x] +=ct[i]; + } /* //prefix scan for the nulls (for documentation) if (threadIdx.x==0) @@ -140,27 +141,33 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) { // broadcast ibs =size-1; __syncthreads(); - while (ibs>0) { + while (__syncthreads_and(ibs>0)) { int i = ibs - threadIdx.x; - cu[threadIdx.x]=-1; - ct[threadIdx.x]=-1; + if (threadIdx.x=0) { - bin = (a[j[i]] >> d*p)&(sb-1); - ct[threadIdx.x]=bin; - atomicMax(&cu[bin],int(i)); + if (threadIdx.x=0) { + bin = (a[j[i]] >> d*p)&(sb-1); + ct[threadIdx.x]=bin; + atomicMax(&cu[bin],int(i)); + } } __syncthreads(); - if (i>=0 && i==cu[bin]) // ensure to keep them in order - for (int ii=threadIdx.x; ii=oi);if(i>=oi) - k[--c[bin]] = j[i-oi]; - } + if (threadIdx.x=0 && i==cu[bin]) // ensure to keep them in order + for (int ii=threadIdx.x; ii=oi);if(i>=oi) + k[--c[bin]] = j[i-oi]; + } + } __syncthreads(); if (bin>=0) assert(c[bin]>=0); - if (threadIdx.x==0) ibs-=blockDim.x; + if (threadIdx.x==0) ibs-=sb; __syncthreads(); } @@ -191,6 +198,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) { if (j!=ind) // odd... for (auto i=first; i::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { - radixSortImpl(a,ind,size,dummyReorder); +void +__forceinline__ +radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { + radixSortImpl(a,ind,ind2,size,dummyReorder); } template< @@ -213,8 +224,10 @@ template< typename std::enable_if::value&&std::is_signed::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { - radixSortImpl(a,ind,size,reorderSigned); +void +__forceinline__ +radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { + radixSortImpl(a,ind,ind2,size,reorderSigned); } template< @@ -223,29 +236,47 @@ template< typename std::enable_if::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { +void +__forceinline__ + radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { using I = int; - radixSortImpl((I const *)(a),ind,size,reorderFloat); + radixSortImpl((I const *)(a),ind,ind2, size,reorderFloat); } template __device__ -void radixSortMulti(T * v, uint16_t * index, uint32_t * offsets) { +void +__forceinline__ +radixSortMulti(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + + extern __shared__ uint16_t ws[]; auto a = v+offsets[blockIdx.x]; - auto ind = index+offsets[blockIdx.x];; + auto ind = index+offsets[blockIdx.x]; + auto ind2 = nullptr==workspace ? ws : workspace+offsets[blockIdx.x]; auto size = offsets[blockIdx.x+1]-offsets[blockIdx.x]; assert(offsets[blockIdx.x+1]>=offsets[blockIdx.x]); - if (size>0) radixSort(a,ind,size); + if (size>0) radixSort(a,ind,ind2,size); } template __global__ -void radixSortMultiWrapper(T * v, uint16_t * index, uint32_t * offsets) { - radixSortMulti(v,index,offsets); +void +__launch_bounds__(256, 4) +radixSortMultiWrapper(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + radixSortMulti(v,index,offsets, workspace); } +template +__global__ +void +// __launch_bounds__(256, 4) +radixSortMultiWrapper2(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + radixSortMulti(v,index,offsets, workspace); +} + + #endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index ab97c243c4385..1049aed41fd5f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -19,13 +19,27 @@ + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 61c561f96267d..ca8167ab10894 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -6,47 +6,75 @@ #include #include -template +template void go() { std::mt19937 eng; - std::uniform_int_distribution rgen(std::numeric_limits::min(),std::numeric_limits::max()); + + int rmin=std::numeric_limits::min(); + int rmax=std::numeric_limits::max(); + if (NBINS!=128) { + rmin=0; + rmax=NBINS*2-1; + } + + + + std::uniform_int_distribution rgen(rmin,rmax); constexpr int N=12000; T v[N]; - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << std::endl; - + using Hist = HistoContainer; + using Hist4 = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' << Hist4::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + for (auto nh=0; nh<4; ++nh) std::cout << "bins " << int(Hist4::bin(0))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax))+Hist4::histOff(nh) << std::endl; + + Hist h; + Hist4 h4; + typename Hist::Counter ws[Hist::totbins()]; + typename Hist4::Counter ws4[Hist4::totbins()]; for (int it=0; it<5; ++it) { for (long long j = 0; j < N; j++) v[j]=rgen(eng); - h.zero(); - for (long long j = 0; j < N; j++) h.fill(v[j],j); - - std::cout << "nspills " << h.nspills << std::endl; + if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4; + h.zero();h4.zero(); + assert(h.size()==0);assert(h4.size()==0); + for (auto & i: ws) i=0; + for (auto & i: ws4) i=0; + for (long long j = 0; j < N; j++) { h.count(v[j]); if(j<2000) h4.count(v[j],2); else h4.count(v[j],j%4); } + h.finalize(); h4.finalize(); + assert(h.off[0]==0); + assert(h.size()==N); + assert(h4.off[0]==0); + assert(h4.size()==N); + for (long long j = 0; j < N; j++) { h.fill(v[j],j,ws); if(j<2000) h4.fill(v[j],j,ws4,2); else h4.fill(v[j],j,ws4,j%4); } + assert(h.size()==N); + assert(h4.size()==N); - auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1=i); } // std::cout << kl << ' ' << kh << std::endl; - for (auto j=h.begin(kl); j(); + go(); + go(); + return 0; } diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 73e9f20ae589f..8d640b728e25c 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -34,12 +34,16 @@ void go() { constexpr uint32_t partSize = N/nParts; uint32_t offsets[nParts+1]; - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl; + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist,off)) << ' ' + << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << Hist::wsSize() << ' ' + << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl; - Hist h[nParts]; + Hist h; + + auto h_d = cuda::memory::device::make_unique(current_device, 1); + auto ws_d = cuda::memory::device::make_unique(current_device, Hist::totbins()); - auto h_d = cuda::memory::device::make_unique(current_device, nParts); auto off_d = cuda::memory::device::make_unique(current_device, nParts+1); @@ -62,18 +66,20 @@ void go() { for (long long j = 0; j < N; j++) v[j]=rgen(eng); - if (it==2) { // spill + if (it==2) { // big bin for (long long j = 1000; j < 2000; j++) v[j]= sizeof(T)==1 ? 22 : 3456; } cuda::memory::copy(v_d.get(), v, N*sizeof(T)); - cudautils::fillManyFromVector(h_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0); - - cuda::memory::copy(&h, h_d.get(), nParts*sizeof(Hist)); + cudautils::fillManyFromVector(h_d.get(),ws_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0); - + cuda::memory::copy(&h, h_d.get(), sizeof(Hist)); + assert(0==h.off[0]); + assert(offsets[10]==h.size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1 window ) {} else {++tot;} } if (kk==i) { l=false; continue; } - if (l) for (auto p=h[j].begin(kk); p=nm)) { - std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<=nm)) { + std::cout << "too bad " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'< +#include +#include +#include +#include + +#include + +template +__global__ +void mykernel(T const * __restrict__ v, uint32_t N) { + + assert(v); + assert(N==12000); + + if (threadIdx.x==0) printf("start kernel for %d data\n",N); + + using Hist = HistoContainer; + constexpr auto wss = Hist::totbins(); + + if (threadIdx.x==0) printf("ws size %d\n",wss); + + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + + for (auto j=threadIdx.x; j=k1); + } + + for (auto i=threadIdx.x; i=0 && k::max(); + vm = std::max(vm, 0); + vm = std::min(vm,vmax); + vp = std::min(vp,vmax); + vp = std::max(vp, 0); + assert(vp>=vm); + forEachInWindow(hist, vm,vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp)-hist.begin(bm); + assert(tot==rtot); + } + + +} + +template +void go() { + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + + std::mt19937 eng; + + int rmin=std::numeric_limits::min(); + int rmax=std::numeric_limits::max(); + if (NBINS!=128) { + rmin=0; + rmax=NBINS*2-1; + } + + + + std::uniform_int_distribution rgen(rmin,rmax); + + + constexpr int N=12000; + T v[N]; + + auto v_d = cuda::memory::device::make_unique(current_device, N); + assert(v_d.get()); + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + for (int it=0; it<5; ++it) { + for (long long j = 0; j < N; j++) v[j]=rgen(eng); + if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4; + + assert(v_d.get()); + assert(v); + cuda::memory::copy(v_d.get(), v, N*sizeof(T)); + assert(v_d.get()); + cuda::launch(mykernel,{1,256},v_d.get(),N); + } + +} + +int main() { + go(); + go(); + go(); + + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu new file mode 100644 index 0000000000000..36c4c3b88abe6 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -0,0 +1,86 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include + +template +__global__ +void testPrefixScan(uint32_t size) { + + __shared__ T ws[32]; + __shared__ T c[1024]; + auto first = threadIdx.x; + for (auto i=first; i + + +__global__ +void init(uint32_t * v, uint32_t val, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i<<<1,bs>>>(j); + testPrefixScan<<<1,bs>>>(j); + } + cudaDeviceSynchronize(); + + // Declare, allocate, and initialize device-accessible pointers for input and output + int num_items = 10000; + uint32_t *d_in; + uint32_t *d_out; + + + cudaMalloc(&d_in,num_items*sizeof(uint32_t)); + // cudaMalloc(&d_out,num_items*sizeof(uint32_t)); + + d_out = d_in; + + auto nthreads = 256; + auto nblocks = (num_items + nthreads - 1) / nthreads; + + init<<>>(d_in, 1, num_items); + + // Determine temporary device storage requirements for inclusive prefix sum + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + // Allocate temporary storage for inclusive prefix sum + // fake larger ws already available + temp_storage_bytes *=8; + cudaMalloc(&d_temp_storage, temp_storage_bytes); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + // Run inclusive prefix sum + CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + verify<<>>(d_out, num_items); + + cudaDeviceSynchronize(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index bdc1af0123637..08441c8cc9555 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -7,7 +7,7 @@ #include #include #include - +#include #include #include @@ -31,7 +31,7 @@ struct RS { template -void go() { +void go(bool useShared) { std::mt19937 eng; // std::mt19937 eng2; @@ -89,6 +89,7 @@ void go() { auto v_d = cuda::memory::device::make_unique(current_device, N); auto ind_d = cuda::memory::device::make_unique(current_device, N); + auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks+1); cuda::memory::copy(v_d.get(), v, N*sizeof(T)); @@ -96,12 +97,23 @@ void go() { if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl; + auto ntXBl = 1==i%4 ? 256 : 256; + delta -= (std::chrono::high_resolution_clock::now()-start); + constexpr int MaxSize = 256*32; + if (useShared) cuda::launch( radixSortMultiWrapper, - { blocks, 256 }, - v_d.get(),ind_d.get(),off_d.get() + { blocks, ntXBl, MaxSize*2 }, + v_d.get(),ind_d.get(),off_d.get(),nullptr ); + else + cuda::launch( + radixSortMultiWrapper2, + { blocks, ntXBl }, + v_d.get(),ind_d.get(),off_d.get(),ws_d.get() + ); + if (i==0) std::cout << "done for " << offsets[blocks] << std::endl; @@ -117,10 +129,13 @@ void go() { std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize-1000]]) << std::endl; std::cout << LL(v[ind[blockSize/2-1]]) << ' ' << LL(v[ind[blockSize/2]]) << ' ' << LL(v[ind[blockSize/2+1]]) << std::endl; } - for (int ib=0; ib inds; + if (offsets[ib+1]> offsets[ib]) inds.insert(ind[offsets[ib]]); + for (auto j = offsets[ib]+1; j < offsets[ib+1]; j++) { + inds.insert(ind[j]); auto a = v+offsets[ib]; - auto k1=a[ind[i]]; auto k2=a[ind[i-1]]; + auto k1=a[ind[j]]; auto k2=a[ind[j-1]]; auto sh = sizeof(uint64_t)-NS; sh*=8; auto shorten = [sh](T& t) { auto k = (uint64_t *)(&t); @@ -128,8 +143,15 @@ void go() { }; shorten(k1);shorten(k2); if (k1(); - go(); - go(); - go(); - go(); - go(); - go(); - - go(); - go(); - go(); - // go(); + bool useShared=false; + + std::cout << "using Global memory" << std::endl; + + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + useShared=true; + + std::cout << "using Shared memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + + return 0; } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index f3242a11d7ae6..7bd6eac473cc7 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -34,6 +34,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" // local includes @@ -687,11 +688,11 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } - /* - std::cout +#ifdef GPU_DEBUG + std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; - */ +#endif cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id())); @@ -703,10 +704,10 @@ namespace pixelgpudetails { threadsPerBlock = 256; blocks = MaxNumModules; - /* +#ifdef GPU_DEBUG std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; - */ +#endif cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( moduleInd_d, @@ -717,6 +718,18 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); + // apply charge cut + clusterChargeCut<<>>( + moduleInd_d, + adc_d, + moduleStart_d, + clusInModule_d, moduleId_d, + clus_d, + wordCounter); + cudaCheck(cudaGetLastError()); + + + // count the module start indices already here (instead of // rechits) so that the number of clusters/hits can be made // available in the rechit producer without additional points of diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 7f7e65d2874f1..384a4732b32e1 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -584,7 +584,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, } int32_t nclus=-1; - std::vector aclusters(256); + std::vector aclusters(1024); auto totCluseFilled=0; auto fillClusters = [&](uint32_t detId){ @@ -611,6 +611,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, for (uint32_t i = 0; i < gpu.nDigis; i++) { if (gpu.pdigi_h[i]==0) continue; + if (gpu.clus_h[i]>9000) continue; // not in cluster assert(gpu.rawIdArr_h[i] > 109999); if ( (*detDigis).detId() != gpu.rawIdArr_h[i]) { @@ -625,7 +626,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto const & dig = (*detDigis).data.back(); // fill clusters assert(gpu.clus_h[i]>=0); - assert(gpu.clus_h[i]<256); + assert(gpu.clus_h[i]<1024); nclus = std::max(gpu.clus_h[i],nclus); auto row = dig.row(); auto col = dig.column(); diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h new file mode 100644 index 0000000000000..0284a378ecd39 --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -0,0 +1,97 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h +#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h + +#include +#include +#include + +#include "gpuClusteringConstants.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" + +namespace gpuClustering { + + __global__ void clusterChargeCut( + uint16_t * __restrict__ id, // module id of each pixel (modified if bad cluster) + uint16_t const * __restrict__ adc, // charge of each pixel + uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module + uint32_t * __restrict__ nClustersInModule, // modified: number of clusters found in each module + uint32_t const * __restrict__ moduleId, // module id of each module + int32_t * __restrict__ clusterId, // modified: cluster id of each pixel + int numElements) + { + + if (blockIdx.x >= moduleStart[0]) + return; + + auto firstPixel = moduleStart[1 + blockIdx.x]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + assert(thisModuleId==moduleId[blockIdx.x]); + + auto nclus = nClustersInModule[thisModuleId]; + if (nclus==0) return; + + assert(nclus<=MaxNumClustersPerModules); + +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); +#endif + + auto first = firstPixel + threadIdx.x; + + __shared__ int32_t charge[MaxNumClustersPerModules]; + for (int i=threadIdx.x; ichargeCut ? 1 : 0; + } + + __syncthreads(); + + // renumber + __shared__ uint16_t ws[32]; + blockPrefixScan(newclusId, nclus, ws); + + assert(nclus>=newclusId[nclus-1]); + + if(nclus==newclusId[nclus-1]) return; + + nClustersInModule[thisModuleId] = newclusId[nclus-1]; + __syncthreads(); + + // mark bad cluster again + for (int i=threadIdx.x; i #include #include #include "gpuClusteringConstants.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + namespace gpuClustering { __global__ void countModules(uint16_t const * __restrict__ id, @@ -32,7 +37,9 @@ namespace gpuClustering { } } - __global__ void findClus(uint16_t const * __restrict__ id, // module id of each pixel + __global__ +// __launch_bounds__(256,4) + void findClus(uint16_t const * __restrict__ id, // module id of each pixel uint16_t const * __restrict__ x, // local coordinates of each pixel uint16_t const * __restrict__ y, // uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module @@ -63,8 +70,6 @@ namespace gpuClustering { __syncthreads(); // skip threads not associated to an existing pixel - bool active = (first < numElements); - if (active) { for (int i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) // skip invalid pixels continue; @@ -73,95 +78,132 @@ namespace gpuClustering { break; } } - } - - //init hist (ymax < 512) - __shared__ HistoContainer hist; - hist.nspills = 0; - for (auto k = threadIdx.x; k; + constexpr auto wss = Hist::totbins(); + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + for (auto j=threadIdx.x; j0 ? y[i]-1 : 0); - auto be = hist.bin(y[i]+1)+1; - auto loop = [&](int j) { - j+=firstPixel; - if (i>=j or j>jm or - std::abs(int(x[j]) - int(x[i])) > 1 or - std::abs(int(y[j]) - int(y[i])) > 1) return; - auto old = atomicMin(&clusterId[j], clusterId[i]); + auto loop = [&](uint16_t const * kk) { + auto m = (*kk)+firstPixel; +#ifdef GPU_DEBUG + assert(m!=i); +#endif + if (std::abs(int(x[m]) - int(x[i])) > 1) return; + // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1 + auto old = atomicMin(&clusterId[m], clusterId[i]); if (old != clusterId[i]) { // end the loop only if no changes were applied - done = false; + more = true; } atomicMin(&clusterId[i], old); +#ifdef CLUS_LIMIT_LOOP // update the loop boundary for the next iteration - jmax[k] = std::max(j + 1,jmax[k]); + jmax[k] = std::max(kk + 1,jmax[k]); +#endif }; - for (auto b=bs; b MaxNumModules + } #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml index 3fc830883ca58..335591b583b58 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml @@ -34,6 +34,15 @@ + + + + + + + + + diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index 2264be0de02af..a01c1230586fe 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -12,6 +12,7 @@ #include #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" int main(void) { @@ -30,7 +31,6 @@ int main(void) auto h_clus = std::make_unique(numElements); - auto h_debug = std::make_unique(numElements); auto current_device = cuda::device::current::get(); auto d_id = cuda::memory::device::make_unique(current_device, numElements); auto d_x = cuda::memory::device::make_unique(current_device, numElements); @@ -44,12 +44,29 @@ int main(void) auto d_clusInModule = cuda::memory::device::make_unique(current_device, MaxNumModules); auto d_moduleId = cuda::memory::device::make_unique(current_device, MaxNumModules); - auto d_debug = cuda::memory::device::make_unique(current_device, numElements); // later random number int n=0; int ncl=0; int y[10]={5,7,9,1,3,0,4,8,2,6}; + auto generateClusters = [&](int kn) { + auto addBigNoise = 1==kn%2; + if (addBigNoise) { + constexpr int MaxPixels = 1000; + int id = 666; + for (int x=0; x<140; x+=3) { + for (int yy=0; yy<400; yy+=3) { + h_id[n]=id; + h_x[n]=x; + h_y[n]=yy; + h_adc[n]=1000; + ++n; ++ncl; + if (MaxPixels<=ncl) break; + } + if (MaxPixels<=ncl) break; + } + } + { // isolated int id = 42; @@ -58,15 +75,56 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]= kn==0 ? 100 : 5000; + ++n; + + // first column + ++ncl; + h_id[n]=id; + h_x[n]=x; + h_y[n]=0; + h_adc[n]= 5000; + ++n; + // first columns + ++ncl; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=2; + h_adc[n]= 5000; + ++n; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=1; + h_adc[n]= 5000; + ++n; + + // last column + ++ncl; + h_id[n]=id; + h_x[n]=x; + h_y[n]=415; + h_adc[n]= 5000; + ++n; + // last columns + ++ncl; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=415; + h_adc[n]= 2500; + ++n; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=414; + h_adc[n]= 2500; ++n; + // diagonal ++ncl; for (int x=20; x<25; ++x) { h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=1000; ++n; } ++ncl; @@ -75,7 +133,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=1000; ++n; } ++ncl; @@ -86,7 +144,7 @@ int main(void) h_id[n]=id; h_x[n]=xx[k]; h_y[n]=20+xx[k]; - h_adc[n]=100; + h_adc[n]=1000; ++n; } // holes @@ -95,13 +153,13 @@ int main(void) h_id[n]=id; h_x[n]=xx[k]; h_y[n]=100; - h_adc[n]=100; + h_adc[n]= kn==2 ? 100 : 1000; ++n; if (xx[k]%2==0) { h_id[n]=id; h_x[n]=xx[k]; h_y[n]=101; - h_adc[n]=100; + h_adc[n]=1000; ++n; } } @@ -114,7 +172,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=5000; ++n; } // all odd id @@ -132,7 +190,7 @@ int main(void) h_id[n]=id; h_x[n]=x+1; h_y[n]=x+y[k]+2; - h_adc[n]=100; + h_adc[n]=1000; ++n; } } else { @@ -140,24 +198,31 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x+y[9-k]; - h_adc[n]=100; + h_adc[n]= kn==2 ? 10 : 1000; ++n; if (y[k]==3) continue; // hole if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error h_id[n]=id; h_x[n]=x+1; h_y[n]=x+y[k]+2; - h_adc[n]=100; + h_adc[n]= kn==2 ? 10 : 1000; ++n; } } } } + }; // end lambda + for (auto kkk=0; kkk<5; ++kkk) { + n=0; ncl=0; + generateClusters(kkk); + std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl; assert(n<=numElements); + + size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); - size_t size8 = n * sizeof(uint8_t); + // size_t size8 = n * sizeof(uint8_t); uint32_t nModules=0; cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t)); @@ -165,10 +230,9 @@ int main(void) cuda::memory::copy(d_id.get(), h_id.get(), size16); cuda::memory::copy(d_x.get(), h_x.get(), size16); cuda::memory::copy(d_y.get(), h_y.get(), size16); - cuda::memory::copy(d_adc.get(), h_adc.get(), size8); - cuda::memory::device::zero(d_debug.get(),size32); + cuda::memory::copy(d_adc.get(), h_adc.get(), size16); // Launch CUDA Kernels - int threadsPerBlock = 256; + int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256); int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA countModules kernel launch with " << blocksPerGrid @@ -180,7 +244,6 @@ int main(void) d_id.get(), d_moduleStart.get() ,d_clus.get(),n ); - threadsPerBlock = 256; blocksPerGrid = MaxNumModules; //nModules; std::cout @@ -198,32 +261,77 @@ int main(void) d_clus.get(), n ); + cudaDeviceSynchronize(); + + cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); + + uint32_t nclus[MaxNumModules], moduleId[nModules]; + + cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); + std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl; + for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;} + if (ncl!=std::accumulate(nclus,nclus+MaxNumModules,0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; + + cuda::launch( + clusterChargeCut, + { blocksPerGrid, threadsPerBlock }, + d_id.get(), d_adc.get(), + d_moduleStart.get(), + d_clusInModule.get(), d_moduleId.get(), + d_clus.get(), + n + ); + + + cudaDeviceSynchronize(); - cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); std::cout << "found " << nModules << " Modules active" << std::endl; - uint32_t nclus[MaxNumModules], moduleId[nModules]; + cuda::memory::copy(h_id.get(), d_id.get(), size16); cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t)); - cuda::memory::copy(h_debug.get(), d_debug.get(), size32); - auto p = std::minmax_element(h_debug.get(),h_debug.get()+n); - std::cout << "debug " << *p.first << ' ' << *p.second << std::endl; std::set clids; - std::vector seeds; for (int i=0; i=0); assert(h_clus[i]0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;} // << " and " << seeds.size() << " seeds" << std::endl; - + } /// end loop kkk return 0; } diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml index c52545a601341..7918c7a4f4d9a 100644 --- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml @@ -14,7 +14,8 @@ - + + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index e2165471c3386..c63466f157a1b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -8,7 +8,7 @@ // CMSSW headers #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" #include "PixelRecHits.h" #include "gpuPixelRecHits.h" @@ -34,7 +34,7 @@ namespace { namespace pixelgpudetails { PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { - constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; + constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits(); cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); @@ -70,7 +70,8 @@ namespace pixelgpudetails { gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3); gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); - cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist))); + cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist))); + cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins())); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id())); @@ -109,6 +110,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(gpu_.owner_32bit_)); cudaCheck(cudaFree(gpu_.owner_16bit_)); cudaCheck(cudaFree(gpu_.hist_d)); + cudaCheck(cudaFree(gpu_.hws_d)); cudaCheck(cudaFree(gpu_d)); cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); @@ -132,6 +134,10 @@ namespace pixelgpudetails { int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) + +#ifdef GPU_DEBUG + std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; +#endif gpuPixelRecHits::getHits<<>>( cpeParams, gpu_.bs_d, @@ -188,6 +194,6 @@ namespace pixelgpudetails { // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d); } - cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); + cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); } } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index 6065fa3cb274a..f0c3b42b30eba 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -218,6 +218,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::HandledetId(); @@ -234,7 +235,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle=96 && hoc.charge[fc+i]<4000) ) continue; + if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) { ++numberOfLostClusters; continue;} ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a] MaxNumModules - - - constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule; - - using ClusParams = pixelCPEforGPU::ClusParams; - - __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams, float const * __restrict__ bs, uint16_t const * __restrict__ id, @@ -42,21 +33,44 @@ namespace gpuPixelRecHits { float * xe, float * ye, uint16_t * mr, uint16_t * mc) { + + // to be moved in common namespace... + constexpr uint16_t InvId=9999; // must be > MaxNumModules + constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule; + + using ClusParams = pixelCPEforGPU::ClusParams; + + // as usual one block per module __shared__ ClusParams clusParams; auto first = digiModuleStart[1 + blockIdx.x]; - auto me = id[first]; - assert(moduleId[blockIdx.x] == me); + auto me = moduleId[blockIdx.x]; auto nclus = clusInModule[me]; + if (0==nclus) return; + +#ifdef GPU_DEBUG + if (threadIdx.x==0) { + auto k=first; + while (id[k]==InvId) ++k; + assert(id[k]==me); + } +#endif + #ifdef GPU_DEBUG if (me%100==1) if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]); #endif assert(blockDim.x >= MaxClusInModule); - assert(nclus <= MaxClusInModule); + + if (threadIdx.x==0 && nclus > MaxClusInModule) { + printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule); + // zero charge: do not bother to do it in parallel + for (auto d=MaxClusInModule; d= nclus) continue; atomicMin(&clusParams.minRow[clus[i]], x[i]); atomicMax(&clusParams.maxRow[clus[i]], x[i]); atomicMin(&clusParams.minCol[clus[i]], y[i]); @@ -93,6 +107,7 @@ namespace gpuPixelRecHits { for (int i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) continue; // not valid if (id[i] != me) break; // end of module + if (clus[i] >= nclus) continue; atomicAdd(&clusParams.charge[clus[i]], adc[i]); if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]); if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 778b763b28cff..9d0fe7a279799 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -17,6 +17,9 @@ namespace siPixelRecHitsHeterogeneousProduct { using CPUProduct = int; // dummy + static constexpr uint32_t maxHits() { return 65536;} + using hindex_type = uint16_t; // if above is <=2^16 + struct HitsOnGPU{ pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned float * bs_d; @@ -32,8 +35,9 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - using Hist = HistoContainer; + using Hist = HistoContainer; Hist * hist_d; + typename Hist::Counter * hws_d; HitsOnGPU const * me_d = nullptr; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 101095c5a9e4d..66d6d56286008 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -164,8 +164,8 @@ KernelLineFitAllHits(GPU::SimpleVector * foundNtuplets, __global__ void kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, - GPUCACell *cells, uint32_t const * nCells, - GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, + GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells, + GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, uint32_t nHits, uint32_t maxNumberOfDoublets) { auto idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -185,31 +185,36 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, } -__global__ void +__global__ +void kernel_connect(GPU::SimpleVector *foundNtuplets, - GPUCACell *cells, uint32_t const * nCells, - GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, + GPUCACell::Hits const * __restrict__ hhp, + GPUCACell * cells, uint32_t const * __restrict__ nCells, + GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, float ptmin, float region_origin_radius, const float thetaCut, const float phiCut, const float hardPtCut, unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { - float region_origin_x = 0.; - float region_origin_y = 0.; + auto const & hh = *hhp; + + constexpr float region_origin_x = 0.; + constexpr float region_origin_y = 0.; auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel if (cellIndex >= (*nCells) ) return; - auto &thisCell = cells[cellIndex]; + auto const & thisCell = cells[cellIndex]; auto innerHitId = thisCell.get_inner_hit_id(); auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + auto vi = isOuterHitOfCell[innerHitId].data(); for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { - auto otherCell = isOuterHitOfCell[innerHitId][j]; + auto otherCell = __ldg(vi+j); - if (thisCell.check_alignment_and_tag( - cells, otherCell, ptmin, region_origin_x, region_origin_y, + if (thisCell.check_alignment(hh, + cells[otherCell], ptmin, region_origin_x, region_origin_y, region_origin_radius, thetaCut, phiCut, hardPtCut) ) { cells[otherCell].theOuterNeighbors.push_back(cellIndex); @@ -218,7 +223,7 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, } __global__ void kernel_find_ntuplets( - GPUCACell *cells, uint32_t const * nCells, + GPUCACell * const __restrict__ cells, uint32_t const * nCells, GPU::SimpleVector *foundNtuplets, unsigned int minHitsPerNtuplet, unsigned int maxNumberOfDoublets_) @@ -228,7 +233,7 @@ __global__ void kernel_find_ntuplets( if (cellIndex >= (*nCells) ) return; auto &thisCell = cells[cellIndex]; if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME - GPU::VecArray stack; + GPU::VecArray stack; stack.reset(); thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet); assert(stack.size()==0); @@ -336,9 +341,11 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, auto nhits = hh.nHits; assert(nhits <= PixelGPUConstants::maxNumberOfHits); - auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512; - kernel_connect<<>>( + auto blockSize = 64; + auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize; + kernel_connect<<>>( d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel + hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, region.ptMin(), @@ -347,21 +354,20 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, ); cudaCheck(cudaGetLastError()); - kernel_find_ntuplets<<>>( + kernel_find_ntuplets<<>>( device_theCells_, device_nCells_, d_foundNtupletsVec_[regionIndex], 4, maxNumberOfDoublets_); cudaCheck(cudaGetLastError()); - - numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512; - kernel_checkOverflows<<>>( + numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize; + kernel_checkOverflows<<>>( d_foundNtupletsVec_[regionIndex], device_theCells_, device_nCells_, device_isOuterHitOfCell_, nhits, maxNumberOfDoublets_ ); - + cudaCheck(cudaGetLastError()); // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10); @@ -383,7 +389,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, line_fit_resultsGPU_); cudaCheck(cudaGetLastError()); - if(transferToCPU) { + if (transferToCPU) { cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex], sizeof(GPU::SimpleVector), cudaMemcpyDeviceToHost, cudaStream)); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index 589ae994132ad..fdceec9225d76 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -23,6 +23,8 @@ #include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h" + +// FIXME (split header???) #include "GPUCACell.h" class TrackingRegion; @@ -38,6 +40,7 @@ class CAHitQuadrupletGeneratorGPU { using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; static constexpr unsigned int minLayers = 4; typedef OrderedHitSeeds ResultType; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 5995d286fc38d..772b802282d31 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -6,49 +6,49 @@ #include -#include "GPUHitsAndDoublets.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" struct Quadruplet { - int hitId[4]; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + hindex_type hitId[4]; }; + class GPUCACell { public: + + using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + GPUCACell() = default; +#ifdef __CUDACC__ - __host__ __device__ - void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh, - int layerPairId, int doubletId, int innerHitId,int outerHitId) + __device__ __forceinline__ + void init(Hits const & hh, + int layerPairId, int doubletId, + hindex_type innerHitId, hindex_type outerHitId) { theInnerHitId = innerHitId; theOuterHitId = outerHitId; theDoubletId = doubletId; theLayerPairId = layerPairId; - theInnerX = hh.xg_d[innerHitId]; - theOuterX = hh.xg_d[outerHitId]; - - theInnerY = hh.yg_d[innerHitId]; - theOuterY = hh.yg_d[outerHitId]; - - theInnerZ = hh.zg_d[innerHitId]; - theOuterZ = hh.zg_d[outerHitId]; - theInnerR = hh.rg_d[innerHitId]; - theOuterR = hh.rg_d[outerHitId]; + theInnerZ = __ldg(hh.zg_d+innerHitId); + theInnerR = __ldg(hh.rg_d+innerHitId); theOuterNeighbors.reset(); } - constexpr float get_inner_x() const { return theInnerX; } - constexpr float get_outer_x() const { return theOuterX; } - constexpr float get_inner_y() const { return theInnerY; } - constexpr float get_outer_y() const { return theOuterY; } - constexpr float get_inner_z() const { return theInnerZ; } - constexpr float get_outer_z() const { return theOuterZ; } - constexpr float get_inner_r() const { return theInnerR; } - constexpr float get_outer_r() const { return theOuterR; } + __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); } + __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); } + __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; } + __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; } + __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); } + constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; } @@ -56,37 +56,42 @@ class GPUCACell { return theOuterHitId; } - constexpr void print_cell() const { + + __device__ + void print_cell() const { printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: " "%d, innerradius %f, outerRadius %f \n", - theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId, - theInnerR, theOuterR); + theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId + ); } - __host__ __device__ - bool check_alignment_and_tag( - const GPUCACell *cells, unsigned int innerCellId, const float ptmin, + + __device__ + bool check_alignment(Hits const & hh, + GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float thetaCut, - const float phiCut, const float hardPtCut) + const float phiCut, const float hardPtCut) const { - auto ro = get_outer_r(); - auto zo = get_outer_z(); - const auto &otherCell = cells[innerCellId]; + auto ri = get_inner_r(hh); + auto zi = get_inner_z(hh); + + auto ro = get_outer_r(hh); + auto zo = get_outer_z(hh); - auto r1 = otherCell.get_inner_r(); - auto z1 = otherCell.get_inner_z(); - bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut); + auto r1 = otherCell.get_inner_r(hh); + auto z1 = otherCell.get_inner_z(hh); + bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut); return (aligned && - haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x, + haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x, region_origin_y, region_origin_radius, phiCut, hardPtCut)); } - - constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo, + __device__ __forceinline__ + static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, - const float thetaCut) const { + const float thetaCut) { float radius_diff = std::abs(r1 - ro); float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo); @@ -96,27 +101,26 @@ class GPUCACell { // radius_diff later float tan_12_13_half_mul_distance_13_squared = - fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r())); + fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri)); return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; } - constexpr bool - haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId, + __device__ + bool + haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float phiCut, const float hardPtCut) const { - const auto &otherCell = cells[innerCellId]; - - auto x1 = otherCell.get_inner_x(); - auto y1 = otherCell.get_inner_y(); + auto x1 = otherCell.get_inner_x(hh); + auto y1 = otherCell.get_inner_y(hh); - auto x2 = get_inner_x(); - auto y2 = get_inner_y(); + auto x2 = get_inner_x(hh); + auto y2 = get_inner_y(hh); - auto x3 = get_outer_x(); - auto y3 = get_outer_y(); + auto x3 = get_outer_x(hh); + auto y3 = get_outer_y(hh); float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3); float tan_12_13_half_mul_distance_13_squared = @@ -139,7 +143,7 @@ class GPUCACell { return distance_13_beamspot_squared < (region_origin_radius + phiCut) * (region_origin_radius + phiCut); - } + } // 87 cm/GeV = 1/(3.8T * 0.3) @@ -186,13 +190,13 @@ class GPUCACell { // trying to free the track building process from hardcoded layers, leaving // the visit of the graph based on the neighborhood connections between cells. -#ifdef __CUDACC__ +// #ifdef __CUDACC__ __device__ inline void find_ntuplets( - const GPUCACell *cells, + GPUCACell const * __restrict__ cells, GPU::SimpleVector *foundNtuplets, - GPU::VecArray &tmpNtuplet, + GPU::VecArray &tmpNtuplet, const unsigned int minHitsPerNtuplet) const { // the building process for a track ends if: @@ -231,16 +235,10 @@ class GPUCACell { int theLayerPairId; private: - unsigned int theInnerHitId; - unsigned int theOuterHitId; - float theInnerX; - float theOuterX; - float theInnerY; - float theOuterY; float theInnerZ; - float theOuterZ; float theInnerR; - float theOuterR; + hindex_type theInnerHitId; + hindex_type theOuterHitId; }; #endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 31844f39f9727..61d048637585c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -20,12 +20,13 @@ namespace gpuPixelDoublets { template __device__ + __forceinline__ void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells, int16_t const * __restrict__ iphi, - Hist const * __restrict__ hist, + Hist const & __restrict__ hist, uint32_t const * __restrict__ offsets, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, GPU::VecArray< unsigned int, 256> * isOuterHitOfCell, @@ -63,6 +64,8 @@ namespace gpuPixelDoublets { uint8_t outer = layerPairs[2*pairLayerId+1]; assert(outer > inner); + auto hoff = Hist::histOff(outer); + auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; i += offsets[inner]; @@ -73,8 +76,8 @@ namespace gpuPixelDoublets { // found hit corresponding to our cuda thread, now do the job auto mep = iphi[i]; - auto mez = hh.zg_d[i]; - auto mer = hh.rg_d[i]; + auto mez = __ldg(hh.zg_d+i); + auto mer = __ldg(hh.rg_d+i); constexpr float z0cut = 12.f; // cm constexpr float hardPtCut = 0.5f; // GeV @@ -83,13 +86,13 @@ namespace gpuPixelDoublets { auto ptcut = [&](int j) { auto r2t4 = minRadius2T4; auto ri = mer; - auto ro = hh.rg_d[j]; + auto ro = __ldg(hh.rg_d+j); auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) ); return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri); }; auto z0cutoff = [&](int j) { - auto zo = hh.zg_d[j]; - auto ro = hh.rg_d[j]; + auto zo = __ldg(hh.zg_d+j); + auto ro = __ldg(hh.rg_d+j); auto dr = ro-mer; return dr > maxr[pairLayerId] || dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; @@ -97,8 +100,8 @@ namespace gpuPixelDoublets { auto iphicut = phicuts[pairLayerId]; - auto kl = hist[outer].bin(int16_t(mep-iphicut)); - auto kh = hist[outer].bin(int16_t(mep+iphicut)); + auto kl = Hist::bin(int16_t(mep-iphicut)); + auto kh = Hist::bin(int16_t(mep+iphicut)); auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();}; int tot = 0; int nmin = 0; @@ -108,9 +111,11 @@ namespace gpuPixelDoublets { int tooMany=0; for (auto kk = kl; kk != khh; incr(kk)) { if (kk != kl && kk != kh) - nmin += hist[outer].size(kk); - for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) { - auto oi=*p; + nmin += hist.size(kk+hoff); + auto const * __restrict__ p = hist.begin(kk+hoff); + auto const * __restrict__ e = hist.end(kk+hoff); + for (;p < e; ++p) { + auto oi=__ldg(p); assert(oi>=offsets[outer]); assert(oi 0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot); - if (hist[outer].nspills > 0) - printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills); - - // if (0==hist[outer].nspills) assert(tot>=nmin); - // look in spill bin as well.... - } // loop in block... } constexpr auto getDoubletsFromHistoMaxBlockSize = 64; + constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16; __global__ - __launch_bounds__(getDoubletsFromHistoMaxBlockSize) + __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP) void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp, @@ -184,7 +184,7 @@ namespace gpuPixelDoublets { auto const & __restrict__ hh = *hhp; doubletsFromHisto(layerPairs, nPairs, cells, nCells, - hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d, + hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d, hh, isOuterHitOfCell, phicuts, minz, maxz, maxr); } diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml index 5a3208da2c87f..43261b0417410 100644 --- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml @@ -20,4 +20,4 @@ - + diff --git a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h index fcb19f855a9ba..ff3624cdafd65 100644 --- a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h +++ b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h @@ -16,6 +16,7 @@ namespace pixelVertexHeterogeneousProduct { float * z_d; float * zerr_d; float * chi2_d; + uint16_t * sortInd; int32_t * ivtx_d; // this should be indexed with the original tracks, not the reduced set (oops) }; @@ -31,6 +32,7 @@ namespace pixelVertexHeterogeneousProduct { { } std::vector> z,zerr, chi2; + std::vector> sortInd; std::vector> ivtx; uint32_t nVertices=0; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc index 3451655a282d1..0b8c31235abea 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc @@ -117,18 +117,20 @@ void PixelVertexHeterogeneousProducer::acquireGPUCuda( // Second, make a collection of pointers to the tracks we want for the vertex finder // fill z,ez - std::vector z,ez; + std::vector z,ez2,pt2; assert(m_trks.empty()); for (unsigned int i=0; ipt() > 20 ? 20*20 : m_trks[k]->pt()*m_trks[k]->pt(); - } - // sort - std::sort(ind,ind+gpuProduct.nVertices,[&](int i, int j){ return pt2[i]>pt2[j];}); - if(gpuProduct.nVertices>1) assert(pt2[ind[0]]>=pt2[ind[1]]); // fill legacy data format - for (unsigned int j=0; j uind; // fort verifing index consistency + for (int j=int(gpuProduct.nVertices)-1; j>=0; --j) { + auto i = gpuProduct.sortInd[j]; // on gpu sorted in ascending order.... + assert(i>=0); + assert(i0); (*vertexes).emplace_back(reco::Vertex::Point(x,y,z), err, gpuProduct.chi2[i], nt-1, nt ); auto & v = (*vertexes).back(); - pt2[i]=0; for (auto k: itrk) { v.add(reco::TrackBaseRef(m_trks[k])); } itrk.clear(); } + assert(uind.size()==(*vertexes).size()); + if (!uind.empty()) { + assert(0 == *uind.begin()); + assert(uind.size()-1 == *uind.rbegin()); + } - if (verbose_) { edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Found " << vertexes->size() << " vertexes\n"; for (unsigned int i=0; isize(); ++i) { diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 7c97665ece061..ae17401d06625 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -7,10 +7,51 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" + #include "gpuVertexFinder.h" namespace gpuVertexFinder { + + __global__ + void sortByPt2(int nt, + OnGPU * pdata + ) { + auto & __restrict__ data = *pdata; + float const * __restrict__ ptt2 = data.ptt2; + uint32_t const & nv = *data.nv; + + int32_t const * __restrict__ iv = data.iv; + float * __restrict__ ptv2 = data.ptv2; + uint16_t * __restrict__ sortInd = data.sortInd; + + if (nv<1) return; + + // can be done asynchronoisly at the end of previous event + for (int i = threadIdx.x; i < nv; i += blockDim.x) { + ptv2[i]=0; + } + __syncthreads(); + + + for (int i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i]>9990) continue; + atomicAdd(&ptv2[iv[i]], ptt2[i]); + } + __syncthreads(); + + if (1==nv) { + if (threadIdx.x==0) sortInd[0]=0; + return; + } + __shared__ uint16_t ws[1024]; + radixSort(ptv2,sortInd,ws,nv); + + assert(ptv2[sortInd[nv-1]]>=ptv2[sortInd[nv-2]]); + assert(ptv2[sortInd[1]]>=ptv2[sortInd[0]]); + } + // this algo does not really scale as it works in a single block... // enough for <10K tracks we have @@ -24,6 +65,9 @@ namespace gpuVertexFinder { ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false + + + if(verbose && 0==threadIdx.x) printf("params %d %f\n",minT,eps); auto er2mx = errmax*errmax; @@ -35,42 +79,49 @@ namespace gpuVertexFinder { float * __restrict__ chi2 = data.chi2; uint32_t & nv = *data.nv; - int8_t * __restrict__ izt = data.izt; + uint8_t * __restrict__ izt = data.izt; int32_t * __restrict__ nn = data.nn; int32_t * __restrict__ iv = data.iv; assert(pdata); assert(zt); - __shared__ HistoContainer hist; - - // if(0==threadIdx.x) printf("params %d %f\n",minT,eps); - // if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt); - - // zero hist - hist.nspills = 0; - for (auto k = threadIdx.x; k; + constexpr auto wss = Hist::totbins(); + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + for (auto j=threadIdx.x; j= 0); + assert(iz-INT8_MIN < 256); + hist.count(izt[i]); iv[i]=i; nn[i]=0; } __syncthreads(); - - // if(0==threadIdx.x) printf("histo filled %d\n",hist.nspills); - if(0==threadIdx.x && hist.fullSpill()) printf("histo overflow\n"); - + hist.finalize(ws); + __syncthreads(); + assert(hist.size()==nt); + if (threadIdx.x<32) ws[threadIdx.x]=0; // used by prefix scan... + __syncthreads(); + for (int i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i],uint16_t(i),ws); + } + __syncthreads(); + + // count neighbours for (int i = threadIdx.x; i < nt; i += blockDim.x) { if (ezt2[i]>er2mx) continue; @@ -87,20 +138,20 @@ namespace gpuVertexFinder { __syncthreads(); - // if(0==threadIdx.x) printf("nn counted\n"); - // cluster seeds only bool more = true; while (__syncthreads_or(more)) { more=false; - for (int i = threadIdx.x; i < nt; i += blockDim.x) { + for (int k = threadIdx.x; k < hist.size(); k += blockDim.x) { + auto p = hist.begin()+k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i])+1,int(hist.nbins()-1)); if (nn[i]eps) return; + auto dist = std::abs(zt[i]-zt[j]); + if (dist>eps) return; if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; auto old = atomicMin(&iv[j], iv[i]); if (old != iv[i]) { @@ -109,8 +160,8 @@ namespace gpuVertexFinder { } atomicMin(&iv[i], old); }; - - forEachInBins(hist,izt[i],1,loop); + ++p; + for (;p>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); - + clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); + cudaCheck(cudaGetLastError()); + sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); gpuProduct.ivtx.resize(ntrks); cudaCheck(cudaMemcpyAsync(gpuProduct.ivtx.data(),onGPU.iv,sizeof(int32_t)*ntrks, cudaMemcpyDeviceToHost, stream)); - - } Producer::GPUProduct const & Producer::fillResults(cudaStream_t stream) { @@ -78,13 +86,14 @@ namespace gpuVertexFinder { gpuProduct.chi2.resize(gpuProduct.nVertices); cudaCheck(cudaMemcpyAsync(gpuProduct.chi2.data(),onGPU.chi2,sizeof(float)*gpuProduct.nVertices, cudaMemcpyDeviceToHost, stream)); - + + gpuProduct.sortInd.resize(gpuProduct.nVertices); + cudaCheck(cudaMemcpyAsync(gpuProduct.sortInd.data(),onGPU.sortInd,sizeof(uint16_t)*gpuProduct.nVertices, + cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); return gpuProduct; } - - - + } // end namespace - diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 1f90641e3c260..ded6759a940bd 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -13,17 +13,21 @@ namespace gpuVertexFinder { static constexpr uint32_t MAXTRACKS = 16000; static constexpr uint32_t MAXVTX= 1024; - float * zt; // input track z at bs + float * zt; // input track z at bs float * ezt2; // input error^2 on the above + float * ptt2; // input pt^2 on the above + float * zv; // output z-posistion of found vertices float * wv; // output weight (1/error^2) on the above float * chi2; // vertices chi2 + float * ptv2; // vertices pt^2 uint32_t * nv; // the number of vertices int32_t * iv; // vertex index for each associated track + uint16_t * sortInd; // sorted index (by pt2) // workspace - int8_t * izt; // interized z-position of input tracks (reused as interize pt2 of vertices for sorting) + uint8_t * izt; // interized z-position of input tracks int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices) }; @@ -53,6 +57,7 @@ namespace gpuVertexFinder { void produce(cudaStream_t stream, float const * zt, float const * ezt2, + float const * ptt2, uint32_t ntrks ); diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu index f47c4362503ae..a92c116702231 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu +++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu @@ -13,13 +13,14 @@ struct Event { std::vector itrack; std::vector ztrack; std::vector eztrack; + std::vector pttrack; std::vector ivert; }; struct ClusterGenerator { explicit ClusterGenerator(float nvert, float ntrack) : - rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.) + rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.) {} void operator()(Event & ev) { @@ -42,6 +43,8 @@ struct ClusterGenerator { ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng)); ev.eztrack.push_back(err*err); ev.ivert.push_back(iv); + ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) ); + ev.pttrack.back()*=ev.pttrack.back(); } } // add noise @@ -51,6 +54,8 @@ struct ClusterGenerator { ev.ztrack.push_back(rgen(reng)); ev.eztrack.push_back(err*err); ev.ivert.push_back(9999); + ev.pttrack.push_back( 0.5f + ptGen(reng) ); + ev.pttrack.back()*=ev.pttrack.back(); } } @@ -61,7 +66,7 @@ struct ClusterGenerator { std::poisson_distribution clusGen; std::poisson_distribution trackGen; std::normal_distribution gauss; - + std::exponential_distribution ptGen; }; @@ -79,11 +84,14 @@ int main() { auto zt_d = cuda::memory::device::make_unique(current_device, 64000); auto ezt2_d = cuda::memory::device::make_unique(current_device, 64000); + auto ptt2_d = cuda::memory::device::make_unique(current_device, 64000); auto zv_d = cuda::memory::device::make_unique(current_device, 256); auto wv_d = cuda::memory::device::make_unique(current_device, 256); auto chi2_d = cuda::memory::device::make_unique(current_device, 256); + auto ptv2_d = cuda::memory::device::make_unique(current_device, 256); + auto ind_d = cuda::memory::device::make_unique(current_device, 256); - auto izt_d = cuda::memory::device::make_unique(current_device, 64000); + auto izt_d = cuda::memory::device::make_unique(current_device, 64000); auto nn_d = cuda::memory::device::make_unique(current_device, 64000); auto iv_d = cuda::memory::device::make_unique(current_device, 64000); @@ -95,9 +103,12 @@ int main() { onGPU.zt = zt_d.get(); onGPU.ezt2 = ezt2_d.get(); + onGPU.ptt2 = ptt2_d.get(); onGPU.zv = zv_d.get(); onGPU.wv = wv_d.get(); onGPU.chi2 = chi2_d.get(); + onGPU.ptv2 = ptv2_d.get(); + onGPU.sortInd = ind_d.get(); onGPU.nv = nv_d.get(); onGPU.izt = izt_d.get(); onGPU.nn = nn_d.get(); @@ -123,6 +134,7 @@ int main() { cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size()); cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size()); + cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size()); float eps = 0.1f; @@ -130,44 +142,58 @@ int main() { if ( (i%4) == 0 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.02f,12.0f ); if ( (i%4) == 1 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.02f,9.0f ); if ( (i%4) == 2 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.01f,9.0f ); if ( (i%4) == 3 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps, 0.01f,9.0f ); - + cudaDeviceSynchronize(); + cuda::launch(sortByPt2, + { 1, 256 }, + ev.ztrack.size(), onGPU_d.get() + ); uint32_t nv; cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t)); + + if (nv==0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + float zv[nv]; float wv[nv]; float chi2[nv]; + float ptv2[nv]; int32_t nn[nv]; + uint16_t ind[nv]; cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float)); cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float)); cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float)); + cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float)); cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t)); + cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t)); for (auto j=0U; j0) chi2[j]/=float(nn[j]); { @@ -178,7 +204,12 @@ int main() { auto mx = std::minmax_element(chi2,chi2+nv); std::cout << "min max chi2 " << *mx.first << ' ' << *mx.second << std::endl; } - + { + auto mx = std::minmax_element(ptv2,ptv2+nv); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv-1]] << " at " << ind[0] << ' ' << ind[nv-1] << std::endl; + + } float dd[nv]; uint32_t ii=0; diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index c767b1e68936a..a63c38264c20d 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -1,11 +1,11 @@ + + + - - - - +