diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 38ee88b068999..fc6a61ffd433e 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1 +1,2 @@
+
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
index 74c8731da3712..47592fd2063d6 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h
@@ -87,7 +87,7 @@ template struct SimpleVector {
inline constexpr void reset() { m_size = 0; }
inline constexpr int size() const { return m_size; }
inline constexpr int capacity() const { return m_capacity; }
- inline constexpr T *data() const { return m_data; }
+ inline constexpr T const * data() const { return m_data; }
inline constexpr void resize(int size) { m_size = size; }
inline constexpr void set_data(T * data) { m_data = data; }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
index 6083c0a55cd3b..8dcefdce65ab4 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -85,7 +85,7 @@ template struct VecArray {
inline constexpr const T& operator[](int i) const { return m_data[i]; }
inline constexpr void reset() { m_size = 0; }
inline constexpr int capacity() const { return maxSize; }
- inline constexpr T *data() const { return m_data; }
+ inline constexpr T const * data() const { return m_data; }
inline constexpr void resize(int size) { m_size = size; }
inline constexpr bool empty() const { return 0 == m_size; }
inline constexpr bool full() const { return maxSize == m_size; }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
index 94acd0015edf6..e13e30594ab6f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h
@@ -2,6 +2,7 @@
#define HeterogeneousCore_CUDAUtilities_HistoContainer_h
#include
+#include
#include
#include
#include
@@ -10,96 +11,101 @@
#endif // __CUDA_ARCH__
#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include
+#endif
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#ifdef __CUDACC__
namespace cudautils {
- template
- __global__
- void zeroMany(Histo * h, uint32_t nh) {
- auto i = blockIdx.x * blockDim.x + threadIdx.x;
- auto ih = i / Histo::nbins();
- auto k = i - ih * Histo::nbins();
- if (ih < nh) {
- h[ih].nspills = 0;
- if (k < Histo::nbins())
- h[ih].n[k] = 0;
- }
- }
-
template
__global__
- void fillFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets) {
+ void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= offsets[nh]) return;
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
assert((*off) > 0);
int32_t ih = off - offsets - 1;
assert(ih >= 0);
- assert(ih < nh);
- h[ih].fill(v[i], i);
+ assert(ih < nh);
+ (*h).count(v[i], ih);
}
template
__global__
- void fillFromVector(Histo * h, T const * v, uint32_t size) {
+ void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
+ uint32_t * __restrict__ ws ) {
auto i = blockIdx.x * blockDim.x + threadIdx.x;
- if(i < size) h->fill(v[i], i);
- }
-
- template
- void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) {
- auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads;
- zeroMany<<>>(h, nh);
- cudaCheck(cudaGetLastError());
+ if(i >= offsets[nh]) return;
+ auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
+ assert((*off) > 0);
+ int32_t ih = off - offsets - 1;
+ assert(ih >= 0);
+ assert(ih < nh);
+ (*h).fill(v[i], i, ws, ih);
}
- template
- void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) {
- zero(h, 1, nthreads, stream);
- auto nblocks = (size + nthreads - 1) / nthreads;
- fillFromVector<<>>(h, v, size);
- cudaCheck(cudaGetLastError());
- }
template
- void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) {
- zero(h, nh, nthreads, stream);
+ void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
+ uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
+ int nthreads, cudaStream_t stream) {
+ uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
+ cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
auto nblocks = (totSize + nthreads - 1) / nthreads;
- fillFromVector<<>>(h, nh, v, offsets);
+ countFromVector<<>>(h, nh, v, offsets);
+ cudaCheck(cudaGetLastError());
+ size_t wss = Histo::totbins();
+ CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
+ cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
+ fillFromVector<<>>(h, nh, v, offsets,ws);
cudaCheck(cudaGetLastError());
}
+
} // namespace cudautils
#endif
// iteratate over N bins left and right of the one containing "v"
-// including spillBin
template
__host__ __device__
+__forceinline__
void forEachInBins(Hist const & hist, V value, int n, Func func) {
- int bs = hist.bin(value);
- int be = std::min(int(hist.nbins()),bs+n+1);
+ int bs = Hist::bin(value);
+ int be = std::min(int(Hist::nbins()-1),bs+n);
bs = std::max(0,bs-n);
- assert(be>bs);
- for (auto b=bs; b=bs);
+ for (auto pj=hist.begin(bs);pj
+__host__ __device__
+__forceinline__
+void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) {
+ auto bs = Hist::bin(wmin);
+ auto be = Hist::bin(wmax);
+ assert(be>=bs);
+ for (auto pj=hist.begin(bs);pj
class HistoContainer {
public:
@@ -111,24 +117,55 @@ class HistoContainer {
using index_type = I;
using UT = typename std::make_unsigned::type;
+
+ static constexpr uint32_t ilog2(uint32_t v) {
+
+ constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+ constexpr uint32_t s[] = {1, 2, 4, 8, 16};
+
+ uint32_t r = 0; // result of log2(v) will go here
+ for (auto i = 4; i >= 0; i--) if (v & b[i]) {
+ v >>= s[i];
+ r |= s[i];
+ }
+ return r;
+ }
+
+
static constexpr uint32_t sizeT() { return S; }
- static constexpr uint32_t nbins() { return 1 << N; }
- static constexpr uint32_t shift() { return sizeT() - N; }
- static constexpr uint32_t mask() { return nbins() - 1; }
- static constexpr uint32_t binSize() { return 1 << M; }
- static constexpr uint32_t spillSize() { return 16 * binSize(); }
+ static constexpr uint32_t nbins() { return NBINS;}
+ static constexpr uint32_t nhists() { return NHISTS;}
+ static constexpr uint32_t totbins() { return NHISTS*NBINS+1;}
+ static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;}
+ static constexpr uint32_t capacity() { return SIZE; }
+
+ static constexpr auto histOff(uint32_t nh) { return NBINS*nh; }
+
+#ifdef __CUDACC__
+ __host__
+ static size_t wsSize() {
+ uint32_t * v =nullptr;
+ void * d_temp_storage = nullptr;
+ size_t temp_storage_bytes = 0;
+ cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
+ return std::max(temp_storage_bytes,size_t(totbins()));
+ }
+#endif
+
static constexpr UT bin(T t) {
- return (t >> shift()) & mask();
+ constexpr uint32_t shift = sizeT() - nbits();
+ constexpr uint32_t mask = (1<> shift) & mask;
}
void zero() {
- nspills = 0;
- for (auto & i : n)
+ for (auto & i : off)
i = 0;
}
- static constexpr
+ static __host__ __device__
+ __forceinline__
uint32_t atomicIncrement(Counter & x) {
#ifdef __CUDA_ARCH__
return atomicAdd(&x, 1);
@@ -138,51 +175,72 @@ class HistoContainer {
}
__host__ __device__
- void fill(T t, index_type j) {
- UT b = bin(t);
+ __forceinline__
+ void count(T t) {
+ uint32_t b = bin(t);
assert(b= spillSize();
- }
-
- constexpr bool full(uint32_t b) const {
- return n[b] >= binSize();
+ __host__ __device__
+ __forceinline__
+ void fill(T t, index_type j, Counter * ws) {
+ uint32_t b = bin(t);
+ assert(b
+#include
+
+template
+__device__
+void
+__forceinline__
+warpPrefixScan(T * c, uint32_t i) {
+ auto x = c[i];
+ auto laneId = threadIdx.x & 0x1f;
+ #pragma unroll
+ for( int offset = 1 ; offset < 32 ; offset <<= 1 ) {
+ auto y = __shfl_up_sync(0xffffffff,x, offset);
+ if(laneId >= offset) x += y;
+ }
+ c[i] = x;
+}
+
+// limited to 32*32 elements....
+template
+__device__
+void
+__forceinline__
+blockPrefixScan(T * c, uint32_t size, T* ws) {
+ assert(size<=1024);
+ assert(0==blockDim.x%32);
+
+ auto first = threadIdx.x;
+
+ for (auto i=first; i
-__device__
-void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
+__device__
+void
+__forceinline__
+radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_t size, RF reorder) {
constexpr int d = 8, w = 8*sizeof(T);
constexpr int sb = 1<0);
- assert(size<=MaxSize);
- assert(blockDim.x==sb);
+ assert(blockDim.x>=sb);
// bool debug = false; // threadIdx.x==0 && blockIdx.x==5;
@@ -104,9 +103,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
for (auto i=first; i= offset) x += y;
+ if (threadIdx.x= offset) x += y;
+ }
+ ct[threadIdx.x] = x;
}
- ct[threadIdx.x] = x;
__syncthreads();
- auto ss = (threadIdx.x/32)*32 -1;
- c[threadIdx.x] = ct[threadIdx.x];
- for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i];
-
+ if (threadIdx.x0; i-=32) c[threadIdx.x] +=ct[i];
+ }
/*
//prefix scan for the nulls (for documentation)
if (threadIdx.x==0)
@@ -140,27 +141,33 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
// broadcast
ibs =size-1;
__syncthreads();
- while (ibs>0) {
+ while (__syncthreads_and(ibs>0)) {
int i = ibs - threadIdx.x;
- cu[threadIdx.x]=-1;
- ct[threadIdx.x]=-1;
+ if (threadIdx.x=0) {
- bin = (a[j[i]] >> d*p)&(sb-1);
- ct[threadIdx.x]=bin;
- atomicMax(&cu[bin],int(i));
+ if (threadIdx.x=0) {
+ bin = (a[j[i]] >> d*p)&(sb-1);
+ ct[threadIdx.x]=bin;
+ atomicMax(&cu[bin],int(i));
+ }
}
__syncthreads();
- if (i>=0 && i==cu[bin]) // ensure to keep them in order
- for (int ii=threadIdx.x; ii=oi);if(i>=oi)
- k[--c[bin]] = j[i-oi];
- }
+ if (threadIdx.x=0 && i==cu[bin]) // ensure to keep them in order
+ for (int ii=threadIdx.x; ii=oi);if(i>=oi)
+ k[--c[bin]] = j[i-oi];
+ }
+ }
__syncthreads();
if (bin>=0) assert(c[bin]>=0);
- if (threadIdx.x==0) ibs-=blockDim.x;
+ if (threadIdx.x==0) ibs-=sb;
__syncthreads();
}
@@ -191,6 +198,8 @@ void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) {
if (j!=ind) // odd...
for (auto i=first; i::value,T>::type* = nullptr
>
__device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
- radixSortImpl(a,ind,size,dummyReorder);
+void
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+ radixSortImpl(a,ind,ind2,size,dummyReorder);
}
template<
@@ -213,8 +224,10 @@ template<
typename std::enable_if::value&&std::is_signed::value,T>::type* = nullptr
>
__device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
- radixSortImpl(a,ind,size,reorderSigned);
+void
+__forceinline__
+radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
+ radixSortImpl(a,ind,ind2,size,reorderSigned);
}
template<
@@ -223,29 +236,47 @@ template<
typename std::enable_if::value,T>::type* = nullptr
>
__device__
-void radixSort(T const * a, uint16_t * ind, uint32_t size) {
+void
+__forceinline__
+ radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) {
using I = int;
- radixSortImpl((I const *)(a),ind,size,reorderFloat);
+ radixSortImpl((I const *)(a),ind,ind2, size,reorderFloat);
}
template
__device__
-void radixSortMulti(T * v, uint16_t * index, uint32_t * offsets) {
+void
+__forceinline__
+radixSortMulti(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+
+ extern __shared__ uint16_t ws[];
auto a = v+offsets[blockIdx.x];
- auto ind = index+offsets[blockIdx.x];;
+ auto ind = index+offsets[blockIdx.x];
+ auto ind2 = nullptr==workspace ? ws : workspace+offsets[blockIdx.x];
auto size = offsets[blockIdx.x+1]-offsets[blockIdx.x];
assert(offsets[blockIdx.x+1]>=offsets[blockIdx.x]);
- if (size>0) radixSort(a,ind,size);
+ if (size>0) radixSort(a,ind,ind2,size);
}
template
__global__
-void radixSortMultiWrapper(T * v, uint16_t * index, uint32_t * offsets) {
- radixSortMulti(v,index,offsets);
+void
+__launch_bounds__(256, 4)
+radixSortMultiWrapper(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+ radixSortMulti(v,index,offsets, workspace);
}
+template
+__global__
+void
+// __launch_bounds__(256, 4)
+radixSortMultiWrapper2(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) {
+ radixSortMulti(v,index,offsets, workspace);
+}
+
+
#endif // HeterogeneousCoreCUDAUtilities_radixSort_H
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index ab97c243c4385..1049aed41fd5f 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -19,13 +19,27 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
index 61c561f96267d..ca8167ab10894 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp
@@ -6,47 +6,75 @@
#include
#include
-template
+template
void go() {
std::mt19937 eng;
- std::uniform_int_distribution rgen(std::numeric_limits::min(),std::numeric_limits::max());
+
+ int rmin=std::numeric_limits::min();
+ int rmax=std::numeric_limits::max();
+ if (NBINS!=128) {
+ rmin=0;
+ rmax=NBINS*2-1;
+ }
+
+
+
+ std::uniform_int_distribution rgen(rmin,rmax);
constexpr int N=12000;
T v[N];
- using Hist = HistoContainer;
- std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << std::endl;
-
+ using Hist = HistoContainer;
+ using Hist4 = HistoContainer;
+ std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+ std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+ std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' << Hist4::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+ for (auto nh=0; nh<4; ++nh) std::cout << "bins " << int(Hist4::bin(0))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax))+Hist4::histOff(nh) << std::endl;
+
+
Hist h;
+ Hist4 h4;
+ typename Hist::Counter ws[Hist::totbins()];
+ typename Hist4::Counter ws4[Hist4::totbins()];
for (int it=0; it<5; ++it) {
for (long long j = 0; j < N; j++) v[j]=rgen(eng);
- h.zero();
- for (long long j = 0; j < N; j++) h.fill(v[j],j);
-
- std::cout << "nspills " << h.nspills << std::endl;
+ if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
+ h.zero();h4.zero();
+ assert(h.size()==0);assert(h4.size()==0);
+ for (auto & i: ws) i=0;
+ for (auto & i: ws4) i=0;
+ for (long long j = 0; j < N; j++) { h.count(v[j]); if(j<2000) h4.count(v[j],2); else h4.count(v[j],j%4); }
+ h.finalize(); h4.finalize();
+ assert(h.off[0]==0);
+ assert(h.size()==N);
+ assert(h4.off[0]==0);
+ assert(h4.size()==N);
+ for (long long j = 0; j < N; j++) { h.fill(v[j],j,ws); if(j<2000) h4.fill(v[j],j,ws4,2); else h4.fill(v[j],j,ws4,j%4); }
+ assert(h.size()==N);
+ assert(h4.size()==N);
- auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
+ auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) {
assert(t1=i); }
// std::cout << kl << ' ' << kh << std::endl;
- for (auto j=h.begin(kl); j();
+ go();
+ go();
+
return 0;
}
diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
index 73e9f20ae589f..8d640b728e25c 100644
--- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu
@@ -34,12 +34,16 @@ void go() {
constexpr uint32_t partSize = N/nParts;
uint32_t offsets[nParts+1];
- using Hist = HistoContainer;
- std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl;
+ using Hist = HistoContainer;
+ std::cout << "HistoContainer " << (int)(offsetof(Hist,off)) << ' '
+ << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << Hist::wsSize() << ' '
+ << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl;
- Hist h[nParts];
+ Hist h;
+
+ auto h_d = cuda::memory::device::make_unique(current_device, 1);
+ auto ws_d = cuda::memory::device::make_unique(current_device, Hist::totbins());
- auto h_d = cuda::memory::device::make_unique(current_device, nParts);
auto off_d = cuda::memory::device::make_unique(current_device, nParts+1);
@@ -62,18 +66,20 @@ void go() {
for (long long j = 0; j < N; j++) v[j]=rgen(eng);
- if (it==2) { // spill
+ if (it==2) { // big bin
for (long long j = 1000; j < 2000; j++) v[j]= sizeof(T)==1 ? 22 : 3456;
}
cuda::memory::copy(v_d.get(), v, N*sizeof(T));
- cudautils::fillManyFromVector(h_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
-
- cuda::memory::copy(&h, h_d.get(), nParts*sizeof(Hist));
+ cudautils::fillManyFromVector(h_d.get(),ws_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0);
-
+ cuda::memory::copy(&h, h_d.get(), sizeof(Hist));
+ assert(0==h.off[0]);
+ assert(offsets[10]==h.size());
+
+
auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) {
assert(t1 window ) {} else {++tot;}
}
if (kk==i) { l=false; continue; }
- if (l) for (auto p=h[j].begin(kk); p=nm)) {
- std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<=nm)) {
+ std::cout << "too bad " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<
+#include
+#include
+#include
+#include
+
+#include
+
+template
+__global__
+void mykernel(T const * __restrict__ v, uint32_t N) {
+
+ assert(v);
+ assert(N==12000);
+
+ if (threadIdx.x==0) printf("start kernel for %d data\n",N);
+
+ using Hist = HistoContainer;
+ constexpr auto wss = Hist::totbins();
+
+ if (threadIdx.x==0) printf("ws size %d\n",wss);
+
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter ws[wss];
+
+ for (auto j=threadIdx.x; j=k1);
+ }
+
+ for (auto i=threadIdx.x; i=0 && k::max();
+ vm = std::max(vm, 0);
+ vm = std::min(vm,vmax);
+ vp = std::min(vp,vmax);
+ vp = std::max(vp, 0);
+ assert(vp>=vm);
+ forEachInWindow(hist, vm,vp, ftest);
+ int bp = Hist::bin(vp);
+ int bm = Hist::bin(vm);
+ rtot = hist.end(bp)-hist.begin(bm);
+ assert(tot==rtot);
+ }
+
+
+}
+
+template
+void go() {
+
+ if (cuda::device::count() == 0) {
+ std::cerr << "No CUDA devices on this system" << "\n";
+ exit(EXIT_FAILURE);
+ }
+
+ auto current_device = cuda::device::current::get();
+
+
+ std::mt19937 eng;
+
+ int rmin=std::numeric_limits::min();
+ int rmax=std::numeric_limits::max();
+ if (NBINS!=128) {
+ rmin=0;
+ rmax=NBINS*2-1;
+ }
+
+
+
+ std::uniform_int_distribution rgen(rmin,rmax);
+
+
+ constexpr int N=12000;
+ T v[N];
+
+ auto v_d = cuda::memory::device::make_unique(current_device, N);
+ assert(v_d.get());
+
+ using Hist = HistoContainer;
+ std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl;
+ std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl;
+
+ for (int it=0; it<5; ++it) {
+ for (long long j = 0; j < N; j++) v[j]=rgen(eng);
+ if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4;
+
+ assert(v_d.get());
+ assert(v);
+ cuda::memory::copy(v_d.get(), v, N*sizeof(T));
+ assert(v_d.get());
+ cuda::launch(mykernel,{1,256},v_d.get(),N);
+ }
+
+}
+
+int main() {
+ go();
+ go();
+ go();
+
+
+ return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
new file mode 100644
index 0000000000000..36c4c3b88abe6
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu
@@ -0,0 +1,86 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+#include
+
+template
+__global__
+void testPrefixScan(uint32_t size) {
+
+ __shared__ T ws[32];
+ __shared__ T c[1024];
+ auto first = threadIdx.x;
+ for (auto i=first; i
+
+
+__global__
+void init(uint32_t * v, uint32_t val, uint32_t n) {
+ auto i = blockIdx.x * blockDim.x + threadIdx.x;
+ if(i<<<1,bs>>>(j);
+ testPrefixScan<<<1,bs>>>(j);
+ }
+ cudaDeviceSynchronize();
+
+ // Declare, allocate, and initialize device-accessible pointers for input and output
+ int num_items = 10000;
+ uint32_t *d_in;
+ uint32_t *d_out;
+
+
+ cudaMalloc(&d_in,num_items*sizeof(uint32_t));
+ // cudaMalloc(&d_out,num_items*sizeof(uint32_t));
+
+ d_out = d_in;
+
+ auto nthreads = 256;
+ auto nblocks = (num_items + nthreads - 1) / nthreads;
+
+ init<<>>(d_in, 1, num_items);
+
+ // Determine temporary device storage requirements for inclusive prefix sum
+ void *d_temp_storage = nullptr;
+ size_t temp_storage_bytes = 0;
+ cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+
+ std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+ // Allocate temporary storage for inclusive prefix sum
+ // fake larger ws already available
+ temp_storage_bytes *=8;
+ cudaMalloc(&d_temp_storage, temp_storage_bytes);
+ std::cout << "temp storage " << temp_storage_bytes << std::endl;
+ // Run inclusive prefix sum
+ CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
+ std::cout << "temp storage " << temp_storage_bytes << std::endl;
+
+ verify<<>>(d_out, num_items);
+
+ cudaDeviceSynchronize();
+
+ return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
index bdc1af0123637..08441c8cc9555 100644
--- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
+++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu
@@ -7,7 +7,7 @@
#include
#include
#include
-
+#include
#include
#include
@@ -31,7 +31,7 @@ struct RS {
template
-void go() {
+void go(bool useShared) {
std::mt19937 eng;
// std::mt19937 eng2;
@@ -89,6 +89,7 @@ void go() {
auto v_d = cuda::memory::device::make_unique(current_device, N);
auto ind_d = cuda::memory::device::make_unique(current_device, N);
+ auto ws_d = cuda::memory::device::make_unique(current_device, N);
auto off_d = cuda::memory::device::make_unique(current_device, blocks+1);
cuda::memory::copy(v_d.get(), v, N*sizeof(T));
@@ -96,12 +97,23 @@ void go() {
if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl;
+ auto ntXBl = 1==i%4 ? 256 : 256;
+
delta -= (std::chrono::high_resolution_clock::now()-start);
+ constexpr int MaxSize = 256*32;
+ if (useShared)
cuda::launch(
radixSortMultiWrapper,
- { blocks, 256 },
- v_d.get(),ind_d.get(),off_d.get()
+ { blocks, ntXBl, MaxSize*2 },
+ v_d.get(),ind_d.get(),off_d.get(),nullptr
);
+ else
+ cuda::launch(
+ radixSortMultiWrapper2,
+ { blocks, ntXBl },
+ v_d.get(),ind_d.get(),off_d.get(),ws_d.get()
+ );
+
if (i==0) std::cout << "done for " << offsets[blocks] << std::endl;
@@ -117,10 +129,13 @@ void go() {
std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize-1000]]) << std::endl;
std::cout << LL(v[ind[blockSize/2-1]]) << ' ' << LL(v[ind[blockSize/2]]) << ' ' << LL(v[ind[blockSize/2+1]]) << std::endl;
}
- for (int ib=0; ib inds;
+ if (offsets[ib+1]> offsets[ib]) inds.insert(ind[offsets[ib]]);
+ for (auto j = offsets[ib]+1; j < offsets[ib+1]; j++) {
+ inds.insert(ind[j]);
auto a = v+offsets[ib];
- auto k1=a[ind[i]]; auto k2=a[ind[i-1]];
+ auto k1=a[ind[j]]; auto k2=a[ind[j-1]];
auto sh = sizeof(uint64_t)-NS; sh*=8;
auto shorten = [sh](T& t) {
auto k = (uint64_t *)(&t);
@@ -128,8 +143,15 @@ void go() {
};
shorten(k1);shorten(k2);
if (k1();
- go();
- go();
- go();
- go();
- go();
- go();
-
- go();
- go();
- go();
- // go();
+ bool useShared=false;
+
+ std::cout << "using Global memory" << std::endl;
+
+
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ // go(v);
+
+ useShared=true;
+
+ std::cout << "using Shared memory" << std::endl;
+
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ go(useShared);
+
+ go(useShared);
+ go(useShared);
+ go(useShared);
+ // go(v);
+
+
+
return 0;
}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index f3242a11d7ae6..7bd6eac473cc7 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -34,6 +34,7 @@
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
// local includes
@@ -687,11 +688,11 @@ namespace pixelgpudetails {
cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
}
- /*
- std::cout
+#ifdef GPU_DEBUG
+ std::cout
<< "CUDA countModules kernel launch with " << blocks
<< " blocks of " << threadsPerBlock << " threads\n";
- */
+#endif
cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id()));
@@ -703,10 +704,10 @@ namespace pixelgpudetails {
threadsPerBlock = 256;
blocks = MaxNumModules;
- /*
+#ifdef GPU_DEBUG
std::cout << "CUDA findClus kernel launch with " << blocks
<< " blocks of " << threadsPerBlock << " threads\n";
- */
+#endif
cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id()));
findClus<<>>(
moduleInd_d,
@@ -717,6 +718,18 @@ namespace pixelgpudetails {
wordCounter);
cudaCheck(cudaGetLastError());
+ // apply charge cut
+ clusterChargeCut<<>>(
+ moduleInd_d,
+ adc_d,
+ moduleStart_d,
+ clusInModule_d, moduleId_d,
+ clus_d,
+ wordCounter);
+ cudaCheck(cudaGetLastError());
+
+
+
// count the module start indices already here (instead of
// rechits) so that the number of clusters/hits can be made
// available in the rechit producer without additional points of
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
index 7f7e65d2874f1..384a4732b32e1 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
@@ -584,7 +584,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
}
int32_t nclus=-1;
- std::vector aclusters(256);
+ std::vector aclusters(1024);
auto totCluseFilled=0;
auto fillClusters = [&](uint32_t detId){
@@ -611,6 +611,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
for (uint32_t i = 0; i < gpu.nDigis; i++) {
if (gpu.pdigi_h[i]==0) continue;
+ if (gpu.clus_h[i]>9000) continue; // not in cluster
assert(gpu.rawIdArr_h[i] > 109999);
if ( (*detDigis).detId() != gpu.rawIdArr_h[i])
{
@@ -625,7 +626,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
auto const & dig = (*detDigis).data.back();
// fill clusters
assert(gpu.clus_h[i]>=0);
- assert(gpu.clus_h[i]<256);
+ assert(gpu.clus_h[i]<1024);
nclus = std::max(gpu.clus_h[i],nclus);
auto row = dig.row();
auto col = dig.column();
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
new file mode 100644
index 0000000000000..0284a378ecd39
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -0,0 +1,97 @@
+#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h
+
+#include
+#include
+#include
+
+#include "gpuClusteringConstants.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
+
+namespace gpuClustering {
+
+ __global__ void clusterChargeCut(
+ uint16_t * __restrict__ id, // module id of each pixel (modified if bad cluster)
+ uint16_t const * __restrict__ adc, // charge of each pixel
+ uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module
+ uint32_t * __restrict__ nClustersInModule, // modified: number of clusters found in each module
+ uint32_t const * __restrict__ moduleId, // module id of each module
+ int32_t * __restrict__ clusterId, // modified: cluster id of each pixel
+ int numElements)
+ {
+
+ if (blockIdx.x >= moduleStart[0])
+ return;
+
+ auto firstPixel = moduleStart[1 + blockIdx.x];
+ auto thisModuleId = id[firstPixel];
+ assert(thisModuleId < MaxNumModules);
+ assert(thisModuleId==moduleId[blockIdx.x]);
+
+ auto nclus = nClustersInModule[thisModuleId];
+ if (nclus==0) return;
+
+ assert(nclus<=MaxNumClustersPerModules);
+
+#ifdef GPU_DEBUG
+ if (thisModuleId % 100 == 1)
+ if (threadIdx.x == 0)
+ printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
+
+ auto first = firstPixel + threadIdx.x;
+
+ __shared__ int32_t charge[MaxNumClustersPerModules];
+ for (int i=threadIdx.x; ichargeCut ? 1 : 0;
+ }
+
+ __syncthreads();
+
+ // renumber
+ __shared__ uint16_t ws[32];
+ blockPrefixScan(newclusId, nclus, ws);
+
+ assert(nclus>=newclusId[nclus-1]);
+
+ if(nclus==newclusId[nclus-1]) return;
+
+ nClustersInModule[thisModuleId] = newclusId[nclus-1];
+ __syncthreads();
+
+ // mark bad cluster again
+ for (int i=threadIdx.x; i
#include
#include
#include "gpuClusteringConstants.h"
+#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
namespace gpuClustering {
__global__ void countModules(uint16_t const * __restrict__ id,
@@ -32,7 +37,9 @@ namespace gpuClustering {
}
}
- __global__ void findClus(uint16_t const * __restrict__ id, // module id of each pixel
+ __global__
+// __launch_bounds__(256,4)
+ void findClus(uint16_t const * __restrict__ id, // module id of each pixel
uint16_t const * __restrict__ x, // local coordinates of each pixel
uint16_t const * __restrict__ y, //
uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module
@@ -63,8 +70,6 @@ namespace gpuClustering {
__syncthreads();
// skip threads not associated to an existing pixel
- bool active = (first < numElements);
- if (active) {
for (int i = first; i < numElements; i += blockDim.x) {
if (id[i] == InvId) // skip invalid pixels
continue;
@@ -73,95 +78,132 @@ namespace gpuClustering {
break;
}
}
- }
-
- //init hist (ymax < 512)
- __shared__ HistoContainer hist;
- hist.nspills = 0;
- for (auto k = threadIdx.x; k;
+ constexpr auto wss = Hist::totbins();
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter ws[wss];
+ for (auto j=threadIdx.x; j0 ? y[i]-1 : 0);
- auto be = hist.bin(y[i]+1)+1;
- auto loop = [&](int j) {
- j+=firstPixel;
- if (i>=j or j>jm or
- std::abs(int(x[j]) - int(x[i])) > 1 or
- std::abs(int(y[j]) - int(y[i])) > 1) return;
- auto old = atomicMin(&clusterId[j], clusterId[i]);
+ auto loop = [&](uint16_t const * kk) {
+ auto m = (*kk)+firstPixel;
+#ifdef GPU_DEBUG
+ assert(m!=i);
+#endif
+ if (std::abs(int(x[m]) - int(x[i])) > 1) return;
+ // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1
+ auto old = atomicMin(&clusterId[m], clusterId[i]);
if (old != clusterId[i]) {
// end the loop only if no changes were applied
- done = false;
+ more = true;
}
atomicMin(&clusterId[i], old);
+#ifdef CLUS_LIMIT_LOOP
// update the loop boundary for the next iteration
- jmax[k] = std::max(j + 1,jmax[k]);
+ jmax[k] = std::max(kk + 1,jmax[k]);
+#endif
};
- for (auto b=bs; b MaxNumModules
+
}
#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
index 3fc830883ca58..335591b583b58 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml
@@ -34,6 +34,15 @@
+
+
+
+
+
+
+
+
+
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
index 2264be0de02af..a01c1230586fe 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu
@@ -12,6 +12,7 @@
#include
#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h"
int main(void)
{
@@ -30,7 +31,6 @@ int main(void)
auto h_clus = std::make_unique(numElements);
- auto h_debug = std::make_unique(numElements);
auto current_device = cuda::device::current::get();
auto d_id = cuda::memory::device::make_unique(current_device, numElements);
auto d_x = cuda::memory::device::make_unique(current_device, numElements);
@@ -44,12 +44,29 @@ int main(void)
auto d_clusInModule = cuda::memory::device::make_unique(current_device, MaxNumModules);
auto d_moduleId = cuda::memory::device::make_unique(current_device, MaxNumModules);
- auto d_debug = cuda::memory::device::make_unique(current_device, numElements);
// later random number
int n=0;
int ncl=0;
int y[10]={5,7,9,1,3,0,4,8,2,6};
+ auto generateClusters = [&](int kn) {
+ auto addBigNoise = 1==kn%2;
+ if (addBigNoise) {
+ constexpr int MaxPixels = 1000;
+ int id = 666;
+ for (int x=0; x<140; x+=3) {
+ for (int yy=0; yy<400; yy+=3) {
+ h_id[n]=id;
+ h_x[n]=x;
+ h_y[n]=yy;
+ h_adc[n]=1000;
+ ++n; ++ncl;
+ if (MaxPixels<=ncl) break;
+ }
+ if (MaxPixels<=ncl) break;
+ }
+ }
+
{
// isolated
int id = 42;
@@ -58,15 +75,56 @@ int main(void)
h_id[n]=id;
h_x[n]=x;
h_y[n]=x;
- h_adc[n]=100;
+ h_adc[n]= kn==0 ? 100 : 5000;
+ ++n;
+
+ // first column
+ ++ncl;
+ h_id[n]=id;
+ h_x[n]=x;
+ h_y[n]=0;
+ h_adc[n]= 5000;
+ ++n;
+ // first columns
+ ++ncl;
+ h_id[n]=id;
+ h_x[n]=x+80;
+ h_y[n]=2;
+ h_adc[n]= 5000;
+ ++n;
+ h_id[n]=id;
+ h_x[n]=x+80;
+ h_y[n]=1;
+ h_adc[n]= 5000;
+ ++n;
+
+ // last column
+ ++ncl;
+ h_id[n]=id;
+ h_x[n]=x;
+ h_y[n]=415;
+ h_adc[n]= 5000;
+ ++n;
+ // last columns
+ ++ncl;
+ h_id[n]=id;
+ h_x[n]=x+80;
+ h_y[n]=415;
+ h_adc[n]= 2500;
+ ++n;
+ h_id[n]=id;
+ h_x[n]=x+80;
+ h_y[n]=414;
+ h_adc[n]= 2500;
++n;
+
// diagonal
++ncl;
for (int x=20; x<25; ++x) {
h_id[n]=id;
h_x[n]=x;
h_y[n]=x;
- h_adc[n]=100;
+ h_adc[n]=1000;
++n;
}
++ncl;
@@ -75,7 +133,7 @@ int main(void)
h_id[n]=id;
h_x[n]=x;
h_y[n]=x;
- h_adc[n]=100;
+ h_adc[n]=1000;
++n;
}
++ncl;
@@ -86,7 +144,7 @@ int main(void)
h_id[n]=id;
h_x[n]=xx[k];
h_y[n]=20+xx[k];
- h_adc[n]=100;
+ h_adc[n]=1000;
++n;
}
// holes
@@ -95,13 +153,13 @@ int main(void)
h_id[n]=id;
h_x[n]=xx[k];
h_y[n]=100;
- h_adc[n]=100;
+ h_adc[n]= kn==2 ? 100 : 1000;
++n;
if (xx[k]%2==0) {
h_id[n]=id;
h_x[n]=xx[k];
h_y[n]=101;
- h_adc[n]=100;
+ h_adc[n]=1000;
++n;
}
}
@@ -114,7 +172,7 @@ int main(void)
h_id[n]=id;
h_x[n]=x;
h_y[n]=x;
- h_adc[n]=100;
+ h_adc[n]=5000;
++n;
}
// all odd id
@@ -132,7 +190,7 @@ int main(void)
h_id[n]=id;
h_x[n]=x+1;
h_y[n]=x+y[k]+2;
- h_adc[n]=100;
+ h_adc[n]=1000;
++n;
}
} else {
@@ -140,24 +198,31 @@ int main(void)
h_id[n]=id;
h_x[n]=x;
h_y[n]=x+y[9-k];
- h_adc[n]=100;
+ h_adc[n]= kn==2 ? 10 : 1000;
++n;
if (y[k]==3) continue; // hole
if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error
h_id[n]=id;
h_x[n]=x+1;
h_y[n]=x+y[k]+2;
- h_adc[n]=100;
+ h_adc[n]= kn==2 ? 10 : 1000;
++n;
}
}
}
}
+ }; // end lambda
+ for (auto kkk=0; kkk<5; ++kkk) {
+ n=0; ncl=0;
+ generateClusters(kkk);
+
std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl;
assert(n<=numElements);
+
+
size_t size32 = n * sizeof(unsigned int);
size_t size16 = n * sizeof(unsigned short);
- size_t size8 = n * sizeof(uint8_t);
+ // size_t size8 = n * sizeof(uint8_t);
uint32_t nModules=0;
cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t));
@@ -165,10 +230,9 @@ int main(void)
cuda::memory::copy(d_id.get(), h_id.get(), size16);
cuda::memory::copy(d_x.get(), h_x.get(), size16);
cuda::memory::copy(d_y.get(), h_y.get(), size16);
- cuda::memory::copy(d_adc.get(), h_adc.get(), size8);
- cuda::memory::device::zero(d_debug.get(),size32);
+ cuda::memory::copy(d_adc.get(), h_adc.get(), size16);
// Launch CUDA Kernels
- int threadsPerBlock = 256;
+ int threadsPerBlock = (kkk==5) ? 512 : ((kkk==3) ? 64 : 256);
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout
<< "CUDA countModules kernel launch with " << blocksPerGrid
@@ -180,7 +244,6 @@ int main(void)
d_id.get(), d_moduleStart.get() ,d_clus.get(),n
);
- threadsPerBlock = 256;
blocksPerGrid = MaxNumModules; //nModules;
std::cout
@@ -198,32 +261,77 @@ int main(void)
d_clus.get(),
n
);
+ cudaDeviceSynchronize();
+
+ cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
+
+ uint32_t nclus[MaxNumModules], moduleId[nModules];
+
+ cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
+ std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl;
+ for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;}
+ if (ncl!=std::accumulate(nclus,nclus+MaxNumModules,0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
+
+ cuda::launch(
+ clusterChargeCut,
+ { blocksPerGrid, threadsPerBlock },
+ d_id.get(), d_adc.get(),
+ d_moduleStart.get(),
+ d_clusInModule.get(), d_moduleId.get(),
+ d_clus.get(),
+ n
+ );
+
+
+ cudaDeviceSynchronize();
- cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t));
std::cout << "found " << nModules << " Modules active" << std::endl;
- uint32_t nclus[MaxNumModules], moduleId[nModules];
+ cuda::memory::copy(h_id.get(), d_id.get(), size16);
cuda::memory::copy(h_clus.get(), d_clus.get(), size32);
cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t));
cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t));
- cuda::memory::copy(h_debug.get(), d_debug.get(), size32);
- auto p = std::minmax_element(h_debug.get(),h_debug.get()+n);
- std::cout << "debug " << *p.first << ' ' << *p.second << std::endl;
std::set clids;
- std::vector seeds;
for (int i=0; i=0);
assert(h_clus[i]0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;}
// << " and " << seeds.size() << " seeds" << std::endl;
-
+ } /// end loop kkk
return 0;
}
diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
index c52545a601341..7918c7a4f4d9a 100644
--- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml
@@ -14,7 +14,8 @@
-
+
+
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index e2165471c3386..c63466f157a1b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -8,7 +8,7 @@
// CMSSW headers
#include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
#include "PixelRecHits.h"
#include "gpuPixelRecHits.h"
@@ -34,7 +34,7 @@ namespace {
namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {
- constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256;
+ constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
@@ -70,7 +70,8 @@ namespace pixelgpudetails {
gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3);
gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4);
- cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist)));
+ cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist)));
+ cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()));
cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU)));
gpu_.me_d = gpu_d;
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id()));
@@ -109,6 +110,7 @@ namespace pixelgpudetails {
cudaCheck(cudaFree(gpu_.owner_32bit_));
cudaCheck(cudaFree(gpu_.owner_16bit_));
cudaCheck(cudaFree(gpu_.hist_d));
+ cudaCheck(cudaFree(gpu_.hws_d));
cudaCheck(cudaFree(gpu_d));
cudaCheck(cudaFree(d_phase1TopologyLayerStart_));
@@ -132,6 +134,10 @@ namespace pixelgpudetails {
int threadsPerBlock = 256;
int blocks = input.nModules; // active modules (with digis)
+
+#ifdef GPU_DEBUG
+ std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
+#endif
gpuPixelRecHits::getHits<<>>(
cpeParams,
gpu_.bs_d,
@@ -188,6 +194,6 @@ namespace pixelgpudetails {
// radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d);
}
- cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
+ cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id());
}
}
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 6065fa3cb274a..f0c3b42b30eba 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -218,6 +218,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::HandledetId();
@@ -234,7 +235,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle=96 && hoc.charge[fc+i]<4000) ) continue;
+ if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) { ++numberOfLostClusters; continue;}
ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a] MaxNumModules
-
-
- constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
-
- using ClusParams = pixelCPEforGPU::ClusParams;
-
-
__global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams,
float const * __restrict__ bs,
uint16_t const * __restrict__ id,
@@ -42,21 +33,44 @@ namespace gpuPixelRecHits {
float * xe, float * ye,
uint16_t * mr, uint16_t * mc)
{
+
+ // to be moved in common namespace...
+ constexpr uint16_t InvId=9999; // must be > MaxNumModules
+ constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
+
+ using ClusParams = pixelCPEforGPU::ClusParams;
+
+
// as usual one block per module
__shared__ ClusParams clusParams;
auto first = digiModuleStart[1 + blockIdx.x];
- auto me = id[first];
- assert(moduleId[blockIdx.x] == me);
+ auto me = moduleId[blockIdx.x];
auto nclus = clusInModule[me];
+ if (0==nclus) return;
+
+#ifdef GPU_DEBUG
+ if (threadIdx.x==0) {
+ auto k=first;
+ while (id[k]==InvId) ++k;
+ assert(id[k]==me);
+ }
+#endif
+
#ifdef GPU_DEBUG
if (me%100==1)
if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);
#endif
assert(blockDim.x >= MaxClusInModule);
- assert(nclus <= MaxClusInModule);
+
+ if (threadIdx.x==0 && nclus > MaxClusInModule) {
+ printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule);
+ // zero charge: do not bother to do it in parallel
+ for (auto d=MaxClusInModule; d= nclus) continue;
atomicMin(&clusParams.minRow[clus[i]], x[i]);
atomicMax(&clusParams.maxRow[clus[i]], x[i]);
atomicMin(&clusParams.minCol[clus[i]], y[i]);
@@ -93,6 +107,7 @@ namespace gpuPixelRecHits {
for (int i = first; i < numElements; i += blockDim.x) {
if (id[i] == InvId) continue; // not valid
if (id[i] != me) break; // end of module
+ if (clus[i] >= nclus) continue;
atomicAdd(&clusParams.charge[clus[i]], adc[i]);
if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 778b763b28cff..9d0fe7a279799 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -17,6 +17,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
using CPUProduct = int; // dummy
+ static constexpr uint32_t maxHits() { return 65536;}
+ using hindex_type = uint16_t; // if above is <=2^16
+
struct HitsOnGPU{
pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned
float * bs_d;
@@ -32,8 +35,9 @@ namespace siPixelRecHitsHeterogeneousProduct {
uint16_t * mr_d;
uint16_t * mc_d;
- using Hist = HistoContainer;
+ using Hist = HistoContainer;
Hist * hist_d;
+ typename Hist::Counter * hws_d;
HitsOnGPU const * me_d = nullptr;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 101095c5a9e4d..66d6d56286008 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -164,8 +164,8 @@ KernelLineFitAllHits(GPU::SimpleVector * foundNtuplets,
__global__ void
kernel_checkOverflows(GPU::SimpleVector *foundNtuplets,
- GPUCACell *cells, uint32_t const * nCells,
- GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+ GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells,
+ GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
uint32_t nHits, uint32_t maxNumberOfDoublets) {
auto idx = threadIdx.x + blockIdx.x * blockDim.x;
@@ -185,31 +185,36 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets,
}
-__global__ void
+__global__
+void
kernel_connect(GPU::SimpleVector *foundNtuplets,
- GPUCACell *cells, uint32_t const * nCells,
- GPU::VecArray< unsigned int, 256> *isOuterHitOfCell,
+ GPUCACell::Hits const * __restrict__ hhp,
+ GPUCACell * cells, uint32_t const * __restrict__ nCells,
+ GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell,
float ptmin,
float region_origin_radius, const float thetaCut,
const float phiCut, const float hardPtCut,
unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
- float region_origin_x = 0.;
- float region_origin_y = 0.;
+ auto const & hh = *hhp;
+
+ constexpr float region_origin_x = 0.;
+ constexpr float region_origin_y = 0.;
auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel
if (cellIndex >= (*nCells) ) return;
- auto &thisCell = cells[cellIndex];
+ auto const & thisCell = cells[cellIndex];
auto innerHitId = thisCell.get_inner_hit_id();
auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+ auto vi = isOuterHitOfCell[innerHitId].data();
for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
- auto otherCell = isOuterHitOfCell[innerHitId][j];
+ auto otherCell = __ldg(vi+j);
- if (thisCell.check_alignment_and_tag(
- cells, otherCell, ptmin, region_origin_x, region_origin_y,
+ if (thisCell.check_alignment(hh,
+ cells[otherCell], ptmin, region_origin_x, region_origin_y,
region_origin_radius, thetaCut, phiCut, hardPtCut)
) {
cells[otherCell].theOuterNeighbors.push_back(cellIndex);
@@ -218,7 +223,7 @@ kernel_connect(GPU::SimpleVector *foundNtuplets,
}
__global__ void kernel_find_ntuplets(
- GPUCACell *cells, uint32_t const * nCells,
+ GPUCACell * const __restrict__ cells, uint32_t const * nCells,
GPU::SimpleVector *foundNtuplets,
unsigned int minHitsPerNtuplet,
unsigned int maxNumberOfDoublets_)
@@ -228,7 +233,7 @@ __global__ void kernel_find_ntuplets(
if (cellIndex >= (*nCells) ) return;
auto &thisCell = cells[cellIndex];
if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME
- GPU::VecArray stack;
+ GPU::VecArray stack;
stack.reset();
thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet);
assert(stack.size()==0);
@@ -336,9 +341,11 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion,
auto nhits = hh.nHits;
assert(nhits <= PixelGPUConstants::maxNumberOfHits);
- auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512;
- kernel_connect<<>>(
+ auto blockSize = 64;
+ auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize;
+ kernel_connect<<>>(
d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
+ hh.gpu_d,
device_theCells_, device_nCells_,
device_isOuterHitOfCell_,
region.ptMin(),
@@ -347,21 +354,20 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion,
);
cudaCheck(cudaGetLastError());
- kernel_find_ntuplets<<>>(
+ kernel_find_ntuplets<<>>(
device_theCells_, device_nCells_,
d_foundNtupletsVec_[regionIndex],
4, maxNumberOfDoublets_);
cudaCheck(cudaGetLastError());
-
- numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512;
- kernel_checkOverflows<<>>(
+ numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize;
+ kernel_checkOverflows<<>>(
d_foundNtupletsVec_[regionIndex],
device_theCells_, device_nCells_,
device_isOuterHitOfCell_, nhits,
maxNumberOfDoublets_
);
-
+ cudaCheck(cudaGetLastError());
// kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10);
@@ -383,7 +389,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion,
line_fit_resultsGPU_);
cudaCheck(cudaGetLastError());
- if(transferToCPU) {
+ if (transferToCPU) {
cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
sizeof(GPU::SimpleVector),
cudaMemcpyDeviceToHost, cudaStream));
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index 589ae994132ad..fdceec9225d76 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -23,6 +23,8 @@
#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
#include "RecoPixelVertexing/PixelTrackFitting/interface/RiemannFit.h"
+
+// FIXME (split header???)
#include "GPUCACell.h"
class TrackingRegion;
@@ -38,6 +40,7 @@ class CAHitQuadrupletGeneratorGPU {
using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
+ using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
static constexpr unsigned int minLayers = 4;
typedef OrderedHitSeeds ResultType;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 5995d286fc38d..772b802282d31 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -6,49 +6,49 @@
#include
-#include "GPUHitsAndDoublets.h"
#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
struct Quadruplet {
- int hitId[4];
+ using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+ hindex_type hitId[4];
};
+
class GPUCACell {
public:
+
+ using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+ using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
+
GPUCACell() = default;
+#ifdef __CUDACC__
- __host__ __device__
- void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
- int layerPairId, int doubletId, int innerHitId,int outerHitId)
+ __device__ __forceinline__
+ void init(Hits const & hh,
+ int layerPairId, int doubletId,
+ hindex_type innerHitId, hindex_type outerHitId)
{
theInnerHitId = innerHitId;
theOuterHitId = outerHitId;
theDoubletId = doubletId;
theLayerPairId = layerPairId;
- theInnerX = hh.xg_d[innerHitId];
- theOuterX = hh.xg_d[outerHitId];
-
- theInnerY = hh.yg_d[innerHitId];
- theOuterY = hh.yg_d[outerHitId];
-
- theInnerZ = hh.zg_d[innerHitId];
- theOuterZ = hh.zg_d[outerHitId];
- theInnerR = hh.rg_d[innerHitId];
- theOuterR = hh.rg_d[outerHitId];
+ theInnerZ = __ldg(hh.zg_d+innerHitId);
+ theInnerR = __ldg(hh.rg_d+innerHitId);
theOuterNeighbors.reset();
}
- constexpr float get_inner_x() const { return theInnerX; }
- constexpr float get_outer_x() const { return theOuterX; }
- constexpr float get_inner_y() const { return theInnerY; }
- constexpr float get_outer_y() const { return theOuterY; }
- constexpr float get_inner_z() const { return theInnerZ; }
- constexpr float get_outer_z() const { return theOuterZ; }
- constexpr float get_inner_r() const { return theInnerR; }
- constexpr float get_outer_r() const { return theOuterR; }
+ __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
+ __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
+ __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
+ __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
+ __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
+ __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
+ __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
+ __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }
+
constexpr unsigned int get_inner_hit_id() const {
return theInnerHitId;
}
@@ -56,37 +56,42 @@ class GPUCACell {
return theOuterHitId;
}
- constexpr void print_cell() const {
+
+ __device__
+ void print_cell() const {
printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
"%d, innerradius %f, outerRadius %f \n",
- theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
- theInnerR, theOuterR);
+ theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
+ );
}
- __host__ __device__
- bool check_alignment_and_tag(
- const GPUCACell *cells, unsigned int innerCellId, const float ptmin,
+
+ __device__
+ bool check_alignment(Hits const & hh,
+ GPUCACell const & otherCell, const float ptmin,
const float region_origin_x, const float region_origin_y,
const float region_origin_radius, const float thetaCut,
- const float phiCut, const float hardPtCut)
+ const float phiCut, const float hardPtCut) const
{
- auto ro = get_outer_r();
- auto zo = get_outer_z();
- const auto &otherCell = cells[innerCellId];
+ auto ri = get_inner_r(hh);
+ auto zi = get_inner_z(hh);
+
+ auto ro = get_outer_r(hh);
+ auto zo = get_outer_z(hh);
- auto r1 = otherCell.get_inner_r();
- auto z1 = otherCell.get_inner_z();
- bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
+ auto r1 = otherCell.get_inner_r(hh);
+ auto z1 = otherCell.get_inner_z(hh);
+ bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
return (aligned &&
- haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
+ haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
region_origin_y, region_origin_radius, phiCut,
hardPtCut));
}
-
- constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
+ __device__ __forceinline__
+ static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
const float ptmin,
- const float thetaCut) const {
+ const float thetaCut) {
float radius_diff = std::abs(r1 - ro);
float distance_13_squared =
radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
@@ -96,27 +101,26 @@ class GPUCACell {
// radius_diff later
float tan_12_13_half_mul_distance_13_squared =
- fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
+ fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
}
- constexpr bool
- haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
+ __device__
+ bool
+ haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
const float ptmin, const float region_origin_x,
const float region_origin_y,
const float region_origin_radius, const float phiCut,
const float hardPtCut) const {
- const auto &otherCell = cells[innerCellId];
-
- auto x1 = otherCell.get_inner_x();
- auto y1 = otherCell.get_inner_y();
+ auto x1 = otherCell.get_inner_x(hh);
+ auto y1 = otherCell.get_inner_y(hh);
- auto x2 = get_inner_x();
- auto y2 = get_inner_y();
+ auto x2 = get_inner_x(hh);
+ auto y2 = get_inner_y(hh);
- auto x3 = get_outer_x();
- auto y3 = get_outer_y();
+ auto x3 = get_outer_x(hh);
+ auto y3 = get_outer_y(hh);
float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
float tan_12_13_half_mul_distance_13_squared =
@@ -139,7 +143,7 @@ class GPUCACell {
return distance_13_beamspot_squared <
(region_origin_radius + phiCut) * (region_origin_radius + phiCut);
- }
+ }
// 87 cm/GeV = 1/(3.8T * 0.3)
@@ -186,13 +190,13 @@ class GPUCACell {
// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.
-#ifdef __CUDACC__
+// #ifdef __CUDACC__
__device__
inline void find_ntuplets(
- const GPUCACell *cells,
+ GPUCACell const * __restrict__ cells,
GPU::SimpleVector *foundNtuplets,
- GPU::VecArray &tmpNtuplet,
+ GPU::VecArray &tmpNtuplet,
const unsigned int minHitsPerNtuplet) const
{
// the building process for a track ends if:
@@ -231,16 +235,10 @@ class GPUCACell {
int theLayerPairId;
private:
- unsigned int theInnerHitId;
- unsigned int theOuterHitId;
- float theInnerX;
- float theOuterX;
- float theInnerY;
- float theOuterY;
float theInnerZ;
- float theOuterZ;
float theInnerR;
- float theOuterR;
+ hindex_type theInnerHitId;
+ hindex_type theOuterHitId;
};
#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index 31844f39f9727..61d048637585c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -20,12 +20,13 @@ namespace gpuPixelDoublets {
template
__device__
+ __forceinline__
void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
uint32_t nPairs,
GPUCACell * cells,
uint32_t * nCells,
int16_t const * __restrict__ iphi,
- Hist const * __restrict__ hist,
+ Hist const & __restrict__ hist,
uint32_t const * __restrict__ offsets,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh,
GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
@@ -63,6 +64,8 @@ namespace gpuPixelDoublets {
uint8_t outer = layerPairs[2*pairLayerId+1];
assert(outer > inner);
+ auto hoff = Hist::histOff(outer);
+
auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
i += offsets[inner];
@@ -73,8 +76,8 @@ namespace gpuPixelDoublets {
// found hit corresponding to our cuda thread, now do the job
auto mep = iphi[i];
- auto mez = hh.zg_d[i];
- auto mer = hh.rg_d[i];
+ auto mez = __ldg(hh.zg_d+i);
+ auto mer = __ldg(hh.rg_d+i);
constexpr float z0cut = 12.f; // cm
constexpr float hardPtCut = 0.5f; // GeV
@@ -83,13 +86,13 @@ namespace gpuPixelDoublets {
auto ptcut = [&](int j) {
auto r2t4 = minRadius2T4;
auto ri = mer;
- auto ro = hh.rg_d[j];
+ auto ro = __ldg(hh.rg_d+j);
auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
};
auto z0cutoff = [&](int j) {
- auto zo = hh.zg_d[j];
- auto ro = hh.rg_d[j];
+ auto zo = __ldg(hh.zg_d+j);
+ auto ro = __ldg(hh.rg_d+j);
auto dr = ro-mer;
return dr > maxr[pairLayerId] ||
dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
@@ -97,8 +100,8 @@ namespace gpuPixelDoublets {
auto iphicut = phicuts[pairLayerId];
- auto kl = hist[outer].bin(int16_t(mep-iphicut));
- auto kh = hist[outer].bin(int16_t(mep+iphicut));
+ auto kl = Hist::bin(int16_t(mep-iphicut));
+ auto kh = Hist::bin(int16_t(mep+iphicut));
auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
int tot = 0;
int nmin = 0;
@@ -108,9 +111,11 @@ namespace gpuPixelDoublets {
int tooMany=0;
for (auto kk = kl; kk != khh; incr(kk)) {
if (kk != kl && kk != kh)
- nmin += hist[outer].size(kk);
- for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
- auto oi=*p;
+ nmin += hist.size(kk+hoff);
+ auto const * __restrict__ p = hist.begin(kk+hoff);
+ auto const * __restrict__ e = hist.end(kk+hoff);
+ for (;p < e; ++p) {
+ auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi 0)
printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot);
- if (hist[outer].nspills > 0)
- printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);
-
- // if (0==hist[outer].nspills) assert(tot>=nmin);
- // look in spill bin as well....
-
} // loop in block...
}
constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
+ constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
__global__
- __launch_bounds__(getDoubletsFromHistoMaxBlockSize)
+ __launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
void getDoubletsFromHisto(GPUCACell * cells,
uint32_t * nCells,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp,
@@ -184,7 +184,7 @@ namespace gpuPixelDoublets {
auto const & __restrict__ hh = *hhp;
doubletsFromHisto(layerPairs, nPairs, cells, nCells,
- hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
+ hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
hh, isOuterHitOfCell,
phicuts, minz, maxz, maxr);
}
diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
index 5a3208da2c87f..43261b0417410 100644
--- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
+++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml
@@ -20,4 +20,4 @@
-
+
diff --git a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
index fcb19f855a9ba..ff3624cdafd65 100644
--- a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
+++ b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h
@@ -16,6 +16,7 @@ namespace pixelVertexHeterogeneousProduct {
float * z_d;
float * zerr_d;
float * chi2_d;
+ uint16_t * sortInd;
int32_t * ivtx_d; // this should be indexed with the original tracks, not the reduced set (oops)
};
@@ -31,6 +32,7 @@ namespace pixelVertexHeterogeneousProduct {
{ }
std::vector> z,zerr, chi2;
+ std::vector> sortInd;
std::vector> ivtx;
uint32_t nVertices=0;
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
index 3451655a282d1..0b8c31235abea 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc
@@ -117,18 +117,20 @@ void PixelVertexHeterogeneousProducer::acquireGPUCuda(
// Second, make a collection of pointers to the tracks we want for the vertex finder
// fill z,ez
- std::vector z,ez;
+ std::vector z,ez2,pt2;
assert(m_trks.empty());
for (unsigned int i=0; ipt() > 20 ? 20*20 : m_trks[k]->pt()*m_trks[k]->pt();
- }
- // sort
- std::sort(ind,ind+gpuProduct.nVertices,[&](int i, int j){ return pt2[i]>pt2[j];});
- if(gpuProduct.nVertices>1) assert(pt2[ind[0]]>=pt2[ind[1]]);
// fill legacy data format
- for (unsigned int j=0; j uind; // fort verifing index consistency
+ for (int j=int(gpuProduct.nVertices)-1; j>=0; --j) {
+ auto i = gpuProduct.sortInd[j]; // on gpu sorted in ascending order....
+ assert(i>=0);
+ assert(i0);
(*vertexes).emplace_back(reco::Vertex::Point(x,y,z), err, gpuProduct.chi2[i], nt-1, nt );
auto & v = (*vertexes).back();
- pt2[i]=0;
for (auto k: itrk) {
v.add(reco::TrackBaseRef(m_trks[k]));
}
itrk.clear();
}
+ assert(uind.size()==(*vertexes).size());
+ if (!uind.empty()) {
+ assert(0 == *uind.begin());
+ assert(uind.size()-1 == *uind.rbegin());
+ }
-
if (verbose_) {
edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Found " << vertexes->size() << " vertexes\n";
for (unsigned int i=0; isize(); ++i) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
index 7c97665ece061..ae17401d06625 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h
@@ -7,10 +7,51 @@
#include
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h"
+
#include "gpuVertexFinder.h"
namespace gpuVertexFinder {
+
+ __global__
+ void sortByPt2(int nt,
+ OnGPU * pdata
+ ) {
+ auto & __restrict__ data = *pdata;
+ float const * __restrict__ ptt2 = data.ptt2;
+ uint32_t const & nv = *data.nv;
+
+ int32_t const * __restrict__ iv = data.iv;
+ float * __restrict__ ptv2 = data.ptv2;
+ uint16_t * __restrict__ sortInd = data.sortInd;
+
+ if (nv<1) return;
+
+ // can be done asynchronoisly at the end of previous event
+ for (int i = threadIdx.x; i < nv; i += blockDim.x) {
+ ptv2[i]=0;
+ }
+ __syncthreads();
+
+
+ for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+ if (iv[i]>9990) continue;
+ atomicAdd(&ptv2[iv[i]], ptt2[i]);
+ }
+ __syncthreads();
+
+ if (1==nv) {
+ if (threadIdx.x==0) sortInd[0]=0;
+ return;
+ }
+ __shared__ uint16_t ws[1024];
+ radixSort(ptv2,sortInd,ws,nv);
+
+ assert(ptv2[sortInd[nv-1]]>=ptv2[sortInd[nv-2]]);
+ assert(ptv2[sortInd[1]]>=ptv2[sortInd[0]]);
+ }
+
// this algo does not really scale as it works in a single block...
// enough for <10K tracks we have
@@ -24,6 +65,9 @@ namespace gpuVertexFinder {
) {
constexpr bool verbose = false; // in principle the compiler should optmize out if false
+
+
+ if(verbose && 0==threadIdx.x) printf("params %d %f\n",minT,eps);
auto er2mx = errmax*errmax;
@@ -35,42 +79,49 @@ namespace gpuVertexFinder {
float * __restrict__ chi2 = data.chi2;
uint32_t & nv = *data.nv;
- int8_t * __restrict__ izt = data.izt;
+ uint8_t * __restrict__ izt = data.izt;
int32_t * __restrict__ nn = data.nn;
int32_t * __restrict__ iv = data.iv;
assert(pdata);
assert(zt);
- __shared__ HistoContainer hist;
-
- // if(0==threadIdx.x) printf("params %d %f\n",minT,eps);
- // if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt);
-
- // zero hist
- hist.nspills = 0;
- for (auto k = threadIdx.x; k;
+ constexpr auto wss = Hist::totbins();
+ __shared__ Hist hist;
+ __shared__ typename Hist::Counter ws[wss];
+ for (auto j=threadIdx.x; j= 0);
+ assert(iz-INT8_MIN < 256);
+ hist.count(izt[i]);
iv[i]=i;
nn[i]=0;
}
__syncthreads();
-
- // if(0==threadIdx.x) printf("histo filled %d\n",hist.nspills);
- if(0==threadIdx.x && hist.fullSpill()) printf("histo overflow\n");
-
+ hist.finalize(ws);
+ __syncthreads();
+ assert(hist.size()==nt);
+ if (threadIdx.x<32) ws[threadIdx.x]=0; // used by prefix scan...
+ __syncthreads();
+ for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+ hist.fill(izt[i],uint16_t(i),ws);
+ }
+ __syncthreads();
+
+
// count neighbours
for (int i = threadIdx.x; i < nt; i += blockDim.x) {
if (ezt2[i]>er2mx) continue;
@@ -87,20 +138,20 @@ namespace gpuVertexFinder {
__syncthreads();
- // if(0==threadIdx.x) printf("nn counted\n");
-
// cluster seeds only
bool more = true;
while (__syncthreads_or(more)) {
more=false;
- for (int i = threadIdx.x; i < nt; i += blockDim.x) {
+ for (int k = threadIdx.x; k < hist.size(); k += blockDim.x) {
+ auto p = hist.begin()+k;
+ auto i = (*p);
+ auto be = std::min(Hist::bin(izt[i])+1,int(hist.nbins()-1));
if (nn[i]eps) return;
+ auto dist = std::abs(zt[i]-zt[j]);
+ if (dist>eps) return;
if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
auto old = atomicMin(&iv[j], iv[i]);
if (old != iv[i]) {
@@ -109,8 +160,8 @@ namespace gpuVertexFinder {
}
atomicMin(&iv[i], old);
};
-
- forEachInBins(hist,izt[i],1,loop);
+ ++p;
+ for (;p>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
-
+ clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max);
+ cudaCheck(cudaGetLastError());
+ sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d);
+ cudaCheck(cudaGetLastError());
+
cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t),
cudaMemcpyDeviceToHost, stream));
gpuProduct.ivtx.resize(ntrks);
cudaCheck(cudaMemcpyAsync(gpuProduct.ivtx.data(),onGPU.iv,sizeof(int32_t)*ntrks,
cudaMemcpyDeviceToHost, stream));
-
-
}
Producer::GPUProduct const & Producer::fillResults(cudaStream_t stream) {
@@ -78,13 +86,14 @@ namespace gpuVertexFinder {
gpuProduct.chi2.resize(gpuProduct.nVertices);
cudaCheck(cudaMemcpyAsync(gpuProduct.chi2.data(),onGPU.chi2,sizeof(float)*gpuProduct.nVertices,
cudaMemcpyDeviceToHost, stream));
-
+
+ gpuProduct.sortInd.resize(gpuProduct.nVertices);
+ cudaCheck(cudaMemcpyAsync(gpuProduct.sortInd.data(),onGPU.sortInd,sizeof(uint16_t)*gpuProduct.nVertices,
+ cudaMemcpyDeviceToHost, stream));
+
cudaStreamSynchronize(stream);
return gpuProduct;
}
-
-
-
+
} // end namespace
-
diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
index 1f90641e3c260..ded6759a940bd 100644
--- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h
@@ -13,17 +13,21 @@ namespace gpuVertexFinder {
static constexpr uint32_t MAXTRACKS = 16000;
static constexpr uint32_t MAXVTX= 1024;
- float * zt; // input track z at bs
+ float * zt; // input track z at bs
float * ezt2; // input error^2 on the above
+ float * ptt2; // input pt^2 on the above
+
float * zv; // output z-posistion of found vertices
float * wv; // output weight (1/error^2) on the above
float * chi2; // vertices chi2
+ float * ptv2; // vertices pt^2
uint32_t * nv; // the number of vertices
int32_t * iv; // vertex index for each associated track
+ uint16_t * sortInd; // sorted index (by pt2)
// workspace
- int8_t * izt; // interized z-position of input tracks (reused as interize pt2 of vertices for sorting)
+ uint8_t * izt; // interized z-position of input tracks
int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices)
};
@@ -53,6 +57,7 @@ namespace gpuVertexFinder {
void produce(cudaStream_t stream,
float const * zt,
float const * ezt2,
+ float const * ptt2,
uint32_t ntrks
);
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
index f47c4362503ae..a92c116702231 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
+++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu
@@ -13,13 +13,14 @@ struct Event {
std::vector itrack;
std::vector ztrack;
std::vector eztrack;
+ std::vector pttrack;
std::vector ivert;
};
struct ClusterGenerator {
explicit ClusterGenerator(float nvert, float ntrack) :
- rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.)
+ rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.)
{}
void operator()(Event & ev) {
@@ -42,6 +43,8 @@ struct ClusterGenerator {
ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng));
ev.eztrack.push_back(err*err);
ev.ivert.push_back(iv);
+ ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) );
+ ev.pttrack.back()*=ev.pttrack.back();
}
}
// add noise
@@ -51,6 +54,8 @@ struct ClusterGenerator {
ev.ztrack.push_back(rgen(reng));
ev.eztrack.push_back(err*err);
ev.ivert.push_back(9999);
+ ev.pttrack.push_back( 0.5f + ptGen(reng) );
+ ev.pttrack.back()*=ev.pttrack.back();
}
}
@@ -61,7 +66,7 @@ struct ClusterGenerator {
std::poisson_distribution clusGen;
std::poisson_distribution trackGen;
std::normal_distribution gauss;
-
+ std::exponential_distribution ptGen;
};
@@ -79,11 +84,14 @@ int main() {
auto zt_d = cuda::memory::device::make_unique(current_device, 64000);
auto ezt2_d = cuda::memory::device::make_unique