From eb29a8b226e5428c126a3177237fd24cd7442d39 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 14 Sep 2018 16:08:56 +0200 Subject: [PATCH 01/36] use ext ws or shared --- .../CUDAUtilities/interface/radixSort.h | 53 +++++++++++----- .../CUDAUtilities/test/radixSort_t.cu | 63 ++++++++++++++----- 2 files changed, 86 insertions(+), 30 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 60d0a48164678..1311865ce2ae6 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -75,22 +75,21 @@ template< int NS, // number of significant bytes to use in sorting typename RF > -__device__ -void radixSortImpl(T const * a, uint16_t * ind, uint32_t size, RF reorder) { +__device__ +void +__forceinline__ +radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_t size, RF reorder) { constexpr int d = 8, w = 8*sizeof(T); constexpr int sb = 1<0); - assert(size<=MaxSize); assert(blockDim.x==sb); // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; @@ -203,8 +202,10 @@ template< typename std::enable_if::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { - radixSortImpl(a,ind,size,dummyReorder); +void +__forceinline__ +radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { + radixSortImpl(a,ind,ind2,size,dummyReorder); } template< @@ -213,8 +214,10 @@ template< typename std::enable_if::value&&std::is_signed::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { - radixSortImpl(a,ind,size,reorderSigned); +void +__forceinline__ +radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { + radixSortImpl(a,ind,ind2,size,reorderSigned); } template< @@ -223,29 +226,47 @@ template< typename std::enable_if::value,T>::type* = nullptr > __device__ -void radixSort(T const * a, uint16_t * ind, uint32_t size) { +void +__forceinline__ + radixSort(T const * a, uint16_t * ind, uint16_t * ind2, uint32_t size) { using I = int; - radixSortImpl((I const *)(a),ind,size,reorderFloat); + radixSortImpl((I const *)(a),ind,ind2, size,reorderFloat); } template __device__ -void radixSortMulti(T * v, uint16_t * index, uint32_t * offsets) { +void +__forceinline__ +radixSortMulti(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + + extern __shared__ uint16_t ws[]; auto a = v+offsets[blockIdx.x]; - auto ind = index+offsets[blockIdx.x];; + auto ind = index+offsets[blockIdx.x]; + auto ind2 = nullptr==workspace ? ws : workspace+offsets[blockIdx.x]; auto size = offsets[blockIdx.x+1]-offsets[blockIdx.x]; assert(offsets[blockIdx.x+1]>=offsets[blockIdx.x]); - if (size>0) radixSort(a,ind,size); + if (size>0) radixSort(a,ind,ind2,size); } template __global__ -void radixSortMultiWrapper(T * v, uint16_t * index, uint32_t * offsets) { - radixSortMulti(v,index,offsets); +void +__launch_bounds__(256, 4) +radixSortMultiWrapper(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + radixSortMulti(v,index,offsets, workspace); } +template +__global__ +void +// __launch_bounds__(256, 4) +radixSortMultiWrapper2(T const * v, uint16_t * index, uint32_t const * offsets, uint16_t * workspace) { + radixSortMulti(v,index,offsets, workspace); +} + + #endif // HeterogeneousCoreCUDAUtilities_radixSort_H diff --git a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu index bdc1af0123637..020b657bbfd29 100644 --- a/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/radixSort_t.cu @@ -31,7 +31,7 @@ struct RS { template -void go() { +void go(bool useShared) { std::mt19937 eng; // std::mt19937 eng2; @@ -89,6 +89,7 @@ void go() { auto v_d = cuda::memory::device::make_unique(current_device, N); auto ind_d = cuda::memory::device::make_unique(current_device, N); + auto ws_d = cuda::memory::device::make_unique(current_device, N); auto off_d = cuda::memory::device::make_unique(current_device, blocks+1); cuda::memory::copy(v_d.get(), v, N*sizeof(T)); @@ -97,12 +98,21 @@ void go() { if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl; delta -= (std::chrono::high_resolution_clock::now()-start); + constexpr int MaxSize = 256*32; + if (useShared) cuda::launch( radixSortMultiWrapper, + { blocks, 256, MaxSize*2 }, + v_d.get(),ind_d.get(),off_d.get(),nullptr + ); + else + cuda::launch( + radixSortMultiWrapper2, { blocks, 256 }, - v_d.get(),ind_d.get(),off_d.get() + v_d.get(),ind_d.get(),off_d.get(),ws_d.get() ); + if (i==0) std::cout << "done for " << offsets[blocks] << std::endl; // cuda::memory::copy(v, v_d.get(), 2*N); @@ -140,17 +150,42 @@ void go() { int main() { - go(); - go(); - go(); - go(); - go(); - go(); - go(); - - go(); - go(); - go(); - // go(); + bool useShared=false; + + std::cout << "using Global memory" << std::endl; + + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + useShared=true; + + std::cout << "using Shared memory" << std::endl; + + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + go(useShared); + + go(useShared); + go(useShared); + go(useShared); + // go(v); + + + return 0; } From fe2af2e50f983f4c6d43c8dafe4e4f8a826edb2f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 14 Sep 2018 16:26:52 +0200 Subject: [PATCH 02/36] add minBLperML and a debug printout --- .../plugins/SiPixelRecHitHeterogeneous.cc | 11 ++++++++++- .../PixelTriplets/plugins/gpuPixelDoublets.h | 3 ++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index 6065fa3cb274a..f6e49ea7ddd08 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -218,6 +218,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::HandledetId(); @@ -234,7 +235,7 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle=96 && hoc.charge[fc+i]<4000) ) continue; + if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) { ++numberOfLostClusters; continue;} ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a] Date: Sat, 15 Sep 2018 17:36:54 +0200 Subject: [PATCH 03/36] a small prefix scan in single block --- .../CUDAUtilities/interface/prefixScan.h | 52 +++++++++++++++++++ .../CUDAUtilities/test/BuildFile.xml | 2 + .../CUDAUtilities/test/prefixScan_t.cu | 33 ++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/prefixScan.h create mode 100644 HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu diff --git a/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h new file mode 100644 index 0000000000000..24925242950f6 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/prefixScan.h @@ -0,0 +1,52 @@ +#ifndef HeterogeneousCore_CUDAUtilities_prefixScan_h +#define HeterogeneousCore_CUDAUtilities_prefixScan_h + +#include +#include + +template +__device__ +void +__forceinline__ +warpPrefixScan(T * c, uint32_t i) { + auto x = c[i]; + auto laneId = threadIdx.x & 0x1f; + #pragma unroll + for( int offset = 1 ; offset < 32 ; offset <<= 1 ) { + auto y = __shfl_up_sync(0xffffffff,x, offset); + if(laneId >= offset) x += y; + } + c[i] = x; +} + +// limited to 32*32 elements.... +template +__device__ +void +__forceinline__ +blockPrefixScan(T * c, uint32_t size, T* ws) { + assert(size<=1024); + assert(0==blockDim.x%32); + + auto first = threadIdx.x; + + for (auto i=first; i + + diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu new file mode 100644 index 0000000000000..3e8645439a28f --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -0,0 +1,33 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include + +template +__global__ +void testPrefixScan(uint32_t size) { + + __shared__ T ws[32]; + __shared__ T c[1024]; + auto first = threadIdx.x; + for (auto i=first; i<<<1,bs>>>(j); + testPrefixScan<<<1,bs>>>(j); + } + cudaDeviceSynchronize(); + + return 0; +} From 90344c76ebf98ca0737aab70b528727622360043 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 15 Sep 2018 19:43:07 +0200 Subject: [PATCH 04/36] add charge cut --- .../plugins/gpuClusterChargeCut.h | 92 +++++++++++++++++++ .../plugins/gpuClusteringConstants.h | 2 + .../SiPixelClusterizer/test/gpuClustering.cu | 45 ++++++--- 3 files changed, 127 insertions(+), 12 deletions(-) create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h new file mode 100644 index 0000000000000..c86b67e66d131 --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -0,0 +1,92 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h +#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusterChargeCut_h + +#include +#include +#include + +#include "gpuClusteringConstants.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" + +namespace gpuClustering { + + __global__ void clusterChargeCut( + uint16_t * __restrict__ id, // module id of each pixel (modified if bad cluster) + uint16_t const * __restrict__ adc, // charge of each pixel + uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module + uint32_t * __restrict__ nClustersInModule, // modified: number of clusters found in each module + uint32_t const * __restrict__ moduleId, // module id of each module + int32_t * __restrict__ clusterId, // modified: cluster id of each pixel + int numElements) + { + + if (blockIdx.x >= moduleStart[0]) + return; + + auto firstPixel = moduleStart[1 + blockIdx.x]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + + auto nclus = nClustersInModule[thisModuleId]; + if (nclus==0) return; + + assert(nclus<=MaxNumClustersPerModules); + +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); +#endif + + auto first = firstPixel + threadIdx.x; + + __shared__ int32_t charge[MaxNumClustersPerModules]; + for (int i=threadIdx.x; ichargeCut ? 1 : 0; + } + + // renumber + __shared__ uint16_t ws[32]; + blockPrefixScan(newclusId, nclus, ws); + + assert(nclus>=newclusId[nclus-1]); + + if(nclus==newclusId[nclus-1]) return; + + nClustersInModule[thisModuleId] = newclusId[nclus-1]; + + // mark bad cluster with 0 again + for (int i=threadIdx.x+1; i MaxNumModules + } #endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClusteringConstants_h diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index 2264be0de02af..a94081c2d69b4 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -12,6 +12,7 @@ #include #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" int main(void) { @@ -58,7 +59,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=5000; ++n; // diagonal ++ncl; @@ -66,7 +67,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=1000; ++n; } ++ncl; @@ -75,7 +76,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=1000; ++n; } ++ncl; @@ -86,7 +87,7 @@ int main(void) h_id[n]=id; h_x[n]=xx[k]; h_y[n]=20+xx[k]; - h_adc[n]=100; + h_adc[n]=1000; ++n; } // holes @@ -95,13 +96,13 @@ int main(void) h_id[n]=id; h_x[n]=xx[k]; h_y[n]=100; - h_adc[n]=100; + h_adc[n]=1000; ++n; if (xx[k]%2==0) { h_id[n]=id; h_x[n]=xx[k]; h_y[n]=101; - h_adc[n]=100; + h_adc[n]=1000; ++n; } } @@ -114,7 +115,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=100; + h_adc[n]=1000; ++n; } // all odd id @@ -132,7 +133,7 @@ int main(void) h_id[n]=id; h_x[n]=x+1; h_y[n]=x+y[k]+2; - h_adc[n]=100; + h_adc[n]=1000; ++n; } } else { @@ -140,14 +141,14 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x+y[9-k]; - h_adc[n]=100; + h_adc[n]=1000; ++n; if (y[k]==3) continue; // hole if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error h_id[n]=id; h_x[n]=x+1; h_y[n]=x+y[k]+2; - h_adc[n]=100; + h_adc[n]=1000; ++n; } } @@ -199,10 +200,30 @@ int main(void) n ); - cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); + cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); + + uint32_t nclus[MaxNumModules], moduleId[nModules]; + + cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); + std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl; + + + + cuda::launch( + clusterChargeCut, + { blocksPerGrid, threadsPerBlock }, + d_id.get(), d_adc.get(), + d_moduleStart.get(), + d_clusInModule.get(), d_moduleId.get(), + d_clus.get(), + n + ); + + + std::cout << "found " << nModules << " Modules active" << std::endl; - uint32_t nclus[MaxNumModules], moduleId[nModules]; + cuda::memory::copy(h_id.get(), d_id.get(), size16); cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t)); From b1e774689496d80fda4fd09b17b5159954ba9481 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 15 Sep 2018 20:13:03 +0200 Subject: [PATCH 05/36] apply charge cut --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index f3242a11d7ae6..0fc31fc1dc36b 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -34,6 +34,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" // local includes @@ -717,6 +718,18 @@ namespace pixelgpudetails { wordCounter); cudaCheck(cudaGetLastError()); + // apply charge cut + clusterChargeCut<<>>( + moduleInd_d, + adc_d, + moduleStart_d, + clusInModule_d, moduleId_d, + clus_d, + wordCounter); + cudaCheck(cudaGetLastError()); + + + // count the module start indices already here (instead of // rechits) so that the number of clusters/hits can be made // available in the rechit producer without additional points of From dca2f1e16854bf475b7b6f9d85fcb4b43b4a3d23 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sun, 16 Sep 2018 10:38:37 +0200 Subject: [PATCH 06/36] ccc works --- .../plugins/SiPixelRawToClusterGPUKernel.cu | 10 +++++----- .../plugins/SiPixelRawToClusterHeterogeneous.cc | 1 + .../SiPixelRecHits/plugins/PixelRecHits.cu | 4 ++++ .../plugins/SiPixelRecHitHeterogeneous.cc | 3 ++- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 13 +++++++++++-- 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 0fc31fc1dc36b..7bd6eac473cc7 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -688,11 +688,11 @@ namespace pixelgpudetails { cudaCheck(cudaMemcpyAsync(adc_h, adc_d, wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); } - /* - std::cout +#ifdef GPU_DEBUG + std::cout << "CUDA countModules kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; - */ +#endif cudaCheck(cudaMemsetAsync(moduleStart_d, 0x00, sizeof(uint32_t), stream.id())); @@ -704,10 +704,10 @@ namespace pixelgpudetails { threadsPerBlock = 256; blocks = MaxNumModules; - /* +#ifdef GPU_DEBUG std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; - */ +#endif cudaCheck(cudaMemsetAsync(clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t), stream.id())); findClus<<>>( moduleInd_d, diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 7f7e65d2874f1..76f3536ab1706 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -611,6 +611,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, for (uint32_t i = 0; i < gpu.nDigis; i++) { if (gpu.pdigi_h[i]==0) continue; + if (gpu.clus_h[i]>9000) continue; // not in cluster assert(gpu.rawIdArr_h[i] > 109999); if ( (*detDigis).detId() != gpu.rawIdArr_h[i]) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index e2165471c3386..bc376536dee12 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -132,6 +132,10 @@ namespace pixelgpudetails { int threadsPerBlock = 256; int blocks = input.nModules; // active modules (with digis) + +#ifdef GPU_DEBUG + std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; +#endif gpuPixelRecHits::getHits<<>>( cpeParams, gpu_.bs_d, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc index f6e49ea7ddd08..f0c3b42b30eba 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc @@ -309,12 +309,13 @@ void SiPixelRecHitHeterogeneous::run(const edm::Handle Date: Sun, 16 Sep 2018 14:26:50 +0200 Subject: [PATCH 07/36] stable, but something fishy in test --- .../plugins/gpuClusterChargeCut.h | 15 +++-- .../SiPixelClusterizer/test/BuildFile.xml | 7 +++ .../SiPixelClusterizer/test/gpuClustering.cu | 58 +++++++++++++++++-- 3 files changed, 69 insertions(+), 11 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h index c86b67e66d131..c0e2174339bd6 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -45,6 +45,7 @@ namespace gpuClustering { for (int i=threadIdx.x; ichargeCut ? 1 : 0; + newclusId[i] = ok[i] = charge[i]>chargeCut ? 1 : 0; } + __syncthreads(); + // renumber __shared__ uint16_t ws[32]; blockPrefixScan(newclusId, nclus, ws); - + assert(nclus>=newclusId[nclus-1]); if(nclus==newclusId[nclus-1]) return; nClustersInModule[thisModuleId] = newclusId[nclus-1]; - // mark bad cluster with 0 again - for (int i=threadIdx.x+1; i + + + + + + + diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index a94081c2d69b4..7e2bfd69a96a4 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -51,6 +51,23 @@ int main(void) int ncl=0; int y[10]={5,7,9,1,3,0,4,8,2,6}; + auto generateClusters = [&](bool addBigNoise) { + if (addBigNoise) { + constexpr int MaxPixels = 1000; + int id = 666; + for (int x=0; x<140; x+=3) { + for (int yy=0; yy<400; yy+=3) { + h_id[n]=id; + h_x[n]=x; + h_y[n]=yy; + h_adc[n]=1000; + ++n; ++ncl; + if (MaxPixels<=ncl) break; + } + if (MaxPixels<=ncl) break; + } + } + { // isolated int id = 42; @@ -154,8 +171,15 @@ int main(void) } } } + }; // end lambda + for (auto kkk=0; kkk<2; ++kkk) { + n=0; ncl=0; + generateClusters(1==kkk); + std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl; assert(n<=numElements); + + size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); size_t size8 = n * sizeof(uint8_t); @@ -227,24 +251,48 @@ int main(void) cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t)); - cuda::memory::copy(h_debug.get(), d_debug.get(), size32); +// cuda::memory::copy(h_debug.get(), d_debug.get(), size32); - auto p = std::minmax_element(h_debug.get(),h_debug.get()+n); - std::cout << "debug " << *p.first << ' ' << *p.second << std::endl; std::set clids; std::vector seeds; for (int i=0; i=0); assert(h_clus[i] Date: Sun, 16 Sep 2018 15:12:21 +0200 Subject: [PATCH 08/36] of course de test had a bug --- .../plugins/gpuClusterChargeCut.h | 1 + .../SiPixelClusterizer/test/gpuClustering.cu | 35 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h index c0e2174339bd6..4abc029e1942a 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h @@ -27,6 +27,7 @@ namespace gpuClustering { auto firstPixel = moduleStart[1 + blockIdx.x]; auto thisModuleId = id[firstPixel]; assert(thisModuleId < MaxNumModules); + assert(thisModuleId==moduleId[blockIdx.x]); auto nclus = nClustersInModule[thisModuleId]; if (nclus==0) return; diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index 7e2bfd69a96a4..af20ae17c745a 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -31,7 +31,6 @@ int main(void) auto h_clus = std::make_unique(numElements); - auto h_debug = std::make_unique(numElements); auto current_device = cuda::device::current::get(); auto d_id = cuda::memory::device::make_unique(current_device, numElements); auto d_x = cuda::memory::device::make_unique(current_device, numElements); @@ -45,13 +44,13 @@ int main(void) auto d_clusInModule = cuda::memory::device::make_unique(current_device, MaxNumModules); auto d_moduleId = cuda::memory::device::make_unique(current_device, MaxNumModules); - auto d_debug = cuda::memory::device::make_unique(current_device, numElements); // later random number int n=0; int ncl=0; int y[10]={5,7,9,1,3,0,4,8,2,6}; - auto generateClusters = [&](bool addBigNoise) { + auto generateClusters = [&](int kn) { + auto addBigNoise = 1==kn%2; if (addBigNoise) { constexpr int MaxPixels = 1000; int id = 666; @@ -76,7 +75,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=5000; + h_adc[n]= kn==0 ? 100 : 5000; ++n; // diagonal ++ncl; @@ -113,7 +112,7 @@ int main(void) h_id[n]=id; h_x[n]=xx[k]; h_y[n]=100; - h_adc[n]=1000; + h_adc[n]= kn==2 ? 100 : 1000; ++n; if (xx[k]%2==0) { h_id[n]=id; @@ -132,7 +131,7 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x; - h_adc[n]=1000; + h_adc[n]=5000; ++n; } // all odd id @@ -158,23 +157,23 @@ int main(void) h_id[n]=id; h_x[n]=x; h_y[n]=x+y[9-k]; - h_adc[n]=1000; + h_adc[n]= kn==2 ? 10 : 1000; ++n; if (y[k]==3) continue; // hole if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error h_id[n]=id; h_x[n]=x+1; h_y[n]=x+y[k]+2; - h_adc[n]=1000; + h_adc[n]= kn==2 ? 10 : 1000; ++n; } } } } }; // end lambda - for (auto kkk=0; kkk<2; ++kkk) { + for (auto kkk=0; kkk<4; ++kkk) { n=0; ncl=0; - generateClusters(1==kkk); + generateClusters(kkk); std::cout << "created " << n << " digis in " << ncl << " clusters" << std::endl; assert(n<=numElements); @@ -182,7 +181,7 @@ int main(void) size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); - size_t size8 = n * sizeof(uint8_t); + // size_t size8 = n * sizeof(uint8_t); uint32_t nModules=0; cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t)); @@ -190,8 +189,7 @@ int main(void) cuda::memory::copy(d_id.get(), h_id.get(), size16); cuda::memory::copy(d_x.get(), h_x.get(), size16); cuda::memory::copy(d_y.get(), h_y.get(), size16); - cuda::memory::copy(d_adc.get(), h_adc.get(), size8); - cuda::memory::device::zero(d_debug.get(),size32); + cuda::memory::copy(d_adc.get(), h_adc.get(), size16); // Launch CUDA Kernels int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; @@ -223,6 +221,7 @@ int main(void) d_clus.get(), n ); + cudaDeviceSynchronize(); cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); @@ -230,10 +229,10 @@ int main(void) cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl; + for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;} - - cuda::launch( + cuda::launch( clusterChargeCut, { blocksPerGrid, threadsPerBlock }, d_id.get(), d_adc.get(), @@ -244,6 +243,7 @@ int main(void) ); + cudaDeviceSynchronize(); std::cout << "found " << nModules << " Modules active" << std::endl; @@ -251,11 +251,9 @@ int main(void) cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t)); -// cuda::memory::copy(h_debug.get(), d_debug.get(), size32); std::set clids; - std::vector seeds; for (int i=0; i0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;} // << " and " << seeds.size() << " seeds" << std::endl; } /// end loop kkk return 0; From 9103a9c0f927abac842f3db75e9b4402ea5727fd Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Mon, 17 Sep 2018 14:30:23 +0200 Subject: [PATCH 09/36] works, sort only with blksize 256 --- .../CUDAUtilities/interface/radixSort.h | 67 +++++++++++-------- .../CUDAUtilities/test/radixSort_t.cu | 28 +++++--- .../plugins/CAHitQuadrupletGeneratorGPU.cu | 14 ++-- .../PixelTriplets/plugins/GPUCACell.h | 2 +- .../PixelTriplets/plugins/gpuPixelDoublets.h | 1 + .../pixelVertexHeterogeneousProduct.h | 2 + .../src/PixelVertexHeterogeneousProducer.cc | 36 +++++----- .../PixelVertexFinding/src/gpuClusterTracks.h | 38 +++++++++++ .../PixelVertexFinding/src/gpuVertexFinder.cu | 23 +++++-- .../PixelVertexFinding/src/gpuVertexFinder.h | 9 ++- .../test/gpuVertexFinder_t.cu | 31 ++++++++- 11 files changed, 179 insertions(+), 72 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 1311865ce2ae6..83468d854b1e6 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -90,7 +90,7 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_ __shared__ int p; assert(size>0); - assert(blockDim.x==sb); + assert(blockDim.x>=sb); // bool debug = false; // threadIdx.x==0 && blockIdx.x==5; @@ -104,8 +104,8 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_ __syncthreads(); - while(p < w/d) { - c[threadIdx.x]=0; + while(__syncthreads_and(p < w/d)) { + if (threadIdx.x= offset) x += y; + if (threadIdx.x= offset) x += y; + } + ct[threadIdx.x] = x; } - ct[threadIdx.x] = x; __syncthreads(); - auto ss = (threadIdx.x/32)*32 -1; - c[threadIdx.x] = ct[threadIdx.x]; - for(int i=ss; i>0; i-=32) c[threadIdx.x] +=ct[i]; - + if (threadIdx.x0; i-=32) c[threadIdx.x] +=ct[i]; + } /* //prefix scan for the nulls (for documentation) if (threadIdx.x==0) @@ -139,27 +142,33 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_ // broadcast ibs =size-1; __syncthreads(); - while (ibs>0) { + while (__syncthreads_and(ibs>0)) { int i = ibs - threadIdx.x; - cu[threadIdx.x]=-1; - ct[threadIdx.x]=-1; + if (threadIdx.x=0) { - bin = (a[j[i]] >> d*p)&(sb-1); - ct[threadIdx.x]=bin; - atomicMax(&cu[bin],int(i)); + if (threadIdx.x=0) { + bin = (a[j[i]] >> d*p)&(sb-1); + ct[threadIdx.x]=bin; + atomicMax(&cu[bin],int(i)); + } } __syncthreads(); - if (i>=0 && i==cu[bin]) // ensure to keep them in order - for (int ii=threadIdx.x; ii=oi);if(i>=oi) - k[--c[bin]] = j[i-oi]; - } + if (threadIdx.x=0 && i==cu[bin]) // ensure to keep them in order + for (int ii=threadIdx.x; ii=oi);if(i>=oi) + k[--c[bin]] = j[i-oi]; + } + } __syncthreads(); if (bin>=0) assert(c[bin]>=0); - if (threadIdx.x==0) ibs-=blockDim.x; + if (threadIdx.x==0) ibs-=sb; __syncthreads(); } @@ -190,6 +199,8 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_ if (j!=ind) // odd... for (auto i=first; i #include #include - +#include #include #include @@ -97,18 +97,20 @@ void go(bool useShared) { if (i<2) std::cout << "lauch for " << offsets[blocks] << std::endl; + auto ntXBl = 1==i%4 ? 256 : 256; + delta -= (std::chrono::high_resolution_clock::now()-start); constexpr int MaxSize = 256*32; if (useShared) cuda::launch( radixSortMultiWrapper, - { blocks, 256, MaxSize*2 }, + { blocks, ntXBl, MaxSize*2 }, v_d.get(),ind_d.get(),off_d.get(),nullptr ); else cuda::launch( radixSortMultiWrapper2, - { blocks, 256 }, + { blocks, ntXBl }, v_d.get(),ind_d.get(),off_d.get(),ws_d.get() ); @@ -127,10 +129,13 @@ void go(bool useShared) { std::cout << LL(v[ind[3]]) << ' ' << LL(v[ind[10]]) << ' ' << LL(v[ind[blockSize-1000]]) << std::endl; std::cout << LL(v[ind[blockSize/2-1]]) << ' ' << LL(v[ind[blockSize/2]]) << ' ' << LL(v[ind[blockSize/2+1]]) << std::endl; } - for (int ib=0; ib inds; + if (offsets[ib+1]> offsets[ib]) inds.insert(ind[offsets[ib]]); + for (auto j = offsets[ib]+1; j < offsets[ib+1]; j++) { + inds.insert(ind[j]); auto a = v+offsets[ib]; - auto k1=a[ind[i]]; auto k2=a[ind[i-1]]; + auto k1=a[ind[j]]; auto k2=a[ind[j-1]]; auto sh = sizeof(uint64_t)-NS; sh*=8; auto shorten = [sh](T& t) { auto k = (uint64_t *)(&t); @@ -138,8 +143,15 @@ void go(bool useShared) { }; shorten(k1);shorten(k2); if (k1 *foundNtuplets, } -__global__ void +__global__ +void kernel_connect(GPU::SimpleVector *foundNtuplets, GPUCACell *cells, uint32_t const * nCells, GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, @@ -158,8 +159,9 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, auto nhits = hh.nHits; assert(nhits <= PixelGPUConstants::maxNumberOfHits); - auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512; - kernel_connect<<>>( + auto blockSize = 64; + auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize; + kernel_connect<<>>( d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel device_theCells_, device_nCells_, device_isOuterHitOfCell_, @@ -169,15 +171,15 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, ); cudaCheck(cudaGetLastError()); - kernel_find_ntuplets<<>>( + kernel_find_ntuplets<<>>( device_theCells_, device_nCells_, d_foundNtupletsVec_[regionIndex], 4, maxNumberOfDoublets_); cudaCheck(cudaGetLastError()); - numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + 512 - 1)/512; - kernel_checkOverflows<<>>( + numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize; + kernel_checkOverflows<<>>( d_foundNtupletsVec_[regionIndex], device_theCells_, device_nCells_, device_isOuterHitOfCell_, nhits, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 5995d286fc38d..02cf0d6f91642 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -139,7 +139,7 @@ class GPUCACell { return distance_13_beamspot_squared < (region_origin_radius + phiCut) * (region_origin_radius + phiCut); - } + } // 87 cm/GeV = 1/(3.8T * 0.3) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index c75c492d6953d..2d6ad83c63b2c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -20,6 +20,7 @@ namespace gpuPixelDoublets { template __device__ + __forceinline__ void doubletsFromHisto(uint8_t const * __restrict__ layerPairs, uint32_t nPairs, GPUCACell * cells, diff --git a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h index fcb19f855a9ba..ff3624cdafd65 100644 --- a/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h +++ b/RecoPixelVertexing/PixelVertexFinding/interface/pixelVertexHeterogeneousProduct.h @@ -16,6 +16,7 @@ namespace pixelVertexHeterogeneousProduct { float * z_d; float * zerr_d; float * chi2_d; + uint16_t * sortInd; int32_t * ivtx_d; // this should be indexed with the original tracks, not the reduced set (oops) }; @@ -31,6 +32,7 @@ namespace pixelVertexHeterogeneousProduct { { } std::vector> z,zerr, chi2; + std::vector> sortInd; std::vector> ivtx; uint32_t nVertices=0; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc index 3451655a282d1..0b8c31235abea 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc +++ b/RecoPixelVertexing/PixelVertexFinding/src/PixelVertexHeterogeneousProducer.cc @@ -117,18 +117,20 @@ void PixelVertexHeterogeneousProducer::acquireGPUCuda( // Second, make a collection of pointers to the tracks we want for the vertex finder // fill z,ez - std::vector z,ez; + std::vector z,ez2,pt2; assert(m_trks.empty()); for (unsigned int i=0; ipt() > 20 ? 20*20 : m_trks[k]->pt()*m_trks[k]->pt(); - } - // sort - std::sort(ind,ind+gpuProduct.nVertices,[&](int i, int j){ return pt2[i]>pt2[j];}); - if(gpuProduct.nVertices>1) assert(pt2[ind[0]]>=pt2[ind[1]]); // fill legacy data format - for (unsigned int j=0; j uind; // fort verifing index consistency + for (int j=int(gpuProduct.nVertices)-1; j>=0; --j) { + auto i = gpuProduct.sortInd[j]; // on gpu sorted in ascending order.... + assert(i>=0); + assert(i0); (*vertexes).emplace_back(reco::Vertex::Point(x,y,z), err, gpuProduct.chi2[i], nt-1, nt ); auto & v = (*vertexes).back(); - pt2[i]=0; for (auto k: itrk) { v.add(reco::TrackBaseRef(m_trks[k])); } itrk.clear(); } + assert(uind.size()==(*vertexes).size()); + if (!uind.empty()) { + assert(0 == *uind.begin()); + assert(uind.size()-1 == *uind.rbegin()); + } - if (verbose_) { edm::LogInfo("PixelVertexHeterogeneousProducer") << ": Found " << vertexes->size() << " vertexes\n"; for (unsigned int i=0; isize(); ++i) { diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 7c97665ece061..8d4c4d967a6b8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -7,10 +7,48 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" + #include "gpuVertexFinder.h" namespace gpuVertexFinder { + + + __global__ + void sortByPt2(int nt, + OnGPU * pdata + ) { + auto & __restrict__ data = *pdata; + float const * __restrict__ ptt2 = data.ptt2; + uint32_t const & nv = *data.nv; + + int32_t const * __restrict__ iv = data.iv; + float * __restrict__ ptv2 = data.ptv2; + uint16_t * __restrict__ sortInd = data.sortInd; + + if (nv<2) return; + + // can be done asynchronoisly at the end of previous event + for (int i = threadIdx.x; i < nv; i += blockDim.x) { + ptv2[i]=0; + } + __syncthreads(); + + + for (int i = threadIdx.x; i < nt; i += blockDim.x) { + if (iv[i]>9990) continue; + atomicAdd(&ptv2[iv[i]], ptt2[i]); + } + __syncthreads(); + + __shared__ uint16_t ws[1024]; + radixSort(ptv2,sortInd,ws,nv); + + assert(ptv2[sortInd[nv-1]]>=ptv2[sortInd[nv-2]]); + assert(ptv2[sortInd[1]]>=ptv2[sortInd[0]]); + } + // this algo does not really scale as it works in a single block... // enough for <10K tracks we have diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu index cb39e003a098d..e1e346253ceb5 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu @@ -6,13 +6,16 @@ namespace gpuVertexFinder { void Producer::allocateOnGPU() { cudaCheck(cudaMalloc(&onGPU.zt, OnGPU::MAXTRACKS*sizeof(float))); cudaCheck(cudaMalloc(&onGPU.ezt2, OnGPU::MAXTRACKS*sizeof(float))); + cudaCheck(cudaMalloc(&onGPU.ptt2, OnGPU::MAXTRACKS*sizeof(float))); cudaCheck(cudaMalloc(&onGPU.iv, OnGPU::MAXTRACKS*sizeof(int32_t))); cudaCheck(cudaMalloc(&onGPU.nv, sizeof(uint32_t))); cudaCheck(cudaMalloc(&onGPU.zv, OnGPU::MAXVTX*sizeof(float))); cudaCheck(cudaMalloc(&onGPU.wv, OnGPU::MAXVTX*sizeof(float))); cudaCheck(cudaMalloc(&onGPU.chi2, OnGPU::MAXVTX*sizeof(float))); - + cudaCheck(cudaMalloc(&onGPU.ptv2, OnGPU::MAXVTX*sizeof(float))); + cudaCheck(cudaMalloc(&onGPU.sortInd, OnGPU::MAXVTX*sizeof(uint16_t))); + cudaCheck(cudaMalloc(&onGPU.izt, OnGPU::MAXTRACKS*sizeof(int8_t))); cudaCheck(cudaMalloc(&onGPU.nn, OnGPU::MAXTRACKS*sizeof(int32_t))); @@ -25,13 +28,15 @@ namespace gpuVertexFinder { void Producer::deallocateOnGPU() { cudaCheck(cudaFree(onGPU.zt)); cudaCheck(cudaFree(onGPU.ezt2)); + cudaCheck(cudaFree(onGPU.ptt2)); cudaCheck(cudaFree(onGPU.iv)); cudaCheck(cudaFree(onGPU.nv)); cudaCheck(cudaFree(onGPU.zv)); cudaCheck(cudaFree(onGPU.wv)); cudaCheck(cudaFree(onGPU.chi2)); - + cudaCheck(cudaFree(onGPU.ptv2)); + cudaCheck(cudaFree(onGPU.sortInd)); cudaCheck(cudaFree(onGPU.izt)); cudaCheck(cudaFree(onGPU.nn)); @@ -44,6 +49,7 @@ namespace gpuVertexFinder { void Producer::produce(cudaStream_t stream, float const * __restrict__ zt, float const * __restrict__ ezt2, + float const * __restrict__ ptt2, uint32_t ntrks ) { @@ -52,10 +58,15 @@ namespace gpuVertexFinder { cudaMemcpyHostToDevice,stream)); cudaCheck(cudaMemcpyAsync(onGPU.ezt2,ezt2,ntrks*sizeof(float), cudaMemcpyHostToDevice,stream)); + + cudaCheck(cudaMemcpyAsync(onGPU.ptt2,ptt2,ntrks*sizeof(float), + cudaMemcpyHostToDevice,stream)); + assert(onGPU_d); clusterTracks<<<1,1024,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); - + sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d); + cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); @@ -78,7 +89,11 @@ namespace gpuVertexFinder { gpuProduct.chi2.resize(gpuProduct.nVertices); cudaCheck(cudaMemcpyAsync(gpuProduct.chi2.data(),onGPU.chi2,sizeof(float)*gpuProduct.nVertices, cudaMemcpyDeviceToHost, stream)); - + + gpuProduct.sortInd.resize(gpuProduct.nVertices); + cudaCheck(cudaMemcpyAsync(gpuProduct.sortInd.data(),onGPU.sortInd,sizeof(uint16_t)*gpuProduct.nVertices, + cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); return gpuProduct; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index 1f90641e3c260..bea3db7338a86 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -13,17 +13,21 @@ namespace gpuVertexFinder { static constexpr uint32_t MAXTRACKS = 16000; static constexpr uint32_t MAXVTX= 1024; - float * zt; // input track z at bs + float * zt; // input track z at bs float * ezt2; // input error^2 on the above + float * ptt2; // input pt^2 on the above + float * zv; // output z-posistion of found vertices float * wv; // output weight (1/error^2) on the above float * chi2; // vertices chi2 + float * ptv2; // vertices pt^2 uint32_t * nv; // the number of vertices int32_t * iv; // vertex index for each associated track + uint16_t * sortInd; // sorted index (by pt2) // workspace - int8_t * izt; // interized z-position of input tracks (reused as interize pt2 of vertices for sorting) + int8_t * izt; // interized z-position of input tracks int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices) }; @@ -53,6 +57,7 @@ namespace gpuVertexFinder { void produce(cudaStream_t stream, float const * zt, float const * ezt2, + float const * ptt2, uint32_t ntrks ); diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu index f47c4362503ae..840b8947e7220 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu +++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu @@ -13,13 +13,14 @@ struct Event { std::vector itrack; std::vector ztrack; std::vector eztrack; + std::vector pttrack; std::vector ivert; }; struct ClusterGenerator { explicit ClusterGenerator(float nvert, float ntrack) : - rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.) + rgen(-13.,13), errgen(0.005,0.025), clusGen(nvert), trackGen(ntrack), gauss(0.,1.), ptGen(1.) {} void operator()(Event & ev) { @@ -42,6 +43,8 @@ struct ClusterGenerator { ev.ztrack.push_back(ev.zvert[iv]+err*gauss(reng)); ev.eztrack.push_back(err*err); ev.ivert.push_back(iv); + ev.pttrack.push_back( (iv==5? 1.f:0.5f) + ptGen(reng) ); + ev.pttrack.back()*=ev.pttrack.back(); } } // add noise @@ -51,6 +54,8 @@ struct ClusterGenerator { ev.ztrack.push_back(rgen(reng)); ev.eztrack.push_back(err*err); ev.ivert.push_back(9999); + ev.pttrack.push_back( 0.5f + ptGen(reng) ); + ev.pttrack.back()*=ev.pttrack.back(); } } @@ -61,7 +66,7 @@ struct ClusterGenerator { std::poisson_distribution clusGen; std::poisson_distribution trackGen; std::normal_distribution gauss; - + std::exponential_distribution ptGen; }; @@ -79,9 +84,12 @@ int main() { auto zt_d = cuda::memory::device::make_unique(current_device, 64000); auto ezt2_d = cuda::memory::device::make_unique(current_device, 64000); + auto ptt2_d = cuda::memory::device::make_unique(current_device, 64000); auto zv_d = cuda::memory::device::make_unique(current_device, 256); auto wv_d = cuda::memory::device::make_unique(current_device, 256); auto chi2_d = cuda::memory::device::make_unique(current_device, 256); + auto ptv2_d = cuda::memory::device::make_unique(current_device, 256); + auto ind_d = cuda::memory::device::make_unique(current_device, 256); auto izt_d = cuda::memory::device::make_unique(current_device, 64000); auto nn_d = cuda::memory::device::make_unique(current_device, 64000); @@ -95,9 +103,12 @@ int main() { onGPU.zt = zt_d.get(); onGPU.ezt2 = ezt2_d.get(); + onGPU.ptt2 = ptt2_d.get(); onGPU.zv = zv_d.get(); onGPU.wv = wv_d.get(); onGPU.chi2 = chi2_d.get(); + onGPU.ptv2 = ptv2_d.get(); + onGPU.sortInd = ind_d.get(); onGPU.nv = nv_d.get(); onGPU.izt = izt_d.get(); onGPU.nn = nn_d.get(); @@ -123,6 +134,7 @@ int main() { cuda::memory::copy(onGPU.zt,ev.ztrack.data(),sizeof(float)*ev.ztrack.size()); cuda::memory::copy(onGPU.ezt2,ev.eztrack.data(),sizeof(float)*ev.eztrack.size()); + cuda::memory::copy(onGPU.ptt2,ev.pttrack.data(),sizeof(float)*ev.eztrack.size()); float eps = 0.1f; @@ -157,17 +169,25 @@ int main() { ); + cuda::launch(sortByPt2, + { 1, 256 }, + ev.ztrack.size(), onGPU_d.get() + ); uint32_t nv; cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t)); float zv[nv]; float wv[nv]; float chi2[nv]; + float ptv2[nv]; int32_t nn[nv]; + uint16_t ind[nv]; cuda::memory::copy(&zv, onGPU.zv, nv*sizeof(float)); cuda::memory::copy(&wv, onGPU.wv, nv*sizeof(float)); cuda::memory::copy(&chi2, onGPU.chi2, nv*sizeof(float)); + cuda::memory::copy(&ptv2, onGPU.ptv2, nv*sizeof(float)); cuda::memory::copy(&nn, onGPU.nn, nv*sizeof(int32_t)); + cuda::memory::copy(&ind, onGPU.sortInd, nv*sizeof(uint16_t)); for (auto j=0U; j0) chi2[j]/=float(nn[j]); { @@ -178,7 +198,12 @@ int main() { auto mx = std::minmax_element(chi2,chi2+nv); std::cout << "min max chi2 " << *mx.first << ' ' << *mx.second << std::endl; } - + { + auto mx = std::minmax_element(ptv2,ptv2+nv); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv-1]] << " at " << ind[0] << ' ' << ind[nv-1] << std::endl; + + } float dd[nv]; uint32_t ii=0; From f05df88e6a978e510458b0fcd306b858f8cec06d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 18 Sep 2018 14:37:22 +0200 Subject: [PATCH 10/36] make the number of bins configurable --- .../CUDAUtilities/interface/HistoContainer.h | 71 +++++++++++++++---- .../CUDAUtilities/test/BuildFile.xml | 1 + .../CUDAUtilities/test/HistoContainer_t.cpp | 42 +++++++---- .../CUDAUtilities/test/HistoContainer_t.cu | 2 +- .../plugins/gpuClustering.h | 17 ++--- .../SiPixelClusterizer/test/gpuClustering.cu | 41 +++++++++++ .../siPixelRecHitsHeterogeneousProduct.h | 2 +- .../PixelVertexFinding/src/gpuClusterTracks.h | 2 +- 8 files changed, 137 insertions(+), 41 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index 94acd0015edf6..bd0f60a1d0122 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -31,7 +31,7 @@ namespace cudautils { template __global__ - void fillFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets) { + void fillFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= offsets[nh]) return; auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); @@ -44,7 +44,7 @@ namespace cudautils { template __global__ - void fillFromVector(Histo * h, T const * v, uint32_t size) { + void fillFromVector(Histo * h, T const * __restrict__ v, uint32_t size) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if(i < size) h->fill(v[i], i); } @@ -57,7 +57,7 @@ namespace cudautils { } template - void fillOneFromVector(Histo * h, T const * v, uint32_t size, int nthreads, cudaStream_t stream) { + void fillOneFromVector(Histo * h, T const * __restrict__ v, uint32_t size, int nthreads, cudaStream_t stream) { zero(h, 1, nthreads, stream); auto nblocks = (size + nthreads - 1) / nthreads; fillFromVector<<>>(h, v, size); @@ -65,7 +65,7 @@ namespace cudautils { } template - void fillManyFromVector(Histo * h, uint32_t nh, T const * v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) { + void fillManyFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) { zero(h, nh, nthreads, stream); auto nblocks = (totSize + nthreads - 1) / nthreads; fillFromVector<<>>(h, nh, v, offsets); @@ -80,15 +80,41 @@ namespace cudautils { // including spillBin template __host__ __device__ +__forceinline__ void forEachInBins(Hist const & hist, V value, int n, Func func) { int bs = hist.bin(value); int be = std::min(int(hist.nbins()),bs+n+1); bs = std::max(0,bs-n); - assert(be>bs); + // assert(be>bs); +// bool tbc=false; for (auto b=bs; b +__host__ __device__ +__forceinline__ +void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { + auto bs = hist.bin(wmin); + auto be = hist.bin(wmax); + // be = std::min(int(hist.nbins()),be+1); + // bs = std::max(0,bs); + // assert(be>=bs); +// bool tbc=false; + for (auto b=bs; b<=be; ++b){ +// tbc |= hist.full(b); + for (auto pj=hist.begin(b);pj @@ -111,15 +137,31 @@ class HistoContainer { using index_type = I; using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return 1 << N; } - static constexpr uint32_t shift() { return sizeT() - N; } - static constexpr uint32_t mask() { return nbins() - 1; } + static constexpr uint32_t nbins() { return NBINS;} + static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} static constexpr uint32_t binSize() { return 1 << M; } static constexpr uint32_t spillSize() { return 16 * binSize(); } static constexpr UT bin(T t) { - return (t >> shift()) & mask(); + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1<> shift) & mask; } void zero() { @@ -128,7 +170,8 @@ class HistoContainer { i = 0; } - static constexpr + static __host__ __device__ + __forceinline__ uint32_t atomicIncrement(Counter & x) { #ifdef __CUDA_ARCH__ return atomicAdd(&x, 1); @@ -152,11 +195,11 @@ class HistoContainer { } constexpr bool fullSpill() const { - return nspills >= spillSize(); + return nspills > spillSize(); } constexpr bool full(uint32_t b) const { - return n[b] >= binSize(); + return n[b] > binSize(); } constexpr auto const * begin(uint32_t b) const { diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 9957266fc064b..794a7e519b21f 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -19,6 +19,7 @@ + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 61c561f96267d..03218d87a2208 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -6,41 +6,54 @@ #include #include -template +template void go() { std::mt19937 eng; - std::uniform_int_distribution rgen(std::numeric_limits::min(),std::numeric_limits::max()); + + int rmin=std::numeric_limits::min(); + int rmax=std::numeric_limits::max(); + if (NBINS!=128) { + rmin=0; + rmax=NBINS*2-1; + } + + + + std::uniform_int_distribution rgen(rmin,rmax); constexpr int N=12000; T v[N]; - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << std::endl; - + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + Hist h; for (int it=0; it<5; ++it) { for (long long j = 0; j < N; j++) v[j]=rgen(eng); + if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4; h.zero(); for (long long j = 0; j < N; j++) h.fill(v[j],j); - + std::cout << "nspills " << h.nspills << std::endl; - auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1=i); } // std::cout << kl << ' ' << kh << std::endl; - for (auto j=h.begin(kl); j(); + go(); + go(); + return 0; } diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 73e9f20ae589f..98c36492bacdc 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -34,7 +34,7 @@ void go() { constexpr uint32_t partSize = N/nParts; uint32_t offsets[nParts+1]; - using Hist = HistoContainer; + using Hist = HistoContainer; std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl; Hist h[nParts]; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 16df3abdf8df9..973771e13ac01 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -6,6 +6,7 @@ #include #include "gpuClusteringConstants.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" @@ -75,8 +76,9 @@ namespace gpuClustering { } } - //init hist (ymax < 512) - __shared__ HistoContainer hist; + //init hist (ymax=416 < 512 : 9bits) + constexpr auto nbins = phase1PixelTopology::numColsInModule/2+2; + __shared__ HistoContainer hist; hist.nspills = 0; for (auto k = threadIdx.x; k0 ? y[i]-1 : 0); - auto be = hist.bin(y[i]+1)+1; auto loop = [&](int j) { j+=firstPixel; if (i>=j or j>jm or @@ -145,12 +145,7 @@ namespace gpuClustering { // update the loop boundary for the next iteration jmax[k] = std::max(j + 1,jmax[k]); }; - for (auto b=bs; b0 ? y[i]-1 :0 ,y[i]+1,loop); } // pixel loop } // end active } // end while diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index af20ae17c745a..3e373663bab85 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -77,6 +77,47 @@ int main(void) h_y[n]=x; h_adc[n]= kn==0 ? 100 : 5000; ++n; + + // first column + ++ncl; + h_id[n]=id; + h_x[n]=x; + h_y[n]=0; + h_adc[n]= 5000; + ++n; + // first columns + ++ncl; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=2; + h_adc[n]= 5000; + ++n; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=1; + h_adc[n]= 5000; + ++n; + + // last column + ++ncl; + h_id[n]=id; + h_x[n]=x; + h_y[n]=415; + h_adc[n]= 5000; + ++n; + // last columns + ++ncl; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=415; + h_adc[n]= 2500; + ++n; + h_id[n]=id; + h_x[n]=x+80; + h_y[n]=414; + h_adc[n]= 2500; + ++n; + // diagonal ++ncl; for (int x=20; x<25; ++x) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 778b763b28cff..a8b97e988e5e6 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -32,7 +32,7 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - using Hist = HistoContainer; + using Hist = HistoContainer; Hist * hist_d; HitsOnGPU const * me_d = nullptr; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 8d4c4d967a6b8..062c802ed83a4 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -80,7 +80,7 @@ namespace gpuVertexFinder { assert(pdata); assert(zt); - __shared__ HistoContainer hist; + __shared__ HistoContainer hist; // if(0==threadIdx.x) printf("params %d %f\n",minT,eps); // if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt); From 304c6077da0a991c9b1a84e351679f56cd704f79 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 18 Sep 2018 16:03:09 +0200 Subject: [PATCH 11/36] fix race --- .../CUDAUtilities/interface/HistoContainer.h | 46 ++++++++++++------- .../CUDAUtilities/test/HistoContainer_t.cpp | 2 +- .../CUDAUtilities/test/HistoContainer_t.cu | 4 +- .../plugins/gpuClusterChargeCut.h | 1 + .../plugins/gpuClustering.h | 4 +- .../PixelTriplets/plugins/gpuPixelDoublets.h | 6 +-- .../PixelVertexFinding/src/gpuClusterTracks.h | 3 +- 7 files changed, 39 insertions(+), 27 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index bd0f60a1d0122..e7bcaf3422945 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -20,11 +20,10 @@ namespace cudautils { __global__ void zeroMany(Histo * h, uint32_t nh) { auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto ih = i / Histo::nbins(); - auto k = i - ih * Histo::nbins(); + auto ih = i / Histo::totbins(); + auto k = i - ih * Histo::totbins(); if (ih < nh) { - h[ih].nspills = 0; - if (k < Histo::nbins()) + if (k < Histo::totbins()) h[ih].n[k] = 0; } } @@ -51,7 +50,7 @@ namespace cudautils { template void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) { - auto nblocks = (nh * Histo::nbins() + nthreads - 1) / nthreads; + auto nblocks = (nh * Histo::totbins() + nthreads - 1) / nthreads; zeroMany<<>>(h, nh); cudaCheck(cudaGetLastError()); } @@ -154,6 +153,7 @@ class HistoContainer { static constexpr uint32_t sizeT() { return S; } static constexpr uint32_t nbins() { return NBINS;} + static constexpr uint32_t totbins() { return NBINS+1;} // including spillbin static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} static constexpr uint32_t binSize() { return 1 << M; } static constexpr uint32_t spillSize() { return 16 * binSize(); } @@ -165,7 +165,6 @@ class HistoContainer { } void zero() { - nspills = 0; for (auto & i : n) i = 0; } @@ -181,6 +180,7 @@ class HistoContainer { } __host__ __device__ + __forceinline__ void fill(T t, index_type j) { UT b = bin(t); assert(b spillSize(); + return nspills() > spillSize(); } constexpr bool full(uint32_t b) const { @@ -211,21 +216,30 @@ class HistoContainer { } constexpr auto size(uint32_t b) const { - return n[b]; + return uint32_t(n[b]); + } + + constexpr auto const * spillBin() const { + return bins + nbins()*binSize(); } constexpr auto const * beginSpill() const { - return spillBin; + return spillBin(); } constexpr auto const * endSpill() const { - return beginSpill() + std::min(spillSize(), uint32_t(nspills)); + return beginSpill() + std::min(spillSize(), uint32_t(nspills())); } - Counter n[nbins()]; - Counter nspills; - index_type bins[nbins()*binSize()]; - index_type spillBin[spillSize()]; + Counter n[nbins()+1]; // last is the spill bin + index_type bins[nbins()*binSize()+spillSize()]; +}; + +// a compactified version of above resuing the very same space +template +class CompactHistoContainer { + + }; #endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp index 03218d87a2208..190a4dc18e3fa 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cpp @@ -36,7 +36,7 @@ void go() { h.zero(); for (long long j = 0; j < N; j++) h.fill(v[j],j); - std::cout << "nspills " << h.nspills << std::endl; + std::cout << "nspills " << h.nspills() << std::endl; auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1=nm)) { + if (h[j].nspills()==0 && !(tot>=nm)) { std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'< hist; - hist.nspills = 0; - for (auto k = threadIdx.x; k 0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot); - if (hist[outer].nspills > 0) - printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills); + if (hist[outer].nspills() > 0) + printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills()); - // if (0==hist[outer].nspills) assert(tot>=nmin); + // if (0==hist[outer].nspills()) assert(tot>=nmin); // look in spill bin as well.... } // loop in block... diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 062c802ed83a4..54121d5babfb5 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -86,8 +86,7 @@ namespace gpuVertexFinder { // if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt); // zero hist - hist.nspills = 0; - for (auto k = threadIdx.x; k Date: Wed, 19 Sep 2018 15:31:14 +0200 Subject: [PATCH 12/36] compactify the histogram (to make itaration faster?) --- .../CUDAUtilities/interface/HistoContainer.h | 89 +++++++++++++++++++ .../CUDAUtilities/test/BuildFile.xml | 5 ++ .../CUDAUtilities/test/prefixScan_t.cu | 6 +- 3 files changed, 97 insertions(+), 3 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index e7bcaf3422945..d85bc761df151 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -10,6 +10,9 @@ #endif // __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#endif #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -119,6 +122,23 @@ void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { } +// same as above but for compactified histos +template +__host__ __device__ +__forceinline__ +void forEachInWindowCompact(Hist const & hist, V wmin, V wmax, Func const & func) { + auto bs = hist.getH().bin(wmin); + auto be = hist.getH().bin(wmax); + assert(be>=bs); + for (auto pj=hist.begin(bs);pj class CompactHistoContainer { +public: + + using index_type = typename H::index_type; + + static constexpr auto wsSize() { return std::max(H::spillSize(),32U);} + __host__ __device__ + __forceinline__ + H & getH() { return histo;} + + __host__ __device__ + __forceinline__ + H const & getH() const { return histo;} + + +#ifdef __CUDACC__ + __device__ + __forceinline__ + void compactify(typename H::Counter * ws) { + auto & h = histo; + // fix size + for (auto j=threadIdx.x; j=0); + assert(b<=h.begin(i)-h.begin(0)); + if (i + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index 3e8645439a28f..e109f6672bc32 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -13,9 +13,9 @@ void testPrefixScan(uint32_t size) { blockPrefixScan(c, size, ws); assert(1==c[0]); - for (auto i=first+1; i Date: Thu, 20 Sep 2018 12:14:08 +0200 Subject: [PATCH 13/36] not faster, at least avoid spill --- .../interface/FancyHistoContainer.h | 172 ++++++++++++++++++ .../CUDAUtilities/interface/HistoContainer.h | 6 +- .../CUDAUtilities/test/HistoContainer_t.cpp | 26 ++- .../SiPixelClusterizer/test/BuildFile.xml | 4 +- .../SiPixelClusterizer/test/gpuClustering.cu | 4 +- 5 files changed, 195 insertions(+), 17 deletions(-) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h new file mode 100644 index 0000000000000..f5e669ca3d7ef --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h @@ -0,0 +1,172 @@ +#ifndef HeterogeneousCore_CUDAUtilities_HistoContainer_h +#define HeterogeneousCore_CUDAUtilities_HistoContainer_h + +#include +#include +#include +#include +#ifndef __CUDA_ARCH__ +#include +#endif // __CUDA_ARCH__ + +#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#endif +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + + +#ifdef __CUDACC__ +namespace cudautils { + + template + __global__ + void zeroMany(Histo * h, uint32_t nh) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto ih = i / Histo::totbins(); + auto k = i - ih * Histo::totbins(); + if (ih < nh) { + if (k < Histo::totbins()) + h[ih].n[k] = 0; + } + } + +} // namespace cudautils +#endif + + +// iteratate over N bins left and right of the one containing "v" +template +__host__ __device__ +__forceinline__ +void forEachInBins(Hist const & hist, V value, int n, Func func) { + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins()-1),bs+n); + bs = std::max(0,bs-n); + assert(be>=bs); + for (auto pj=hist.begin(bs);pj +__host__ __device__ +__forceinline__ +void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { + auto bs = Hist::bin(wmin); + auto be = Hist::bin(wmax); + assert(be>=bs); + for (auto pj=hist.begin(bs);pj +class HistoContainer { +public: +#ifdef __CUDACC__ + using Counter = uint32_t; +#else + using Counter = std::atomic; +#endif + + using index_type = I; + using UT = typename std::make_unsigned::type; + + static constexpr uint32_t ilog2(uint32_t v) { + + constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; + constexpr uint32_t s[] = {1, 2, 4, 8, 16}; + + uint32_t r = 0; // result of log2(v) will go here + for (auto i = 4; i >= 0; i--) if (v & b[i]) { + v >>= s[i]; + r |= s[i]; + } + return r; + } + + + static constexpr uint32_t sizeT() { return S; } + static constexpr uint32_t nbins() { return NBINS;} + static constexpr uint32_t totbins() { return NBINS+1;} + static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr UT bin(T t) { + constexpr uint32_t shift = sizeT() - nbits(); + constexpr uint32_t mask = (1<> shift) & mask; + } + + void zero() { + for (auto & i : off) + i = 0; + } + + static __host__ __device__ + __forceinline__ + uint32_t atomicIncrement(Counter & x) { + #ifdef __CUDA_ARCH__ + return atomicAdd(&x, 1); + #else + return x++; + #endif + } + + __host__ __device__ + __forceinline__ + void count(T t) { + UT b = bin(t); + assert(b=0); - assert(b<=h.begin(i)-h.begin(0)); - if (i=0); +// assert(b<=h.begin(i)-h.begin(0)); +// if (i #include @@ -25,18 +25,25 @@ void go() { constexpr int N=12000; T v[N]; - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; Hist h; + typename Hist::Counter ws[Hist::totbins()]; for (int it=0; it<5; ++it) { for (long long j = 0; j < N; j++) v[j]=rgen(eng); - if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4; + if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4; h.zero(); - for (long long j = 0; j < N; j++) h.fill(v[j],j); - - std::cout << "nspills " << h.nspills() << std::endl; + assert(h.size()==0); + for (auto & i: ws) i=0; + for (long long j = 0; j < N; j++) h.count(v[j]); + h.finalize(); + assert(h.off[0]==0); + assert(h.size()==N); + for (long long j = 0; j < N; j++) h.fill(v[j],j,ws); + assert(h.size()==N); + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1 - - + + diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index 3e373663bab85..f517a7e7731bd 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -11,7 +11,7 @@ #include -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" int main(void) @@ -271,7 +271,7 @@ int main(void) cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); std::cout << "before charge cut found " << std::accumulate(nclus,nclus+MaxNumModules,0) << " clusters" << std::endl; for (auto i=MaxNumModules; i>0; i--) if (nclus[i-1]>0) {std::cout << "last module is " << i-1 << ' ' << nclus[i-1] << std::endl; break;} - + if (ncl!=std::accumulate(nclus,nclus+MaxNumModules,0)) std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl; cuda::launch( clusterChargeCut, From 81a15e6e5a2560739130d55723c0c3e56c1060db Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 20 Sep 2018 12:14:46 +0200 Subject: [PATCH 14/36] will need some cleanup --- .../test/NaiveHistoContainer_t.cpp | 97 ++++++++ .../CUDAUtilities/test/OneHistoContainer_t.cu | 147 +++++++++++ .../plugins/gpuFancyClustering.h | 232 ++++++++++++++++++ 3 files changed, 476 insertions(+) create mode 100644 HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp create mode 100644 HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h diff --git a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp new file mode 100644 index 0000000000000..190a4dc18e3fa --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp @@ -0,0 +1,97 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +#include +#include +#include +#include +#include + +template +void go() { + std::mt19937 eng; + + int rmin=std::numeric_limits::min(); + int rmax=std::numeric_limits::max(); + if (NBINS!=128) { + rmin=0; + rmax=NBINS*2-1; + } + + + + std::uniform_int_distribution rgen(rmin,rmax); + + + constexpr int N=12000; + T v[N]; + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + Hist h; + for (int it=0; it<5; ++it) { + for (long long j = 0; j < N; j++) v[j]=rgen(eng); + if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4; + h.zero(); + for (long long j = 0; j < N; j++) h.fill(v[j],j); + + std::cout << "nspills " << h.nspills() << std::endl; + + auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { + assert(t1=i); } + // std::cout << kl << ' ' << kh << std::endl; + for (auto j=h.begin(kl); j=0 && k=0) rtot += h.end(bm)-h.begin(bm); + assert(tot==rtot); + w=2; tot=0; + forEachInBins(h,v[j],w,ftest); + bp++; + bm--; + if (bp=0) rtot += h.end(bm)-h.begin(bm); + assert(tot==rtot); + } + +} + +int main() { + go(); + go(); + go(); + + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu new file mode 100644 index 0000000000000..9d520929a20c0 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -0,0 +1,147 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" + +#include +#include +#include +#include +#include + +#include + +template +__global__ +void mykernel(T const * __restrict__ v, uint32_t N) { + + assert(v); + assert(N==12000); + + if (threadIdx.x==0) printf("start kernel for %d data\n",N); + + using Hist = HistoContainer; + constexpr auto wss = Hist::totbins(); + + if (threadIdx.x==0) printf("ws size %d\n",wss); + + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + + for (auto j=threadIdx.x; j=k1); + } + + for (auto i=threadIdx.x; i=0 && k::max(); + vm = std::max(vm, 0); + vm = std::min(vm,vmax); + vp = std::min(vp,vmax); + vp = std::max(vp, 0); + assert(vp>=vm); + forEachInWindow(hist, vm,vp, ftest); + int bp = Hist::bin(vp); + int bm = Hist::bin(vm); + rtot = hist.end(bp)-hist.begin(bm); + assert(tot==rtot); + } + + +} + +template +void go() { + + if (cuda::device::count() == 0) { + std::cerr << "No CUDA devices on this system" << "\n"; + exit(EXIT_FAILURE); + } + + auto current_device = cuda::device::current::get(); + + + std::mt19937 eng; + + int rmin=std::numeric_limits::min(); + int rmax=std::numeric_limits::max(); + if (NBINS!=128) { + rmin=0; + rmax=NBINS*2-1; + } + + + + std::uniform_int_distribution rgen(rmin,rmax); + + + constexpr int N=12000; + T v[N]; + + auto v_d = cuda::memory::device::make_unique(current_device, N); + assert(v_d.get()); + + using Hist = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + + for (int it=0; it<5; ++it) { + for (long long j = 0; j < N; j++) v[j]=rgen(eng); + if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4; + + assert(v_d.get()); + assert(v); + cuda::memory::copy(v_d.get(), v, N*sizeof(T)); + assert(v_d.get()); + cuda::launch(mykernel,{1,256},v_d.get(),N); + } + +} + +int main() { + go(); + go(); + go(); + + + return 0; +} diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h new file mode 100644 index 0000000000000..a4d5c0f7a864a --- /dev/null +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h @@ -0,0 +1,232 @@ +#ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h +#define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h + + +// #define CLUS_LIMIT_LOOP + +#include +#include +#include + +#include "gpuClusteringConstants.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" + +namespace gpuClustering { + + __global__ void countModules(uint16_t const * __restrict__ id, + uint32_t * __restrict__ moduleStart, + int32_t * __restrict__ clusterId, + int numElements) + { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= numElements) + return; + clusterId[i] = i; + if (InvId == id[i]) + return; + auto j = i - 1; + while (j >= 0 and id[j] == InvId) + --j; + if (j < 0 or id[j] != id[i]) { + // boundary... + auto loc = atomicInc(moduleStart, MaxNumModules); + moduleStart[loc + 1] = i; + } + } + + __global__ void findClus(uint16_t const * __restrict__ id, // module id of each pixel + uint16_t const * __restrict__ x, // local coordinates of each pixel + uint16_t const * __restrict__ y, // + uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module + uint32_t * __restrict__ nClustersInModule, // output: number of clusters found in each module + uint32_t * __restrict__ moduleId, // output: module id of each module + int32_t * __restrict__ clusterId, // output: cluster id of each pixel + int numElements) + { + + if (blockIdx.x >= moduleStart[0]) + return; + + auto firstPixel = moduleStart[1 + blockIdx.x]; + auto thisModuleId = id[firstPixel]; + assert(thisModuleId < MaxNumModules); + +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); +#endif + + auto first = firstPixel + threadIdx.x; + + // find the index of the first pixel not belonging to this module (or invalid) + __shared__ int msize; + msize = numElements; + __syncthreads(); + + // skip threads not associated to an existing pixel + for (int i = first; i < numElements; i += blockDim.x) { + if (id[i] == InvId) // skip invalid pixels + continue; + if (id[i] != thisModuleId) { // find the first pixel in a different module + atomicMin(&msize, i); + break; + } + } + + //init hist (ymax=416 < 512 : 9bits) + constexpr uint32_t maxPixInModule = 4000; + constexpr auto nbins = phase1PixelTopology::numColsInModule/2+2; + using Hist = HistoContainer; + constexpr auto wss = Hist::totbins(); + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + for (auto j=threadIdx.x; j 1 or + std::abs(int(y[m]) - int(y[i])) > 1) return; + auto old = atomicMin(&clusterId[m], clusterId[i]); + if (old != clusterId[i]) { + // end the loop only if no changes were applied + done = false; + } + atomicMin(&clusterId[i], old); +#ifdef CLUS_LIMIT_LOOP + // update the loop boundary for the next iteration + jmax[k] = std::max(kk + 1,jmax[k]); +#endif + }; + ++p; + for (;p= 0) { + // mark each pixel in a cluster with the same id as the first one + clusterId[i] = clusterId[clusterId[i]]; + } + } + __syncthreads(); + + // adjust the cluster id to be a positive value starting from 0 + for (int i = first; i < msize; i += blockDim.x) { + if (id[i] == InvId) { // skip invalid pixels + clusterId[i] = -9999; + continue; + } + clusterId[i] = - clusterId[i] - 1; + } + __syncthreads(); + + if (threadIdx.x == 0) { + nClustersInModule[thisModuleId] = foundClusters; + moduleId[blockIdx.x] = thisModuleId; +#ifdef GPU_DEBUG + if (thisModuleId % 100 == 1) + if (threadIdx.x == 0) + printf("%d clusters in module %d\n", foundClusters, thisModuleId); +#endif + } + } + +} // namespace gpuClustering + +#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h From 8eb1210d815a835de36d9651ebf272ec6c774a9b Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 20 Sep 2018 13:13:28 +0200 Subject: [PATCH 15/36] clean hist header --- .../CUDAUtilities/interface/HistoContainer.h | 96 ------------------- .../plugins/gpuFancyClustering.h | 4 +- .../SiPixelClusterizer/test/gpuClustering.cu | 5 +- 3 files changed, 5 insertions(+), 100 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index ad446ad27c1bc..fb346c1406cb5 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -10,9 +10,6 @@ #endif // __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" -#ifdef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" -#endif #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -122,23 +119,6 @@ void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { } -// same as above but for compactified histos -template -__host__ __device__ -__forceinline__ -void forEachInWindowCompact(Hist const & hist, V wmin, V wmax, Func const & func) { - auto bs = hist.getH().bin(wmin); - auto be = hist.getH().bin(wmax); - assert(be>=bs); - for (auto pj=hist.begin(bs);pj -class CompactHistoContainer { - -public: - - using index_type = typename H::index_type; - - static constexpr auto wsSize() { return std::max(H::spillSize(),32U);} - - __host__ __device__ - __forceinline__ - H & getH() { return histo;} - - __host__ __device__ - __forceinline__ - H const & getH() const { return histo;} - - -#ifdef __CUDACC__ - __device__ - __forceinline__ - void compactify(typename H::Counter * ws) { - auto & h = histo; - // fix size - for (auto j=threadIdx.x; j=0); -// assert(b<=h.begin(i)-h.begin(0)); -// if (i Date: Thu, 20 Sep 2018 13:44:56 +0200 Subject: [PATCH 16/36] use one bin per column --- .../SiPixelClusterizer/plugins/gpuFancyClustering.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h index 9783b8e1b08be..ab58d7d6154d3 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h @@ -80,7 +80,7 @@ namespace gpuClustering { //init hist (ymax=416 < 512 : 9bits) constexpr uint32_t maxPixInModule = 4000; - constexpr auto nbins = phase1PixelTopology::numColsInModule/2+2; + constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; using Hist = HistoContainer; constexpr auto wss = Hist::totbins(); __shared__ Hist hist; @@ -163,8 +163,8 @@ namespace gpuClustering { #ifdef GPU_DEBUG assert(m!=i); #endif - if (std::abs(int(x[m]) - int(x[i])) > 1 or - std::abs(int(y[m]) - int(y[i])) > 1) return; + if (std::abs(int(x[m]) - int(x[i])) > 1) return; + // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1 auto old = atomicMin(&clusterId[m], clusterId[i]); if (old != clusterId[i]) { // end the loop only if no changes were applied From 25b24c5a5142a5e04886cf18be0556876b46a0f6 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 20 Sep 2018 14:21:23 +0200 Subject: [PATCH 17/36] from done to more, and debug --- .../plugins/gpuFancyClustering.h | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h index ab58d7d6154d3..2aa894b1d9f7f 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h @@ -134,15 +134,22 @@ namespace gpuClustering { for (int k = 0; k < maxiter; ++k) jmax[k] = hist.end(); #endif + +#ifdef GPU_DEBUG + __shared__ int nloops; + nloops=0; +#endif + + __syncthreads(); // for hit filling! // for each pixel, look at all the pixels until the end of the module; // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum; // after the loop, all the pixel in each cluster should have the id equeal to the lowest // pixel in the cluster ( clus[i] == i ). - bool done = false; - while (not __syncthreads_and(done)) { - done = true; + bool more = true; + while (__syncthreads_or(more)) { + more = false; for (int j=threadIdx.x, k = 0; j Date: Thu, 20 Sep 2018 16:12:01 +0200 Subject: [PATCH 18/36] vertex finding is faster now --- .../PixelVertexFinding/src/gpuClusterTracks.h | 66 +++++++++++-------- .../PixelVertexFinding/src/gpuVertexFinder.cu | 4 +- .../PixelVertexFinding/src/gpuVertexFinder.h | 2 +- .../test/gpuVertexFinder_t.cu | 18 +++-- 4 files changed, 53 insertions(+), 37 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 54121d5babfb5..599ef57c7cd4a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -6,7 +6,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" @@ -14,7 +14,6 @@ namespace gpuVertexFinder { - __global__ void sortByPt2(int nt, OnGPU * pdata @@ -62,6 +61,9 @@ namespace gpuVertexFinder { ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false + + + if(verbose && 0==threadIdx.x) printf("params %d %f\n",minT,eps); auto er2mx = errmax*errmax; @@ -73,41 +75,49 @@ namespace gpuVertexFinder { float * __restrict__ chi2 = data.chi2; uint32_t & nv = *data.nv; - int8_t * __restrict__ izt = data.izt; + uint8_t * __restrict__ izt = data.izt; int32_t * __restrict__ nn = data.nn; int32_t * __restrict__ iv = data.iv; assert(pdata); assert(zt); - __shared__ HistoContainer hist; - - // if(0==threadIdx.x) printf("params %d %f\n",minT,eps); - // if(0==threadIdx.x) printf("booked hist with %d bins, size %d for %d tracks\n",hist.nbins(),hist.binSize(),nt); - - // zero hist - for (auto k = threadIdx.x; k; + constexpr auto wss = Hist::totbins(); + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + for (auto j=threadIdx.x; j= 0); + assert(iz-INT8_MIN < 256); + hist.count(izt[i]); iv[i]=i; nn[i]=0; } __syncthreads(); - - // if(0==threadIdx.x) printf("histo filled %d\n",hist.nspills); - if(0==threadIdx.x && hist.fullSpill()) printf("histo overflow\n"); - + hist.finalize(ws); + __syncthreads(); + assert(hist.size()==nt); + if (threadIdx.x<32) ws[threadIdx.x]=0; // used by prefix scan... + __syncthreads(); + for (int i = threadIdx.x; i < nt; i += blockDim.x) { + hist.fill(izt[i],uint16_t(i),ws); + } + __syncthreads(); + + // count neighbours for (int i = threadIdx.x; i < nt; i += blockDim.x) { if (ezt2[i]>er2mx) continue; @@ -124,20 +134,20 @@ namespace gpuVertexFinder { __syncthreads(); - // if(0==threadIdx.x) printf("nn counted\n"); - // cluster seeds only bool more = true; while (__syncthreads_or(more)) { more=false; - for (int i = threadIdx.x; i < nt; i += blockDim.x) { + for (int k = threadIdx.x; k < hist.size(); k += blockDim.x) { + auto p = hist.begin()+k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i])+1,int(hist.nbins()-1)); if (nn[i]eps) return; + auto dist = std::abs(zt[i]-zt[j]); + if (dist>eps) return; if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; auto old = atomicMin(&iv[j], iv[i]); if (old != iv[i]) { @@ -146,8 +156,8 @@ namespace gpuVertexFinder { } atomicMin(&iv[i], old); }; - - forEachInBins(hist,izt[i],1,loop); + ++p; + for (;p>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); + clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d); cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t), diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h index bea3db7338a86..ded6759a940bd 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.h @@ -27,7 +27,7 @@ namespace gpuVertexFinder { uint16_t * sortInd; // sorted index (by pt2) // workspace - int8_t * izt; // interized z-position of input tracks + uint8_t * izt; // interized z-position of input tracks int32_t * nn; // number of nearest neighbours (reused as number of dof for output vertices) }; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu index 840b8947e7220..a92c116702231 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu +++ b/RecoPixelVertexing/PixelVertexFinding/test/gpuVertexFinder_t.cu @@ -91,7 +91,7 @@ int main() { auto ptv2_d = cuda::memory::device::make_unique(current_device, 256); auto ind_d = cuda::memory::device::make_unique(current_device, 256); - auto izt_d = cuda::memory::device::make_unique(current_device, 64000); + auto izt_d = cuda::memory::device::make_unique(current_device, 64000); auto nn_d = cuda::memory::device::make_unique(current_device, 64000); auto iv_d = cuda::memory::device::make_unique(current_device, 64000); @@ -142,33 +142,33 @@ int main() { if ( (i%4) == 0 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.02f,12.0f ); if ( (i%4) == 1 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.02f,9.0f ); if ( (i%4) == 2 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,eps, 0.01f,9.0f ); if ( (i%4) == 3 ) cuda::launch(clusterTracks, - { 1, 1024 }, + { 1, 512+256 }, ev.ztrack.size(), onGPU_d.get(),kk,0.7f*eps, 0.01f,9.0f ); - + cudaDeviceSynchronize(); cuda::launch(sortByPt2, { 1, 256 }, ev.ztrack.size(), onGPU_d.get() @@ -176,6 +176,12 @@ int main() { uint32_t nv; cuda::memory::copy(&nv, onGPU.nv, sizeof(uint32_t)); + + if (nv==0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + float zv[nv]; float wv[nv]; float chi2[nv]; From a1085711b63db914a98358f8fb719a46323432b3 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Thu, 20 Sep 2018 19:48:15 +0200 Subject: [PATCH 19/36] use new clustering for real! --- .../SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu | 2 +- RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h | 2 ++ .../SiPixelClusterizer/plugins/gpuFancyClustering.h | 1 + RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu | 2 +- 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 7bd6eac473cc7..bcceed91f2fc5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -33,7 +33,7 @@ // CMSSW includes #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 3cc1f97efed35..17d2fefc0ec93 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -1,6 +1,8 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h +#error "old version" + #include #include #include diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h index 2aa894b1d9f7f..6475d6422557c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h @@ -13,6 +13,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" + namespace gpuClustering { __global__ void countModules(uint16_t const * __restrict__ id, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index bc376536dee12..d442fb8195562 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -8,7 +8,7 @@ // CMSSW headers #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h" #include "PixelRecHits.h" #include "gpuPixelRecHits.h" From c711b5dfc2c0d2221e253561e61f9c2f24d1a873 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 21 Sep 2018 11:00:00 +0200 Subject: [PATCH 20/36] test cub as well --- .../CUDAUtilities/test/BuildFile.xml | 5 +- .../CUDAUtilities/test/prefixScan_t.cu | 49 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index f5a3dd3ae8207..db708e3328b4b 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -36,4 +36,7 @@ - + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu index e109f6672bc32..240ca684121f0 100644 --- a/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/prefixScan_t.cu @@ -19,6 +19,23 @@ void testPrefixScan(uint32_t size) { } } +#include + + +__global__ +void init(uint32_t * v, uint32_t val, uint32_t n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i>>(d_in, 1, num_items); + + // Determine temporary device storage requirements for inclusive prefix sum + void *d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + + std::cout << "temp storage " << temp_storage_bytes << std::endl; + + // Allocate temporary storage for inclusive prefix sum + cudaMalloc(&d_temp_storage, temp_storage_bytes); + // Run inclusive prefix sum + CubDebugExit(cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); + + verify<<>>(d_out, num_items); + + cudaDeviceSynchronize(); + return 0; } From 83879cd899983970bc0fa9905c1f93c1f57da1c0 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 21 Sep 2018 14:27:54 +0200 Subject: [PATCH 21/36] works in c++ --- .../interface/FancyHistoContainer.h | 43 +++++++++++++++---- .../CUDAUtilities/test/HistoContainer_t.cpp | 25 +++++++---- 2 files changed, 52 insertions(+), 16 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h index f5e669ca3d7ef..d050240a6b810 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h @@ -69,7 +69,8 @@ template< uint32_t NBINS, // number of bins uint32_t SIZE, // max number of element uint32_t S=sizeof(T) * 8, // number of significant bits in T - typename I=uint32_t // type stored in the container (usually an index in a vector of the input values) + typename I=uint32_t, // type stored in the container (usually an index in a vector of the input values) + uint32_t NHISTS=1 // number of histos stored > class HistoContainer { public: @@ -98,10 +99,13 @@ class HistoContainer { static constexpr uint32_t sizeT() { return S; } static constexpr uint32_t nbins() { return NBINS;} - static constexpr uint32_t totbins() { return NBINS+1;} + static constexpr uint32_t nhists() { return NHISTS;} + static constexpr uint32_t totbins() { return NHISTS*NBINS+1;} static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} static constexpr uint32_t capacity() { return SIZE; } + static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } + static constexpr UT bin(T t) { constexpr uint32_t shift = sizeT() - nbits(); constexpr uint32_t mask = (1<; - std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + using Hist4 = HistoContainer; + std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; + std::cout << "HistoContainer4 " << Hist4::nbits() << ' ' << Hist4::nbins() << ' ' << Hist4::totbins() << ' ' << Hist4::capacity() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; + for (auto nh=0; nh<4; ++nh) std::cout << "bins " << int(Hist4::bin(0))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmin))+Hist4::histOff(nh) << ' ' << int(Hist::bin(rmax))+Hist4::histOff(nh) << std::endl; + Hist h; + Hist4 h4; typename Hist::Counter ws[Hist::totbins()]; + typename Hist4::Counter ws4[Hist4::totbins()]; for (int it=0; it<5; ++it) { for (long long j = 0; j < N; j++) v[j]=rgen(eng); if (it==2) for (long long j = N/2; j < N/2+N/4; j++) v[j]=4; - h.zero(); - assert(h.size()==0); - for (auto & i: ws) i=0; - for (long long j = 0; j < N; j++) h.count(v[j]); - h.finalize(); + h.zero();h4.zero(); + assert(h.size()==0);assert(h4.size()==0); + for (auto & i: ws) i=0; + for (auto & i: ws4) i=0; + for (long long j = 0; j < N; j++) { h.count(v[j]); if(j<2000) h4.count(v[j],2); else h4.count(v[j],j%4); } + h.finalize(); h4.finalize(); assert(h.off[0]==0); assert(h.size()==N); - for (long long j = 0; j < N; j++) h.fill(v[j],j,ws); + assert(h4.off[0]==0); + assert(h4.size()==N); + for (long long j = 0; j < N; j++) { h.fill(v[j],j,ws); if(j<2000) h4.fill(v[j],j,ws4,2); else h4.fill(v[j],j,ws4,j%4); } assert(h.size()==N); - + assert(h4.size()==N); auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1 Date: Fri, 21 Sep 2018 17:46:46 +0200 Subject: [PATCH 22/36] works --- .../interface/FancyHistoContainer.h | 64 ++++++++++++++++--- .../CUDAUtilities/test/BuildFile.xml | 3 + .../CUDAUtilities/test/HistoContainer_t.cu | 55 +++++++++------- .../CUDAUtilities/test/prefixScan_t.cu | 6 +- 4 files changed, 94 insertions(+), 34 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h index d050240a6b810..e21d03a4d86e3 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h @@ -2,6 +2,7 @@ #define HeterogeneousCore_CUDAUtilities_HistoContainer_h #include +#include #include #include #include @@ -12,6 +13,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" #ifdef __CUDACC__ #include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include #endif #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" @@ -19,18 +21,50 @@ #ifdef __CUDACC__ namespace cudautils { - template + template __global__ - void zeroMany(Histo * h, uint32_t nh) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto ih = i / Histo::totbins(); - auto k = i - ih * Histo::totbins(); - if (ih < nh) { - if (k < Histo::totbins()) - h[ih].n[k] = 0; - } + void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= offsets[nh]) return; + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).count(v[i], ih); + } + + template + __global__ + void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, + uint32_t * __restrict__ ws ) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= offsets[nh]) return; + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).fill(v[i], i, ws, ih); + } + + + template + void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws, + uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, + int nthreads, cudaStream_t stream) { + uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); + cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); + auto nblocks = (totSize + nthreads - 1) / nthreads; + countFromVector<<>>(h, nh, v, offsets); + size_t wss = Histo::totbins(); + CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); + cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream); + fillFromVector<<>>(h, nh, v, offsets,ws); + cudaCheck(cudaGetLastError()); } + } // namespace cudautils #endif @@ -106,6 +140,18 @@ class HistoContainer { static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } +#ifdef __CUDACC__ + __host__ + static size_t wsSize() { + uint32_t * v =nullptr; + void * d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1); + return std::max(temp_storage_bytes,size_t(totbins())); + } +#endif + + static constexpr UT bin(T t) { constexpr uint32_t shift = sizeT() - nbits(); constexpr uint32_t mask = (1< + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index 2b6549361f182..d6dec63144f58 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -1,4 +1,4 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" #include #include @@ -34,12 +34,16 @@ void go() { constexpr uint32_t partSize = N/nParts; uint32_t offsets[nParts+1]; - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl; + using Hist = HistoContainer; + std::cout << "HistoContainer " << (int)(offsetof(Hist,off)) << ' ' + << Hist::nbins() << ' ' << Hist::totbins() << ' ' << Hist::capacity() << ' ' << Hist::wsSize() << ' ' + << (std::numeric_limits::max()-std::numeric_limits::min())/Hist::nbins() << std::endl; - Hist h[nParts]; + Hist h; + + auto h_d = cuda::memory::device::make_unique(current_device, 1); + auto ws_d = cuda::memory::device::make_unique(current_device, Hist::totbins()); - auto h_d = cuda::memory::device::make_unique(current_device, nParts); auto off_d = cuda::memory::device::make_unique(current_device, nParts+1); @@ -62,18 +66,20 @@ void go() { for (long long j = 0; j < N; j++) v[j]=rgen(eng); - if (it==2) { // spill + if (it==2) { // big bin for (long long j = 1000; j < 2000; j++) v[j]= sizeof(T)==1 ? 22 : 3456; } cuda::memory::copy(v_d.get(), v, N*sizeof(T)); - cudautils::fillManyFromVector(h_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0); - - cuda::memory::copy(&h, h_d.get(), nParts*sizeof(Hist)); + cudautils::fillManyFromVector(h_d.get(),ws_d.get(),nParts,v_d.get(),off_d.get(),offsets[10],256,0); - + cuda::memory::copy(&h, h_d.get(), sizeof(Hist)); + assert(0==h.off[0]); + assert(offsets[10]==h.size()); + + auto verify = [&](uint32_t i, uint32_t k, uint32_t t1, uint32_t t2) { assert(t1 window ) {} else {++tot;} } if (kk==i) { l=false; continue; } - if (l) for (auto p=h[j].begin(kk); p=nm)) { - std::cout << "too bad " << j << ' ' << i <<' ' << me << '/'<< T(me-window)<< '/'<< T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<=nm)) { + std::cout << "too bad " << j << ' ' << i <<' ' << int(me) << '/'<< (int)T(me-window)<< '/'<< (int)T(me+window) << ": " << kl << '/' << kh << ' '<< khh << ' '<< tot<<'/'<>>(d_out, num_items); From b8df4f849819c29f523c4d3c72e86cffc4edf525 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 21 Sep 2018 17:49:03 +0200 Subject: [PATCH 23/36] old histo is gone --- .../interface/FancyHistoContainer.h | 245 ------------------ .../CUDAUtilities/interface/HistoContainer.h | 213 +++++++-------- .../CUDAUtilities/test/HistoContainer_t.cpp | 2 +- .../CUDAUtilities/test/HistoContainer_t.cu | 2 +- .../CUDAUtilities/test/OneHistoContainer_t.cu | 2 +- 5 files changed, 113 insertions(+), 351 deletions(-) delete mode 100644 HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h deleted file mode 100644 index e21d03a4d86e3..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h +++ /dev/null @@ -1,245 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_HistoContainer_h -#define HeterogeneousCore_CUDAUtilities_HistoContainer_h - -#include -#include -#include -#include -#include -#ifndef __CUDA_ARCH__ -#include -#endif // __CUDA_ARCH__ - -#include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" -#ifdef __CUDACC__ -#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" -#include -#endif -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - - -#ifdef __CUDACC__ -namespace cudautils { - - template - __global__ - void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= offsets[nh]) return; - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < nh); - (*h).count(v[i], ih); - } - - template - __global__ - void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, - uint32_t * __restrict__ ws ) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i >= offsets[nh]) return; - auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); - assert((*off) > 0); - int32_t ih = off - offsets - 1; - assert(ih >= 0); - assert(ih < nh); - (*h).fill(v[i], i, ws, ih); - } - - - template - void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws, - uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, - int nthreads, cudaStream_t stream) { - uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); - cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); - auto nblocks = (totSize + nthreads - 1) / nthreads; - countFromVector<<>>(h, nh, v, offsets); - size_t wss = Histo::totbins(); - CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); - cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream); - fillFromVector<<>>(h, nh, v, offsets,ws); - cudaCheck(cudaGetLastError()); - } - - -} // namespace cudautils -#endif - - -// iteratate over N bins left and right of the one containing "v" -template -__host__ __device__ -__forceinline__ -void forEachInBins(Hist const & hist, V value, int n, Func func) { - int bs = Hist::bin(value); - int be = std::min(int(Hist::nbins()-1),bs+n); - bs = std::max(0,bs-n); - assert(be>=bs); - for (auto pj=hist.begin(bs);pj -__host__ __device__ -__forceinline__ -void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { - auto bs = Hist::bin(wmin); - auto be = Hist::bin(wmax); - assert(be>=bs); - for (auto pj=hist.begin(bs);pj -class HistoContainer { -public: -#ifdef __CUDACC__ - using Counter = uint32_t; -#else - using Counter = std::atomic; -#endif - - using index_type = I; - using UT = typename std::make_unsigned::type; - - static constexpr uint32_t ilog2(uint32_t v) { - - constexpr uint32_t b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; - constexpr uint32_t s[] = {1, 2, 4, 8, 16}; - - uint32_t r = 0; // result of log2(v) will go here - for (auto i = 4; i >= 0; i--) if (v & b[i]) { - v >>= s[i]; - r |= s[i]; - } - return r; - } - - - static constexpr uint32_t sizeT() { return S; } - static constexpr uint32_t nbins() { return NBINS;} - static constexpr uint32_t nhists() { return NHISTS;} - static constexpr uint32_t totbins() { return NHISTS*NBINS+1;} - static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} - static constexpr uint32_t capacity() { return SIZE; } - - static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } - -#ifdef __CUDACC__ - __host__ - static size_t wsSize() { - uint32_t * v =nullptr; - void * d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1); - return std::max(temp_storage_bytes,size_t(totbins())); - } -#endif - - - static constexpr UT bin(T t) { - constexpr uint32_t shift = sizeT() - nbits(); - constexpr uint32_t mask = (1<> shift) & mask; - } - - void zero() { - for (auto & i : off) - i = 0; - } - - static __host__ __device__ - __forceinline__ - uint32_t atomicIncrement(Counter & x) { - #ifdef __CUDA_ARCH__ - return atomicAdd(&x, 1); - #else - return x++; - #endif - } - - __host__ __device__ - __forceinline__ - void count(T t) { - uint32_t b = bin(t); - assert(b +#include #include #include #include @@ -10,121 +11,100 @@ #endif // __CUDA_ARCH__ #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h" +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h" +#include +#endif #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #ifdef __CUDACC__ namespace cudautils { - template - __global__ - void zeroMany(Histo * h, uint32_t nh) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto ih = i / Histo::totbins(); - auto k = i - ih * Histo::totbins(); - if (ih < nh) { - if (k < Histo::totbins()) - h[ih].n[k] = 0; - } - } - template __global__ - void fillFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets) { + void countFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) { auto i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= offsets[nh]) return; auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); assert((*off) > 0); int32_t ih = off - offsets - 1; assert(ih >= 0); - assert(ih < nh); - h[ih].fill(v[i], i); + assert(ih < nh); + (*h).count(v[i], ih); } template __global__ - void fillFromVector(Histo * h, T const * __restrict__ v, uint32_t size) { + void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, + uint32_t * __restrict__ ws ) { auto i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < size) h->fill(v[i], i); - } - - template - void zero(Histo * h, uint32_t nh, int nthreads, cudaStream_t stream) { - auto nblocks = (nh * Histo::totbins() + nthreads - 1) / nthreads; - zeroMany<<>>(h, nh); - cudaCheck(cudaGetLastError()); + if(i >= offsets[nh]) return; + auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); + assert((*off) > 0); + int32_t ih = off - offsets - 1; + assert(ih >= 0); + assert(ih < nh); + (*h).fill(v[i], i, ws, ih); } - template - void fillOneFromVector(Histo * h, T const * __restrict__ v, uint32_t size, int nthreads, cudaStream_t stream) { - zero(h, 1, nthreads, stream); - auto nblocks = (size + nthreads - 1) / nthreads; - fillFromVector<<>>(h, v, size); - cudaCheck(cudaGetLastError()); - } template - void fillManyFromVector(Histo * h, uint32_t nh, T const * __restrict__ v, uint32_t * offsets, uint32_t totSize, int nthreads, cudaStream_t stream) { - zero(h, nh, nthreads, stream); + void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws, + uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize, + int nthreads, cudaStream_t stream) { + uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off)); + cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); auto nblocks = (totSize + nthreads - 1) / nthreads; - fillFromVector<<>>(h, nh, v, offsets); + countFromVector<<>>(h, nh, v, offsets); + size_t wss = Histo::totbins(); + CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); + cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream); + fillFromVector<<>>(h, nh, v, offsets,ws); cudaCheck(cudaGetLastError()); } + } // namespace cudautils #endif // iteratate over N bins left and right of the one containing "v" -// including spillBin template __host__ __device__ __forceinline__ void forEachInBins(Hist const & hist, V value, int n, Func func) { - int bs = hist.bin(value); - int be = std::min(int(hist.nbins()),bs+n+1); + int bs = Hist::bin(value); + int be = std::min(int(Hist::nbins()-1),bs+n); bs = std::max(0,bs-n); - // assert(be>bs); -// bool tbc=false; - for (auto b=bs; b=bs); + for (auto pj=hist.begin(bs);pj __host__ __device__ __forceinline__ void forEachInWindow(Hist const & hist, V wmin, V wmax, Func const & func) { - auto bs = hist.bin(wmin); - auto be = hist.bin(wmax); - // be = std::min(int(hist.nbins()),be+1); - // bs = std::max(0,bs); - // assert(be>=bs); -// bool tbc=false; - for (auto b=bs; b<=be; ++b){ -// tbc |= hist.full(b); - for (auto pj=hist.begin(b);pj=bs); + for (auto pj=hist.begin(bs);pj class HistoContainer { public: @@ -153,10 +133,24 @@ class HistoContainer { static constexpr uint32_t sizeT() { return S; } static constexpr uint32_t nbins() { return NBINS;} - static constexpr uint32_t totbins() { return NBINS+1;} // including spillbin + static constexpr uint32_t nhists() { return NHISTS;} + static constexpr uint32_t totbins() { return NHISTS*NBINS+1;} static constexpr uint32_t nbits() { return ilog2(NBINS-1)+1;} - static constexpr uint32_t binSize() { return 1 << M; } - static constexpr uint32_t spillSize() { return 16 * binSize(); } + static constexpr uint32_t capacity() { return SIZE; } + + static constexpr auto histOff(uint32_t nh) { return NBINS*nh; } + +#ifdef __CUDACC__ + __host__ + static size_t wsSize() { + uint32_t * v =nullptr; + void * d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1); + return std::max(temp_storage_bytes,size_t(totbins())); + } +#endif + static constexpr UT bin(T t) { constexpr uint32_t shift = sizeT() - nbits(); @@ -165,7 +159,7 @@ class HistoContainer { } void zero() { - for (auto & i : n) + for (auto & i : off) i = 0; } @@ -181,58 +175,71 @@ class HistoContainer { __host__ __device__ __forceinline__ - void fill(T t, index_type j) { - UT b = bin(t); + void count(T t) { + uint32_t b = bin(t); assert(b spillSize(); - } - constexpr bool full(uint32_t b) const { - return n[b] > binSize(); + __host__ __device__ + __forceinline__ + void count(T t, uint32_t nh) { + uint32_t b = bin(t); + assert(b #include diff --git a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu index d6dec63144f58..8d640b728e25c 100644 --- a/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/HistoContainer_t.cu @@ -1,4 +1,4 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include #include diff --git a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu index 9d520929a20c0..c3682578163c8 100644 --- a/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu +++ b/HeterogeneousCore/CUDAUtilities/test/OneHistoContainer_t.cu @@ -1,4 +1,4 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include #include From 1bb3bdad60b5ed20ddef5f895f139d44485fb53d Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 21 Sep 2018 19:13:09 +0200 Subject: [PATCH 24/36] wip --- .../test/NaiveHistoContainer_t.cpp | 97 ------- .../plugins/SiPixelRawToClusterGPUKernel.cu | 2 +- .../plugins/gpuClustering.h | 142 ++++++---- .../plugins/gpuFancyClustering.h | 251 ------------------ .../SiPixelClusterizer/test/gpuClustering.cu | 2 +- .../SiPixelRecHits/plugins/PixelRecHits.cu | 6 +- .../siPixelRecHitsHeterogeneousProduct.h | 4 +- .../PixelVertexFinding/src/gpuClusterTracks.h | 2 +- 8 files changed, 101 insertions(+), 405 deletions(-) delete mode 100644 HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp delete mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h diff --git a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp b/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp deleted file mode 100644 index 190a4dc18e3fa..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/test/NaiveHistoContainer_t.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" - -#include -#include -#include -#include -#include - -template -void go() { - std::mt19937 eng; - - int rmin=std::numeric_limits::min(); - int rmax=std::numeric_limits::max(); - if (NBINS!=128) { - rmin=0; - rmax=NBINS*2-1; - } - - - - std::uniform_int_distribution rgen(rmin,rmax); - - - constexpr int N=12000; - T v[N]; - - using Hist = HistoContainer; - std::cout << "HistoContainer " << Hist::nbits() << ' ' << Hist::nbins() << ' ' << Hist::binSize() << ' ' << (rmax-rmin)/Hist::nbins() << std::endl; - std::cout << "bins " << int(Hist::bin(0)) << ' ' << int(Hist::bin(rmin)) << ' ' << int(Hist::bin(rmax)) << std::endl; - - Hist h; - for (int it=0; it<5; ++it) { - for (long long j = 0; j < N; j++) v[j]=rgen(eng); - if (it==2) for (long long j = N/2; j < N/2+2*Hist::binSize(); j++) v[j]=4; - h.zero(); - for (long long j = 0; j < N; j++) h.fill(v[j],j); - - std::cout << "nspills " << h.nspills() << std::endl; - - auto verify = [&](uint32_t i, uint32_t j, uint32_t k, uint32_t t1, uint32_t t2) { - assert(t1=i); } - // std::cout << kl << ' ' << kh << std::endl; - for (auto j=h.begin(kl); j=0 && k=0) rtot += h.end(bm)-h.begin(bm); - assert(tot==rtot); - w=2; tot=0; - forEachInBins(h,v[j],w,ftest); - bp++; - bm--; - if (bp=0) rtot += h.end(bm)-h.begin(bm); - assert(tot==rtot); - } - -} - -int main() { - go(); - go(); - go(); - - - return 0; -} diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index bcceed91f2fc5..7bd6eac473cc7 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -33,7 +33,7 @@ // CMSSW includes #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h" -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h" diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 17d2fefc0ec93..163378c6adbf0 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -1,7 +1,8 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h -#error "old version" + +// #define CLUS_LIMIT_LOOP #include #include @@ -12,6 +13,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + namespace gpuClustering { __global__ void countModules(uint16_t const * __restrict__ id, @@ -35,7 +37,9 @@ namespace gpuClustering { } } - __global__ void findClus(uint16_t const * __restrict__ id, // module id of each pixel + __global__ +// __launch_bounds__(256,4) + void findClus(uint16_t const * __restrict__ id, // module id of each pixel uint16_t const * __restrict__ x, // local coordinates of each pixel uint16_t const * __restrict__ y, // uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module @@ -66,8 +70,6 @@ namespace gpuClustering { __syncthreads(); // skip threads not associated to an existing pixel - bool active = (first < numElements); - if (active) { for (int i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) // skip invalid pixels continue; @@ -76,87 +78,132 @@ namespace gpuClustering { break; } } - } //init hist (ymax=416 < 512 : 9bits) - constexpr auto nbins = phase1PixelTopology::numColsInModule/2+2; - __shared__ HistoContainer hist; - for (auto k = threadIdx.x; k; + constexpr auto wss = Hist::totbins(); + __shared__ Hist hist; + __shared__ typename Hist::Counter ws[wss]; + for (auto j=threadIdx.x; j=j or j>jm or - std::abs(int(x[j]) - int(x[i])) > 1 or - std::abs(int(y[j]) - int(y[i])) > 1) return; - auto old = atomicMin(&clusterId[j], clusterId[i]); + auto loop = [&](uint16_t const * kk) { + auto m = (*kk)+firstPixel; +#ifdef GPU_DEBUG + assert(m!=i); +#endif + if (std::abs(int(x[m]) - int(x[i])) > 1) return; + // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1 + auto old = atomicMin(&clusterId[m], clusterId[i]); if (old != clusterId[i]) { // end the loop only if no changes were applied - done = false; + more = true; } atomicMin(&clusterId[i], old); +#ifdef CLUS_LIMIT_LOOP // update the loop boundary for the next iteration - jmax[k] = std::max(j + 1,jmax[k]); + jmax[k] = std::max(kk + 1,jmax[k]); +#endif }; - forEachInWindow(hist,y[i]>0 ? y[i]-1 :0 ,y[i]+1,loop); - } // pixel loop - } // end active + ++p; + for (;p -#include -#include - -#include "gpuClusteringConstants.h" -#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" - -#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" - - -namespace gpuClustering { - - __global__ void countModules(uint16_t const * __restrict__ id, - uint32_t * __restrict__ moduleStart, - int32_t * __restrict__ clusterId, - int numElements) - { - int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= numElements) - return; - clusterId[i] = i; - if (InvId == id[i]) - return; - auto j = i - 1; - while (j >= 0 and id[j] == InvId) - --j; - if (j < 0 or id[j] != id[i]) { - // boundary... - auto loc = atomicInc(moduleStart, MaxNumModules); - moduleStart[loc + 1] = i; - } - } - - __global__ -// __launch_bounds__(256,4) - void findClus(uint16_t const * __restrict__ id, // module id of each pixel - uint16_t const * __restrict__ x, // local coordinates of each pixel - uint16_t const * __restrict__ y, // - uint32_t const * __restrict__ moduleStart, // index of the first pixel of each module - uint32_t * __restrict__ nClustersInModule, // output: number of clusters found in each module - uint32_t * __restrict__ moduleId, // output: module id of each module - int32_t * __restrict__ clusterId, // output: cluster id of each pixel - int numElements) - { - - if (blockIdx.x >= moduleStart[0]) - return; - - auto firstPixel = moduleStart[1 + blockIdx.x]; - auto thisModuleId = id[firstPixel]; - assert(thisModuleId < MaxNumModules); - -#ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) - if (threadIdx.x == 0) - printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x); -#endif - - auto first = firstPixel + threadIdx.x; - - // find the index of the first pixel not belonging to this module (or invalid) - __shared__ int msize; - msize = numElements; - __syncthreads(); - - // skip threads not associated to an existing pixel - for (int i = first; i < numElements; i += blockDim.x) { - if (id[i] == InvId) // skip invalid pixels - continue; - if (id[i] != thisModuleId) { // find the first pixel in a different module - atomicMin(&msize, i); - break; - } - } - - //init hist (ymax=416 < 512 : 9bits) - constexpr uint32_t maxPixInModule = 4000; - constexpr auto nbins = phase1PixelTopology::numColsInModule + 2; //2+2; - using Hist = HistoContainer; - constexpr auto wss = Hist::totbins(); - __shared__ Hist hist; - __shared__ typename Hist::Counter ws[wss]; - for (auto j=threadIdx.x; j 1) return; - // if (std::abs(int(y[m]) - int(y[i])) > 1) return; // binssize is 1 - auto old = atomicMin(&clusterId[m], clusterId[i]); - if (old != clusterId[i]) { - // end the loop only if no changes were applied - more = true; - } - atomicMin(&clusterId[i], old); -#ifdef CLUS_LIMIT_LOOP - // update the loop boundary for the next iteration - jmax[k] = std::max(kk + 1,jmax[k]); -#endif - }; - ++p; - for (;p= 0) { - // mark each pixel in a cluster with the same id as the first one - clusterId[i] = clusterId[clusterId[i]]; - } - } - __syncthreads(); - - // adjust the cluster id to be a positive value starting from 0 - for (int i = first; i < msize; i += blockDim.x) { - if (id[i] == InvId) { // skip invalid pixels - clusterId[i] = -9999; - continue; - } - clusterId[i] = - clusterId[i] - 1; - } - __syncthreads(); - - if (threadIdx.x == 0) { - nClustersInModule[thisModuleId] = foundClusters; - moduleId[blockIdx.x] = thisModuleId; -#ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) - if (threadIdx.x == 0) - printf("%d clusters in module %d\n", foundClusters, thisModuleId); -#endif - } - } - -} // namespace gpuClustering - -#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index 2392822bab336..a01c1230586fe 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -11,7 +11,7 @@ #include -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuFancyClustering.h" +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h" int main(void) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index d442fb8195562..d10c5b7d07952 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -70,7 +70,8 @@ namespace pixelgpudetails { gpu_.iphi_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 3); gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); - cudaCheck(cudaMalloc((void **) & gpu_.hist_d, 10 * sizeof(HitsOnGPU::Hist))); + cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist))); + cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()))); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id())); @@ -109,6 +110,7 @@ namespace pixelgpudetails { cudaCheck(cudaFree(gpu_.owner_32bit_)); cudaCheck(cudaFree(gpu_.owner_16bit_)); cudaCheck(cudaFree(gpu_.hist_d)); + cudaCheck(cudaFree(gpu_.hws_d)); cudaCheck(cudaFree(gpu_d)); cudaCheck(cudaFree(d_phase1TopologyLayerStart_)); @@ -192,6 +194,6 @@ namespace pixelgpudetails { // radixSortMultiWrapper<<<10, 256, 0, c.stream>>>(gpu_.iphi_d, gpu_.sortIndex_d, gpu_.hitsLayerStart_d); } - cudautils::fillManyFromVector(gpu_.hist_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); + cudautils::fillManyFromVector(gpu_.hist_d, gpu_.hws_d, 10, gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits_, 256, stream.id()); } } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index a8b97e988e5e6..b8c2f06cd6854 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -18,6 +18,7 @@ namespace siPixelRecHitsHeterogeneousProduct { using CPUProduct = int; // dummy struct HitsOnGPU{ + constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned float * bs_d; const uint32_t * hitsModuleStart_d; // forwarded from clusters @@ -32,8 +33,9 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - using Hist = HistoContainer; + using Hist = HistoContainer; Hist * hist_d; + uint32_t hws_d; HitsOnGPU const * me_d = nullptr; diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h index 599ef57c7cd4a..32160e87275d2 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuClusterTracks.h @@ -6,7 +6,7 @@ #include #include -#include "HeterogeneousCore/CUDAUtilities/interface/FancyHistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/radixSort.h" From 78e4aa5b6864f30a56a71d671e7dcbbf9d9e87d6 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Fri, 21 Sep 2018 19:46:41 +0200 Subject: [PATCH 25/36] wip --- HeterogeneousCore/CUDAUtilities/BuildFile.xml | 1 + RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml | 2 ++ RecoLocalTracker/SiPixelRecHits/BuildFile.xml | 3 ++- RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu | 2 +- .../plugins/siPixelRecHitsHeterogeneousProduct.h | 5 +++-- RecoPixelVertexing/PixelVertexFinding/BuildFile.xml | 2 +- SimTracker/TrackerHitAssociation/plugins/BuildFile.xml | 1 + 7 files changed, 11 insertions(+), 5 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml index 38ee88b068999..fc6a61ffd433e 100644 --- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml @@ -1 +1,2 @@ + diff --git a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml index 17ce3cd5ce230..335591b583b58 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml +++ b/RecoLocalTracker/SiPixelClusterizer/test/BuildFile.xml @@ -34,12 +34,14 @@ + + diff --git a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml index c52545a601341..7918c7a4f4d9a 100644 --- a/RecoLocalTracker/SiPixelRecHits/BuildFile.xml +++ b/RecoLocalTracker/SiPixelRecHits/BuildFile.xml @@ -14,7 +14,8 @@ - + + diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index d10c5b7d07952..bb9ff5eb84de4 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -34,7 +34,7 @@ namespace { namespace pixelgpudetails { PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { - constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; + constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits(); cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float))); cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t))); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index b8c2f06cd6854..4890333caaa6d 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -17,8 +17,9 @@ namespace siPixelRecHitsHeterogeneousProduct { using CPUProduct = int; // dummy + static constexpr uint32_t maxHits() { return 65536;} + struct HitsOnGPU{ - constexpr auto MAX_HITS = gpuClustering::MaxNumModules * 256; pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned float * bs_d; const uint32_t * hitsModuleStart_d; // forwarded from clusters @@ -33,7 +34,7 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - using Hist = HistoContainer; + using Hist = HistoContainer; Hist * hist_d; uint32_t hws_d; diff --git a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml index 5a3208da2c87f..43261b0417410 100644 --- a/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml +++ b/RecoPixelVertexing/PixelVertexFinding/BuildFile.xml @@ -20,4 +20,4 @@ - + diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index c767b1e68936a..4f3e0d27878a1 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -9,6 +9,7 @@ + From 0c19cd105622201df5fc55d4efe5db3c8fc2bf3f Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 22 Sep 2018 11:36:45 +0200 Subject: [PATCH 26/36] does not crash --- .../SiPixelRecHits/plugins/PixelRecHits.cu | 2 +- .../siPixelRecHitsHeterogeneousProduct.h | 4 ++-- .../PixelTriplets/plugins/gpuPixelDoublets.h | 22 +++++++++---------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index bb9ff5eb84de4..c63466f157a1b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -71,7 +71,7 @@ namespace pixelgpudetails { gpu_.sortIndex_d = slicePitch(gpu_.owner_16bit_, gpu_.owner_16bit_pitch_, 4); cudaCheck(cudaMalloc((void **) & gpu_.hist_d, sizeof(HitsOnGPU::Hist))); - cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins()))); + cudaCheck(cudaMalloc((void **) & gpu_.hws_d, 4*HitsOnGPU::Hist::totbins())); cudaCheck(cudaMalloc((void **) & gpu_d, sizeof(HitsOnGPU))); gpu_.me_d = gpu_d; cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, cudaStream.id())); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 4890333caaa6d..c06cb17e2e282 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -34,9 +34,9 @@ namespace siPixelRecHitsHeterogeneousProduct { uint16_t * mr_d; uint16_t * mc_d; - using Hist = HistoContainer; + using Hist = HistoContainer; Hist * hist_d; - uint32_t hws_d; + typename Hist::Counter * hws_d; HitsOnGPU const * me_d = nullptr; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index 418964869dfda..ccbff0480e6c5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -26,7 +26,7 @@ namespace gpuPixelDoublets { GPUCACell * cells, uint32_t * nCells, int16_t const * __restrict__ iphi, - Hist const * __restrict__ hist, + Hist const & __restrict__ hist, uint32_t const * __restrict__ offsets, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, GPU::VecArray< unsigned int, 256> * isOuterHitOfCell, @@ -64,6 +64,8 @@ namespace gpuPixelDoublets { uint8_t outer = layerPairs[2*pairLayerId+1]; assert(outer > inner); + auto hoff = Hist::histOff(outer); + auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; i += offsets[inner]; @@ -98,8 +100,8 @@ namespace gpuPixelDoublets { auto iphicut = phicuts[pairLayerId]; - auto kl = hist[outer].bin(int16_t(mep-iphicut)); - auto kh = hist[outer].bin(int16_t(mep+iphicut)); + auto kl = Hist::bin(int16_t(mep-iphicut)); + auto kh = Hist::bin(int16_t(mep+iphicut)); auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();}; int tot = 0; int nmin = 0; @@ -109,8 +111,10 @@ namespace gpuPixelDoublets { int tooMany=0; for (auto kk = kl; kk != khh; incr(kk)) { if (kk != kl && kk != kh) - nmin += hist[outer].size(kk); - for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) { + nmin += hist.size(kk+hoff); + auto const * __restrict__ p = hist.begin(kk+hoff); + auto const * __restrict__ e = hist.end(kk+hoff); + for (;p < e; ++p) { auto oi=*p; assert(oi>=offsets[outer]); assert(oi 0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot); - if (hist[outer].nspills() > 0) - printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills()); - - // if (0==hist[outer].nspills()) assert(tot>=nmin); - // look in spill bin as well.... - } // loop in block... } @@ -186,7 +184,7 @@ namespace gpuPixelDoublets { auto const & __restrict__ hh = *hhp; doubletsFromHisto(layerPairs, nPairs, cells, nCells, - hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d, + hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d, hh, isOuterHitOfCell, phicuts, minz, maxz, maxr); } From 1b55356c1f43eccdace636494fc5b93a90f6fc67 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 22 Sep 2018 14:15:54 +0200 Subject: [PATCH 27/36] remove hard limit --- .../SiPixelRawToClusterHeterogeneous.cc | 4 ++-- .../SiPixelRecHits/plugins/gpuPixelRecHits.h | 24 ++++++++++--------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc index 76f3536ab1706..384a4732b32e1 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc @@ -584,7 +584,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, } int32_t nclus=-1; - std::vector aclusters(256); + std::vector aclusters(1024); auto totCluseFilled=0; auto fillClusters = [&](uint32_t detId){ @@ -626,7 +626,7 @@ void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev, auto const & dig = (*detDigis).data.back(); // fill clusters assert(gpu.clus_h[i]>=0); - assert(gpu.clus_h[i]<256); + assert(gpu.clus_h[i]<1024); nclus = std::max(gpu.clus_h[i],nclus); auto row = dig.row(); auto col = dig.column(); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index ec48333bde889..816a8492429ef 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -14,15 +14,6 @@ namespace gpuPixelRecHits { - // to be moved in common namespace... - constexpr uint16_t InvId=9999; // must be > MaxNumModules - - - constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule; - - using ClusParams = pixelCPEforGPU::ClusParams; - - __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams, float const * __restrict__ bs, uint16_t const * __restrict__ id, @@ -42,6 +33,14 @@ namespace gpuPixelRecHits { float * xe, float * ye, uint16_t * mr, uint16_t * mc) { + + // to be moved in common namespace... + constexpr uint16_t InvId=9999; // must be > MaxNumModules + constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule; + + using ClusParams = pixelCPEforGPU::ClusParams; + + // as usual one block per module __shared__ ClusParams clusParams; @@ -65,7 +64,9 @@ namespace gpuPixelRecHits { #endif assert(blockDim.x >= MaxClusInModule); - assert(nclus <= MaxClusInModule); + + if (threadIdx.x==0 && nclus > MaxClusInModule) printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule); + nclus = std::min(nclus, MaxClusInModule); auto ic = threadIdx.x; @@ -90,7 +91,7 @@ namespace gpuPixelRecHits { for (int i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) continue; // not valid if (id[i] != me) break; // end of module - assert(clus[i] < nclus); + if (clus[i] >= nclus) continue; atomicMin(&clusParams.minRow[clus[i]], x[i]); atomicMax(&clusParams.maxRow[clus[i]], x[i]); atomicMin(&clusParams.minCol[clus[i]], y[i]); @@ -102,6 +103,7 @@ namespace gpuPixelRecHits { for (int i = first; i < numElements; i += blockDim.x) { if (id[i] == InvId) continue; // not valid if (id[i] != me) break; // end of module + if (clus[i] >= nclus) continue; atomicAdd(&clusParams.charge[clus[i]], adc[i]); if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]); if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]); From 70d2fcb5c20db8d0e7eb6a9a5c071400ab32dc59 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Sat, 22 Sep 2018 15:39:34 +0200 Subject: [PATCH 28/36] mark lost clusters --- RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 816a8492429ef..3f92e4833bc22 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -65,7 +65,11 @@ namespace gpuPixelRecHits { assert(blockDim.x >= MaxClusInModule); - if (threadIdx.x==0 && nclus > MaxClusInModule) printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule); + if (threadIdx.x==0 && nclus > MaxClusInModule) { + printf("WARNING: too many clusters %d in Module %d. Only first %d processed\n", nclus,me,MaxClusInModule); + // zero charge: do not bother to do it in parallel + for (auto d=MaxClusInModule; d Date: Sat, 22 Sep 2018 17:03:24 +0200 Subject: [PATCH 29/36] a bit of cleanup, no speedup --- .../plugins/CAHitQuadrupletGeneratorGPU.cu | 16 ++++++++-------- .../PixelTriplets/plugins/GPUCACell.h | 17 +++++++---------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index ca7f8928873e4..4e17b580bbaf3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -12,8 +12,8 @@ using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; __global__ void kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, - GPUCACell *cells, uint32_t const * nCells, - GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, + GPUCACell const * __restrict__ cells, uint32_t const * __restrict__ nCells, + GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, uint32_t nHits, uint32_t maxNumberOfDoublets) { auto idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -36,8 +36,8 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, __global__ void kernel_connect(GPU::SimpleVector *foundNtuplets, - GPUCACell *cells, uint32_t const * nCells, - GPU::VecArray< unsigned int, 256> *isOuterHitOfCell, + GPUCACell * cells, uint32_t const * __restrict__ nCells, + GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, float ptmin, float region_origin_radius, const float thetaCut, const float phiCut, const float hardPtCut, @@ -51,14 +51,14 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel if (cellIndex >= (*nCells) ) return; - auto &thisCell = cells[cellIndex]; + auto const & thisCell = cells[cellIndex]; auto innerHitId = thisCell.get_inner_hit_id(); auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { auto otherCell = isOuterHitOfCell[innerHitId][j]; - if (thisCell.check_alignment_and_tag( - cells, otherCell, ptmin, region_origin_x, region_origin_y, + if (thisCell.check_alignment( + cells[otherCell], ptmin, region_origin_x, region_origin_y, region_origin_radius, thetaCut, phiCut, hardPtCut) ) { cells[otherCell].theOuterNeighbors.push_back(cellIndex); @@ -67,7 +67,7 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, } __global__ void kernel_find_ntuplets( - GPUCACell *cells, uint32_t const * nCells, + GPUCACell * const __restrict__ cells, uint32_t const * nCells, GPU::SimpleVector *foundNtuplets, unsigned int minHitsPerNtuplet, unsigned int maxNumberOfDoublets_) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 02cf0d6f91642..a9578e7dab9dc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -20,7 +20,7 @@ class GPUCACell { GPUCACell() = default; __host__ __device__ - void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh, + void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, int layerPairId, int doubletId, int innerHitId,int outerHitId) { theInnerHitId = innerHitId; @@ -64,21 +64,20 @@ class GPUCACell { } __host__ __device__ - bool check_alignment_and_tag( - const GPUCACell *cells, unsigned int innerCellId, const float ptmin, + bool check_alignment( + GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float thetaCut, - const float phiCut, const float hardPtCut) + const float phiCut, const float hardPtCut) const { auto ro = get_outer_r(); auto zo = get_outer_z(); - const auto &otherCell = cells[innerCellId]; auto r1 = otherCell.get_inner_r(); auto z1 = otherCell.get_inner_z(); bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut); return (aligned && - haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x, + haveSimilarCurvature(otherCell, ptmin, region_origin_x, region_origin_y, region_origin_radius, phiCut, hardPtCut)); } @@ -101,14 +100,12 @@ class GPUCACell { } constexpr bool - haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId, + haveSimilarCurvature(GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float phiCut, const float hardPtCut) const { - const auto &otherCell = cells[innerCellId]; - auto x1 = otherCell.get_inner_x(); auto y1 = otherCell.get_inner_y(); @@ -190,7 +187,7 @@ class GPUCACell { __device__ inline void find_ntuplets( - const GPUCACell *cells, + GPUCACell const * __restrict__ cells, GPU::SimpleVector *foundNtuplets, GPU::VecArray &tmpNtuplet, const unsigned int minHitsPerNtuplet) const From 85453afd53160745b86f3f3d9fc61de7007b8ffe Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Mon, 24 Sep 2018 12:39:00 +0200 Subject: [PATCH 30/36] force load in constant cache, reduce size of Cell --- .../siPixelRecHitsHeterogeneousProduct.h | 1 + .../plugins/CAHitQuadrupletGeneratorGPU.cu | 6 +- .../plugins/CAHitQuadrupletGeneratorGPU.h | 2 + .../PixelTriplets/plugins/GPUCACell.h | 108 +++++++++--------- .../PixelTriplets/plugins/gpuPixelDoublets.h | 12 +- 5 files changed, 68 insertions(+), 61 deletions(-) diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index c06cb17e2e282..9d0fe7a279799 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -18,6 +18,7 @@ namespace siPixelRecHitsHeterogeneousProduct { using CPUProduct = int; // dummy static constexpr uint32_t maxHits() { return 65536;} + using hindex_type = uint16_t; // if above is <=2^16 struct HitsOnGPU{ pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr; // forwarded from setup, NOT owned diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 4e17b580bbaf3..3574276b69ebd 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -36,6 +36,7 @@ kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, __global__ void kernel_connect(GPU::SimpleVector *foundNtuplets, + GPUCACell::Hits const * __restrict__ hhp, GPUCACell * cells, uint32_t const * __restrict__ nCells, GPU::VecArray< unsigned int, 256> const * __restrict__ isOuterHitOfCell, float ptmin, @@ -43,6 +44,8 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, const float phiCut, const float hardPtCut, unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { + auto const & hh = *hhp; + float region_origin_x = 0.; float region_origin_y = 0.; @@ -57,7 +60,7 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { auto otherCell = isOuterHitOfCell[innerHitId][j]; - if (thisCell.check_alignment( + if (thisCell.check_alignment(hh, cells[otherCell], ptmin, region_origin_x, region_origin_y, region_origin_radius, thetaCut, phiCut, hardPtCut) ) { @@ -163,6 +166,7 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1)/blockSize; kernel_connect<<>>( d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel + hh.gpu_d, device_theCells_, device_nCells_, device_isOuterHitOfCell_, region.ptMin(), diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index c06f1be2f257b..81841fdc9bd0c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -21,6 +21,8 @@ #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h" + +// FIXME (split header???) #include "GPUCACell.h" class TrackingRegion; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index a9578e7dab9dc..5646b6db4f96b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -6,7 +6,6 @@ #include -#include "GPUHitsAndDoublets.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" @@ -15,40 +14,40 @@ struct Quadruplet { int hitId[4]; }; + class GPUCACell { public: + + using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + GPUCACell() = default; +#ifdef __CUDACC__ - __host__ __device__ - void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh, - int layerPairId, int doubletId, int innerHitId,int outerHitId) + __device__ __forceinline__ + void init(Hits const & hh, + int layerPairId, int doubletId, + hindex_type innerHitId, hindex_type outerHitId) { theInnerHitId = innerHitId; theOuterHitId = outerHitId; theDoubletId = doubletId; theLayerPairId = layerPairId; - theInnerX = hh.xg_d[innerHitId]; - theOuterX = hh.xg_d[outerHitId]; - - theInnerY = hh.yg_d[innerHitId]; - theOuterY = hh.yg_d[outerHitId]; - - theInnerZ = hh.zg_d[innerHitId]; - theOuterZ = hh.zg_d[outerHitId]; - theInnerR = hh.rg_d[innerHitId]; - theOuterR = hh.rg_d[outerHitId]; + theInnerZ = __ldg(hh.zg_d+innerHitId); + theInnerR = __ldg(hh.rg_d+innerHitId); theOuterNeighbors.reset(); } - constexpr float get_inner_x() const { return theInnerX; } - constexpr float get_outer_x() const { return theOuterX; } - constexpr float get_inner_y() const { return theInnerY; } - constexpr float get_outer_y() const { return theOuterY; } - constexpr float get_inner_z() const { return theInnerZ; } - constexpr float get_outer_z() const { return theOuterZ; } - constexpr float get_inner_r() const { return theInnerR; } - constexpr float get_outer_r() const { return theOuterR; } + __device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); } + __device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); } + __device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; } + __device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); } + __device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; } + __device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); } + constexpr unsigned int get_inner_hit_id() const { return theInnerHitId; } @@ -56,36 +55,42 @@ class GPUCACell { return theOuterHitId; } - constexpr void print_cell() const { +/* + __host__ __device__ + void print_cell() const { printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: " "%d, innerradius %f, outerRadius %f \n", - theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId, - theInnerR, theOuterR); + theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId + ); } +*/ - __host__ __device__ - bool check_alignment( + __device__ + bool check_alignment(Hits const & hh, GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float thetaCut, const float phiCut, const float hardPtCut) const { - auto ro = get_outer_r(); - auto zo = get_outer_z(); + auto ri = get_inner_r(hh); + auto zi = get_inner_z(hh); - auto r1 = otherCell.get_inner_r(); - auto z1 = otherCell.get_inner_z(); - bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut); + auto ro = get_outer_r(hh); + auto zo = get_outer_z(hh); + + auto r1 = otherCell.get_inner_r(hh); + auto z1 = otherCell.get_inner_z(hh); + bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut); return (aligned && - haveSimilarCurvature(otherCell, ptmin, region_origin_x, + haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x, region_origin_y, region_origin_radius, phiCut, hardPtCut)); } - - constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo, + __device__ __forceinline__ + static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, - const float thetaCut) const { + const float thetaCut) { float radius_diff = std::abs(r1 - ro); float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo); @@ -95,25 +100,26 @@ class GPUCACell { // radius_diff later float tan_12_13_half_mul_distance_13_squared = - fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r())); + fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri)); return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; } - constexpr bool - haveSimilarCurvature(GPUCACell const & otherCell, + __device__ + bool + haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell, const float ptmin, const float region_origin_x, const float region_origin_y, const float region_origin_radius, const float phiCut, const float hardPtCut) const { - auto x1 = otherCell.get_inner_x(); - auto y1 = otherCell.get_inner_y(); + auto x1 = otherCell.get_inner_x(hh); + auto y1 = otherCell.get_inner_y(hh); - auto x2 = get_inner_x(); - auto y2 = get_inner_y(); + auto x2 = get_inner_x(hh); + auto y2 = get_inner_y(hh); - auto x3 = get_outer_x(); - auto y3 = get_outer_y(); + auto x3 = get_outer_x(hh); + auto y3 = get_outer_y(hh); float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3); float tan_12_13_half_mul_distance_13_squared = @@ -183,7 +189,7 @@ class GPUCACell { // trying to free the track building process from hardcoded layers, leaving // the visit of the graph based on the neighborhood connections between cells. -#ifdef __CUDACC__ +// #ifdef __CUDACC__ __device__ inline void find_ntuplets( @@ -228,16 +234,10 @@ class GPUCACell { int theLayerPairId; private: - unsigned int theInnerHitId; - unsigned int theOuterHitId; - float theInnerX; - float theOuterX; - float theInnerY; - float theOuterY; float theInnerZ; - float theOuterZ; float theInnerR; - float theOuterR; + hindex_type theInnerHitId; + hindex_type theOuterHitId; }; #endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index ccbff0480e6c5..61d048637585c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -76,8 +76,8 @@ namespace gpuPixelDoublets { // found hit corresponding to our cuda thread, now do the job auto mep = iphi[i]; - auto mez = hh.zg_d[i]; - auto mer = hh.rg_d[i]; + auto mez = __ldg(hh.zg_d+i); + auto mer = __ldg(hh.rg_d+i); constexpr float z0cut = 12.f; // cm constexpr float hardPtCut = 0.5f; // GeV @@ -86,13 +86,13 @@ namespace gpuPixelDoublets { auto ptcut = [&](int j) { auto r2t4 = minRadius2T4; auto ri = mer; - auto ro = hh.rg_d[j]; + auto ro = __ldg(hh.rg_d+j); auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) ); return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri); }; auto z0cutoff = [&](int j) { - auto zo = hh.zg_d[j]; - auto ro = hh.rg_d[j]; + auto zo = __ldg(hh.zg_d+j); + auto ro = __ldg(hh.rg_d+j); auto dr = ro-mer; return dr > maxr[pairLayerId] || dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; @@ -115,7 +115,7 @@ namespace gpuPixelDoublets { auto const * __restrict__ p = hist.begin(kk+hoff); auto const * __restrict__ e = hist.end(kk+hoff); for (;p < e; ++p) { - auto oi=*p; + auto oi=__ldg(p); assert(oi>=offsets[outer]); assert(oi Date: Mon, 24 Sep 2018 15:56:31 +0200 Subject: [PATCH 31/36] be consistent with ht index type --- .../CUDAUtilities/interface/GPUSimpleVector.h | 2 +- .../CUDAUtilities/interface/GPUVecArray.h | 2 +- .../plugins/CAHitQuadrupletGeneratorGPU.cu | 9 +++++---- .../plugins/CAHitQuadrupletGeneratorGPU.h | 1 + RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h | 11 ++++++----- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h index 74c8731da3712..47592fd2063d6 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h @@ -87,7 +87,7 @@ template struct SimpleVector { inline constexpr void reset() { m_size = 0; } inline constexpr int size() const { return m_size; } inline constexpr int capacity() const { return m_capacity; } - inline constexpr T *data() const { return m_data; } + inline constexpr T const * data() const { return m_data; } inline constexpr void resize(int size) { m_size = size; } inline constexpr void set_data(T * data) { m_data = data; } diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h index 6083c0a55cd3b..8dcefdce65ab4 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h @@ -85,7 +85,7 @@ template struct VecArray { inline constexpr const T& operator[](int i) const { return m_data[i]; } inline constexpr void reset() { m_size = 0; } inline constexpr int capacity() const { return maxSize; } - inline constexpr T *data() const { return m_data; } + inline constexpr T const * data() const { return m_data; } inline constexpr void resize(int size) { m_size = size; } inline constexpr bool empty() const { return 0 == m_size; } inline constexpr bool full() const { return maxSize == m_size; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 3574276b69ebd..69ca7f38a5b18 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -46,8 +46,8 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, auto const & hh = *hhp; - float region_origin_x = 0.; - float region_origin_y = 0.; + constexpr float region_origin_x = 0.; + constexpr float region_origin_y = 0.; auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; @@ -57,8 +57,9 @@ kernel_connect(GPU::SimpleVector *foundNtuplets, auto const & thisCell = cells[cellIndex]; auto innerHitId = thisCell.get_inner_hit_id(); auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + auto vi = isOuterHitOfCell[innerHitId].data(); for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { - auto otherCell = isOuterHitOfCell[innerHitId][j]; + auto otherCell = __ldg(vi+j); if (thisCell.check_alignment(hh, cells[otherCell], ptmin, region_origin_x, region_origin_y, @@ -80,7 +81,7 @@ __global__ void kernel_find_ntuplets( if (cellIndex >= (*nCells) ) return; auto &thisCell = cells[cellIndex]; if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME - GPU::VecArray stack; + GPU::VecArray stack; stack.reset(); thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet); assert(stack.size()==0); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index 81841fdc9bd0c..f34b323ed2346 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -38,6 +38,7 @@ class CAHitQuadrupletGeneratorGPU { using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; static constexpr unsigned int minLayers = 4; typedef OrderedHitSeeds ResultType; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 5646b6db4f96b..772b802282d31 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -11,7 +11,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" struct Quadruplet { - int hitId[4]; + using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type; + hindex_type hitId[4]; }; @@ -55,15 +56,15 @@ class GPUCACell { return theOuterHitId; } -/* - __host__ __device__ + + __device__ void print_cell() const { printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: " "%d, innerradius %f, outerRadius %f \n", theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId ); } -*/ + __device__ bool check_alignment(Hits const & hh, @@ -195,7 +196,7 @@ class GPUCACell { inline void find_ntuplets( GPUCACell const * __restrict__ cells, GPU::SimpleVector *foundNtuplets, - GPU::VecArray &tmpNtuplet, + GPU::VecArray &tmpNtuplet, const unsigned int minHitsPerNtuplet) const { // the building process for a track ends if: From bda658ac0ba64b7a5b0788cb4acc8e74c839d913 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Tue, 25 Sep 2018 15:32:53 +0200 Subject: [PATCH 32/36] fill pt2 and index for nv==1 as well --- HeterogeneousCore/CUDAUtilities/interface/radixSort.h | 1 - .../PixelVertexFinding/src/gpuClusterTracks.h | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h index 83468d854b1e6..5de2a5b6f8807 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/radixSort.h +++ b/HeterogeneousCore/CUDAUtilities/interface/radixSort.h @@ -103,7 +103,6 @@ radixSortImpl(T const * __restrict__ a, uint16_t * ind, uint16_t * ind2, uint32_ for (auto i=first; i Date: Tue, 25 Sep 2018 23:17:37 +0200 Subject: [PATCH 33/36] check cudaGetLastError() after launching each kernel --- HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h | 1 + 1 file changed, 1 insertion(+) diff --git a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h index e21d03a4d86e3..e13e30594ab6f 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h +++ b/HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h @@ -57,6 +57,7 @@ namespace cudautils { cudaMemsetAsync(off,0, 4*Histo::totbins(),stream); auto nblocks = (totSize + nthreads - 1) / nthreads; countFromVector<<>>(h, nh, v, offsets); + cudaCheck(cudaGetLastError()); size_t wss = Histo::totbins(); CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream)); cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream); From 3f69a405558a782e8ca10ab786617e9e9dfb053b Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 25 Sep 2018 23:21:38 +0200 Subject: [PATCH 34/36] check cudaGetLastError() after launching each kernel --- .../PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 69ca7f38a5b18..53930f467a6c5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -182,7 +182,6 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, 4, maxNumberOfDoublets_); cudaCheck(cudaGetLastError()); - numberOfBlocks = (std::max(int(nhits), maxNumberOfDoublets_) + blockSize - 1)/blockSize; kernel_checkOverflows<<>>( d_foundNtupletsVec_[regionIndex], @@ -190,11 +189,11 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, device_isOuterHitOfCell_, nhits, maxNumberOfDoublets_ ); - + cudaCheck(cudaGetLastError()); // kernel_print_found_ntuplets<<<1, 1, 0, cudaStream>>>(d_foundNtupletsVec_[regionIndex], 10); - if(transferToCPU) { + if (transferToCPU) { cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex], sizeof(GPU::SimpleVector), cudaMemcpyDeviceToHost, cudaStream)); From a96066857080595c73f61d60d4eb6615e9569808 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 25 Sep 2018 23:22:59 +0200 Subject: [PATCH 35/36] check cudaGetLastError() after launching each kernel --- .../PixelVertexFinding/src/gpuVertexFinder.cu | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu index 6cb08ae817721..b4cb7fa24e6e5 100644 --- a/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu +++ b/RecoPixelVertexing/PixelVertexFinding/src/gpuVertexFinder.cu @@ -53,19 +53,18 @@ namespace gpuVertexFinder { uint32_t ntrks ) { - cudaCheck(cudaMemcpyAsync(onGPU.zt,zt,ntrks*sizeof(float), cudaMemcpyHostToDevice,stream)); cudaCheck(cudaMemcpyAsync(onGPU.ezt2,ezt2,ntrks*sizeof(float), cudaMemcpyHostToDevice,stream)); - cudaCheck(cudaMemcpyAsync(onGPU.ptt2,ptt2,ntrks*sizeof(float), cudaMemcpyHostToDevice,stream)); - assert(onGPU_d); clusterTracks<<<1,1024-256,0,stream>>>(ntrks,onGPU_d,minT,eps,errmax,chi2max); - sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d); + cudaCheck(cudaGetLastError()); + sortByPt2<<<1,256,0,stream>>>(ntrks,onGPU_d); + cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpyAsync(&gpuProduct.nVertices, onGPU.nv, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); @@ -73,8 +72,6 @@ namespace gpuVertexFinder { gpuProduct.ivtx.resize(ntrks); cudaCheck(cudaMemcpyAsync(gpuProduct.ivtx.data(),onGPU.iv,sizeof(int32_t)*ntrks, cudaMemcpyDeviceToHost, stream)); - - } Producer::GPUProduct const & Producer::fillResults(cudaStream_t stream) { @@ -98,8 +95,5 @@ namespace gpuVertexFinder { return gpuProduct; } - - - + } // end namespace - From 0e2812cfc49d6f664163f82022739380501d5e68 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Wed, 26 Sep 2018 00:20:19 +0200 Subject: [PATCH 36/36] Declare the direct dependencies Declare the direct dependency on "HeterogeneousCore/CUDAUtilities" rather than the indirect dependency on "cub". --- SimTracker/TrackerHitAssociation/plugins/BuildFile.xml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml index 4f3e0d27878a1..a63c38264c20d 100644 --- a/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml +++ b/SimTracker/TrackerHitAssociation/plugins/BuildFile.xml @@ -1,15 +1,14 @@ + + + - - - - + -