From 339bb0fcbdc3955056712ca0c1ecfb026e59cb25 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 30 May 2018 15:08:12 +0200 Subject: [PATCH 1/3] faster cluster, now do not requires to know number of modules --- .../plugins/gpuClustering.h | 21 +++++++++++++------ .../SiPixelClusterizer/test/gpuClustering.cu | 11 +++++----- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 672a4dbe97450..77cb2cf608a76 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -1,9 +1,9 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h -#include #include #include +#include namespace gpuClustering { @@ -46,6 +46,8 @@ namespace gpuClustering { __shared__ int msize; + if (blockIdx.x >=moduleStart[0]) return; + auto first = moduleStart[1 + blockIdx.x]; auto me = id[first]; @@ -79,9 +81,11 @@ namespace gpuClustering { if (first>=msize) return; int jmax[10]; + // int jmin[10]; auto niter = (msize-first)/blockDim.x; assert(niter<10); - for (int i=0; i1) continue; - if (std::abs(int(y[j])-int(y[i]))>1) continue; + if (std::abs(int(x[j])-int(x[i]))>1 | + std::abs(int(y[j])-int(y[i]))>1) continue; +// if (first) {jmin[k] = j; first=false;} auto old = atomicMin(&clus[j],clus[i]); if (old!=clus[i]) go=true; atomicMin(&clus[i],old); @@ -167,4 +176,4 @@ namespace gpuClustering { } //namespace gpuClustering -#endif // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h +#endif diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index ae0359cd683e0..beafe83ba60bd 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -198,13 +198,8 @@ int main(void) d_id.get(), d_moduleStart.get() ,d_clus.get(),n ); - cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); - - std::cout << "found " << nModules << " Modules active" << std::endl; - - threadsPerBlock = 256; - blocksPerGrid = nModules; + blocksPerGrid = MaxNumModules; //nModules; @@ -226,6 +221,10 @@ int main(void) ); + cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); + std::cout << "found " << nModules << " Modules active" << std::endl; + + uint32_t nclus[MaxNumModules], moduleId[nModules]; cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); From 8f0a694a96c92a07a3190679ccfefb59108d2588 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 4 Jun 2018 12:11:35 +0200 Subject: [PATCH 2/3] Clean up --- .../plugins/gpuClustering.h | 244 +++++++++--------- 1 file changed, 118 insertions(+), 126 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 77cb2cf608a76..11bcf5c41543c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -1,179 +1,171 @@ #ifndef RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h #define RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h +#include #include #include -#include namespace gpuClustering { - constexpr uint32_t MaxNumModules = 2000; + constexpr uint32_t MaxNumModules = 2000; + constexpr uint32_t MaxNumPixels = 256 * 2000; // this does not mean maxPixelPerModule == 256! + constexpr uint16_t InvId = 9999; // must be > MaxNumModules - constexpr uint32_t MaxNumPixels = 256*2000; // this does not mean maxPixelPerModule==256! - - constexpr uint16_t InvId=9999; // must be > MaxNumModules - __global__ void countModules(uint16_t const * id, - uint32_t * moduleStart, - int32_t * clus, - int numElements){ - + uint32_t * moduleStart, + int32_t * clus, + int numElements){ + int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= numElements) return; - clus[i]=i; - if (InvId==id[i]) return; - auto j=i-1; - while(j>=0 && id[j]==InvId) --j; - if(j<0 || id[j]!=id[i]) { + if (i >= numElements) + return; + clus[i] = i; + if (InvId == id[i]) + return; + auto j = i - 1; + while (j >= 0 and id[j] == InvId) + --j; + if (j < 0 or id[j] != id[i]) { // boundary... - auto loc = atomicInc(moduleStart,MaxNumModules); - moduleStart[loc+1]=i; + auto loc = atomicInc(moduleStart, MaxNumModules); + moduleStart[loc + 1] = i; } } - - + __global__ void findClus(uint16_t const * id, - uint16_t const * x, - uint16_t const * y, - uint16_t const * adc, - uint32_t const * moduleStart, - uint32_t * clusInModule, uint32_t * moduleId, - int32_t * clus, uint32_t * debug, - int numElements){ - + uint16_t const * x, + uint16_t const * y, + uint16_t const * adc, + uint32_t const * moduleStart, + uint32_t * clusInModule, uint32_t * moduleId, + int32_t * clus, uint32_t * debug, + int numElements) + { __shared__ bool go; __shared__ int nclus; + __shared__ int msize; - __shared__ int msize; - - if (blockIdx.x >=moduleStart[0]) return; + if (blockIdx.x >= moduleStart[0]) + return; - auto first = moduleStart[1 + blockIdx.x]; - + auto first = moduleStart[1 + blockIdx.x]; auto me = id[first]; - assert(me=numElements) return; - - go=true; - nclus=0; + first += threadIdx.x; + if (first>= numElements) + return; + + go = true; + nclus = 0; - msize=numElements; + msize = numElements; __syncthreads(); - for (int i=first; i=msize) return; + assert(msize<= numElements); + if (first>= msize) + return; int jmax[10]; - // int jmin[10]; - auto niter = (msize-first)/blockDim.x; - assert(niter<10); - // for (int k=0; k1 | - std::abs(int(y[j])-int(y[i]))>1) continue; -// if (first) {jmin[k] = j; first=false;} - auto old = atomicMin(&clus[j],clus[i]); - if (old!=clus[i]) go=true; - atomicMin(&clus[i],old); - jmax[k]=j+1; - } + jmax[k] = i + 1; + for (int j = js; j < jm; ++j) { + if (id[j] == InvId) // not valid + continue; + if (std::abs(int(x[j]) - int(x[i])) > 1 | + std::abs(int(y[j]) - int(y[i])) > 1) + continue; + auto old = atomicMin(&clus[j], clus[i]); + if (old != clus[i]) go = true; + atomicMin(&clus[i], old); + jmax[k] = j + 1; + } } - assert (k<=niter); - __syncthreads(); - } - - /* - // fast count (nice but not much useful) - auto laneId = threadIdx.x & 0x1f; - - for (int i=first; i=0) clus[i]=clus[clus[i]]; + for (int i = first; i < numElements; i += blockDim.x) { + if (id[i] == InvId) // not valid + continue; + if (id[i] != me) // end of module + break; + if (clus[i]>= 0) clus[i] = clus[clus[i]]; } - + __syncthreads(); - for (int i=first; i Date: Mon, 4 Jun 2018 14:34:52 +0200 Subject: [PATCH 3/3] Clean up --- .../SiPixelClusterizer/test/gpuClustering.cu | 169 +++++++----------- 1 file changed, 69 insertions(+), 100 deletions(-) diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu index beafe83ba60bd..8c6b6cdf7a821 100644 --- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu +++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering.cu @@ -1,37 +1,27 @@ -#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" - - -#include "cuda/api_wrappers.h" - -#include -#include - +#include +#include +#include #include +#include +#include #include -#include -#include -#include -#include - -#include -#include +#include +#include +#include +#include +#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h" - int main(void) { if (cuda::device::count() == 0) { std::cerr << "No CUDA devices on this system" << "\n"; exit(EXIT_FAILURE); } - - using namespace gpuClustering; - - int numElements = MaxNumPixels; - + int numElements = MaxNumPixels; // these in reality are already on GPU auto h_id = std::make_unique(numElements); auto h_x = std::make_unique(numElements); @@ -39,27 +29,23 @@ int main(void) auto h_adc = std::make_unique(numElements); auto h_clus = std::make_unique(numElements); - + auto h_debug = std::make_unique(numElements); - - auto current_device = cuda::device::current::get(); auto d_id = cuda::memory::device::make_unique(current_device, numElements); auto d_x = cuda::memory::device::make_unique(current_device, numElements); auto d_y = cuda::memory::device::make_unique(current_device, numElements); auto d_adc = cuda::memory::device::make_unique(current_device, numElements); - + auto d_clus = cuda::memory::device::make_unique(current_device, numElements); auto d_moduleStart = cuda::memory::device::make_unique(current_device, MaxNumModules+1); auto d_clusInModule = cuda::memory::device::make_unique(current_device, MaxNumModules); auto d_moduleId = cuda::memory::device::make_unique(current_device, MaxNumModules); - - auto d_debug = cuda::memory::device::make_unique(current_device, numElements); - - // later random number + auto d_debug = cuda::memory::device::make_unique(current_device, numElements); + // later random number int n=0; int ncl=0; int y[10]={5,7,9,1,3,0,4,8,2,6}; @@ -112,10 +98,10 @@ int main(void) h_adc[n]=100; ++n; if (xx[k]%2==0) { - h_id[n]=id; - h_x[n]=xx[k]; - h_y[n]=101; - h_adc[n]=100; + h_id[n]=id; + h_x[n]=xx[k]; + h_y[n]=101; + h_adc[n]=100; ++n; } } @@ -129,43 +115,41 @@ int main(void) h_x[n]=x; h_y[n]=x; h_adc[n]=100; - ++n; + ++n; } - - // all odd id for(int id=11; id<=1800; id+=2) { if ( (id/20)%2) h_id[n++]=InvId; // error - for (int x=0; x<40; x+=4) { + for (int x=0; x<40; x+=4) { ++ncl; if ((id/10)%2) { - for (int k=0; k<10; ++k) { - h_id[n]=id; - h_x[n]=x; - h_y[n]=x+y[k]; - h_adc[n]=100; - ++n; - h_id[n]=id; - h_x[n]=x+1; - h_y[n]=x+y[k]+2; - h_adc[n]=100; - ++n; - } + for (int k=0; k<10; ++k) { + h_id[n]=id; + h_x[n]=x; + h_y[n]=x+y[k]; + h_adc[n]=100; + ++n; + h_id[n]=id; + h_x[n]=x+1; + h_y[n]=x+y[k]+2; + h_adc[n]=100; + ++n; + } } else { - for (int k=0; k<10; ++k) { - h_id[n]=id; - h_x[n]=x; - h_y[n]=x+y[9-k]; - h_adc[n]=100; - ++n; - if (y[k]==3) continue; // hole - if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error - h_id[n]=id; - h_x[n]=x+1; - h_y[n]=x+y[k]+2; - h_adc[n]=100; - ++n; - } + for (int k=0; k<10; ++k) { + h_id[n]=id; + h_x[n]=x; + h_y[n]=x+y[9-k]; + h_adc[n]=100; + ++n; + if (y[k]==3) continue; // hole + if (id==51) {h_id[n++]=InvId; h_id[n++]=InvId; }// error + h_id[n]=id; + h_x[n]=x+1; + h_y[n]=x+y[k]+2; + h_adc[n]=100; + ++n; + } } } } @@ -174,35 +158,31 @@ int main(void) size_t size32 = n * sizeof(unsigned int); size_t size16 = n * sizeof(unsigned short); size_t size8 = n * sizeof(uint8_t); - + uint32_t nModules=0; cuda::memory::copy(d_moduleStart.get(),&nModules,sizeof(uint32_t)); - + cuda::memory::copy(d_id.get(), h_id.get(), size16); cuda::memory::copy(d_x.get(), h_x.get(), size16); cuda::memory::copy(d_y.get(), h_y.get(), size16); cuda::memory::copy(d_adc.get(), h_adc.get(), size8); - cuda::memory::device::zero(d_debug.get(),size32); + cuda::memory::device::zero(d_debug.get(),size32); // Launch CUDA Kernels - - int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; std::cout << "CUDA countModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; - + cuda::launch( - countModules, - { blocksPerGrid, threadsPerBlock }, - d_id.get(), d_moduleStart.get() ,d_clus.get(),n - ); - + countModules, + { blocksPerGrid, threadsPerBlock }, + d_id.get(), d_moduleStart.get() ,d_clus.get(),n + ); + threadsPerBlock = 256; blocksPerGrid = MaxNumModules; //nModules; - - std::cout << "CUDA findModules kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; @@ -210,35 +190,28 @@ int main(void) cuda::memory::device::zero(d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::launch( - findClus, - { blocksPerGrid, threadsPerBlock }, - d_id.get(), d_x.get(), d_y.get(), d_adc.get(), - d_moduleStart.get(), - d_clusInModule.get(), d_moduleId.get(), - d_clus.get(), - d_debug.get(), - n - ); - + findClus, + { blocksPerGrid, threadsPerBlock }, + d_id.get(), d_x.get(), d_y.get(), d_adc.get(), + d_moduleStart.get(), + d_clusInModule.get(), d_moduleId.get(), + d_clus.get(), + d_debug.get(), + n + ); cuda::memory::copy(&nModules,d_moduleStart.get(),sizeof(uint32_t)); std::cout << "found " << nModules << " Modules active" << std::endl; - - uint32_t nclus[MaxNumModules], moduleId[nModules]; + uint32_t nclus[MaxNumModules], moduleId[nModules]; cuda::memory::copy(h_clus.get(), d_clus.get(), size32); cuda::memory::copy(&nclus,d_clusInModule.get(),MaxNumModules*sizeof(uint32_t)); cuda::memory::copy(&moduleId,d_moduleId.get(),nModules*sizeof(uint32_t)); - - cuda::memory::copy(h_debug.get(), d_debug.get(), size32); auto p = std::minmax_element(h_debug.get(),h_debug.get()+n); - std::cout << "debug " << *p.first << ' ' << *p.second << std::endl; + std::cout << "debug " << *p.first << ' ' << *p.second << std::endl; - - - std::set clids; std::vector seeds; for (int i=0; i=0); assert(h_clus[i]