-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GPU: better hits #81
GPU: better hits #81
Changes from all commits
5412113
777a84d
d713b2a
65a94a1
fd12ed6
b7ee911
72c5952
67bbc84
2fafecf
7a2edc2
dcedee7
8440462
4620fdf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,31 +17,60 @@ | |
#include "gpuPixelRecHits.h" | ||
|
||
namespace pixelgpudetails { | ||
PixelRecHitGPUKernel::PixelRecHitGPUKernel() { | ||
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) { | ||
|
||
cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.xg_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.yg_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.zg_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.rg_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.xl_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.yl_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.xerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.yerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.iphi_d,(gpuClustering::MaxNumModules*256)*sizeof(int16_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); | ||
cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t))); | ||
// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist))); | ||
|
||
cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU))); | ||
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id())); | ||
|
||
} | ||
|
||
PixelRecHitGPUKernel::~PixelRecHitGPUKernel() { | ||
cudaCheck(cudaFree(gpu_.hitsModuleStart_d)); | ||
cudaCheck(cudaFree(gpu_.charge_d)); | ||
cudaCheck(cudaFree(gpu_.detInd_d)); | ||
cudaCheck(cudaFree(gpu_.xg_d)); | ||
cudaCheck(cudaFree(gpu_.yg_d)); | ||
cudaCheck(cudaFree(gpu_.zg_d)); | ||
cudaCheck(cudaFree(gpu_.rg_d)); | ||
cudaCheck(cudaFree(gpu_.xl_d)); | ||
cudaCheck(cudaFree(gpu_.yl_d)); | ||
cudaCheck(cudaFree(gpu_.xerr_d)); | ||
cudaCheck(cudaFree(gpu_.yerr_d)); | ||
cudaCheck(cudaFree(gpu_.iphi_d)); | ||
cudaCheck(cudaFree(gpu_.sortIndex_d)); | ||
cudaCheck(cudaFree(gpu_.mr_d)); | ||
cudaCheck(cudaFree(gpu_.mc_d)); | ||
// cudaCheck(cudaFree(gpu_.hist_d)); | ||
|
||
cudaCheck(cudaFree(gpu_d)); | ||
} | ||
|
||
void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input, | ||
float const * bs, | ||
pixelCPEforGPU::ParamsOnGPU const * cpeParams, | ||
cuda::stream_t<>& stream) { | ||
|
||
cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id())); | ||
|
||
thrust::exclusive_scan(thrust::cuda::par.on(stream.id()), | ||
input.clusInModule_d, | ||
input.clusInModule_d + gpuClustering::MaxNumModules + 1, | ||
|
@@ -51,6 +80,7 @@ namespace pixelgpudetails { | |
int blocks = input.nModules; // active modules (with digis) | ||
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>( | ||
cpeParams, | ||
gpu_.bs_d, | ||
input.moduleInd_d, | ||
input.xx_d, input.yy_d, input.adc_d, | ||
input.moduleStart_d, | ||
|
@@ -59,27 +89,50 @@ namespace pixelgpudetails { | |
input.nDigis, | ||
gpu_.hitsModuleStart_d, | ||
gpu_.charge_d, | ||
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, | ||
gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d, | ||
true // for the time being stay local... | ||
gpu_.detInd_d, | ||
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d, | ||
gpu_.iphi_d, | ||
gpu_.xl_d, gpu_.yl_d, | ||
gpu_.xerr_d, gpu_.yerr_d, | ||
gpu_.mr_d, gpu_.mc_d | ||
); | ||
|
||
// needed only if hits on CPU are required... | ||
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
|
||
// to be moved to gpu? | ||
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules]; | ||
for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]]; | ||
hitsLayerStart_[10]=nhits; | ||
|
||
std::cout << "hit layerStart "; | ||
for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' '; | ||
std::cout << "end:" << hitsLayerStart_[10] << std::endl; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By the way, now we have a per-event (non-thread safe) printout from here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will be removed at next iteration when the sorting will be implemented as well |
||
|
||
cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
|
||
// for timing test | ||
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d); | ||
|
||
// fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream); | ||
|
||
|
||
} | ||
|
||
HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const { | ||
// needed only if hits on CPU are required... | ||
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules]; | ||
|
||
HitsOnCPU hoc(nhits); | ||
hoc.gpu_d = gpu_d; | ||
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t)); | ||
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id())); | ||
cudaCheck(cudaStreamSynchronize(stream.id())); | ||
return hoc; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just to note that this introduces yet another convention for naming pixel layers. It seems to be only for debug prints, so I don't object.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@makortel what are the "usual" pixel layer names, and where are they defined ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We (=CMS) have many conventions in different places
BPixN
andFPixN_pos
/FPixN_neg
cmssw/RecoTracker/TkSeedingLayers/python/PixelLayerTriplets_cfi.py
Lines 6 to 11 in 96559f3
PixelSubdetector
usesenum
sPixelBarrel
andPixelForward
cmssw/DataFormats/SiPixelDetId/interface/PixelSubdetector.h
Line 11 in 96559f3
GeomDetEnumerators
usesenum
sPixelBarrel
,PixelEndcap
,P1PXB
,P1PXEC
,P2PXB
,P2PXEC
cmssw/Geometry/CommonDetUnit/interface/GeomDetEnumerators.h
Line 11 in 96559f3
TrackerTopology
usespxb
andpxf
in function prefixescmssw/DataFormats/TrackerCommon/interface/TrackerTopology.h
Lines 160 to 163 in 96559f3
TrackingNtuple
python library usesBPixN
andFPixN+
/FPixN-
in printoutscmssw/Validation/RecoTrack/python/plotting/ntupleDataFormat.py
Lines 100 to 112 in 96559f3
I don't think none of these is authoritative enough to suggest a change in here (since they're for printouts only, for configuration input's I'd probably suggest the seeding layers' convention). Anyway these have the nice property (on purpose?) that the length of BPix and FPix strings are the same (that none of the other conventions have).