Skip to content

Commit 8e11466

Browse files
VinInnfwyzard
authored andcommitted
Migrate the pixel rechits producer and CA to the new heterogeneous framework (#338)
Use cleaned hits. Use pixel layer and ladders geometry, and use pixel triplets in the gaps. Optimise GPU memory usage: - reduce the number of memory allocations - fix the size of the cub workspace - allocate memory per event via the caching allocator - use constant memory for geometry and parameters - use shared memory where the content is the same for every thread Optimise kernel launches, and add a protection for empty events and overflows.
1 parent 9f61afb commit 8e11466

13 files changed

+455
-364
lines changed

RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py

-2
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
2121
process = customizePixelTracksForProfiling(process)
2222

2323
# Disable conversions to legacy
24-
process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
2524
process.pixelTracksHitQuadruplets.gpuEnableConversion = False
2625
process.pixelTracks.gpuEnableConversion = False
2726
process.pixelVertices.gpuEnableConversion = False
@@ -32,7 +31,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
3231
process = customizePixelTracksForProfilingDisableConversion(process)
3332

3433
# Disable "unnecessary" transfers to CPU
35-
process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
3634
process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
3735
process.pixelVertices.gpuEnableTransfer = False
3836

RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ namespace Rfit
3838
|cov(X0, R)|cov(Y0, R)|cov( R, R)|
3939
*/
4040
int32_t q; //!< particle charge
41-
float chi2 = 0.0;
41+
float chi2;
4242
};
4343

4444
struct line_fit
@@ -49,7 +49,7 @@ namespace Rfit
4949
|cov(c_t,c_t)|cov(Zip,c_t)| \n
5050
|cov(c_t,Zip)|cov(Zip,Zip)|
5151
*/
52-
double chi2 = 0.0;
52+
double chi2;
5353
};
5454

5555
struct helix_fit

RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu

+37-31
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
1212
#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
1313
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
14-
#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
1514

15+
#include "FWCore/ServiceRegistry/interface/Service.h"
16+
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
17+
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
1618

17-
using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
18-
19-
using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
19+
using HitsOnGPU = TrackingRecHit2DSOAView;
2020
using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
2121

2222
using namespace Eigen;
@@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
7575
for (unsigned int i = 0; i < hitsInFit; ++i) {
7676
auto hit = hitId[i];
7777
float ge[6];
78-
hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
78+
hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
7979
#ifdef BL_DUMP_HITS
8080
if (dump){
81-
printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
82-
printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
81+
printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit));
82+
printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
8383
}
8484
#endif
85-
hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
85+
hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
8686
hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
8787
}
8888
BrokenLine::BL_Fast_fit(hits,fast_fit);
@@ -167,65 +167,71 @@ void kernelBLFit(
167167
}
168168

169169

170-
void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
170+
void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
171171
{
172-
assert(tuples_d); assert(fast_fit_resultsGPU_);
172+
assert(tuples_d);
173173

174174
auto blockSize = 64;
175175
auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
176176

177-
for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
177+
// Fit internals
178+
edm::Service<CUDAService> cs;
179+
auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
180+
auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
181+
auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
182+
183+
for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
178184

179185
// fit triplets
180-
kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
181-
tuples_d, tupleMultiplicity_d, hh.gpu_d,
182-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
186+
kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
187+
tuples_d, tupleMultiplicity_d, hh.view(),
188+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
183189
3, offset);
184190
cudaCheck(cudaGetLastError());
185191

186-
kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
192+
kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
187193
tupleMultiplicity_d, bField_, helix_fit_results_d,
188-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
194+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
189195
3, offset);
190196
cudaCheck(cudaGetLastError());
191197

192198
// fit quads
193-
kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
194-
tuples_d, tupleMultiplicity_d, hh.gpu_d,
195-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
199+
kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
200+
tuples_d, tupleMultiplicity_d, hh.view(),
201+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
196202
4, offset);
197203
cudaCheck(cudaGetLastError());
198204

199-
kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
205+
kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
200206
tupleMultiplicity_d, bField_, helix_fit_results_d,
201-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
207+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
202208
4, offset);
203209
cudaCheck(cudaGetLastError());
204210

205211
if (fit5as4_) {
206212
// fit penta (only first 4)
207-
kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
208-
tuples_d, tupleMultiplicity_d, hh.gpu_d,
209-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
213+
kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
214+
tuples_d, tupleMultiplicity_d, hh.view(),
215+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
210216
5, offset);
211217
cudaCheck(cudaGetLastError());
212218

213-
kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
219+
kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
214220
tupleMultiplicity_d, bField_, helix_fit_results_d,
215-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
221+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
216222
5, offset);
217223
cudaCheck(cudaGetLastError());
218224
} else {
219225
// fit penta (all 5)
220-
kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
221-
tuples_d, tupleMultiplicity_d, hh.gpu_d,
222-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
226+
kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
227+
tuples_d, tupleMultiplicity_d, hh.view(),
228+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
223229
5, offset);
224230
cudaCheck(cudaGetLastError());
225231

226-
kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
232+
kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
227233
tupleMultiplicity_d, bField_, helix_fit_results_d,
228-
hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
234+
hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
229235
5, offset);
230236
cudaCheck(cudaGetLastError());
231237
}

RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml

-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,6 @@
1111
<use name="RecoPixelVertexing/PixelTriplets"/>
1212
<use name="RecoTracker/TkSeedingLayers"/>
1313
<use name="RecoTracker/TkTrackingRegions"/>
14-
<flags CXXFLAGS="-g -fno-math-errno"/>
15-
<flags CUDA_FLAGS="-g"/>
1614
<library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
1715
<flags EDM_PLUGIN="1"/>
1816
</library>

RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h

+25-5
Original file line numberDiff line numberDiff line change
@@ -6,32 +6,52 @@
66

77
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
88
#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
9-
#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
9+
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
10+
#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
1011

1112
// #define ONLY_PHICUT
1213

1314
namespace CAConstants {
1415

1516
// constants
16-
17-
constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; }
17+
#ifdef GPU_SMALL_EVENTS
18+
constexpr uint32_t maxNumberOfTuples() { return 3*1024;}
19+
#else
20+
constexpr uint32_t maxNumberOfTuples() { return 6*1024;}
21+
#endif
22+
constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
1823
#ifndef ONLY_PHICUT
24+
#ifndef GPU_SMALL_EVENTS
1925
constexpr uint32_t maxNumberOfDoublets() { return 262144; }
2026
constexpr uint32_t maxCellsPerHit() { return 128; }
27+
#else
28+
constexpr uint32_t maxNumberOfDoublets() { return 262144/2; }
29+
constexpr uint32_t maxCellsPerHit() { return 128/2; }
30+
#endif
2131
#else
2232
constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
2333
constexpr uint32_t maxCellsPerHit() { return 4*128; }
2434
#endif
35+
constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;}
36+
37+
2538
constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
2639
constexpr uint32_t maxNumberOfLayers() { return 10; }
27-
constexpr uint32_t maxTuples() { return 6*1024;}
40+
constexpr uint32_t maxTuples() { return maxNumberOfTuples();}
2841

2942
// types
3043
using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
3144
using tindex_type = uint16_t; // for tuples
45+
46+
using CellNeighbors = GPU::VecArray< uint32_t, 36>;
47+
using CellTracks = GPU::VecArray< tindex_type, 42>;
48+
49+
using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
50+
using CellTracksVector = GPU::SimpleVector<CellTracks>;
51+
3252
using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
3353
using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
34-
using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
54+
using HitToTuple = OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
3555
using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
3656

3757
}

0 commit comments

Comments
 (0)