Skip to content

Commit b8f87ba

Browse files
VinInnfwyzard
authored andcommitted
Migrate gpuPixelRecHits::getHits() kernel to use a View instead of multiple pointers (#354)
Other changes and optimisations: - take into account the case where `nclus > blockDim.x` - use a smaller block size - document why why we copy or not to local variables
1 parent 09ec13f commit b8f87ba

File tree

1 file changed

+76
-67
lines changed

1 file changed

+76
-67
lines changed

RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h

+76-67
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,21 @@ namespace gpuPixelRecHits {
1515

1616
__global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams,
1717
BeamSpotCUDA::Data const* __restrict__ bs,
18-
uint16_t const* __restrict__ id,
19-
uint16_t const* __restrict__ x,
20-
uint16_t const* __restrict__ y,
21-
uint16_t const* __restrict__ adc,
22-
uint32_t const* __restrict__ digiModuleStart,
23-
uint32_t const* __restrict__ clusInModule,
24-
uint32_t const* __restrict__ moduleId,
25-
int32_t const* __restrict__ clus,
18+
SiPixelDigisCUDA::DeviceConstView const * __restrict__ pdigis,
2619
int numElements,
27-
uint32_t const* __restrict__ hitsModuleStart,
20+
SiPixelClustersCUDA::DeviceConstView const * __restrict__ pclusters,
2821
TrackingRecHit2DSOAView* phits) {
22+
23+
// FIXME
24+
// the compiler seems NOT to optimize loads from views (even in a simple test case)
25+
// The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
26+
// not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
27+
2928
auto& hits = *phits;
3029

30+
auto const digis = *pdigis; // the copy is intentional!
31+
auto const & clusters = *pclusters;
32+
3133
// to be moved in common namespace...
3234
constexpr uint16_t InvId = 9999; // must be > MaxNumModules
3335
constexpr uint32_t MaxHitsInModule = pixelCPEforGPU::MaxHitsInModule;
@@ -37,19 +39,19 @@ namespace gpuPixelRecHits {
3739
// as usual one block per module
3840
__shared__ ClusParams clusParams;
3941

40-
auto first = digiModuleStart[1 + blockIdx.x];
41-
auto me = moduleId[blockIdx.x];
42-
auto nclus = clusInModule[me];
42+
auto first = clusters.moduleStart(1 + blockIdx.x);
43+
auto me = clusters.moduleId(blockIdx.x);
44+
auto nclus = clusters.clusInModule(me);
4345

4446
if (0 == nclus)
4547
return;
4648

4749
#ifdef GPU_DEBUG
4850
if (threadIdx.x == 0) {
4951
auto k = first;
50-
while (id[k] == InvId)
52+
while (digis.moduleInd(k) == InvId)
5153
++k;
52-
assert(id[k] == me);
54+
assert(digis.moduleInd(k) == me);
5355
}
5456
#endif
5557

@@ -71,9 +73,7 @@ namespace gpuPixelRecHits {
7173
}
7274
nclus = std::min(nclus, MaxHitsInModule);
7375

74-
auto ic = threadIdx.x;
75-
76-
if (ic < nclus) {
76+
for (int ic = threadIdx.x; ic < nclus; ic += blockDim.x) {
7777
clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
7878
clusParams.maxRow[ic] = 0;
7979
clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
@@ -92,85 +92,94 @@ namespace gpuPixelRecHits {
9292
// one thead per "digi"
9393

9494
for (int i = first; i < numElements; i += blockDim.x) {
95-
if (id[i] == InvId)
95+
auto id = digis.moduleInd(i);
96+
if (id == InvId)
9697
continue; // not valid
97-
if (id[i] != me)
98+
if (id != me)
9899
break; // end of module
99-
if (clus[i] >= nclus)
100+
auto cl = digis.clus(i);
101+
if (cl >= nclus)
100102
continue;
101-
atomicMin(&clusParams.minRow[clus[i]], x[i]);
102-
atomicMax(&clusParams.maxRow[clus[i]], x[i]);
103-
atomicMin(&clusParams.minCol[clus[i]], y[i]);
104-
atomicMax(&clusParams.maxCol[clus[i]], y[i]);
103+
auto x = digis.xx(i);
104+
auto y = digis.yy(i);
105+
atomicMin(&clusParams.minRow[cl], x);
106+
atomicMax(&clusParams.maxRow[cl], x);
107+
atomicMin(&clusParams.minCol[cl], y);
108+
atomicMax(&clusParams.maxCol[cl], y);
105109
}
106110

107111
__syncthreads();
108112

109113
for (int i = first; i < numElements; i += blockDim.x) {
110-
if (id[i] == InvId)
114+
auto id = digis.moduleInd(i);
115+
if (id == InvId)
111116
continue; // not valid
112-
if (id[i] != me)
117+
if (id != me)
113118
break; // end of module
114-
if (clus[i] >= nclus)
119+
auto cl = digis.clus(i);
120+
if (cl >= nclus)
115121
continue;
116-
atomicAdd(&clusParams.charge[clus[i]], adc[i]);
117-
if (clusParams.minRow[clus[i]] == x[i])
118-
atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
119-
if (clusParams.maxRow[clus[i]] == x[i])
120-
atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);
121-
if (clusParams.minCol[clus[i]] == y[i])
122-
atomicAdd(&clusParams.Q_f_Y[clus[i]], adc[i]);
123-
if (clusParams.maxCol[clus[i]] == y[i])
124-
atomicAdd(&clusParams.Q_l_Y[clus[i]], adc[i]);
122+
auto x = digis.xx(i);
123+
auto y = digis.yy(i);
124+
auto ch = digis.adc(i);
125+
atomicAdd(&clusParams.charge[cl], ch);
126+
if (clusParams.minRow[cl] == x)
127+
atomicAdd(&clusParams.Q_f_X[cl], ch);
128+
if (clusParams.maxRow[cl] == x)
129+
atomicAdd(&clusParams.Q_l_X[cl], ch);
130+
if (clusParams.minCol[cl] == y)
131+
atomicAdd(&clusParams.Q_f_Y[cl], ch);
132+
if (clusParams.maxCol[cl] == y)
133+
atomicAdd(&clusParams.Q_l_Y[cl], ch);
125134
}
126135

127136
__syncthreads();
128137

129138
// next one cluster per thread...
130139

131-
if (ic >= nclus)
132-
return;
140+
first = clusters.clusModuleStart(me);
133141

134-
first = hitsModuleStart[me];
135-
auto h = first + ic; // output index in global memory
142+
for (int ic = threadIdx.x; ic < nclus; ic += blockDim.x) {
143+
auto h = first + ic; // output index in global memory
136144

137-
if (h >= TrackingRecHit2DSOAView::maxHits())
138-
return; // overflow...
145+
if (h >= TrackingRecHit2DSOAView::maxHits())
146+
break; // overflow...
139147

140-
pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
141-
pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
148+
pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
149+
pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
142150

143-
// store it
151+
// store it
144152

145-
hits.charge(h) = clusParams.charge[ic];
153+
hits.charge(h) = clusParams.charge[ic];
146154

147-
hits.detectorIndex(h) = me;
155+
hits.detectorIndex(h) = me;
148156

149-
float xl, yl;
150-
hits.xLocal(h) = xl = clusParams.xpos[ic];
151-
hits.yLocal(h) = yl = clusParams.ypos[ic];
157+
float xl, yl;
158+
hits.xLocal(h) = xl = clusParams.xpos[ic];
159+
hits.yLocal(h) = yl = clusParams.ypos[ic];
152160

153-
hits.clusterSizeX(h) = clusParams.xsize[ic];
154-
hits.clusterSizeY(h) = clusParams.ysize[ic];
161+
hits.clusterSizeX(h) = clusParams.xsize[ic];
162+
hits.clusterSizeY(h) = clusParams.ysize[ic];
155163

156-
hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
157-
hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
164+
hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic];
165+
hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic];
158166

159-
// keep it local for computations
160-
float xg, yg, zg;
161-
// to global and compute phi...
162-
cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
163-
// here correct for the beamspot...
164-
xg -= bs->x;
165-
yg -= bs->y;
166-
zg -= bs->z;
167+
// keep it local for computations
168+
float xg, yg, zg;
169+
// to global and compute phi...
170+
cpeParams->detParams(me).frame.toGlobal(xl, yl, xg, yg, zg);
171+
// here correct for the beamspot...
172+
xg -= bs->x;
173+
yg -= bs->y;
174+
zg -= bs->z;
167175

168-
hits.xGlobal(h) = xg;
169-
hits.yGlobal(h) = yg;
170-
hits.zGlobal(h) = zg;
176+
hits.xGlobal(h) = xg;
177+
hits.yGlobal(h) = yg;
178+
hits.zGlobal(h) = zg;
171179

172-
hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
173-
hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
180+
hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
181+
hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
182+
}
174183
}
175184

176185
} // namespace gpuPixelRecHits

0 commit comments

Comments
 (0)