@@ -15,19 +15,21 @@ namespace gpuPixelRecHits {
15
15
16
16
__global__ void getHits (pixelCPEforGPU::ParamsOnGPU const * __restrict__ cpeParams,
17
17
BeamSpotCUDA::Data const * __restrict__ bs,
18
- uint16_t const * __restrict__ id,
19
- uint16_t const * __restrict__ x,
20
- uint16_t const * __restrict__ y,
21
- uint16_t const * __restrict__ adc,
22
- uint32_t const * __restrict__ digiModuleStart,
23
- uint32_t const * __restrict__ clusInModule,
24
- uint32_t const * __restrict__ moduleId,
25
- int32_t const * __restrict__ clus,
18
+ SiPixelDigisCUDA::DeviceConstView const * __restrict__ pdigis,
26
19
int numElements,
27
- uint32_t const * __restrict__ hitsModuleStart ,
20
+ SiPixelClustersCUDA::DeviceConstView const * __restrict__ pclusters ,
28
21
TrackingRecHit2DSOAView* phits) {
22
+
23
+ // FIXME
24
+ // the compiler seems NOT to optimize loads from views (even in a simple test case)
25
+ // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
26
+ // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
27
+
29
28
auto & hits = *phits;
30
29
30
+ auto const digis = *pdigis; // the copy is intentional!
31
+ auto const & clusters = *pclusters;
32
+
31
33
// to be moved in common namespace...
32
34
constexpr uint16_t InvId = 9999 ; // must be > MaxNumModules
33
35
constexpr uint32_t MaxHitsInModule = pixelCPEforGPU::MaxHitsInModule;
@@ -37,19 +39,19 @@ namespace gpuPixelRecHits {
37
39
// as usual one block per module
38
40
__shared__ ClusParams clusParams;
39
41
40
- auto first = digiModuleStart[ 1 + blockIdx.x ] ;
41
- auto me = moduleId[ blockIdx.x ] ;
42
- auto nclus = clusInModule[me] ;
42
+ auto first = clusters. moduleStart ( 1 + blockIdx.x ) ;
43
+ auto me = clusters. moduleId ( blockIdx.x ) ;
44
+ auto nclus = clusters. clusInModule (me) ;
43
45
44
46
if (0 == nclus)
45
47
return ;
46
48
47
49
#ifdef GPU_DEBUG
48
50
if (threadIdx.x == 0 ) {
49
51
auto k = first;
50
- while (id[k] == InvId)
52
+ while (digis. moduleInd (k) == InvId)
51
53
++k;
52
- assert (id[k] == me);
54
+ assert (digis. moduleInd (k) == me);
53
55
}
54
56
#endif
55
57
@@ -71,9 +73,7 @@ namespace gpuPixelRecHits {
71
73
}
72
74
nclus = std::min (nclus, MaxHitsInModule);
73
75
74
- auto ic = threadIdx.x ;
75
-
76
- if (ic < nclus) {
76
+ for (int ic = threadIdx.x ; ic < nclus; ic += blockDim.x ) {
77
77
clusParams.minRow [ic] = std::numeric_limits<uint32_t >::max ();
78
78
clusParams.maxRow [ic] = 0 ;
79
79
clusParams.minCol [ic] = std::numeric_limits<uint32_t >::max ();
@@ -92,85 +92,94 @@ namespace gpuPixelRecHits {
92
92
// one thead per "digi"
93
93
94
94
for (int i = first; i < numElements; i += blockDim.x ) {
95
- if (id[i] == InvId)
95
+ auto id = digis.moduleInd (i);
96
+ if (id == InvId)
96
97
continue ; // not valid
97
- if (id[i] != me)
98
+ if (id != me)
98
99
break ; // end of module
99
- if (clus[i] >= nclus)
100
+ auto cl = digis.clus (i);
101
+ if (cl >= nclus)
100
102
continue ;
101
- atomicMin (&clusParams.minRow [clus[i]], x[i]);
102
- atomicMax (&clusParams.maxRow [clus[i]], x[i]);
103
- atomicMin (&clusParams.minCol [clus[i]], y[i]);
104
- atomicMax (&clusParams.maxCol [clus[i]], y[i]);
103
+ auto x = digis.xx (i);
104
+ auto y = digis.yy (i);
105
+ atomicMin (&clusParams.minRow [cl], x);
106
+ atomicMax (&clusParams.maxRow [cl], x);
107
+ atomicMin (&clusParams.minCol [cl], y);
108
+ atomicMax (&clusParams.maxCol [cl], y);
105
109
}
106
110
107
111
__syncthreads ();
108
112
109
113
for (int i = first; i < numElements; i += blockDim.x ) {
110
- if (id[i] == InvId)
114
+ auto id = digis.moduleInd (i);
115
+ if (id == InvId)
111
116
continue ; // not valid
112
- if (id[i] != me)
117
+ if (id != me)
113
118
break ; // end of module
114
- if (clus[i] >= nclus)
119
+ auto cl = digis.clus (i);
120
+ if (cl >= nclus)
115
121
continue ;
116
- atomicAdd (&clusParams.charge [clus[i]], adc[i]);
117
- if (clusParams.minRow [clus[i]] == x[i])
118
- atomicAdd (&clusParams.Q_f_X [clus[i]], adc[i]);
119
- if (clusParams.maxRow [clus[i]] == x[i])
120
- atomicAdd (&clusParams.Q_l_X [clus[i]], adc[i]);
121
- if (clusParams.minCol [clus[i]] == y[i])
122
- atomicAdd (&clusParams.Q_f_Y [clus[i]], adc[i]);
123
- if (clusParams.maxCol [clus[i]] == y[i])
124
- atomicAdd (&clusParams.Q_l_Y [clus[i]], adc[i]);
122
+ auto x = digis.xx (i);
123
+ auto y = digis.yy (i);
124
+ auto ch = digis.adc (i);
125
+ atomicAdd (&clusParams.charge [cl], ch);
126
+ if (clusParams.minRow [cl] == x)
127
+ atomicAdd (&clusParams.Q_f_X [cl], ch);
128
+ if (clusParams.maxRow [cl] == x)
129
+ atomicAdd (&clusParams.Q_l_X [cl], ch);
130
+ if (clusParams.minCol [cl] == y)
131
+ atomicAdd (&clusParams.Q_f_Y [cl], ch);
132
+ if (clusParams.maxCol [cl] == y)
133
+ atomicAdd (&clusParams.Q_l_Y [cl], ch);
125
134
}
126
135
127
136
__syncthreads ();
128
137
129
138
// next one cluster per thread...
130
139
131
- if (ic >= nclus)
132
- return ;
140
+ first = clusters.clusModuleStart (me);
133
141
134
- first = hitsModuleStart[me];
135
- auto h = first + ic; // output index in global memory
142
+ for ( int ic = threadIdx. x ; ic < nclus; ic += blockDim. x ) {
143
+ auto h = first + ic; // output index in global memory
136
144
137
- if (h >= TrackingRecHit2DSOAView::maxHits ())
138
- return ; // overflow...
145
+ if (h >= TrackingRecHit2DSOAView::maxHits ())
146
+ break ; // overflow...
139
147
140
- pixelCPEforGPU::position (cpeParams->commonParams (), cpeParams->detParams (me), clusParams, ic);
141
- pixelCPEforGPU::errorFromDB (cpeParams->commonParams (), cpeParams->detParams (me), clusParams, ic);
148
+ pixelCPEforGPU::position (cpeParams->commonParams (), cpeParams->detParams (me), clusParams, ic);
149
+ pixelCPEforGPU::errorFromDB (cpeParams->commonParams (), cpeParams->detParams (me), clusParams, ic);
142
150
143
- // store it
151
+ // store it
144
152
145
- hits.charge (h) = clusParams.charge [ic];
153
+ hits.charge (h) = clusParams.charge [ic];
146
154
147
- hits.detectorIndex (h) = me;
155
+ hits.detectorIndex (h) = me;
148
156
149
- float xl, yl;
150
- hits.xLocal (h) = xl = clusParams.xpos [ic];
151
- hits.yLocal (h) = yl = clusParams.ypos [ic];
157
+ float xl, yl;
158
+ hits.xLocal (h) = xl = clusParams.xpos [ic];
159
+ hits.yLocal (h) = yl = clusParams.ypos [ic];
152
160
153
- hits.clusterSizeX (h) = clusParams.xsize [ic];
154
- hits.clusterSizeY (h) = clusParams.ysize [ic];
161
+ hits.clusterSizeX (h) = clusParams.xsize [ic];
162
+ hits.clusterSizeY (h) = clusParams.ysize [ic];
155
163
156
- hits.xerrLocal (h) = clusParams.xerr [ic] * clusParams.xerr [ic];
157
- hits.yerrLocal (h) = clusParams.yerr [ic] * clusParams.yerr [ic];
164
+ hits.xerrLocal (h) = clusParams.xerr [ic] * clusParams.xerr [ic];
165
+ hits.yerrLocal (h) = clusParams.yerr [ic] * clusParams.yerr [ic];
158
166
159
- // keep it local for computations
160
- float xg, yg, zg;
161
- // to global and compute phi...
162
- cpeParams->detParams (me).frame .toGlobal (xl, yl, xg, yg, zg);
163
- // here correct for the beamspot...
164
- xg -= bs->x ;
165
- yg -= bs->y ;
166
- zg -= bs->z ;
167
+ // keep it local for computations
168
+ float xg, yg, zg;
169
+ // to global and compute phi...
170
+ cpeParams->detParams (me).frame .toGlobal (xl, yl, xg, yg, zg);
171
+ // here correct for the beamspot...
172
+ xg -= bs->x ;
173
+ yg -= bs->y ;
174
+ zg -= bs->z ;
167
175
168
- hits.xGlobal (h) = xg;
169
- hits.yGlobal (h) = yg;
170
- hits.zGlobal (h) = zg;
176
+ hits.xGlobal (h) = xg;
177
+ hits.yGlobal (h) = yg;
178
+ hits.zGlobal (h) = zg;
171
179
172
- hits.rGlobal (h) = std::sqrt (xg * xg + yg * yg);
173
- hits.iphi (h) = unsafe_atan2s<7 >(yg, xg);
180
+ hits.rGlobal (h) = std::sqrt (xg * xg + yg * yg);
181
+ hits.iphi (h) = unsafe_atan2s<7 >(yg, xg);
182
+ }
174
183
}
175
184
176
185
} // namespace gpuPixelRecHits
0 commit comments