|
11 | 11 | #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
|
12 | 12 | #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
|
13 | 13 | #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
|
14 |
| -#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" |
15 | 14 |
|
| 15 | +#include "FWCore/ServiceRegistry/interface/Service.h" |
| 16 | +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" |
| 17 | +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" |
16 | 18 |
|
17 |
| -using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; |
18 |
| - |
19 |
| -using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; |
| 19 | +using HitsOnGPU = TrackingRecHit2DSOAView; |
20 | 20 | using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
|
21 | 21 |
|
22 | 22 | using namespace Eigen;
|
@@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
|
75 | 75 | for (unsigned int i = 0; i < hitsInFit; ++i) {
|
76 | 76 | auto hit = hitId[i];
|
77 | 77 | float ge[6];
|
78 |
| - hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge); |
| 78 | + hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); |
79 | 79 | #ifdef BL_DUMP_HITS
|
80 | 80 | if (dump){
|
81 |
| - printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); |
82 |
| - printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); |
| 81 | + printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit)); |
| 82 | + printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); |
83 | 83 | }
|
84 | 84 | #endif
|
85 |
| - hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit]; |
| 85 | + hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); |
86 | 86 | hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
|
87 | 87 | }
|
88 | 88 | BrokenLine::BL_Fast_fit(hits,fast_fit);
|
@@ -167,65 +167,71 @@ void kernelBLFit(
|
167 | 167 | }
|
168 | 168 |
|
169 | 169 |
|
170 |
| -void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream) |
| 170 | +void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream) |
171 | 171 | {
|
172 |
| - assert(tuples_d); assert(fast_fit_resultsGPU_); |
| 172 | + assert(tuples_d); |
173 | 173 |
|
174 | 174 | auto blockSize = 64;
|
175 | 175 | auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
|
176 | 176 |
|
177 |
| - for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) { |
| 177 | + // Fit internals |
| 178 | + edm::Service<CUDAService> cs; |
| 179 | + auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream); |
| 180 | + auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream); |
| 181 | + auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream); |
| 182 | + |
| 183 | + for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) { |
178 | 184 |
|
179 | 185 | // fit triplets
|
180 |
| - kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
181 |
| - tuples_d, tupleMultiplicity_d, hh.gpu_d, |
182 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 186 | + kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
| 187 | + tuples_d, tupleMultiplicity_d, hh.view(), |
| 188 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
183 | 189 | 3, offset);
|
184 | 190 | cudaCheck(cudaGetLastError());
|
185 | 191 |
|
186 |
| - kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
| 192 | + kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
187 | 193 | tupleMultiplicity_d, bField_, helix_fit_results_d,
|
188 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 194 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
189 | 195 | 3, offset);
|
190 | 196 | cudaCheck(cudaGetLastError());
|
191 | 197 |
|
192 | 198 | // fit quads
|
193 |
| - kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
194 |
| - tuples_d, tupleMultiplicity_d, hh.gpu_d, |
195 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 199 | + kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
| 200 | + tuples_d, tupleMultiplicity_d, hh.view(), |
| 201 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
196 | 202 | 4, offset);
|
197 | 203 | cudaCheck(cudaGetLastError());
|
198 | 204 |
|
199 |
| - kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
| 205 | + kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
200 | 206 | tupleMultiplicity_d, bField_, helix_fit_results_d,
|
201 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 207 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
202 | 208 | 4, offset);
|
203 | 209 | cudaCheck(cudaGetLastError());
|
204 | 210 |
|
205 | 211 | if (fit5as4_) {
|
206 | 212 | // fit penta (only first 4)
|
207 |
| - kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
208 |
| - tuples_d, tupleMultiplicity_d, hh.gpu_d, |
209 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 213 | + kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
| 214 | + tuples_d, tupleMultiplicity_d, hh.view(), |
| 215 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
210 | 216 | 5, offset);
|
211 | 217 | cudaCheck(cudaGetLastError());
|
212 | 218 |
|
213 |
| - kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
| 219 | + kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
214 | 220 | tupleMultiplicity_d, bField_, helix_fit_results_d,
|
215 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 221 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
216 | 222 | 5, offset);
|
217 | 223 | cudaCheck(cudaGetLastError());
|
218 | 224 | } else {
|
219 | 225 | // fit penta (all 5)
|
220 |
| - kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
221 |
| - tuples_d, tupleMultiplicity_d, hh.gpu_d, |
222 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 226 | + kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
| 227 | + tuples_d, tupleMultiplicity_d, hh.view(), |
| 228 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
223 | 229 | 5, offset);
|
224 | 230 | cudaCheck(cudaGetLastError());
|
225 | 231 |
|
226 |
| - kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>( |
| 232 | + kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>( |
227 | 233 | tupleMultiplicity_d, bField_, helix_fit_results_d,
|
228 |
| - hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_, |
| 234 | + hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), |
229 | 235 | 5, offset);
|
230 | 236 | cudaCheck(cudaGetLastError());
|
231 | 237 | }
|
|
0 commit comments