From abf1a46a184edd35ad3f5791be03b9e94f9afda7 Mon Sep 17 00:00:00 2001 From: Vincenzo Innocente Date: Wed, 8 Aug 2018 19:17:46 +0200 Subject: [PATCH] Pixel doublets on GPU (#118) Pixel doublets (actually CACells) are created on GPU and fed to CA. The whole workflow up to quadruplets candidates is now fully on GPU. --- .../CUDAUtilities/interface/GPUVecArray.h | 14 +- .../SiPixelRecHits/plugins/PixelRecHits.cu | 1 + .../siPixelRecHitsHeterogeneousProduct.h | 2 + .../CAHitNtupletHeterogeneousEDProducer.cc | 54 +- .../plugins/CAHitQuadrupletGeneratorGPU.cc | 487 +++++------------ .../plugins/CAHitQuadrupletGeneratorGPU.cu | 497 ++++-------------- .../plugins/CAHitQuadrupletGeneratorGPU.h | 57 +- .../PixelTriplets/plugins/GPUCACell.h | 74 +-- .../PixelTriplets/plugins/RecHitsMap.h | 77 +++ .../PixelTriplets/plugins/gpuPixelDoublets.h | 233 ++++---- 10 files changed, 510 insertions(+), 986 deletions(-) create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h index 601eb6db0305a..7ea4f6422d7fa 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h @@ -5,7 +5,6 @@ #ifndef GPU_VECARRAY_H_ #define GPU_VECARRAY_H_ - #include #include @@ -24,8 +23,6 @@ template struct VecArray { } } - - template constexpr int emplace_back_unsafe(Ts &&... args) { auto previousSize = m_size; m_size++; @@ -38,9 +35,7 @@ template struct VecArray { } } - __inline__ constexpr T & back() const { - if (m_size > 0) { return m_data[m_size - 1]; } else @@ -95,10 +90,15 @@ template struct VecArray { __inline__ constexpr void resize(int size) { m_size = size; } + __inline__ constexpr bool empty() const { return 0 == m_size; } + + __inline__ constexpr bool full() const { return maxSize == m_size; } - int m_size; + int m_size = 0; T m_data[maxSize]; }; + } -#endif + +#endif // GPU_VECARRAY_H_ diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu index 1e3d6a72d0a94..3f0c5c81b9140 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu @@ -129,6 +129,7 @@ namespace pixelgpudetails { #endif auto nhits = input.nClusters; cpu_ = std::make_unique(nhits); + cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id())); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h index 23ca8805991bd..864f7f4f7b2aa 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h @@ -39,6 +39,7 @@ namespace siPixelRecHitsHeterogeneousProduct { HitsOnCPU() = default; explicit HitsOnCPU(uint32_t nhits) : + detInd(nhits), charge(nhits), xl(nhits), yl(nhits), @@ -50,6 +51,7 @@ namespace siPixelRecHitsHeterogeneousProduct { { } uint32_t hitsModuleStart[2001]; + std::vector> detInd; std::vector> charge; std::vector> xl, yl; std::vector> xe, ye; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc index a4183d17e070f..1d945aa1e0c44 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc @@ -98,45 +98,42 @@ void CAHitNtupletHeterogeneousEDProducer::beginStreamGPUCuda( void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda( const edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup, cuda::stream_t<> &cudaStream) { + + seedingHitSets_ = std::make_unique(); + + + // FIXME: move directly to region or similar... edm::Handle hdoublets; iEvent.getByToken(doubletToken_, hdoublets); const auto ®ionDoublets = *hdoublets; + assert(regionDoublets.regionSize()<=1); - const SeedingLayerSetsHits &seedingLayerHits = - regionDoublets.seedingLayerHits(); - if (seedingLayerHits.numberOfLayersInSet() < - CAHitQuadrupletGeneratorGPU::minLayers) { - throw cms::Exception("LogicError") - << "CAHitNtupletHeterogeneousEDProducer expects " - "SeedingLayerSetsHits::numberOfLayersInSet() to be >= " - << CAHitQuadrupletGeneratorGPU::minLayers << ", got " - << seedingLayerHits.numberOfLayersInSet() - << ". This is likely caused by a configuration error of this module, " - "HitPairEDProducer, or SeedingLayersEDProducer."; + if (regionDoublets.empty()) { + emptyRegionDoublets = true; + return; } - seedingHitSets_ = std::make_unique(); + const TrackingRegion ®ion = (*regionDoublets.begin()).region(); + edm::Handle gh; iEvent.getByToken(tGpuHits, gh); auto const & gHits = *gh; // auto nhits = gHits.nHits; - GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id()); + // move inside hitNtuplets??? + GPUGenerator_.buildDoublets(gHits,cudaStream.id()); - if (regionDoublets.empty()) { - emptyRegionDoublets = true; - } else { - seedingHitSets_->reserve(regionDoublets.regionSize(), localRA_.upper()); - GPUGenerator_.initEvent(iEvent.event(), iSetup); + seedingHitSets_->reserve(regionDoublets.regionSize(), localRA_.upper()); + GPUGenerator_.initEvent(iEvent.event(), iSetup); - LogDebug("CAHitNtupletHeterogeneousEDProducer") + LogDebug("CAHitNtupletHeterogeneousEDProducer") << "Creating ntuplets for " << regionDoublets.regionSize() << " regions, and " << regionDoublets.layerPairsSize() << " layer pairs"; - GPUGenerator_.hitNtuplets(regionDoublets, iSetup, seedingLayerHits, cudaStream.id()); - } + GPUGenerator_.hitNtuplets(region, gHits, iSetup, cudaStream.id()); + } void CAHitNtupletHeterogeneousEDProducer::produceGPUCuda( @@ -147,16 +144,18 @@ void CAHitNtupletHeterogeneousEDProducer::produceGPUCuda( edm::Handle hdoublets; iEvent.getByToken(doubletToken_, hdoublets); const auto ®ionDoublets = *hdoublets; - const SeedingLayerSetsHits &seedingLayerHits = - regionDoublets.seedingLayerHits(); - int index = 0; + + edm::Handle gh; + iEvent.getByToken(tGpuHits, gh); + auto const & rechits = gh->get().getProduct(); + std::vector ntuplets(regionDoublets.regionSize()); - for (auto &ntuplet : ntuplets) - ntuplet.reserve(localRA_.upper()); + for (auto &ntuplet : ntuplets) ntuplet.reserve(localRA_.upper()); + int index = 0; for (const auto ®ionLayerPairs : regionDoublets) { const TrackingRegion ®ion = regionLayerPairs.region(); auto seedingHitSetsFiller = seedingHitSets_->beginRegion(®ion); - GPUGenerator_.fillResults(regionDoublets, ntuplets, iSetup, seedingLayerHits, cudaStream.id()); + GPUGenerator_.fillResults(region, rechits.collection, ntuplets, iSetup, cudaStream.id()); fillNtuplets(seedingHitSetsFiller, ntuplets[index]); ntuplets[index].clear(); index++; @@ -188,6 +187,7 @@ void CAHitNtupletHeterogeneousEDProducer::produceCPU( iEvent.put(std::move(seedingHitSets)); return; } + seedingHitSets->reserve(regionDoublets.regionSize(), localRA_.upper()); CPUGenerator_.initEvent(iEvent.event(), iSetup); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc index 1230d29f123ff..e9116fcd58eac 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc @@ -30,30 +30,23 @@ constexpr unsigned int CAHitQuadrupletGeneratorGPU::minLayers; CAHitQuadrupletGeneratorGPU::CAHitQuadrupletGeneratorGPU( const edm::ParameterSet &cfg, edm::ConsumesCollector &iC) - : extraHitRPhitolerance(cfg.getParameter( - "extraHitRPhitolerance")), // extra window in - // ThirdHitPredictionFromCircle range - // (divide by R to get phi) - maxChi2(cfg.getParameter("maxChi2")), - fitFastCircle(cfg.getParameter("fitFastCircle")), - fitFastCircleChi2Cut(cfg.getParameter("fitFastCircleChi2Cut")), - useBendingCorrection(cfg.getParameter("useBendingCorrection")), - caThetaCut(cfg.getParameter("CAThetaCut")), - caPhiCut(cfg.getParameter("CAPhiCut")), - caHardPtCut(cfg.getParameter("CAHardPtCut")) { - edm::ParameterSet comparitorPSet = - cfg.getParameter("SeedComparitorPSet"); - std::string comparitorName = - comparitorPSet.getParameter("ComponentName"); + : extraHitRPhitolerance(cfg.getParameter("extraHitRPhitolerance")), // extra window in ThirdHitPredictionFromCircle range (divide by R to get phi) + maxChi2(cfg.getParameter("maxChi2")), + fitFastCircle(cfg.getParameter("fitFastCircle")), + fitFastCircleChi2Cut(cfg.getParameter("fitFastCircleChi2Cut")), + useBendingCorrection(cfg.getParameter("useBendingCorrection")), + caThetaCut(cfg.getParameter("CAThetaCut")), + caPhiCut(cfg.getParameter("CAPhiCut")), + caHardPtCut(cfg.getParameter("CAHardPtCut")) +{ + edm::ParameterSet comparitorPSet = cfg.getParameter("SeedComparitorPSet"); + std::string comparitorName = comparitorPSet.getParameter("ComponentName"); if (comparitorName != "none") { - theComparitor.reset(SeedComparitorFactory::get()->create( - comparitorName, comparitorPSet, iC)); + theComparitor.reset(SeedComparitorFactory::get()->create(comparitorName, comparitorPSet, iC)); } - } -void CAHitQuadrupletGeneratorGPU::fillDescriptions( - edm::ParameterSetDescription &desc) { +void CAHitQuadrupletGeneratorGPU::fillDescriptions(edm::ParameterSetDescription &desc) { desc.add("extraHitRPhitolerance", 0.1); desc.add("fitFastCircle", false); desc.add("fitFastCircleChi2Cut", false); @@ -61,10 +54,9 @@ void CAHitQuadrupletGeneratorGPU::fillDescriptions( desc.add("CAThetaCut", 0.00125); desc.add("CAPhiCut", 10); desc.add("CAHardPtCut", 0); - desc.addOptional("CAOnlyOneLastHitPerLayerFilter") - ->setComment( - "Deprecated and has no effect. To be fully removed later when the " - "parameter is no longer used in HLT configurations."); + desc.addOptional("CAOnlyOneLastHitPerLayerFilter")->setComment( + "Deprecated and has no effect. To be fully removed later when the " + "parameter is no longer used in HLT configurations."); edm::ParameterSetDescription descMaxChi2; descMaxChi2.add("pt1", 0.2); descMaxChi2.add("pt2", 1.5); @@ -75,13 +67,11 @@ void CAHitQuadrupletGeneratorGPU::fillDescriptions( edm::ParameterSetDescription descComparitor; descComparitor.add("ComponentName", "none"); - descComparitor.setAllowAnything(); // until we have moved SeedComparitor too - // to EDProducers + descComparitor.setAllowAnything(); // until we have moved SeedComparitor to EDProducers too desc.add("SeedComparitorPSet", descComparitor); } -void CAHitQuadrupletGeneratorGPU::initEvent(const edm::Event &ev, - const edm::EventSetup &es) { +void CAHitQuadrupletGeneratorGPU::initEvent(edm::Event const& ev, edm::EventSetup const& es) { if (theComparitor) theComparitor->init(ev, es); } @@ -90,356 +80,121 @@ CAHitQuadrupletGeneratorGPU::~CAHitQuadrupletGeneratorGPU() { deallocateOnGPU(); } -namespace { -void createGraphStructure(const SeedingLayerSetsHits &layers, CAGraph &g, - GPULayerHits *h_layers_, unsigned int maxNumberOfHits_, - float *h_x_, float *h_y_, float *h_z_) { - for (unsigned int i = 0; i < layers.size(); i++) { - for (unsigned int j = 0; j < 4; ++j) { - auto vertexIndex = 0; - auto foundVertex = std::find(g.theLayers.begin(), g.theLayers.end(), - layers[i][j].name()); - if (foundVertex == g.theLayers.end()) { - g.theLayers.emplace_back(layers[i][j].name(), - layers[i][j].hits().size()); - vertexIndex = g.theLayers.size() - 1; - - } else { - vertexIndex = foundVertex - g.theLayers.begin(); - } - if (j == 0) { - - if (std::find(g.theRootLayers.begin(), g.theRootLayers.end(), - vertexIndex) == g.theRootLayers.end()) { - g.theRootLayers.emplace_back(vertexIndex); - } - } - } - } -} -void clearGraphStructure(const SeedingLayerSetsHits &layers, CAGraph &g) { - g.theLayerPairs.clear(); - for (unsigned int i = 0; i < g.theLayers.size(); i++) { - g.theLayers[i].theInnerLayers.clear(); - g.theLayers[i].theInnerLayerPairs.clear(); - g.theLayers[i].theOuterLayers.clear(); - g.theLayers[i].theOuterLayerPairs.clear(); - for (auto &v : g.theLayers[i].isOuterHitOfCell) - v.clear(); - } +void CAHitQuadrupletGeneratorGPU::hitNtuplets( + TrackingRegion const& region, + HitsOnCPU const& hh, + edm::EventSetup const& es, + cudaStream_t cudaStream) +{ + hitsOnCPU = &hh; + int index = 0; + launchKernels(region, index, hh, cudaStream); } -void fillGraph(const SeedingLayerSetsHits &layers, - const IntermediateHitDoublets::RegionLayerSets ®ionLayerPairs, - CAGraph &g, std::vector &hitDoublets) { - - for (unsigned int i = 0; i < layers.size(); i++) { - for (unsigned int j = 0; j < 4; ++j) { - auto vertexIndex = 0; - auto foundVertex = std::find(g.theLayers.begin(), g.theLayers.end(), - layers[i][j].name()); - if (foundVertex == g.theLayers.end()) { - vertexIndex = g.theLayers.size() - 1; - } else { - vertexIndex = foundVertex - g.theLayers.begin(); - } - if (j > 0) { - auto innerVertex = std::find(g.theLayers.begin(), g.theLayers.end(), - layers[i][j - 1].name()); +void CAHitQuadrupletGeneratorGPU::fillResults( + const TrackingRegion ®ion, SiPixelRecHitCollectionNew const & rechits, + std::vector &result, const edm::EventSetup &es, + cudaStream_t cudaStream) +{ + hitmap_.clear(); + auto const & rcs = rechits.data(); + for (auto const & h : rcs) hitmap_.add(h, &h); - CALayerPair tmpInnerLayerPair(innerVertex - g.theLayers.begin(), - vertexIndex); + assert(hitsOnCPU); + auto nhits = hitsOnCPU->nHits; + int index = 0; - if (std::find(g.theLayerPairs.begin(), g.theLayerPairs.end(), - tmpInnerLayerPair) == g.theLayerPairs.end()) { - auto found = std::find_if( - regionLayerPairs.begin(), regionLayerPairs.end(), - [&](const IntermediateHitDoublets::LayerPairHitDoublets &pair) { - return pair.innerLayerIndex() == layers[i][j - 1].index() && - pair.outerLayerIndex() == layers[i][j].index(); - }); - if (found != regionLayerPairs.end()) { - hitDoublets.emplace_back(&(found->doublets())); - g.theLayerPairs.push_back(tmpInnerLayerPair); - g.theLayers[vertexIndex].theInnerLayers.push_back( - innerVertex - g.theLayers.begin()); - innerVertex->theOuterLayers.push_back(vertexIndex); - g.theLayers[vertexIndex].theInnerLayerPairs.push_back( - g.theLayerPairs.size() - 1); - innerVertex->theOuterLayerPairs.push_back(g.theLayerPairs.size() - - 1); - } - } + auto const & foundQuads = fetchKernelResult(index, cudaStream); + unsigned int numberOfFoundQuadruplets = foundQuads.size(); + const QuantityDependsPtEval maxChi2Eval = maxChi2.evaluator(es); + + // re-used throughout + std::array bc_r; + std::array bc_z; + std::array bc_errZ2; + std::array gps; + std::array ges; + std::array barrels; + std::array phits; + + // loop over quadruplets + for (unsigned int quadId = 0; quadId < numberOfFoundQuadruplets; ++quadId) { + auto isBarrel = [](const unsigned id) -> bool { + return id == PixelSubdetector::PixelBarrel; + }; + bool bad = false; + for (unsigned int i = 0; i < 4; ++i) { + auto k = foundQuads[quadId][i]; + assert(k(hp); + auto const &ahit = *phits[i]; + gps[i] = ahit.globalPosition(); + ges[i] = ahit.globalPositionError(); + barrels[i] = isBarrel(ahit.geographicalId().subdetId()); -} -} // namespace - -void CAHitQuadrupletGeneratorGPU::hitNtuplets( - const IntermediateHitDoublets ®ionDoublets, - const edm::EventSetup &es, - const SeedingLayerSetsHits &layers, cudaStream_t cudaStream) { - CAGraph g; - - hitDoublets.resize(regionDoublets.regionSize()); - - for (unsigned int lpIdx = 0; lpIdx < maxNumberOfLayerPairs_; ++lpIdx) { - h_doublets_[lpIdx].size = 0; - } - numberOfRootLayerPairs_ = 0; - numberOfLayerPairs_ = 0; - numberOfLayers_ = 0; - - for (unsigned int layerIdx = 0; layerIdx < maxNumberOfLayers_; ++layerIdx) { - h_layers_[layerIdx].size = 0; - } - - int index = 0; - for (const auto ®ionLayerPairs : regionDoublets) { - const TrackingRegion ®ion = regionLayerPairs.region(); - hitDoublets[index].clear(); - if (index == 0) { - createGraphStructure(layers, g, h_layers_, maxNumberOfHits_, h_x_, h_y_, h_z_); - } else { - clearGraphStructure(layers, g); } - - fillGraph(layers, regionLayerPairs, g, hitDoublets[index]); - numberOfLayers_ = g.theLayers.size(); - numberOfLayerPairs_ = hitDoublets[index].size(); - std::vector layerAlreadyParsed(g.theLayers.size(), false); - - for (unsigned int i = 0; i < numberOfLayerPairs_; ++i) { - h_doublets_[i].size = hitDoublets[index][i]->size(); - h_doublets_[i].innerLayerId = g.theLayerPairs[i].theLayers[0]; - h_doublets_[i].outerLayerId = g.theLayerPairs[i].theLayers[1]; - - if (layerAlreadyParsed[h_doublets_[i].innerLayerId] == false) { - layerAlreadyParsed[h_doublets_[i].innerLayerId] = true; - h_layers_[h_doublets_[i].innerLayerId].size = - hitDoublets[index][i]->innerLayer().hits().size(); - h_layers_[h_doublets_[i].innerLayerId].layerId = - h_doublets_[i].innerLayerId; - - for (unsigned int l = 0; l < h_layers_[h_doublets_[i].innerLayerId].size; - ++l) { - auto hitId = - h_layers_[h_doublets_[i].innerLayerId].layerId * maxNumberOfHits_ + - l; - h_x_[hitId] = - hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().x(); - h_y_[hitId] = - hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().y(); - h_z_[hitId] = - hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().z(); - } - } - if (layerAlreadyParsed[h_doublets_[i].outerLayerId] == false) { - layerAlreadyParsed[h_doublets_[i].outerLayerId] = true; - h_layers_[h_doublets_[i].outerLayerId].size = - hitDoublets[index][i]->outerLayer().hits().size(); - h_layers_[h_doublets_[i].outerLayerId].layerId = - h_doublets_[i].outerLayerId; - for (unsigned int l = 0; l < h_layers_[h_doublets_[i].outerLayerId].size; - ++l) { - auto hitId = - h_layers_[h_doublets_[i].outerLayerId].layerId * maxNumberOfHits_ + - l; - h_x_[hitId] = - hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().x(); - h_y_[hitId] = - hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().y(); - h_z_[hitId] = - hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().z(); - } + if (bad) continue; + + // TODO: + // - if we decide to always do the circle fit for 4 hits, we don't + // need ThirdHitPredictionFromCircle for the curvature; then we + // could remove extraHitRPhitolerance configuration parameter + ThirdHitPredictionFromCircle predictionRPhi(gps[0], gps[2], + extraHitRPhitolerance); + const float curvature = predictionRPhi.curvature( + ThirdHitPredictionFromCircle::Vector2D(gps[1].x(), gps[1].y())); + const float abscurv = std::abs(curvature); + const float thisMaxChi2 = maxChi2Eval.value(abscurv); + if (theComparitor) { + SeedingHitSet tmpTriplet(phits[0], phits[1], phits[3]); + if (!theComparitor->compatible(tmpTriplet)) { + continue; } + } - for (unsigned int rl : g.theRootLayers) { - if (rl == h_doublets_[i].innerLayerId) { - auto rootlayerPairId = numberOfRootLayerPairs_; - h_rootLayerPairs_[rootlayerPairId] = i; - numberOfRootLayerPairs_++; - } - } - auto numberOfDoublets = hitDoublets[index][i]->size(); - if(numberOfDoublets > maxNumberOfDoublets_) - { - edm::LogError("CAHitQuadrupletGeneratorGPU")<<" too many doublets: " << numberOfDoublets << " max is " << maxNumberOfDoublets_; - return; - } - for (unsigned int l = 0; l < numberOfDoublets; ++l) { - auto hitId = i * maxNumberOfDoublets_ * 2 + 2 * l; - h_indices_[hitId] = hitDoublets[index][i]->innerHitId(l); - h_indices_[hitId + 1] = hitDoublets[index][i]->outerHitId(l); + float chi2 = std::numeric_limits::quiet_NaN(); + // TODO: Do we have any use case to not use bending correction? + if (useBendingCorrection) { + // Following PixelFitterByConformalMappingAndLine + const float simpleCot = (gps.back().z() - gps.front().z()) / + (gps.back().perp() - gps.front().perp()); + const float pt = 1.f / PixelRecoUtilities::inversePt(abscurv, es); + for (int i = 0; i < 4; ++i) { + const GlobalPoint &point = gps[i]; + const GlobalError &error = ges[i]; + bc_r[i] = sqrt(sqr(point.x() - region.origin().x()) + + sqr(point.y() - region.origin().y())); + bc_r[i] += pixelrecoutilities::LongitudinalBendingCorrection(pt, es)( + bc_r[i]); + bc_z[i] = point.z() - region.origin().z(); + bc_errZ2[i] = + (barrels[i]) ? error.czz() : error.rerr(point) * sqr(simpleCot); } + RZLine rzLine(bc_r, bc_z, bc_errZ2, RZLine::ErrZ2_tag()); + chi2 = rzLine.chi2(); + } else { + RZLine rzLine(gps, ges, barrels); + chi2 = rzLine.chi2(); } - - - - for (unsigned int j = 0; j < numberOfLayerPairs_; ++j) { - tmp_layerDoublets_[j] = h_doublets_[j]; - tmp_layerDoublets_[j].indices = &d_indices_[j * maxNumberOfDoublets_ * 2]; - cudaMemcpyAsync(&d_indices_[j * maxNumberOfDoublets_ * 2], - &h_indices_[j * maxNumberOfDoublets_ * 2], - tmp_layerDoublets_[j].size * 2 * sizeof(int), - cudaMemcpyHostToDevice, cudaStream); + if (edm::isNotFinite(chi2) || chi2 > thisMaxChi2) { + continue; } - - for (unsigned int j = 0; j < numberOfLayers_; ++j) { - if(h_layers_[j].size > maxNumberOfHits_) - { - edm::LogError("CAHitQuadrupletGeneratorGPU")<<" too many hits: " << h_layers_[j].size << " max is " << maxNumberOfHits_; - return; - } - tmp_layers_[j] = h_layers_[j]; - tmp_layers_[j].x = &d_x_[maxNumberOfHits_ * j]; - - cudaMemcpyAsync(&d_x_[maxNumberOfHits_ * j], &h_x_[j * maxNumberOfHits_], - tmp_layers_[j].size * sizeof(float), - cudaMemcpyHostToDevice, cudaStream); - - tmp_layers_[j].y = &d_y_[maxNumberOfHits_ * j]; - cudaMemcpyAsync(&d_y_[maxNumberOfHits_ * j], &h_y_[j * maxNumberOfHits_], - tmp_layers_[j].size * sizeof(float), - cudaMemcpyHostToDevice, cudaStream); - - tmp_layers_[j].z = &d_z_[maxNumberOfHits_ * j]; - - cudaMemcpyAsync(&d_z_[maxNumberOfHits_ * j], &h_z_[j * maxNumberOfHits_], - tmp_layers_[j].size * sizeof(float), - cudaMemcpyHostToDevice, cudaStream); + // TODO: Do we have any use case to not use circle fit? Maybe + // HLT where low-pT inefficiency is not a problem? + if (fitFastCircle) { + FastCircleFit c(gps, ges); + chi2 += c.chi2(); + if (edm::isNotFinite(chi2)) + continue; + if (fitFastCircleChi2Cut && chi2 > thisMaxChi2) + continue; } + result[index].emplace_back(phits[0], phits[1], phits[2], phits[3]); - cudaMemcpyAsync(d_rootLayerPairs_, h_rootLayerPairs_, - numberOfRootLayerPairs_ * sizeof(unsigned int), - cudaMemcpyHostToDevice, cudaStream); - cudaMemcpyAsync(d_doublets_, tmp_layerDoublets_, - numberOfLayerPairs_ * sizeof(GPULayerDoublets), - cudaMemcpyHostToDevice, cudaStream); - cudaMemcpyAsync(d_layers_, tmp_layers_, numberOfLayers_ * sizeof(GPULayerHits), - cudaMemcpyHostToDevice, cudaStream); - - launchKernels(region, index, cudaStream); - } -} - -void CAHitQuadrupletGeneratorGPU::fillResults( - const IntermediateHitDoublets ®ionDoublets, - std::vector &result, const edm::EventSetup &es, - const SeedingLayerSetsHits &layers, cudaStream_t cudaStream) -{ - int index = 0; - - for (const auto ®ionLayerPairs : regionDoublets) { - const TrackingRegion ®ion = regionLayerPairs.region(); - auto foundQuads = fetchKernelResult(index, cudaStream); - unsigned int numberOfFoundQuadruplets = foundQuads.size(); - const QuantityDependsPtEval maxChi2Eval = maxChi2.evaluator(es); - - // re-used thoughout - std::array bc_r; - std::array bc_z; - std::array bc_errZ2; - std::array gps; - std::array ges; - std::array barrels; - // Loop over quadruplets - for (unsigned int quadId = 0; quadId < numberOfFoundQuadruplets; ++quadId) { - auto isBarrel = [](const unsigned id) -> bool { - return id == PixelSubdetector::PixelBarrel; - }; - for (unsigned int i = 0; i < 3; ++i) { - auto layerPair = foundQuads[quadId][i].first; - auto doubletId = foundQuads[quadId][i].second; - - auto const &ahit = - hitDoublets[index][layerPair]->hit(doubletId, HitDoublets::inner); - gps[i] = ahit->globalPosition(); - ges[i] = ahit->globalPositionError(); - barrels[i] = isBarrel(ahit->geographicalId().subdetId()); - - } - auto layerPair = foundQuads[quadId][2].first; - auto doubletId = foundQuads[quadId][2].second; - - auto const &ahit = - hitDoublets[index][layerPair]->hit(doubletId, HitDoublets::outer); - gps[3] = ahit->globalPosition(); - ges[3] = ahit->globalPositionError(); - barrels[3] = isBarrel(ahit->geographicalId().subdetId()); - - // TODO: - // - if we decide to always do the circle fit for 4 hits, we don't - // need ThirdHitPredictionFromCircle for the curvature; then we - // could remove extraHitRPhitolerance configuration parameter - ThirdHitPredictionFromCircle predictionRPhi(gps[0], gps[2], - extraHitRPhitolerance); - const float curvature = predictionRPhi.curvature( - ThirdHitPredictionFromCircle::Vector2D(gps[1].x(), gps[1].y())); - const float abscurv = std::abs(curvature); - const float thisMaxChi2 = maxChi2Eval.value(abscurv); - if (theComparitor) { - SeedingHitSet tmpTriplet( - hitDoublets[index][foundQuads[quadId][0].first]->hit( - foundQuads[quadId][0].second, HitDoublets::inner), - hitDoublets[index][foundQuads[quadId][2].first]->hit( - foundQuads[quadId][2].second, HitDoublets::inner), - hitDoublets[index][foundQuads[quadId][2].first]->hit( - foundQuads[quadId][2].second, HitDoublets::outer)); - if (!theComparitor->compatible(tmpTriplet)) { - continue; - } - } - - float chi2 = std::numeric_limits::quiet_NaN(); - // TODO: Do we have any use case to not use bending correction? - if (useBendingCorrection) { - // Following PixelFitterByConformalMappingAndLine - const float simpleCot = (gps.back().z() - gps.front().z()) / - (gps.back().perp() - gps.front().perp()); - const float pt = 1.f / PixelRecoUtilities::inversePt(abscurv, es); - for (int i = 0; i < 4; ++i) { - const GlobalPoint &point = gps[i]; - const GlobalError &error = ges[i]; - bc_r[i] = sqrt(sqr(point.x() - region.origin().x()) + - sqr(point.y() - region.origin().y())); - bc_r[i] += pixelrecoutilities::LongitudinalBendingCorrection(pt, es)( - bc_r[i]); - bc_z[i] = point.z() - region.origin().z(); - bc_errZ2[i] = - (barrels[i]) ? error.czz() : error.rerr(point) * sqr(simpleCot); - } - RZLine rzLine(bc_r, bc_z, bc_errZ2, RZLine::ErrZ2_tag()); - chi2 = rzLine.chi2(); - } else { - RZLine rzLine(gps, ges, barrels); - chi2 = rzLine.chi2(); - } - if (edm::isNotFinite(chi2) || chi2 > thisMaxChi2) { - continue; - } - // TODO: Do we have any use case to not use circle fit? Maybe - // HLT where low-pT inefficiency is not a problem? - if (fitFastCircle) { - FastCircleFit c(gps, ges); - chi2 += c.chi2(); - if (edm::isNotFinite(chi2)) - continue; - if (fitFastCircleChi2Cut && chi2 > thisMaxChi2) - continue; - } - result[index].emplace_back( - hitDoublets[index][foundQuads[quadId][0].first]->hit( - foundQuads[quadId][0].second, HitDoublets::inner), - hitDoublets[index][foundQuads[quadId][1].first]->hit( - foundQuads[quadId][1].second, HitDoublets::inner), - hitDoublets[index][foundQuads[quadId][2].first]->hit( - foundQuads[quadId][2].second, HitDoublets::inner), - hitDoublets[index][foundQuads[quadId][2].first]->hit( - foundQuads[quadId][2].second, HitDoublets::outer)); - } - index++; - } + } // end loop over quads } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu index 6e367e59e320d..2f8966c1a42c4 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu @@ -6,361 +6,99 @@ #include "CAHitQuadrupletGeneratorGPU.h" #include "GPUCACell.h" #include "gpuPixelDoublets.h" +#include + +using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; __global__ void -kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_, - const GPULayerDoublets *gpuDoublets, - const GPULayerHits *gpuHitsOnLayers, GPUCACell *cells, - GPU::VecArray *isOuterHitOfCell, - GPU::SimpleVector *foundNtuplets, - float ptmin, float region_origin_x, float region_origin_y, - float region_origin_radius, const float thetaCut, - const float phiCut, const float hardPtCut, - unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { - if (threadIdx.x == 0 and blockIdx.x == 0) - foundNtuplets->reset(); - - printf("kernel_debug_create: theEvent contains numberOfLayerPairs_: %d\n", - numberOfLayerPairs_); - for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_; - ++layerPairIndex) { - - int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId; - int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId; - int numberOfDoublets = gpuDoublets[layerPairIndex].size; - printf( - "kernel_debug_create: layerPairIndex: %d inner %d outer %d size %u\n", - layerPairIndex, innerLayerId, outerLayerId, numberOfDoublets); - - auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_; - auto globalFirstHitIdx = outerLayerId * maxNumberOfHits_; - printf("kernel_debug_create: theIdOfThefirstCellInLayerPair: %d " - "globalFirstHitIdx %d\n", - globalFirstDoubletIdx, globalFirstHitIdx); - - for (unsigned int i = 0; i < gpuDoublets[layerPairIndex].size; i++) { - - auto globalCellIdx = i + globalFirstDoubletIdx; - auto &thisCell = cells[globalCellIdx]; - auto outerHitId = gpuDoublets[layerPairIndex].indices[2 * i + 1]; - thisCell.init(&gpuDoublets[layerPairIndex], gpuHitsOnLayers, - layerPairIndex, globalCellIdx, - gpuDoublets[layerPairIndex].indices[2 * i], outerHitId, - region_origin_x, region_origin_y); - - isOuterHitOfCell[globalFirstHitIdx + outerHitId].push_back( - globalCellIdx); - } - } +kernel_checkOverflows(GPU::SimpleVector *foundNtuplets, + GPUCACell *cells, uint32_t const * nCells, + GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell, + uint32_t nHits) { + + auto idx = threadIdx.x + blockIdx.x * blockDim.x; + #ifdef GPU_DEBUG + if (0==idx) + printf("number of found cells %d\n",*nCells); + #endif + if (idx < (*nCells) ) { + auto &thisCell = cells[idx]; + if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n",idx,thisCell.theLayerPairId); + } + if (idx < nHits) { + if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; + printf("OuterHitOfCell overflow %d\n", idx); + } - // for(unsigned int layerIndex = 0; layerIndex < numberOfLayers_;++layerIndex ) - // { - // auto numberOfHitsOnLayer = gpuHitsOnLayers[layerIndex].size; - // for(unsigned hitId = 0; hitId < numberOfHitsOnLayer; hitId++) - // { - // - // if(isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size()>0) - // { - // printf("\nlayer %d hit %d is outer hit of %d - // cells\n",layerIndex, hitId, - // isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size()); - // printf("\n\t%f %f %f - // \n",gpuHitsOnLayers[layerIndex].x[hitId],gpuHitsOnLayers[layerIndex].y[hitId],gpuHitsOnLayers[layerIndex].z[hitId]); - // - // for(unsigned cell = 0; cell< - // isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size(); - // cell++) - // { - // printf("cell %d\n", - // isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell]); - // auto& thisCell = - // cells[isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell]]; - // float x1, y1, z1, x2, y2, z2; - // - // x1 = thisCell.get_inner_x(); - // y1 = thisCell.get_inner_y(); - // z1 = thisCell.get_inner_z(); - // x2 = thisCell.get_outer_x(); - // y2 = thisCell.get_outer_y(); - // z2 = thisCell.get_outer_z(); - // printf("\n\tDEBUG cellid %d innerhit outerhit (xyz) (%f %f - // %f), (%f %f - // %f)\n",isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell], - // x1,y1,z1,x2,y2,z2); - // } - // } - // } - // } - - // starting connect - - for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_; - ++layerPairIndex) { - - int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId; - int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId; - int numberOfDoublets = gpuDoublets[layerPairIndex].size; - printf("kernel_debug_connect: connecting layerPairIndex: %d inner %d outer " - "%d size %u\n", - layerPairIndex, innerLayerId, outerLayerId, numberOfDoublets); - - auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_; - auto globalFirstHitIdx = innerLayerId * maxNumberOfHits_; - // printf("kernel_debug_connect: theIdOfThefirstCellInLayerPair: %d - // globalFirstHitIdx %d\n", globalFirstDoubletIdx, - // globalFirstHitIdx); - - for (unsigned int i = 0; i < numberOfDoublets; i++) { - - auto globalCellIdx = i + globalFirstDoubletIdx; - - auto &thisCell = cells[globalCellIdx]; - auto innerHitId = thisCell.get_inner_hit_id(); - auto numberOfPossibleNeighbors = - isOuterHitOfCell[globalFirstHitIdx + innerHitId].size(); - // if(numberOfPossibleNeighbors>0) - // printf("kernel_debug_connect: cell: %d has %d possible - // neighbors\n", globalCellIdx, numberOfPossibleNeighbors); - float x1, y1, z1, x2, y2, z2; - - x1 = thisCell.get_inner_x(); - y1 = thisCell.get_inner_y(); - z1 = thisCell.get_inner_z(); - x2 = thisCell.get_outer_x(); - y2 = thisCell.get_outer_y(); - z2 = thisCell.get_outer_z(); - printf("\n\n\nDEBUG cellid %d innerhit outerhit (xyz) (%f %f %f), (%f %f " - "%f)\n", - globalCellIdx, x1, y1, z1, x2, y2, z2); - - for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { - unsigned int otherCell = - isOuterHitOfCell[globalFirstHitIdx + innerHitId][j]; - - float x3, y3, z3, x4, y4, z4; - x3 = cells[otherCell].get_inner_x(); - y3 = cells[otherCell].get_inner_y(); - z3 = cells[otherCell].get_inner_z(); - x4 = cells[otherCell].get_outer_x(); - y4 = cells[otherCell].get_outer_y(); - z4 = cells[otherCell].get_outer_z(); - - printf("kernel_debug_connect: checking compatibility with %d \n", - otherCell); - printf("DEBUG \tinnerhit outerhit (xyz) (%f %f %f), (%f %f %f)\n", x3, - y3, z3, x4, y4, z4); - - if (thisCell.check_alignment_and_tag( - cells, otherCell, ptmin, region_origin_x, region_origin_y, - region_origin_radius, thetaCut, phiCut, hardPtCut)) { - - printf("kernel_debug_connect: \t\tcell %d is outer neighbor of %d \n", - globalCellIdx, otherCell); - - cells[otherCell].theOuterNeighbors.push_back(globalCellIdx); - } - } - } - } } -__global__ void debug_input_data(unsigned int numberOfLayerPairs_, - const GPULayerDoublets *gpuDoublets, - const GPULayerHits *gpuHitsOnLayers, - float ptmin, float region_origin_x, - float region_origin_y, - float region_origin_radius, - unsigned int maxNumberOfHits_) { - printf("GPU: Region ptmin %f , region_origin_x %f , region_origin_y %f , " - "region_origin_radius %f \n", - ptmin, region_origin_x, region_origin_y, region_origin_radius); - printf("GPU: numberOfLayerPairs_: %d\n", numberOfLayerPairs_); - - for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_; - ++layerPairIndex) { - printf("\t numberOfDoublets: %d \n", gpuDoublets[layerPairIndex].size); - printf("\t innerLayer: %d outerLayer: %d \n", - gpuDoublets[layerPairIndex].innerLayerId, - gpuDoublets[layerPairIndex].outerLayerId); - - for (unsigned int cellIndexInLayerPair = 0; - cellIndexInLayerPair < gpuDoublets[layerPairIndex].size; - ++cellIndexInLayerPair) { - - if (cellIndexInLayerPair < 5) { - auto innerhit = - gpuDoublets[layerPairIndex].indices[2 * cellIndexInLayerPair]; - auto innerX = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId] - .x[innerhit]; - auto innerY = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId] - .y[innerhit]; - auto innerZ = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId] - .z[innerhit]; - - auto outerhit = - gpuDoublets[layerPairIndex].indices[2 * cellIndexInLayerPair + 1]; - auto outerX = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId] - .x[outerhit]; - auto outerY = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId] - .y[outerhit]; - auto outerZ = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId] - .z[outerhit]; - printf("\t \t %d innerHit: %d %f %f %f outerHit: %d %f %f %f\n", - cellIndexInLayerPair, innerhit, innerX, innerY, innerZ, outerhit, - outerX, outerY, outerZ); - } - } - } -} - -template -__global__ void kernel_debug_find_ntuplets( - unsigned int numberOfRootLayerPairs_, const GPULayerDoublets *gpuDoublets, - GPUCACell *cells, - GPU::VecArray *foundNtuplets, - unsigned int *rootLayerPairs, unsigned int minHitsPerNtuplet, - unsigned int maxNumberOfDoublets_) { - printf("numberOfRootLayerPairs_ = %d", numberOfRootLayerPairs_); - for (int rootLayerPair = 0; rootLayerPair < numberOfRootLayerPairs_; - ++rootLayerPair) { - unsigned int rootLayerPairIndex = rootLayerPairs[rootLayerPair]; - auto globalFirstDoubletIdx = rootLayerPairIndex * maxNumberOfDoublets_; - - GPU::VecArray stack; - for (int i = 0; i < gpuDoublets[rootLayerPairIndex].size; i++) { - auto globalCellIdx = i + globalFirstDoubletIdx; - stack.reset(); - stack.push_back(globalCellIdx); - cells[globalCellIdx].find_ntuplets(cells, foundNtuplets, stack, - minHitsPerNtuplet); - } - printf("found quadruplets: %d", foundNtuplets->size()); - } -} - -__global__ void kernel_create( - const unsigned int numberOfLayerPairs_, const GPULayerDoublets *gpuDoublets, - const GPULayerHits *gpuHitsOnLayers, GPUCACell *cells, - GPU::VecArray *isOuterHitOfCell, - GPU::SimpleVector *foundNtuplets, - const float region_origin_x, const float region_origin_y, - unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { - - unsigned int layerPairIndex = blockIdx.y; - unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x; - if (cellIndexInLayerPair == 0 && layerPairIndex == 0) { - foundNtuplets->reset(); - } - - if (layerPairIndex < numberOfLayerPairs_) { - int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId; - auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_; - auto globalFirstHitIdx = outerLayerId * maxNumberOfHits_; - - for (unsigned int i = cellIndexInLayerPair; - i < gpuDoublets[layerPairIndex].size; i += gridDim.x * blockDim.x) { - auto globalCellIdx = i + globalFirstDoubletIdx; - auto &thisCell = cells[globalCellIdx]; - auto outerHitId = gpuDoublets[layerPairIndex].indices[2 * i + 1]; - thisCell.init(&gpuDoublets[layerPairIndex], gpuHitsOnLayers, - layerPairIndex, globalCellIdx, - gpuDoublets[layerPairIndex].indices[2 * i], outerHitId, - region_origin_x, region_origin_y); - - isOuterHitOfCell[globalFirstHitIdx + outerHitId].push_back( - globalCellIdx); - } - } -} __global__ void -kernel_connect(unsigned int numberOfLayerPairs_, - const GPULayerDoublets *gpuDoublets, GPUCACell *cells, - GPU::VecArray< unsigned int, 512> *isOuterHitOfCell, - float ptmin, float region_origin_x, float region_origin_y, +kernel_connect(GPU::SimpleVector *foundNtuplets, + GPUCACell *cells, uint32_t const * nCells, + GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell, + float ptmin, float region_origin_radius, const float thetaCut, const float phiCut, const float hardPtCut, unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) { - unsigned int layerPairIndex = blockIdx.y; - unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x; - if (layerPairIndex < numberOfLayerPairs_) { - int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId; - auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_; - auto globalFirstHitIdx = innerLayerId * maxNumberOfHits_; - - for (int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size; - i += gridDim.x * blockDim.x) { - auto globalCellIdx = i + globalFirstDoubletIdx; - - auto &thisCell = cells[globalCellIdx]; - auto innerHitId = thisCell.get_inner_hit_id(); - auto numberOfPossibleNeighbors = - isOuterHitOfCell[globalFirstHitIdx + innerHitId].size(); - for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { - unsigned int otherCell = - isOuterHitOfCell[globalFirstHitIdx + innerHitId][j]; - - if (thisCell.check_alignment_and_tag( - cells, otherCell, ptmin, region_origin_x, region_origin_y, - region_origin_radius, thetaCut, phiCut, hardPtCut)) { - cells[otherCell].theOuterNeighbors.push_back(globalCellIdx); - } - } - } + + float region_origin_x =0.; + float region_origin_y =0.; + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + + if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel + + if (cellIndex >= (*nCells) ) return; + auto &thisCell = cells[cellIndex]; + auto innerHitId = thisCell.get_inner_hit_id(); + auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + for (auto j = 0; j < numberOfPossibleNeighbors; ++j) { + auto otherCell = isOuterHitOfCell[innerHitId][j]; + + if (thisCell.check_alignment_and_tag( + cells, otherCell, ptmin, region_origin_x, region_origin_y, + region_origin_radius, thetaCut, phiCut, hardPtCut) + ) { + cells[otherCell].theOuterNeighbors.push_back(cellIndex); + } } } __global__ void kernel_find_ntuplets( - unsigned int numberOfRootLayerPairs_, const GPULayerDoublets *gpuDoublets, - GPUCACell *cells, + GPUCACell *cells, uint32_t const * nCells, GPU::SimpleVector *foundNtuplets, - unsigned int *rootLayerPairs, unsigned int minHitsPerNtuplet, + unsigned int minHitsPerNtuplet, unsigned int maxNumberOfDoublets_) { - if (blockIdx.y < numberOfRootLayerPairs_) { - unsigned int cellIndexInRootLayerPair = threadIdx.x + blockIdx.x * blockDim.x; - unsigned int rootLayerPairIndex = rootLayerPairs[blockIdx.y]; - auto globalFirstDoubletIdx = rootLayerPairIndex * maxNumberOfDoublets_; - GPU::VecArray stack; - for (int i = cellIndexInRootLayerPair; - i < gpuDoublets[rootLayerPairIndex].size; - i += gridDim.x * blockDim.x) { - auto globalCellIdx = i + globalFirstDoubletIdx; - stack.reset(); - stack.push_back_unsafe(globalCellIdx); - cells[globalCellIdx].find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet); - } - } + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + if (cellIndex >= (*nCells) ) return; + auto &thisCell = cells[cellIndex]; + if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME + GPU::VecArray stack; + stack.reset(); + thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet); + assert(stack.size()==0); + // printf("in %d found quadruplets: %d\n", cellIndex, foundNtuplets->size()); } -template __global__ void -kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets) { - for (int i = 0; i < foundNtuplets->size(); ++i) { - printf("\nquadruplet %d: %d %d, %d %d, %d %d\n", i, - (*foundNtuplets)[i].layerPairsAndCellId[0].x, - (*foundNtuplets)[i].layerPairsAndCellId[0].y - - maxNumberOfDoublets_ * - ((*foundNtuplets)[i].layerPairsAndCellId[0].x), - (*foundNtuplets)[i].layerPairsAndCellId[1].x, - (*foundNtuplets)[i].layerPairsAndCellId[1].y - - maxNumberOfDoublets_ * - (*foundNtuplets)[i].layerPairsAndCellId[1].x, - (*foundNtuplets)[i].layerPairsAndCellId[2].x, - (*foundNtuplets)[i].layerPairsAndCellId[2].y - - maxNumberOfDoublets_ * - ((*foundNtuplets)[i].layerPairsAndCellId[2].x)); +kernel_print_found_ntuplets(GPU::SimpleVector *foundNtuplets, int maxPrint) { + for (int i = 0; i < std::min(maxPrint,foundNtuplets->size()); ++i) { + printf("\nquadruplet %d: %d %d %d %d\n", i, + (*foundNtuplets)[i].hitId[0], + (*foundNtuplets)[i].hitId[1], + (*foundNtuplets)[i].hitId[2], + (*foundNtuplets)[i].hitId[3] + ); + } } void CAHitQuadrupletGeneratorGPU::deallocateOnGPU() { - cudaFreeHost(h_indices_); - cudaFreeHost(h_doublets_); - cudaFreeHost(h_x_); - cudaFreeHost(h_y_); - cudaFreeHost(h_z_); - cudaFreeHost(h_rootLayerPairs_); for (size_t i = 0; i < h_foundNtupletsVec_.size(); ++i) { cudaFreeHost(h_foundNtupletsVec_[i]); @@ -368,44 +106,21 @@ void CAHitQuadrupletGeneratorGPU::deallocateOnGPU() cudaFree(d_foundNtupletsVec_[i]); cudaFree(d_foundNtupletsData_[i]); } - cudaFreeHost(tmp_layers_); - cudaFreeHost(tmp_layerDoublets_); - cudaFreeHost(h_layers_); - - cudaFree(d_indices_); - cudaFree(d_doublets_); - cudaFree(d_layers_); - cudaFree(d_x_); - cudaFree(d_y_); - cudaFree(d_z_); - cudaFree(d_rootLayerPairs_); + cudaFree(device_theCells_); cudaFree(device_isOuterHitOfCell_); + cudaFree(device_nCells_); } void CAHitQuadrupletGeneratorGPU::allocateOnGPU() { - cudaCheck(cudaMallocHost(&h_doublets_, maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets))); - cudaCheck(cudaMallocHost(&h_indices_, maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * 2 * sizeof(int))); - cudaCheck(cudaMallocHost(&h_x_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMallocHost(&h_y_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMallocHost(&h_z_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMallocHost(&h_rootLayerPairs_, maxNumberOfRootLayerPairs_ * sizeof(int))); - - cudaCheck(cudaMalloc(&d_indices_, maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * 2 * sizeof(int))); - cudaCheck(cudaMalloc(&d_doublets_, maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets))); - cudaCheck(cudaMalloc(&d_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits))); - cudaCheck(cudaMalloc(&d_x_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMalloc(&d_y_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMalloc(&d_z_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float))); - cudaCheck(cudaMalloc(&d_rootLayerPairs_, maxNumberOfRootLayerPairs_ * sizeof(unsigned int))); - ////////////////////////////////////////////////////////// // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// cudaCheck(cudaMalloc(&device_theCells_, maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * sizeof(GPUCACell))); + cudaCheck(cudaMalloc(&device_nCells_,sizeof(uint32_t))); cudaCheck(cudaMalloc(&device_isOuterHitOfCell_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray))); @@ -429,40 +144,47 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU() cudaCheck(cudaMemcpy(d_foundNtupletsVec_[i], & tmp_foundNtuplets, sizeof(GPU::SimpleVector), cudaMemcpyDefault)); } - cudaCheck(cudaMallocHost(&tmp_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits))); - cudaCheck(cudaMallocHost(&tmp_layerDoublets_,maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets))); - cudaCheck(cudaMallocHost(&h_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits))); } void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, - int regionIndex, cudaStream_t cudaStream) + int regionIndex, HitsOnCPU const & hh, + cudaStream_t cudaStream) { assert(regionIndex < maxNumberOfRegions_); - dim3 numberOfBlocks_create(64, numberOfLayerPairs_); - dim3 numberOfBlocks_connect(32, numberOfLayerPairs_); - dim3 numberOfBlocks_find(16, numberOfRootLayerPairs_); + assert(0==regionIndex); + h_foundNtupletsVec_[regionIndex]->reset(); - kernel_create<<>>( - numberOfLayerPairs_, d_doublets_, d_layers_, device_theCells_, - device_isOuterHitOfCell_, d_foundNtupletsVec_[regionIndex], - region.origin().x(), region.origin().y(), maxNumberOfDoublets_, - maxNumberOfHits_); - cudaCheck(cudaGetLastError()); - kernel_connect<<>>( - numberOfLayerPairs_, d_doublets_, device_theCells_, + auto nhits = hh.nHits; + + auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512; + kernel_connect<<>>( + d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel + device_theCells_, device_nCells_, device_isOuterHitOfCell_, - region.ptMin(), region.origin().x(), region.origin().y(), + region.ptMin(), region.originRBound(), caThetaCut, caPhiCut, caHardPtCut, - maxNumberOfDoublets_, maxNumberOfHits_); + maxNumberOfDoublets_, maxNumberOfHits_ + ); cudaCheck(cudaGetLastError()); - kernel_find_ntuplets<<>>( - numberOfRootLayerPairs_, d_doublets_, device_theCells_, + kernel_find_ntuplets<<>>( + device_theCells_, device_nCells_, d_foundNtupletsVec_[regionIndex], - d_rootLayerPairs_, 4, maxNumberOfDoublets_); + 4, maxNumberOfDoublets_); cudaCheck(cudaGetLastError()); + + numberOfBlocks = (std::max(int(nhits),maxNumberOfDoublets_) + 512 - 1)/512; + kernel_checkOverflows<<>>( + d_foundNtupletsVec_[regionIndex], + device_theCells_, device_nCells_, + device_isOuterHitOfCell_, nhits + ); + + + // kernel_print_found_ntuplets<<<1,1,0, cudaStream>>>(d_foundNtupletsVec_[regionIndex],10); + cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex], sizeof(GPU::SimpleVector), cudaMemcpyDeviceToHost, cudaStream)); @@ -473,34 +195,29 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion ®ion, } -std::vector, 3>> +std::vector> CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cudaStream) { + assert(0==regionIndex); h_foundNtupletsVec_[regionIndex]->set_data(h_foundNtupletsData_[regionIndex]); // this lazily resets temporary memory for the next event, and is not needed for reading the output cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray), cudaStream)); - std::vector, 3>> quadsInterface; - for (int i = 0; i < h_foundNtupletsVec_[regionIndex]->size(); ++i) { - auto const& layerPairsAndCellId = (*h_foundNtupletsVec_[regionIndex])[i].layerPairsAndCellId; - std::array, 3> tmpQuad = { - {std::make_pair(layerPairsAndCellId[0].x, layerPairsAndCellId[0].y - maxNumberOfDoublets_ * layerPairsAndCellId[0].x), - std::make_pair(layerPairsAndCellId[1].x, layerPairsAndCellId[1].y - maxNumberOfDoublets_ * layerPairsAndCellId[1].x), - std::make_pair(layerPairsAndCellId[2].x, layerPairsAndCellId[2].y - maxNumberOfDoublets_ * layerPairsAndCellId[2].x)}}; + cudaCheck(cudaMemsetAsync(device_nCells_,0,sizeof(uint32_t),cudaStream)); - quadsInterface.push_back(tmpQuad); + std::vector> quadsInterface(h_foundNtupletsVec_[regionIndex]->size()); + for (int i = 0; i < h_foundNtupletsVec_[regionIndex]->size(); ++i) { + for (int j = 0; j<4; ++j) quadsInterface[i][j] = (*h_foundNtupletsVec_[regionIndex])[i].hitId[j]; } return quadsInterface; } -void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) { +void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) { auto nhits = hh.nHits; - float phiCut=0.06; int threadsPerBlock = 256; - int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock; - - gpuPixelDoublets::getDoubletsFromHisto<<>>(hh.gpu_d,phiCut); + int blocks = (3*nhits + threadsPerBlock - 1) / threadsPerBlock; + gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_,device_nCells_,hh.gpu_d, device_isOuterHitOfCell_); cudaCheck(cudaGetLastError()); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h index a7f33219f66a1..07aabbe2cacbb 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h @@ -18,12 +18,12 @@ #include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h" #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h" +#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h" + #include "GPUCACell.h" -#include "GPUHitsAndDoublets.h" class TrackingRegion; -class SeedingLayerSetsHits; namespace edm { class Event; @@ -37,8 +37,6 @@ class CAHitQuadrupletGeneratorGPU { using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU; using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU; - typedef LayerHitMapCache LayerCacheType; - static constexpr unsigned int minLayers = 4; typedef OrderedHitSeeds ResultType; @@ -54,21 +52,21 @@ class CAHitQuadrupletGeneratorGPU { void initEvent(const edm::Event& ev, const edm::EventSetup& es); - void buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream); + void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream); - void hitNtuplets(const IntermediateHitDoublets& regionDoublets, + void hitNtuplets(const TrackingRegion ®ion, HitsOnCPU const & hh, const edm::EventSetup& es, - const SeedingLayerSetsHits& layers, cudaStream_t stream); - void fillResults(const IntermediateHitDoublets& regionDoublets, + cudaStream_t stream); + void fillResults(const TrackingRegion ®ion, SiPixelRecHitCollectionNew const & rechits, std::vector& result, const edm::EventSetup& es, - const SeedingLayerSetsHits& layers, cudaStream_t stream); + cudaStream_t stream); void allocateOnGPU(); void deallocateOnGPU(); private: - LayerCacheType theLayerCache; +// LayerCacheType theLayerCache; std::unique_ptr theComparitor; @@ -136,10 +134,11 @@ class CAHitQuadrupletGeneratorGPU { const bool enabled_; }; - void launchKernels(const TrackingRegion &, int, cudaStream_t); - std::vector ,3>> fetchKernelResult(int, cudaStream_t); + void launchKernels(const TrackingRegion &, int, HitsOnCPU const & hh, cudaStream_t); + std::vector> fetchKernelResult(int, cudaStream_t); + + const float extraHitRPhitolerance; - std::vector> hitDoublets; const QuantityDependsPt maxChi2; const bool fitFastCircle; @@ -151,29 +150,14 @@ class CAHitQuadrupletGeneratorGPU { const float caHardPtCut = 0.f; static constexpr int maxNumberOfQuadruplets_ = 10000; - static constexpr int maxCellsPerHit_ = 512; + static constexpr int maxCellsPerHit_ = 2048; // 512; static constexpr int maxNumberOfLayerPairs_ = 13; - static constexpr unsigned int maxNumberOfRootLayerPairs_ = 13; static constexpr int maxNumberOfLayers_ = 10; static constexpr int maxNumberOfDoublets_ = 262144; - static constexpr int maxNumberOfHits_ = 10000; - static constexpr int maxNumberOfRegions_ = 30; - - unsigned int numberOfRootLayerPairs_ = 0; - unsigned int numberOfLayerPairs_ = 0; - unsigned int numberOfLayers_ = 0; - - GPULayerDoublets* h_doublets_ = nullptr; - GPULayerHits* h_layers_ = nullptr; - - unsigned int* h_indices_ = nullptr; - float *h_x_=nullptr, *h_y_=nullptr, *h_z_=nullptr; - float *d_x_=nullptr, *d_y_=nullptr, *d_z_=nullptr; - unsigned int* d_rootLayerPairs_ = nullptr; - GPULayerHits* d_layers_ = nullptr; - GPULayerDoublets* d_doublets_ = nullptr; - unsigned int* d_indices_ = nullptr; - unsigned int* h_rootLayerPairs_ = nullptr; + static constexpr int maxNumberOfHits_ = 20000; + static constexpr int maxNumberOfRegions_ = 2; + + std::vector*> h_foundNtupletsVec_; std::vector h_foundNtupletsData_; @@ -182,9 +166,12 @@ class CAHitQuadrupletGeneratorGPU { GPUCACell* device_theCells_ = nullptr; GPU::VecArray< unsigned int, maxCellsPerHit_>* device_isOuterHitOfCell_ = nullptr; + uint32_t* device_nCells_ = nullptr; + + HitsOnCPU const * hitsOnCPU=nullptr; + + RecHitsMap hitmap_ = RecHitsMap(nullptr); - GPULayerHits* tmp_layers_ = nullptr; - GPULayerDoublets* tmp_layerDoublets_ = nullptr; }; #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 14d8ee833ce71..e8d389f00712b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -5,43 +5,43 @@ #define GPU_CACELL_H_ #include "GPUHitsAndDoublets.h" +#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include + struct Quadruplet { - int2 layerPairsAndCellId[3]; + int hitId[4]; }; + class GPUCACell { public: __host__ __device__ GPUCACell() {} - __host__ __device__ void init(const GPULayerDoublets *doublets, - const GPULayerHits *hitsOnLayer, - int layerPairId, int doubletId, int innerHitId, - int outerHitId, float regionX, float regionY) { +__host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh, + int layerPairId, int doubletId, int innerHitId,int outerHitId) { theInnerHitId = innerHitId; theOuterHitId = outerHitId; theDoubletId = doubletId; theLayerPairId = layerPairId; - auto innerLayerId = doublets->innerLayerId; - auto outerLayerId = doublets->outerLayerId; - - theInnerX = hitsOnLayer[innerLayerId].x[innerHitId]; - theOuterX = hitsOnLayer[outerLayerId].x[outerHitId]; + theInnerX = hh.xg_d[innerHitId]; + theOuterX = hh.xg_d[outerHitId]; - theInnerY = hitsOnLayer[innerLayerId].y[innerHitId]; - theOuterY = hitsOnLayer[outerLayerId].y[outerHitId]; + theInnerY = hh.yg_d[innerHitId]; + theOuterY = hh.yg_d[outerHitId]; - theInnerZ = hitsOnLayer[innerLayerId].z[innerHitId]; - theOuterZ = hitsOnLayer[outerLayerId].z[outerHitId]; - theInnerR = hypot(theInnerX - regionX, theInnerY - regionY); - theOuterR = hypot(theOuterX - regionX, theOuterY - regionY); + theInnerZ = hh.zg_d[innerHitId]; + theOuterZ = hh.zg_d[outerHitId]; + theInnerR = hh.rg_d[innerHitId]; + theOuterR = hh.rg_d[outerHitId]; theOuterNeighbors.reset(); } + + constexpr float get_inner_x() const { return theInnerX; } constexpr float get_outer_x() const { return theOuterX; } constexpr float get_inner_y() const { return theInnerY; } @@ -201,57 +201,31 @@ class GPUCACell { // the ntuplets is then saved if the number of hits it contains is greater // than a threshold + tmpNtuplet.push_back_unsafe(theInnerHitId); + assert(tmpNtuplet.size()<=3); - if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) { + if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) { Quadruplet tmpQuadruplet; - for (unsigned int i = 0; i < minHitsPerNtuplet - 1; ++i) { - tmpQuadruplet.layerPairsAndCellId[i].x = cells[tmpNtuplet[i]].theLayerPairId; - tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i]; + for (unsigned int i = 0; i < minHitsPerNtuplet-1; ++i) { + tmpQuadruplet.hitId[i] = tmpNtuplet[i]; } + tmpQuadruplet.hitId[minHitsPerNtuplet-1] = theOuterHitId; foundNtuplets->push_back(tmpQuadruplet); } else { for (int j = 0; j < theOuterNeighbors.size(); ++j) { auto otherCell = theOuterNeighbors[j]; - tmpNtuplet.push_back_unsafe(otherCell); cells[otherCell].find_ntuplets(cells, foundNtuplets, tmpNtuplet, minHitsPerNtuplet); - tmpNtuplet.pop_back(); } } + tmpNtuplet.pop_back(); + assert(tmpNtuplet.size()<3); } #endif - template - __host__ inline void find_ntuplets_host( - const GPUCACell *cells, - GPU::VecArray *foundNtuplets, - GPU::VecArray &tmpNtuplet, - const unsigned int minHitsPerNtuplet) const { - - Quadruplet tmpQuadruplet; - if (tmpNtuplet.size() >= minHitsPerNtuplet - 1) { - for (int i = 0; i < minHitsPerNtuplet - 1; ++i) { - tmpQuadruplet.layerPairsAndCellId[i].x = - cells[tmpNtuplet[i]].theLayerPairId; - tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i]; - } - foundNtuplets->push_back(tmpQuadruplet); - } - - else { - for (int j = 0; j < theOuterNeighbors.size(); ++j) { - auto otherCell = theOuterNeighbors[j]; - tmpNtuplet.push_back_unsafe(otherCell); - cells[otherCell].find_ntuplets_host(cells, foundNtuplets, tmpNtuplet, - minHitsPerNtuplet); - - tmpNtuplet.pop_back(); - } - } - } GPU::VecArray< unsigned int, 40> theOuterNeighbors; int theDoubletId; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h new file mode 100644 index 0000000000000..566a15591472c --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h @@ -0,0 +1,77 @@ +#ifndef RecHitsMap_H +#define RecHitsMap_H + // store T for each cluster... + +#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h" +#include +#include + +#include "FWCore/MessageLogger/interface/MessageLogger.h" + + +//FIXME move it to a better place... +template +class RecHitsMap { +public: + + explicit RecHitsMap(T const & d=T()) : dummy(d){} + + void clear() {m_map.clear();} + + void error(const GeomDetUnit& gd) const {edm::LogError("RecHitMap") << "hit not found in det " << gd.index(); } + void error(uint32_t ind) const {edm::LogError("RecHitMap") << "hit not found in det " << ind; } + + // does not work for matched hits... (easy to extend) + void add(TrackingRecHit const & hit, T const & v) { + auto const & thit = static_cast(hit); + auto const & clus = thit.firstClusterRef(); + + if (clus.isPixel()) + add(clus.pixelCluster(), *thit.detUnit(),v); + else + add(clus.stripCluster(), *thit.detUnit(),v); + } + + template + void add(const Cluster& cluster, const GeomDetUnit& gd, T const & v) { m_map[encode(cluster,gd)] = v; } + + template + T const & get(const Cluster& cluster, const GeomDetUnit& gd) const { + auto p = m_map.find(encode(cluster,gd)); + if (p!=m_map.end()) { return (*p).second; } + error(gd); + return dummy; + } + + T const & get(uint32_t ind, uint16_t mr, uint16_t mc) const { + auto p = m_map.find(encode(ind,mr,mc)); + if (p!=m_map.end()) { return (*p).second; } + error(ind); + return dummy; + } + + static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) { + uint64_t u1 = ind; + uint64_t u2 = mr; + uint64_t u3 = mc; + return (u1<<32) | (u2<<16) | u3; + } + + static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) { + uint64_t u1 = det.index(); + uint64_t u2 = cluster.minPixelRow(); + uint64_t u3 = cluster.minPixelCol(); + return (u1<<32) | (u2<<16) | u3; + } + static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) { + uint64_t u1 = det.index(); + uint64_t u2 = cluster.firstStrip(); + return (u1<<32) | u2; + } + + + std::unordered_map m_map; + T dummy; + }; + +#endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index f09ae6aba5efb..6290c47b9e1ef 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -11,151 +11,162 @@ #include "DataFormats/Math/interface/approx_atan2.h" #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h" +#include "GPUCACell.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" + namespace gpuPixelDoublets { + constexpr uint32_t MaxNumOfDoublets = 1024*1024*256; + + template __device__ - std::pair - findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) { + void doubletsFromHisto(uint8_t const * layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells, + int16_t const * iphi, Hist const * hist, uint32_t const * offsets, + siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh, + GPU::VecArray< unsigned int, 2048> * isOuterHitOfCell, + int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) { + + auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; }; + + // to be optimized later + uint32_t innerLayerCumulativeSize[64]; + assert(nPairs<=64); + innerLayerCumulativeSize[0] = layerSize(layerPairs[0]); + for (uint32_t i=1; i0); + auto ntot = innerLayerCumulativeSize[nPairs-1]; - // find extreemes in top - int16_t minPhi = phiMe-iphicut; - int16_t maxPhi = phiMe+iphicut; - // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl; + auto idx = blockIdx.x*blockDim.x + threadIdx.x; + for(auto j=idx;j::max())); - // std::cout << "jm for " << mPhi << ' ' << jm << std::endl; - jm = std::min(size-1,std::max(0,jm)); - bool notDone=true; - while(jm>0 && mPhiiphi[index[++jm]]){} - jm = std::min(size-1,std::max(0,jm)); - return jm; - }; + uint32_t pairLayerId=0; + while(j>=innerLayerCumulativeSize[pairLayerId++]); --pairLayerId; // move to lower_bound ?? - auto jmin = findLimit(minPhi); - auto jmax = findLimit(maxPhi); + assert(pairLayerId=innerLayerCumulativeSize[pairLayerId-1]); + uint8_t inner = layerPairs[2*pairLayerId]; + uint8_t outer = layerPairs[2*pairLayerId+1]; + assert(outer>inner); - /* - std::cout << "j min/max " << jmin << ' ' << jmax << std::endl; - std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl; - std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl; - std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl; - */ + auto i = (0==pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1]; + i += offsets[inner]; - return std::make_pair(jmin,jmax); - } + // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); + assert(i>=offsets[inner]); + assert(i=offsets[9]) { - // get rid of last layer - return; - } - - assert(0==offsets[0]); - int top = (i>offsets[5]) ? 5: 0; - while (i>=offsets[++top]){}; - assert(top<10); - auto bottom = top-1; - if (bottom == 3 or bottom == 6) { - // do not have UP... (9 we got rid already) - return; - } - assert(i >= offsets[bottom]); - assert(i < offsets[top]); - - if (index[i]>= (offsets[top]-offsets[bottom])) { - printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]); - return; - } + // found hit corresponding to our cuda thread!!!!! + // do the job - assert(index[i]::max()); - - auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut); - - auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) { - auto topPhi = iphi+offsets[top]; - uint16_t imax = std::numeric_limits::max(); - uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0; - int n=0; - for (uint16_t i = mymin+offset; i!=mymax; i++) { - assert(i<=imax); - uint16_t k = (i>mymax) ? i-offset : i; - assert(k=mymin || k2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut) - printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut); - n++; - } - int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin; - assert(n==tot); + auto mep = iphi[i]; + auto mez = hh.zg_d[i]; + auto mer = hh.rg_d[i]; + auto cutoff = [&](int j) { return + abs(hh.zg_d[j]-mez) > maxz[pairLayerId] || + abs(hh.zg_d[j]-mez) < minz[pairLayerId] || + hh.rg_d[j]-mer > maxr[pairLayerId]; }; - slidingWindow(size,jLimits.first,jLimits.second); - } - - template - __device__ - void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) { - auto iphicut = phi2short(phiCut); - auto i = blockIdx.x*blockDim.x + threadIdx.x; - if (i>=offsets[9]) { - // get rid of last layer - return; - } + constexpr float z0cut = 12.f; + auto z0cutoff = [&](int j) { + auto zo = hh.zg_d[j]; + auto ro = hh.rg_d[j]; + auto dr = ro-mer; + return dr > maxr[pairLayerId] || + dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr; + }; - assert(0==offsets[0]); - int top = (i>offsets[5]) ? 5: 0; - while (i>=offsets[++top]){}; - assert(top<10); - auto bottom = top-1; - if (bottom==3 || bottom==6) { - // do not have UP... (9 we got rid already) - return; - } - assert(i>=offsets[bottom]); - assert(i iphicut) + if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk); + for (auto p=hist[outer].begin(kk); p=offsets[outer]); + assert(oi iphicut) continue; + if (z0cutoff(oi)) continue; + auto ind = atomicInc(nCells,MaxNumOfDoublets); + // int layerPairId, int doubletId, int innerHitId,int outerHitId) + cells[ind].init(hh,pairLayerId,ind,i,oi); + isOuterHitOfCell[oi].push_back(ind); + if (isOuterHitOfCell[oi].full()) ++tooMany; ++tot; } } - if (0==hist[top].nspills) assert(tot>=nmin); + if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner,outer, kl,kh,nmin,tot); + + if (hist[outer].nspills>0) + printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills); + + // if (0==hist[outer].nspills) assert(tot>=nmin); // look in spill bin as well.... - } + + } // loop in block... + } __global__ - void getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) { + void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, + GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell) { + + uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 + // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6 + ,0,7 ,1,7 ,2,7 ,7,8 ,8,9 + ,0,4 ,1,4 ,2,4 ,4,5 ,5,6 + }; + + const int16_t phi0p05 = phi2short(0.05); + const int16_t phi0p06 = phi2short(0.06); + const int16_t phi0p07 = phi2short(0.07); + + int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06 + ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 + ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05 + }; + + float const minz[13] = { 0., 0., 0. + ,0., 0., 0., 0., 0. + ,0., 0., 0., 0., 0. + }; + + float const maxz[13] = { 20.,15.,12. + ,30.,20.,20., 50., 50. + ,30.,20.,20., 50., 50. + }; + + float const maxr[13] = { 20., 20., 20. + ,9., 7., 6., 5., 5. + ,9., 7., 6., 5., 5. + }; + + auto const & hh = *hhp; - doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut); + doubletsFromHisto(layerPairs, 13, cells, nCells, + hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d, + hh, isOuterHitOfCell, + phicuts, minz, maxz, maxr); } + + } // namespace end #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h