From 0444702d2d884e853c30d9ad7e207e8dfb737ecd Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Fri, 5 Jul 2019 11:59:12 +0200 Subject: [PATCH] Port the whole pixel workflow to new heterogeneous framework (cms-patatrack#384) - port the whole pixel workflow to new heterogeneous framework - implement a legacy cluster to SoA converter for the pixel RecHits - update the vertex producer to run on CPU as well as GPU --- CUDADataFormats/Track/BuildFile.xml | 10 + .../Track/interface/PixelTrackHeterogeneous.h | 79 ++ .../Track/interface/TrajectoryStateSoA.h | 65 ++ CUDADataFormats/Track/src/classes.h | 10 + CUDADataFormats/Track/src/classes_def.xml | 8 + CUDADataFormats/Track/test/BuildFile.xml | 13 + .../Track/test/TrajectoryStateSOA_t.cpp | 1 + .../Track/test/TrajectoryStateSOA_t.cu | 1 + .../Track/test/TrajectoryStateSOA_t.h | 77 ++ .../customizePixelTracksForProfiling.py | 55 +- .../PixelTrackFitting/interface/FitUtils.h | 37 +- .../PixelTrackFitting/plugins/BuildFile.xml | 1 + .../plugins/PixelTrackDumpCUDA.cc | 94 ++ .../plugins/PixelTrackProducerFromSoA.cc | 214 ++++ .../plugins/PixelTrackSoAFromCUDA.cc | 95 ++ .../PixelTrackFitting/plugins/storeTracks.h | 10 +- .../python/PixelTracks_cff.py | 24 +- .../test/testEigenJacobian.cpp | 2 +- .../plugins/BrokenLineFitOnGPU.cu | 69 +- .../PixelTriplets/plugins/BuildFile.xml | 1 + .../PixelTriplets/plugins/CAHitNtupletCUDA.cc | 82 ++ .../plugins/CAHitNtupletGeneratorKernels.cu | 933 ++++++++++++++++++ .../plugins/CAHitNtupletGeneratorKernels.h | 186 ++++ .../CAHitNtupletGeneratorKernelsAlloc.cu | 40 + .../plugins/CAHitNtupletGeneratorOnGPU.cc | 164 +++ .../plugins/CAHitNtupletGeneratorOnGPU.h | 73 ++ .../PixelTriplets/plugins/GPUCACell.h | 12 +- .../PixelTriplets/plugins/HelixFitOnGPU.cc | 10 +- .../PixelTriplets/plugins/HelixFitOnGPU.h | 23 +- .../PixelTriplets/plugins/RiemannFitOnGPU.cu | 67 +- .../plugins/SeedProducerFromSoA.cc | 177 ++++ 31 files changed, 2495 insertions(+), 138 deletions(-) create mode 100644 CUDADataFormats/Track/BuildFile.xml create mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoA.h create mode 100644 CUDADataFormats/Track/src/classes.h create mode 100644 CUDADataFormats/Track/src/classes_def.xml create mode 100644 CUDADataFormats/Track/test/BuildFile.xml create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h create mode 100644 RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml new file mode 100644 index 0000000000000..521ea8fe29753 --- /dev/null +++ b/CUDADataFormats/Track/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h new file mode 100644 index 0000000000000..a576604b6e935 --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -0,0 +1,79 @@ +#ifndef CUDADataFormatsTrackTrackHeterogeneous_H +#define CUDADataFormatsTrackTrackHeterogeneous_H + +#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" + +namespace trackQuality { + enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity }; +} + +template +class TrackSoAT { +public: + + static constexpr int32_t stride() { return S; } + + using Quality = trackQuality::Quality; + using hindex_type = uint16_t; + using HitContainer = OneToManyAssoc; + + // Always check quality is at least loose! + // CUDA does not support enums in __lgc ... + eigenSoA::ScalarSoA m_quality; + constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));} + constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));} + constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());} + constexpr Quality * qualityData() { return (Quality*)(m_quality.data());} + + + // this is chi2/ndof as not necessarely all hits are used in the fit + eigenSoA::ScalarSoA chi2; + + constexpr int nHits(int i) const { return detIndices.size(i);} + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + TrajectoryStateSoA stateAtBS; + eigenSoA::ScalarSoA eta; + eigenSoA::ScalarSoA pt; + constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); } + constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } + constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } + constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } + + // state at the detector of the outermost hit + // representation to be decided... + // not yet filled on GPU + // TrajectoryStateSoA stateAtOuterDet; + + HitContainer hitIndices; + HitContainer detIndices; + + // total number of tracks (including those not fitted) + uint32_t m_nTracks; + +}; + +namespace pixelTrack{ + +#ifdef GPU_SMALL_EVENTS + constexpr uint32_t maxNumber() { return 2 * 1024;} +#else + constexpr uint32_t maxNumber() { return 32 * 1024;} +#endif + + using TrackSoA = TrackSoAT; + using TrajectoryState = TrajectoryStateSoA; + using HitContainer = TrackSoA::HitContainer; + using Quality = trackQuality::Quality; + +} + +using PixelTrackHeterogeneous = HeterogeneousSoA; + + +#endif // CUDADataFormatsTrackTrackSoA_H + diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h new file mode 100644 index 0000000000000..a6553ff96cb0b --- /dev/null +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h @@ -0,0 +1,65 @@ +#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H +#define CUDADataFormatsTrackTrajectoryStateSOA_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" + +template +struct TrajectoryStateSoA { + + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + + using Vector5d = Eigen::Matrix; + using Matrix5d = Eigen::Matrix; + + + static constexpr int32_t stride() { return S; } + + eigenSoA::MatrixSoA state; + eigenSoA::MatrixSoA covariance; + + + template + __host__ __device__ + void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) { + state(i) << cp.template cast(), lp.template cast(); + state(i)(2) *=b; + auto cov = covariance(i); + cov(0) = ccov(0,0); + cov(1) = ccov(0,1); + cov(2) = b*float(ccov(0,2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1,1); + cov(6) = b*float(ccov(1,2)); + cov(8) = cov(7) = 0; + cov(9) = b*b*float(ccov(2,2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0,0); + cov(13) = lcov(0,1); + cov(14) = lcov(1,1); + } + + + template + __host__ __device__ + void copyFromDense(V5 const & v, M5 const & cov, int32_t i) { + state(i) = v.template cast(); + for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); + } + + template + __host__ __device__ + void copyToDense(V5 & v, M5 & cov, int32_t i) const { + v = state(i).template cast(); + for(int j=0, ind=0; j<5; ++j) { + cov(j,j) = covariance(i)(ind++); + for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++); + } + } + +}; + +#endif // CUDADataFormatsTrackTrajectoryStateSOA_H + + diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h new file mode 100644 index 0000000000000..7f89096977e64 --- /dev/null +++ b/CUDADataFormats/Track/src/classes.h @@ -0,0 +1,10 @@ +#ifndef CUDADataFormats__src_classes_h +#define CUDADataFormats__src_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Common/interface/ArrayShadow.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml new file mode 100644 index 0000000000000..a4c2e766582dd --- /dev/null +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml new file mode 100644 index 0000000000000..598b345d4709d --- /dev/null +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h new file mode 100644 index 0000000000000..adefb57d7bbe5 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h @@ -0,0 +1,77 @@ +#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h" + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; + +__host__ __device__ +Matrix5d loadCov(Vector5d const & e) { + Matrix5d cov; + for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined + cov(i,j) = (i+j)%2 ? -0.4*v : 0.1*v; + cov(j,i) = cov(i,j); + } + } + return cov; +} + + +using TS = TrajectoryStateSoA<128>; + +__global__ void testTSSoA(TS * pts, int n) { + + assert(n<=128); + + Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1; + Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01; + auto cov0 = loadCov(e0); + + TS & ts = *pts; + + int first = threadIdx.x + blockIdx.x * blockDim.x; + + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + ts.copyFromDense(par0,cov0,i); + Vector5d par1; Matrix5d cov1; + ts.copyToDense(par1,cov1,i); + Vector5d delV = par1-par0; + Matrix5d delM = cov1-cov0; + for(int j=0; j<5; ++j) { + assert(std::abs(delV(j))<1.e-5); + for (auto k=j;k<5;++k) { + assert(cov0(k,j)==cov0(j,k)); + assert(cov1(k,j)==cov1(j,k)); + assert(std::abs(delM(k,j))<1.e-5); + } + } + + } +} + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#endif + +int main() { +#ifdef __CUDACC__ + exitSansCUDADevices(); +#endif + + + TS ts; + +#ifdef __CUDACC__ + TS * ts_d; + cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); + testTSSoA<<<1, 64>>>(ts_d,128); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); + cudaCheck(cudaDeviceSynchronize()); +#else + testTSSoA(&ts,128); +#endif + +} diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py index 1021918c0ce6c..ce97de6650244 100644 --- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py +++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py @@ -1,6 +1,41 @@ import FWCore.ParameterSet.Config as cms -def customizePixelTracksForProfiling(process): +def customizePixelTracksForProfilingGPUOnly(process): + process.MessageLogger.cerr.FwkReport.reportEvery = 100 + + process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting) + + process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi') + process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi') + process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA) + + process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco) + return process + +def customizePixelTracksForProfilingSoAonCPU(process): + process = customizePixelTracksForProfilingGPUOnly(process) + + process.pixelVertexSoA = process.pixelVertexCUDA.clone() + process.pixelVertexSoA.onGPU = False + process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA' + process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA) + + process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco) + + return process + +def customizePixelTracksForProfilingEnableTransfer(process): + process = customizePixelTracksForProfilingGPUOnly(process) + + process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi') + process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi') + process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA) + + process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA) + return process + +def customizePixelTracksForProfilingEnableConversion(process): + # use old trick of output path process.MessageLogger.cerr.FwkReport.reportEvery = 100 process.out = cms.OutputModule("AsciiOutputModule", @@ -17,21 +52,3 @@ def customizePixelTracksForProfiling(process): return process -def customizePixelTracksForProfilingDisableConversion(process): - process = customizePixelTracksForProfiling(process) - - # Disable conversions to legacy - process.pixelTracksHitQuadruplets.gpuEnableConversion = False - process.pixelTracks.gpuEnableConversion = False - process.pixelVertices.gpuEnableConversion = False - - return process - -def customizePixelTracksForProfilingDisableTransfer(process): - process = customizePixelTracksForProfilingDisableConversion(process) - - # Disable "unnecessary" transfers to CPU - process.pixelTracksHitQuadruplets.gpuEnableTransfer = False - process.pixelVertices.gpuEnableTransfer = False - - return process diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h index e92c46f654615..e44a58f676106 100644 --- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h +++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h @@ -189,18 +189,43 @@ namespace Rfit { circle.par = par_pak; } + /*! + \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and + consequently covariance matrix. + \param circle_uvr parameter (X0,Y0,R), covariance matrix to + be transformed and particle charge. + */ + __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) { + Vector3d par_pak; + const double temp0 = circle.par.head(2).squaredNorm(); + const double temp1 = sqrt(temp0); + par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)), + circle.q/circle.par(2); + + const double temp2 = sqr(circle.par(0)) * 1. / temp0; + const double temp3 = 1. / temp1 * circle.q; + Matrix3d J4; + J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, + circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2)); + circle.cov = J4 * circle.cov * J4.transpose(); + + circle.par = par_pak; + } + + + // transformation between the "perigee" to cmssw localcoord frame // the plane of the latter is the perigee plane... - // from //!<(phi,Tip,pt,cotan(theta)),Zip) + // from //!<(phi,Tip,q/pt,cotan(theta)),Zip) // to q/p,dx/dz,dy/dz,x,z - template - __host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) { + template + __host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) { auto sinTheta2 = 1./(1.+ip(3)*ip(3)); auto sinTheta = std::sqrt(sinTheta2); auto cosTheta = ip(3)*sinTheta; - op(0) = charge*sinTheta/ip(2); + op(0) = sinTheta*ip(2); op(1) = 0.; op(2) = -ip(3); op(3) = ip(1); @@ -208,8 +233,8 @@ namespace Rfit { Matrix5d J = Matrix5d::Zero(); - J(0,2) = -charge*sinTheta/(ip(2)*ip(2)); - J(0,3) = -charge*sinTheta2*cosTheta/ip(2); + J(0,2) = sinTheta; + J(0,3) = -sinTheta2*cosTheta*ip(2); J(1,0) = 1.; J(2,3) = -1.; J(3,1) = 1.; diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml index 62a8e8541aa64..8c0261ee0d999 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml @@ -2,6 +2,7 @@ + diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc new file mode 100644 index 0000000000000..025e7abd99cf1 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -0,0 +1,94 @@ +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + + +class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> { +public: + explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig); + ~PixelTrackDumpCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const override; + const bool m_onGPU; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; + + +}; + +PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) : + m_onGPU(iConfig.getParameter("onGPU")) { + if (m_onGPU) { + tokenGPUTrack_ = consumes>(iConfig.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = consumes>(iConfig.getParameter("pixelVertexSrc")); + } else { + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + } +} + +void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("onGPU",true); + desc.add("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA")); + desc.add("pixelVertexSrc", edm::InputTag("pixelVertexCUDA")); + descriptions.add("pixelTrackDumpCUDA", desc); +} + +void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const { + if (m_onGPU) { + + auto const & hTracks = iEvent.get(tokenGPUTrack_); + CUDAScopedContextProduce ctx{hTracks}; + + auto const& tracks = ctx.get(hTracks); + auto const * tsoa = tracks.get(); + assert(tsoa); + + auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); + auto const * vsoa = vertices.get(); + assert(vsoa); + + } else { + auto const * tsoa = iEvent.get(tokenSoATrack_).get(); + assert(tsoa); + + auto const * vsoa = iEvent.get(tokenSoAVertex_).get(); + assert(vsoa); + + } + +} + + +DEFINE_FWK_MODULE(PixelTrackDumpCUDA); + diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc new file mode 100644 index 0000000000000..284bcfc2ebb51 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -0,0 +1,214 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "DataFormats/GeometrySurface/interface/Plane.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" +#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" + +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" + +#include "storeTracks.h" +#include "CUDADataFormats/Common/interface/ArrayShadow.h" + + +/** + * This class creates "leagcy" reco::Track + * objects from the output of GPU CA. + */ +class PixelTrackProducerFromSoA : public edm::global::EDProducer<> { +public: + + using IndToEdm = std::vector; + + explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig); + ~PixelTrackProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + + using HitModuleStart = std::array; + using HMSstorage = ArrayShadow; + + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + edm::EDGetTokenT tBeamSpot_; + edm::EDGetTokenT tokenTrack_; + edm::EDGetTokenT cpuHits_; + edm::EDGetTokenT hmsToken_; + + int32_t minNumberOfHits_; +}; + +PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) : + tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + minNumberOfHits_(iConfig.getParameter("minNumberOfHits")) +{ + produces(); + produces(); + produces(); + produces(); +} + +void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("trackSrc", edm::InputTag("pixelTrackSoA")); + desc.add("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting")); + desc.add("minNumberOfHits", 0); + + descriptions.addWithDefaultLabel(desc); +} + + +void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + // std::cout << "Converting gpu helix in reco tracks" << std::endl; + + auto indToEdmP = std::make_unique(); + auto & indToEdm = *indToEdmP; + + edm::ESHandle fieldESH; + iSetup.get().get(fieldESH); + + pixeltrackfitting::TracksWithRecHits tracks; + edm::ESHandle httopo; + iSetup.get().get(httopo); + + + edm::Handle bsHandle; + iEvent.getByToken(tBeamSpot_, bsHandle); + const auto &bsh = *bsHandle; + // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; + GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); + + edm::Handle hhms; + iEvent.getByToken(hmsToken_,hhms); + auto const & hitsModuleStart = *hhms; + + auto fc = hitsModuleStart.data; + + edm::Handle gh; + iEvent.getByToken(cpuHits_, gh); + auto const &rechits = *gh; + std::vector hitmap; + auto const &rcs = rechits.data(); + auto nhits = rcs.size(); + hitmap.resize(nhits,nullptr); + for (auto const &h : rcs) { + auto const &thit = static_cast(h); + auto detI = thit.det()->index(); + auto const &clus = thit.firstClusterRef(); + assert(clus.isPixel()); + auto i = fc[detI] + clus.pixelCluster().originalId(); + assert(i < nhits); + assert(nullptr==hitmap[i]); + hitmap[i] = &h; + } + + std::vector hits; + hits.reserve(5); + + const auto & tsoa = *iEvent.get(tokenTrack_); + + auto const * quality = tsoa.qualityData(); + auto const & fit = tsoa.stateAtBS; + auto const & hitIndices = tsoa.hitIndices; + auto maxTracks =tsoa.stride(); + + int32_t nt = 0; + + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... + indToEdm.push_back(-1); + auto q = quality[it]; + if (q != trackQuality::loose) + continue; // FIXME + if (nHits< minNumberOfHits_) continue; + indToEdm.back() = nt; + ++nt; + + hits.resize(nHits); + auto b = hitIndices.begin(it); + for (int iHit = 0; iHit < nHits; ++iHit) + hits[iHit] = hitmap[*(b+iHit)]; + + // mind: this values are respect the beamspot! + + float chi2 = tsoa.chi2(it); + float phi = tsoa.phi(it); + + Rfit::Vector5d ipar,opar; + Rfit::Matrix5d icov,ocov; + fit.copyToDense(ipar,icov,it); + Rfit::transformToPerigeePlane(ipar,icov,opar,ocov); + + LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.); + AlgebraicSymMatrix55 m; + for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j); + + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot( + sp, -cp, 0, + 0, 0, -1.f, + cp, sp, 0); + + Plane impPointPlane(bs,rot); + GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), + impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product()); + JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product()); + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m); + + int ndof = 2*hits.size()-5; + chi2 = chi2*ndof; // FIXME + GlobalPoint vv = gp.position(); + math::XYZPoint pos( vv.x(), vv.y(), vv.z() ); + GlobalVector pp = gp.momentum(); + math::XYZVector mom( pp.x(), pp.y(), pp.z() ); + + auto track = std::make_unique ( chi2, ndof, pos, mom, + gp.charge(), CurvilinearTrajectoryError(mo)); + // filter??? + tracks.emplace_back(track.release(), hits); + } + // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl; + + // store tracks + storeTracks(iEvent, tracks, *httopo); + iEvent.put(std::move(indToEdmP)); +} + + +DEFINE_FWK_MODULE(PixelTrackProducerFromSoA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc new file mode 100644 index 0000000000000..c8dc04633f832 --- /dev/null +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -0,0 +1,95 @@ +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" + + +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + +class PixelTrackSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig); + ~PixelTrackSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + + + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; + + cudautils::host::unique_ptr m_soa; + +}; + +PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) : + tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) +{} + + +void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + + edm::ParameterSetDescription desc; + + desc.add("src", edm::InputTag("caHitNtupletCUDA")); + descriptions.add("pixelTrackSoA", desc); + +} + + +void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, + edm::EventSetup const& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(tokenCUDA_); + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + auto const& inputData = ctx.get(inputDataWrapped); + + m_soa = inputData.toHostAsync(ctx.stream()); + +} + +void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + + /* + auto const & tsoa = *m_soa; + auto maxTracks = tsoa.stride(); + std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl; + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + assert(nHits==int(tsoa.hitIndices.size(it))); + if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... + nt++; + } + std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl; + */ + + // DO NOT make a copy (actually TWO....) + iEvent.emplace(tokenSOA_,PixelTrackHeterogeneous(std::move(m_soa))); + + assert(!m_soa); +} + + +DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h index 48abab5237587..13bdee8164780 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h @@ -16,8 +16,8 @@ #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" #include "Geometry/Records/interface/TrackerTopologyRcd.h" -template -void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo) +template +void storeTracks(Ev & ev, const TWH& tracksWithHits, const TrackerTopology& ttopo) { auto tracks = std::make_unique(); auto recHits = std::make_unique(); @@ -27,12 +27,12 @@ void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHi for (int i = 0; i < nTracks; i++) { - reco::Track* track = tracksWithHits.at(i).first; - const SeedingHitSet& hits = tracksWithHits.at(i).second; + reco::Track* track = tracksWithHits[i].first; + const auto & hits = tracksWithHits[i].second; for (unsigned int k = 0; k < hits.size(); k++) { - TrackingRecHit *hit = hits[k]->hit()->clone(); + auto * hit = hits[k]->clone(); track->appendHitPattern(*hit, ttopo); recHits->push_back(hit); diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py index ef6d5d16fb329..ab7738826b1c2 100644 --- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py +++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py @@ -51,11 +51,6 @@ SeedComparitorPSet = dict(clusterShapeCacheSrc = 'siPixelClusterShapeCachePreSplitting') ) -from Configuration.ProcessModifiers.gpu_cff import gpu -from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer -gpu.toReplaceWith(pixelTracksHitQuadruplets, _caHitQuadrupletHeterogeneousEDProducer) -gpu.toModify(pixelTracksHitQuadruplets, trackingRegions = "pixelTracksTrackingRegions") - # for trackingLowPU pixelTracksHitTriplets = _pixelTripletHLTEDProducer.clone( doublets = "pixelTracksHitDoublets", @@ -70,10 +65,6 @@ ) trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets") -from Configuration.ProcessModifiers.gpu_cff import gpu -from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA -gpu.toReplaceWith(pixelTracks, _pixelTrackProducerFromCUDA) - pixelTracksTask = cms.Task( pixelTracksTrackingRegions, pixelFitterByHelixProjections, @@ -94,4 +85,19 @@ _pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter) ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit) + +from Configuration.ProcessModifiers.gpu_cff import gpu +from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi import pixelTrackSoA +from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackFromSoA +_pixelTracksGPUTask = cms.Task( + caHitNtupletCUDA, + pixelTrackSoA, + pixelTracks # FromSoA +) + +gpu.toReplaceWith(pixelTracksTask, _pixelTracksGPUTask) +gpu.toReplaceWith(pixelTracks,_pixelTrackFromSoA) + + pixelTracksSequence = cms.Sequence(pixelTracksTask) diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp index dc12de88001cd..d294e4cc6c1d6 100644 --- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp +++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp @@ -85,7 +85,7 @@ int main() { // Matrix5d covf = transfFast(cov0,par0); - Rfit::transformToPerigeePlane(par0,cov0,par1,cov1,charge); + Rfit::transformToPerigeePlane(par0,cov0,par1,cov1); std::cout << "cov1\n" << cov1 << std::endl; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index cb8f151233385..bdfb835a02f33 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -17,14 +17,15 @@ #include "HelixFitOnGPU.h" using HitsOnGPU = TrackingRecHit2DSOAView; -using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; +using Tuples = pixelTrack::HitContainer; +using OutputSoA = pixelTrack::TrackSoA; using namespace Eigen; // #define BL_DUMP_HITS template -__global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets, +__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets, CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, HitsOnGPU const *__restrict__ hhp, double *__restrict__ phits, @@ -55,10 +56,10 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found return; // get it from the ntuple container (one to one to helix) - auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start); - assert(helix_start < foundNtuplets->nbins()); + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start); + assert(tkid < foundNtuplets->nbins()); - assert(foundNtuplets->size(helix_start) == nHits); + assert(foundNtuplets->size(tkid) == nHits); Rfit::Map3xNd hits(phits + local_start); Rfit::Map4d fast_fit(pfast_fit + local_start); @@ -68,11 +69,11 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found __shared__ int done; done = 0; __syncthreads(); - bool dump = (foundNtuplets->size(helix_start) == 5 && 0 == atomicAdd(&done, 1)); + bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1)); #endif // Prepare data structure - auto const *hitId = foundNtuplets->begin(helix_start); + auto const *hitId = foundNtuplets->begin(tkid); for (unsigned int i = 0; i < hitsInFit; ++i) { auto hit = hitId[i]; float ge[6]; @@ -80,14 +81,14 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found #ifdef BL_DUMP_HITS if (dump) { printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", - helix_start, + tkid, hhp->detectorIndex(hit), i, hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit)); printf("Error: %d: %d hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", - helix_start, + tkid, hhp->detetectorIndex(hit), i, ge[0], @@ -113,7 +114,7 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found template __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, double B, - Rfit::helix_fit *results, + OutputSoA *results, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -133,7 +134,7 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t return; // get it for the ntuple container (one to one to helix) - auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start); + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start); Rfit::Map3xNd hits(phits + local_start); Rfit::Map4d fast_fit(pfast_fit + local_start); @@ -148,41 +149,31 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data); BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line); BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle); - Jacob << 1, 0, 0, 0, 1, 0, 0, 0, -B / std::copysign(Rfit::sqr(circle.par(2)), circle.par(2)); - circle.par(2) = B / std::abs(circle.par(2)); - circle.cov = Jacob * circle.cov * Jacob.transpose(); - // Grab helix_fit from the proper location in the output vector - auto &helix = results[helix_start]; - helix.par << circle.par, line.par; - - helix.cov = Rfit::Matrix5d::Zero(); - helix.cov.block(0, 0, 3, 3) = circle.cov; - helix.cov.block(3, 3, 2, 2) = line.cov; - - helix.q = circle.q; - helix.chi2_circle = circle.chi2; - helix.chi2_line = line.chi2; + results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid); + results->pt(tkid) = float(B)/float(std::abs(circle.par(2))); + results->eta(tkid) = asinhf(line.par(0)); + results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5); #ifdef BROKENLINE_DEBUG if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) - printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle, helix.chi2_line); + printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N, nHits, - helix_start, + tkid, circle.par(0), circle.par(1), circle.par(2)); - printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0), line.par(1)); + printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1)); printf("kernelBLHits chi2 cov %f/%f %e,%e,%e,%e,%e\n", - helix.chi2_circle, - helix.chi2_line, - helix.cov(0, 0), - helix.cov(1, 1), - helix.cov(2, 2), - helix.cov(3, 3), - helix.cov(4, 4)); + circle.chi2, + line.chi2, + circle.cov(0, 0), + circle.cov(1, 1), + circle.cov(2, 2), + line.cov(0, 0), + line.cov(1, 1)); #endif } @@ -218,7 +209,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh, kernelBLFit<3><<>>(tupleMultiplicity_d, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -239,7 +230,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh, kernelBLFit<4><<>>(tupleMultiplicity_d, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -261,7 +252,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh, kernelBLFit<4><<>>(tupleMultiplicity_d, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -282,7 +273,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh, kernelBLFit<5><<>>(tupleMultiplicity_d, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index 341a108348337..6d15cc6883098 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -12,6 +12,7 @@ + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc new file mode 100644 index 0000000000000..ba8a3e1052e7b --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -0,0 +1,82 @@ +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" + +#include "CAHitNtupletGeneratorOnGPU.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + + +class CAHitNtupletCUDA : public edm::global::EDProducer<> { +public: + explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig); + ~CAHitNtupletCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + edm::EDGetTokenT> tokenHit_; + edm::EDPutTokenT> tokenTrack_; + + CAHitNtupletGeneratorOnGPU gpuAlgo_; + +}; + +CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) : + tokenHit_(consumes>(iConfig.getParameter("pixelRecHitSrc"))), + tokenTrack_(produces>()), + gpuAlgo_(iConfig, consumesCollector()) {} + + +void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting")); + + CAHitNtupletGeneratorOnGPU::fillDescriptions(desc); + auto label = "caHitNtupletCUDA"; + descriptions.add(label, desc); +} + +void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const { + + edm::Handle> hHits; + iEvent.getByToken(tokenHit_, hHits); + + CUDAScopedContextProduce ctx{*hHits}; + auto const& hits = ctx.get(*hHits); + + auto bf = 1./PixelRecoUtilities::fieldInInvGev(es); + + ctx.emplace( + iEvent, + tokenTrack_, + std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())) + ); + +} + + + +DEFINE_FWK_MODULE(CAHitNtupletCUDA); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu new file mode 100644 index 0000000000000..cedef59f78f91 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -0,0 +1,933 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +// #define NTUPLE_DEBUG + +#include +#include + +#include + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + +#include "CAConstants.h" +#include "CAHitNtupletGeneratorKernels.h" +#include "GPUCACell.h" +#include "gpuFishbone.h" +#include "gpuPixelDoublets.h" + +using namespace gpuPixelDoublets; + + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + + using HitToTuple = CAConstants::HitToTuple; + using TupleMultiplicity = CAConstants::TupleMultiplicity; + + using Quality = pixelTrack::Quality; + using TkSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + +__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets, + CAConstants::TupleMultiplicity * tupleMultiplicity, + AtomicPairCounter *apc, + GPUCACell const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + CellNeighborsVector const *cellNeighbors, + CellTracksVector const *cellTracks, + GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, + uint32_t nHits, + CAHitNtupletGeneratorKernels::Counters *counters) { + auto idx = threadIdx.x + blockIdx.x * blockDim.x; + + auto &c = *counters; + // counters once per event + if (0 == idx) { + atomicAdd(&c.nEvents, 1); + atomicAdd(&c.nHits, nHits); + atomicAdd(&c.nCells, *nCells); + atomicAdd(&c.nTuples, apc->get().m); + atomicAdd(&c.nFitTracks,tupleMultiplicity->size()); + } + +#ifdef NTUPLE_DEBUG + if (0 == idx) { + printf("number of found cells %d, found tuples %d with total hits %d out of %d\n", + *nCells, + apc->get().m, + apc->get().n, + nHits); + if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) { + assert(foundNtuplets->size(apc->get().m) == 0); + assert(foundNtuplets->size() == apc->get().n); + } + } + + if (idx < foundNtuplets->nbins()) { + if (foundNtuplets->size(idx) > 5) + printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); + assert(foundNtuplets->size(idx) < 6); + for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + assert(*ih < nHits); + } +#endif + + if (0 == idx) { + if (apc->get().m >= CAConstants::maxNumberOfQuadruplets()) + printf("Tuples overflow\n"); + if (*nCells >= CAConstants::maxNumberOfDoublets()) + printf("Cells overflow\n"); + } + + if (idx < (*nCells)) { + auto &thisCell = cells[idx]; + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId); + if (thisCell.theDoubletId < 0) + atomicAdd(&c.nKilledCells, 1); + if (0==thisCell.theUsed) + atomicAdd(&c.nEmptyCells, 1); + if (thisCell.tracks().empty()) + atomicAdd(&c.nZeroTrackCells, 1); + } + if (idx < nHits) { + if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell; + printf("OuterHitOfCell overflow %d\n", idx); + } +} + + +__global__ void kernel_fishboneCleaner(GPUCACell const *cells, + uint32_t const *__restrict__ nCells, + Quality *quality) { + constexpr auto bad = trackQuality::bad; + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + + if (cellIndex >= (*nCells)) + return; + auto const &thisCell = cells[cellIndex]; + if (thisCell.theDoubletId >= 0) + return; + + for (auto it : thisCell.tracks()) + quality[it] = bad; +} + +__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells, + uint32_t const *__restrict__ nCells, + HitContainer *foundNtuplets, + Quality *quality) { + // constexpr auto bad = trackQuality::bad; + constexpr auto dup = trackQuality::dup; + // constexpr auto loose = trackQuality::loose; + + assert(nCells); + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + + if (cellIndex >= (*nCells)) + return; + auto const &thisCell = cells[cellIndex]; + if (thisCell.theDoubletId < 0) + return; + + uint32_t maxNh = 0; + + // find maxNh + for (auto it : thisCell.tracks()) { + auto nh = foundNtuplets->size(it); + maxNh = std::max(nh, maxNh); + } + + for (auto it : thisCell.tracks()) { + if (foundNtuplets->size(it) != maxNh) + quality[it] = dup; //no race: simple assignment of the same constant + } + +} + + +__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells, + uint32_t const *__restrict__ nCells, + HitContainer const * __restrict__ foundNtuplets, + TkSoA * __restrict__ tracks) { + constexpr auto bad = trackQuality::bad; + constexpr auto dup = trackQuality::dup; + constexpr auto loose = trackQuality::loose; + + assert(nCells); + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + + if (cellIndex >= (*nCells)) + return; + auto const &thisCell = cells[cellIndex]; + if (thisCell.theDoubletId < 0) + return; + + float mc = 10000.f; + uint16_t im = 60000; + + auto score = [&](auto it) { + return std::abs(tracks->tip(it)); // tip + // return tracks->chi2(it); //chi2 + }; + + // find min socre + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) == loose && score(it) < mc) { + mc = score(it); + im = it; + } + } + // mark all other duplicates + for (auto it : thisCell.tracks()) { + if (tracks->quality(it) != bad && it != im) + tracks->quality(it) = dup; //no race: simple assignment of the same constant + } +} + + +__global__ void kernel_connect(AtomicPairCounter *apc1, + AtomicPairCounter *apc2, // just to zero them, + GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *cells, + uint32_t const *__restrict__ nCells, + CellNeighborsVector *cellNeighbors, + GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell, + float hardCurvCut, + float ptmin, + float CAThetaCutBarrel, + float CAThetaCutForward, + float dcaCutInnerTriplet, + float dcaCutOuterTriplet) { + auto const &hh = *hhp; + + auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y; + auto first = threadIdx.x; + auto stride = blockDim.x; + + if (0 == (cellIndex + first)) { + (*apc1) = 0; + (*apc2) = 0; + } // ready for next kernel + + if (cellIndex >= (*nCells)) + return; + auto & thisCell = cells[cellIndex]; + //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1) + // return; + auto innerHitId = thisCell.get_inner_hit_id(); + auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size(); + auto vi = isOuterHitOfCell[innerHitId].data(); + + constexpr uint32_t last_bpix1_detIndex = 96; + constexpr uint32_t last_barrel_detIndex = 1184; + auto ri = thisCell.get_inner_r(hh); + auto zi = thisCell.get_inner_z(hh); + + auto ro = thisCell.get_outer_r(hh); + auto zo = thisCell.get_outer_z(hh); + auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex; + + for (auto j = first; j < numberOfPossibleNeighbors; j += stride) { + auto otherCell = __ldg(vi + j); + auto & oc = cells[otherCell]; + // if (cells[otherCell].theDoubletId < 0 || + // cells[otherCell].theUsed>1 ) + // continue; + auto r1 = oc.get_inner_r(hh); + auto z1 = oc.get_inner_z(hh); + // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex; + bool aligned = GPUCACell::areAlignedRZ(r1, + z1, + ri, + zi, + ro, + zo, + ptmin, + isBarrel ? CAThetaCutBarrel : CAThetaCutForward); // 2.f*thetaCut); // FIXME tune cuts + if(aligned && + thisCell.dcaCut(hh,oc, + oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet, + hardCurvCut) + ) { // FIXME tune cuts + oc.addOuterNeighbor(cellIndex, *cellNeighbors); + thisCell.theUsed |= 1; + oc.theUsed |= 1; + } + } // loop on inner cells +} + +__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *__restrict__ cells, + uint32_t const *nCells, + CellTracksVector *cellTracks, + HitContainer *foundNtuplets, + AtomicPairCounter *apc, + Quality * __restrict__ quality, + unsigned int minHitsPerNtuplet) { + // recursive: not obvious to widen + auto const &hh = *hhp; + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + if (cellIndex >= (*nCells)) + return; + auto &thisCell = cells[cellIndex]; + + if (thisCell.theDoubletId < 0) + return; + + auto pid = thisCell.theLayerPairId; + auto doit = minHitsPerNtuplet>3 ? pid<3 : pid<8 || pid >12; + if (doit) { + GPUCACell::TmpTuple stack; + stack.reset(); + thisCell.find_ntuplets(hh, + cells, + *cellTracks, + *foundNtuplets, + *apc, + quality, + stack, + minHitsPerNtuplet, + pid<3); + assert(stack.size() == 0); + // printf("in %d found quadruplets: %d\n", cellIndex, apc->get()); + } + +} + + +__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp, + GPUCACell *__restrict__ cells, + uint32_t const *nCells) { + + // auto const &hh = *hhp; + + auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x; + if (cellIndex >= (*nCells)) + return; + auto &thisCell = cells[cellIndex]; + if (!thisCell.tracks().empty()) + thisCell.theUsed |= 2; + +} + + +__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const * __restrict__ quality, + CAConstants::TupleMultiplicity *tupleMultiplicity) { + auto it = blockIdx.x * blockDim.x + threadIdx.x; + + if (it >= foundNtuplets->nbins()) + return; + + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + return; + if (quality[it] == trackQuality::dup) return; + assert(quality[it] == trackQuality::bad); + if (nhits>5) printf("wrong mult %d %d\n",it,nhits); + assert(nhits<8); + tupleMultiplicity->countDirect(nhits); +} + + + +__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, + Quality const * __restrict__ quality, + CAConstants::TupleMultiplicity *tupleMultiplicity) { + auto it = blockIdx.x * blockDim.x + threadIdx.x; + + if (it >= foundNtuplets->nbins()) + return; + + auto nhits = foundNtuplets->size(it); + if (nhits < 3) + return; + if (quality[it] == trackQuality::dup) return; + if (nhits>5) printf("wrong mult %d %d\n",it,nhits); + assert(nhits<8); + tupleMultiplicity->fillDirect(nhits, it); +} + + + +__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, + TkSoA const * __restrict__ tracks, + CAHitNtupletGeneratorKernels::QualityCuts cuts, + Quality *__restrict__ quality) { + auto idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= tuples->nbins()) { + return; + } + if (tuples->size(idx) == 0) { + return; + } + + // if duplicate: not even fit + if (quality[idx] == trackQuality::dup) return; + + assert(quality[idx] == trackQuality::bad); + + // mark doublets as bad + if (tuples->size(idx) < 3) { + return; + } + + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= isnan(tracks->stateAtBS.state(idx)(i)); + } + if (isNaN) { +#ifdef NTUPLE_DEBUG + printf("NaN in fit %d size %d chi2 %f\n", + idx, + tuples->size(idx), + tracks->chi2(idx) + ); +#endif + return; + } + + // compute a pT-dependent chi2 cut + // default parameters: + // - chi2MaxPt = 10 GeV + // - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 } + // - chi2Scale = 30 for broken line fit, 45 for Riemann fit + // (see CAHitNtupletGeneratorGPU.cc) + float pt = std::min(tracks->pt(idx), cuts.chi2MaxPt); + float chi2Cut = cuts.chi2Scale * + (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3]))); + // above number were for Quads not normalized so for the time being just multiple by ndof for Quads (triplets to be understood) + if (3.f*tracks->chi2(idx) >= chi2Cut) { +#ifdef NTUPLE_DEBUG + printf("Bad fit %d size %d pt %f eta %f chi2 %f\n", + idx, + tuples->size(idx), + tracks->pt(idx), + tracks->eta(idx), + 3.f*tracks->chi2(idx) + ); +#endif + return; + } + + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (tuples->size(idx) > 3) ? cuts.quadruplet : cuts.triplet; + bool isOk = (std::abs(tracks->tip(idx)) < region.maxTip) and (tracks->pt(idx) > region.minPt) and + (std::abs(tracks->zip(idx)) < region.maxZip); + + if (isOk) { + quality[idx] = trackQuality::loose; + } +} + +__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernels::Counters *counters) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + continue; + if (quality[idx] != trackQuality::loose) + continue; + atomicAdd(&(counters->nGoodTracks), 1); + } +} + + +__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + continue; + if (quality[idx] != trackQuality::loose) + continue; + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->countDirect(*h); + } +} + +__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) { + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tuples->size(idx) == 0) + continue; + if (quality[idx] != trackQuality::loose) + continue; + for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + hitToTuple->fillDirect(*h, idx); + } +} + +__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, + TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer *__restrict__ hitDetIndices) { + + int first = blockDim.x * blockIdx.x + threadIdx.x; + // copy offsets + for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + hitDetIndices->off[idx] = tuples->off[idx]; + } + // fill hit indices + auto const & hh = *hhp; + auto nhits = hh.nHits(); + for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tuples->bins[idx]bins[idx] = hh.detectorIndex(tuples->bins[idx]); + } +} + +void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tracks_d, cudaStream_t cudaStream) { + auto blockSize=128; + auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize; + + kernel_fillHitDetIndices<<>>(&tracks_d->hitIndices, hh.view(), &tracks_d->detIndices); + cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif +} + + +__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple, + CAHitNtupletGeneratorKernels::Counters *counters) { + auto &c = *counters; + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple->size(idx) == 0) + continue; + atomicAdd(&c.nUsedHits, 1); + if (hitToTuple->size(idx) > 1) + atomicAdd(&c.nDupHits, 1); + } +} + + +__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer const *__restrict__ ptuples, + TkSoA const * __restrict__ ptracks, + Quality *__restrict__ quality, + CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple) { + constexpr auto bad = trackQuality::bad; + constexpr auto dup = trackQuality::dup; + // constexpr auto loose = trackQuality::loose; + + auto &hitToTuple = *phitToTuple; + auto const &foundNtuplets = *ptuples; + auto const & tracks = *ptracks; + + // auto const & hh = *hhp; + // auto l1end = hh.hitsLayerStart_d[1]; + + int first = blockDim.x * blockIdx.x + threadIdx.x; + + for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (hitToTuple.size(idx) < 2) + continue; + + float mc = 10000.f; + uint16_t im = 60000; + uint32_t maxNh = 0; + + // find maxNh + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + maxNh = std::max(nh, maxNh); + } + // kill all tracks shorter than maxHn (only triplets???) + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + uint32_t nh = foundNtuplets.size(*it); + if (maxNh != nh) + quality[*it] = dup; + } + + if (maxNh > 3) + continue; + // if (idx>=l1end) continue; // only for layer 1 + // for triplets choose best tip! + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) { + mc = std::abs(tracks.tip(it)); + im = it; + } + } + // mark duplicates + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (quality[it] != bad && it != im) + quality[it] = dup; //no race: simple assignment of the same constant + } + } // loop over hits +} + +__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, + HitContainer const *__restrict__ ptuples, + TkSoA const * __restrict__ ptracks, + Quality const *__restrict__ quality, + CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple, + uint32_t maxPrint, int iev) { + auto const & foundNtuplets = *ptuples; + auto const & tracks = *ptracks; + int first = blockDim.x * blockIdx.x + threadIdx.x; + for (int i = first; i < std::min(maxPrint, foundNtuplets.nbins()); i+=blockDim.x*gridDim.x) { + auto nh = foundNtuplets.size(i); + if (nh<3) continue; + printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n", + 10000*iev+i, + int(quality[i]), + nh, + tracks.charge(i), + tracks.pt(i), + tracks.eta(i), + tracks.phi(i), + tracks.tip(i), + tracks.zip(i), +// asinhf(fit_results[i].par(3)), + tracks.chi2(i), + *foundNtuplets.begin(i), + *(foundNtuplets.begin(i) + 1), + *(foundNtuplets.begin(i) + 2), + nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1, + nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1 + ); + } +} + + +void CAHitNtupletGeneratorKernels::launchKernels( + HitsOnCPU const &hh, + TkSoA * tracks_d, + cudaStream_t cudaStream) { + + auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets(); + + // these are pointer on GPU! + auto * tuples_d = &tracks_d->hitIndices; + auto * quality_d = (Quality*)(&tracks_d->m_quality); + + auto nhits = hh.nHits(); + assert(nhits <= pixelGPUConstants::maxNumberOfHits); + + // std::cout << "N hits " << nhits << std::endl; + // if (nhits<2) std::cout << "too few hits " << nhits << std::endl; + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + auto nthTot = 64; + auto stride = 4; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize; + auto rescale = numberOfBlocks / 65536; + blockSize *= (rescale + 1); + numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize; + assert(numberOfBlocks < 65536); + assert(blockSize > 0 && 0 == blockSize % 16); + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + + kernel_connect<<>>( + device_hitTuple_apc_, + device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_isOuterHitOfCell_.get(), + m_params.hardCurvCut_, + m_params.ptmin_, + m_params.CAThetaCutBarrel_, + m_params.CAThetaCutForward_, + m_params.dcaCutInnerTriplet_, + m_params.dcaCutOuterTriplet_); + cudaCheck(cudaGetLastError()); + + + if (nhits > 1 && m_params.earlyFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + fishbone<<>>( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + cudaCheck(cudaGetLastError()); + } + + + blockSize = 64; + numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize; + kernel_find_ntuplets<<>>(hh.view(), + device_theCells_.get(), + device_nCells_, + device_theCellTracks_, + tuples_d, + device_hitTuple_apc_, + quality_d, + m_params.minHitsPerNtuplet_); + cudaCheck(cudaGetLastError()); + + if (m_params.doStats_) + kernel_mark_used<<>>(hh.view(), + device_theCells_.get(), + device_nCells_); + cudaCheck(cudaGetLastError()); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + + blockSize = 128; + numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize; + cudautils::finalizeBulk<<>>(device_hitTuple_apc_, tuples_d); + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize; + kernel_earlyDuplicateRemover<<>>( + device_theCells_.get(), device_nCells_, tuples_d, quality_d); + cudaCheck(cudaGetLastError()); + + blockSize = 128; + numberOfBlocks = (CAConstants::maxTuples() + blockSize - 1) / blockSize; + kernel_countMultiplicity<<>>(tuples_d, quality_d, device_tupleMultiplicity_.get()); + cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream); + kernel_fillMultiplicity<<>>(tuples_d, quality_d, device_tupleMultiplicity_.get()); + cudaCheck(cudaGetLastError()); + + if (nhits > 1 && m_params.lateFishbone_) { + auto nthTot = 128; + auto stride = 16; + auto blockSize = nthTot / stride; + auto numberOfBlocks = (nhits + blockSize - 1) / blockSize; + dim3 blks(1, numberOfBlocks, 1); + dim3 thrs(stride, blockSize, 1); + fishbone<<>>( + hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + cudaCheck(cudaGetLastError()); + } + + if (m_params.doStats_) { + numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1) / blockSize; + kernel_checkOverflows<<>>(tuples_d, + device_tupleMultiplicity_.get(), + device_hitTuple_apc_, + device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + device_isOuterHitOfCell_.get(), + nhits, + counters_); + cudaCheck(cudaGetLastError()); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +} + +void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) { + auto nhits = hh.nHits(); + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + edm::Service cs; + device_isOuterHitOfCell_ = cs->make_device_unique(std::max(1U,nhits), stream); + assert(device_isOuterHitOfCell_.get()); + { + int threadsPerBlock = 128; + // at least one block! + int blocks = ( std::max(1U,nhits) + threadsPerBlock - 1) / threadsPerBlock; + gpuPixelDoublets::initDoublets<<>>(device_isOuterHitOfCell_.get(), + nhits, + device_theCellNeighbors_, + device_theCellNeighborsContainer_.get(), + device_theCellTracks_, + device_theCellTracksContainer_.get()); + cudaCheck(cudaGetLastError()); + } + + device_theCells_ = cs->make_device_unique(CAConstants::maxNumberOfDoublets(), stream); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + + if (0 == nhits) + return; // protect against empty events + + // FIXME avoid magic numbers + auto nActualPairs=gpuPixelDoublets::nPairs; + if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15; + if (m_params.minHitsPerNtuplet_>3) { + nActualPairs = 13; + } + + assert(nActualPairs<=gpuPixelDoublets::nPairs); + int stride = 1; + int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride; + int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock; + dim3 blks(1, blocks, 1); + dim3 thrs(stride, threadsPerBlock, 1); + gpuPixelDoublets::getDoubletsFromHisto<<>>(device_theCells_.get(), + device_nCells_, + device_theCellNeighbors_, + device_theCellTracks_, + hh.view(), + device_isOuterHitOfCell_.get(), + nActualPairs, + m_params.idealConditions_, + m_params.doClusterCut_, + m_params.doZCut_, + m_params.doPhiCut_); + cudaCheck(cudaGetLastError()); + +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +} + +void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh, + TkSoA * tracks_d, + cudaStream_t cudaStream) { + // these are pointer on GPU! + auto const * tuples_d = &tracks_d->hitIndices; + auto * quality_d = (Quality*)(&tracks_d->m_quality); + + auto blockSize = 64; + + // classify tracks based on kinematics + auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize; + kernel_classifyTracks<<>>( + tuples_d, tracks_d, m_params.cuts_, quality_d); + cudaCheck(cudaGetLastError()); + + if (m_params.lateFishbone_) { + // apply fishbone cleaning to good tracks + numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize; + kernel_fishboneCleaner<<>>( + device_theCells_.get(), device_nCells_, quality_d); + cudaCheck(cudaGetLastError()); + } + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize; + kernel_fastDuplicateRemover<<>>( + device_theCells_.get(), device_nCells_, tuples_d, tracks_d); + cudaCheck(cudaGetLastError()); + + if (m_params.minHitsPerNtuplet_<4 || m_params.doStats_) { + // fill hit->track "map" + numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize; + kernel_countHitInTracks<<>>( + tuples_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream); + cudaCheck(cudaGetLastError()); + kernel_fillHitInTracks<<>>( + tuples_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + } + if (m_params.minHitsPerNtuplet_<4) { + // remove duplicates (tracks that share a hit) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + kernel_tripletCleaner<<>>( + hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get()); + cudaCheck(cudaGetLastError()); + } + if (m_params.doStats_) { + // counters (add flag???) + numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize; + kernel_doStatsForHitInTracks<<>>(device_hitToTuple_.get(), counters_); + cudaCheck(cudaGetLastError()); + numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize; + kernel_doStatsForTracks<<>>(tuples_d, quality_d, counters_); + cudaCheck(cudaGetLastError()); + } +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + ++iev; + kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev); +#endif + +} + +__global__ void kernel_printCounters(CAHitNtupletGeneratorKernels::Counters const *counters) { + auto const &c = *counters; + printf( + "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nGoodTracks | nUsedHits | nDupHits | nKilledCells | " + "nEmptyCells | nZeroTrackCells ||\n"); + printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n", + c.nEvents, + c.nHits, + c.nCells, + c.nTuples, + c.nGoodTracks, + c.nFitTracks, + c.nUsedHits, + c.nDupHits, + c.nKilledCells, + c.nEmptyCells, + c.nZeroTrackCells); + printf("Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f||\n", + c.nEvents, + c.nHits / double(c.nEvents), + c.nCells / double(c.nEvents), + c.nTuples / double(c.nEvents), + c.nFitTracks / double(c.nEvents), + c.nGoodTracks / double(c.nEvents), + c.nUsedHits / double(c.nEvents), + c.nDupHits / double(c.nEvents), + c.nKilledCells / double(c.nEvents), + c.nEmptyCells / double(c.nCells), + c.nZeroTrackCells / double(c.nCells)); +} + +void CAHitNtupletGeneratorKernels::printCounters(Counters const * counters) { + kernel_printCounters<<<1, 1>>>(counters); +} + + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h new file mode 100644 index 0000000000000..147ba98310c14 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -0,0 +1,186 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h +#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h + +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "GPUCACell.h" + +// #define DUMP_GPU_TK_TUPLES + + +class CAHitNtupletGeneratorKernels { +public: + // counters + struct Counters { + unsigned long long nEvents; + unsigned long long nHits; + unsigned long long nCells; + unsigned long long nTuples; + unsigned long long nFitTracks; + unsigned long long nGoodTracks; + unsigned long long nUsedHits; + unsigned long long nDupHits; + unsigned long long nKilledCells; + unsigned long long nEmptyCells; + unsigned long long nZeroTrackCells; + }; + + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + + using HitToTuple = CAConstants::HitToTuple; + using TupleMultiplicity = CAConstants::TupleMultiplicity; + + using Quality = pixelTrack::Quality; + using TkSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + + + struct QualityCuts { + // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + region triplet; + region quadruplet; + }; + + + // params + struct Params { + Params(uint32_t minHitsPerNtuplet, + bool useRiemannFit, + bool fit5as4, + bool includeJumpingForwardDoublets, + bool earlyFishbone, + bool lateFishbone, + bool idealConditions, + bool doStats, + bool doClusterCut, + bool doZCut, + bool doPhiCut, + float ptmin, + float CAThetaCutBarrel, + float CAThetaCutForward, + float hardCurvCut, + float dcaCutInnerTriplet, + float dcaCutOuterTriplet, + QualityCuts const& cuts) + : minHitsPerNtuplet_(minHitsPerNtuplet), + useRiemannFit_(useRiemannFit), + fit5as4_(fit5as4), + includeJumpingForwardDoublets_(includeJumpingForwardDoublets), + earlyFishbone_(earlyFishbone), + lateFishbone_(lateFishbone), + idealConditions_(idealConditions), + doStats_(doStats), + doClusterCut_(doClusterCut), + doZCut_(doZCut), + doPhiCut_(doPhiCut), + ptmin_(ptmin), + CAThetaCutBarrel_(CAThetaCutBarrel), + CAThetaCutForward_(CAThetaCutForward), + hardCurvCut_(hardCurvCut), + dcaCutInnerTriplet_(dcaCutInnerTriplet), + dcaCutOuterTriplet_(dcaCutOuterTriplet), + cuts_(cuts) { } + + const uint32_t minHitsPerNtuplet_; + const bool useRiemannFit_; + const bool fit5as4_; + const bool includeJumpingForwardDoublets_; + const bool earlyFishbone_; + const bool lateFishbone_; + const bool idealConditions_; + const bool doStats_; + const bool doClusterCut_; + const bool doZCut_; + const bool doPhiCut_; + const float ptmin_; + const float CAThetaCutBarrel_; + const float CAThetaCutForward_; + const float hardCurvCut_; + const float dcaCutInnerTriplet_; + const float dcaCutOuterTriplet_; + + // quality cuts + QualityCuts cuts_ + { + // polynomial coefficients for the pT-dependent chi2 cut + { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }, + // max pT used to determine the chi2 cut + 10., + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + 30., + // regional cuts for triplets + { + 0.3, // |Tip| < 0.3 cm + 0.5, // pT > 0.5 GeV + 12.0 // |Zip| < 12.0 cm + }, + // regional cuts for quadruplets + { + 0.5, // |Tip| < 0.5 cm + 0.3, // pT > 0.3 GeV + 12.0 // |Zip| < 12.0 cm + } + }; + + }; // Params + + + CAHitNtupletGeneratorKernels(Params const & params) : m_params(params){} + ~CAHitNtupletGeneratorKernels() = default; + + TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } + + void launchKernels(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream); + + void classifyTuples(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream); + + void fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tuples_d, cudaStream_t cudaStream); + + void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream); + void allocateOnGPU(cuda::stream_t<>& stream); + void cleanup(cudaStream_t cudaStream); + + static void printCounters(Counters const * counters); + Counters* counters_ = nullptr; + + +private: + + // workspace + CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr; + cudautils::device::unique_ptr device_theCellNeighborsContainer_; + CAConstants::CellTracksVector* device_theCellTracks_ = nullptr; + cudautils::device::unique_ptr device_theCellTracksContainer_; + + cudautils::device::unique_ptr device_theCells_; + cudautils::device::unique_ptr device_isOuterHitOfCell_; + uint32_t* device_nCells_ = nullptr; + + cudautils::device::unique_ptr device_hitToTuple_; + AtomicPairCounter* device_hitToTuple_apc_ = nullptr; + + AtomicPairCounter* device_hitTuple_apc_ = nullptr; + + cudautils::device::unique_ptr device_tupleMultiplicity_; + + uint8_t * device_tmws_; + + cudautils::device::unique_ptr device_storage_; + // params + Params const & m_params; +}; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu new file mode 100644 index 0000000000000..126b6237bd0d7 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu @@ -0,0 +1,40 @@ +#include "CAHitNtupletGeneratorKernels.h" + +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +void CAHitNtupletGeneratorKernels::allocateOnGPU(cuda::stream_t<>& stream) { + ////////////////////////////////////////////////////////// + // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) + ////////////////////////////////////////////////////////// + + edm::Service cs; + + /* not used at the moment + cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector))); + cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector))); + cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector))); + cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector))); + */ + + device_hitToTuple_ = cs->make_device_unique(stream); + + device_tupleMultiplicity_ = cs->make_device_unique(stream); + + auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type); + + device_storage_ = cs->make_device_unique(storageSize,stream); + + device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get(); + device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1; + device_nCells_ = (uint32_t *)(device_storage_.get()+2); + device_tmws_ = (uint8_t*)(device_storage_.get()+3); + + assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize)); + + cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id())); + cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id()); + cudautils::launchZero(device_hitToTuple_.get(), stream.id()); // we may wish to keep it in the edm... +} + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc new file mode 100644 index 0000000000000..08cafc7e8fc09 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -0,0 +1,164 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +#include +#include +#include +#include + +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDMException.h" +#include "FWCore/Utilities/interface/isFinite.h" +#include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" + +#include "CAHitNtupletGeneratorOnGPU.h" + +namespace { + + template + T sqr(T x) { + return x * x; + } + + CAHitNtupletGeneratorKernels::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) { + auto coeff = pset.getParameter>("chi2Coeff"); + if (coeff.size() != 4) { + throw edm::Exception(edm::errors::Configuration, "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements"); + } + return CAHitNtupletGeneratorKernels::QualityCuts { + // polynomial coefficients for the pT-dependent chi2 cut + { (float) coeff[0], (float) coeff[1], (float) coeff[2], (float) coeff[3] }, + // max pT used to determine the chi2 cut + (float) pset.getParameter("chi2MaxPt"), + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + (float) pset.getParameter("chi2Scale"), + // regional cuts for triplets + { + (float) pset.getParameter("tripletMaxTip"), + (float) pset.getParameter("tripletMinPt"), + (float) pset.getParameter("tripletMaxZip") + }, + // regional cuts for quadruplets + { + (float) pset.getParameter("quadrupletMaxTip"), + (float) pset.getParameter("quadrupletMinPt"), + (float) pset.getParameter("quadrupletMaxZip") + } + }; + } + +} // namespace + +using namespace std; + +CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &cfg, edm::ConsumesCollector &iC) + : m_params(cfg.getParameter("minHitsPerNtuplet"), + cfg.getParameter("useRiemannFit"), + cfg.getParameter("fit5as4"), + cfg.getParameter("includeJumpingForwardDoublets"), + cfg.getParameter("earlyFishbone"), + cfg.getParameter("lateFishbone"), + cfg.getParameter("idealConditions"), + cfg.getParameter("fillStatistics"), + cfg.getParameter("doClusterCut"), + cfg.getParameter("doZCut"), + cfg.getParameter("doPhiCut"), + cfg.getParameter("ptmin"), + cfg.getParameter("CAThetaCutBarrel"), + cfg.getParameter("CAThetaCutForward"), + cfg.getParameter("hardCurvCut"), + cfg.getParameter("dcaCutInnerTriplet"), + cfg.getParameter("dcaCutOuterTriplet"), + makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) { + +#ifdef DUMP_GPU_TK_TUPLES + printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + "tid", "qual", "nh","charge","pt","eta","phi","tip","zip","chi2", + "h1","h2","h3","h4","h5"); +#endif + + cudaCheck(cudaMalloc(&m_counters, sizeof(Counters))); + cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters))); + +} + +CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){ + if (m_params.doStats_) { + // crash on multi-gpu processes + CAHitNtupletGeneratorKernels::printCounters(m_counters); + } + cudaFree(m_counters); +} + + +void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &desc) { + // 87 cm/GeV = 1/(3.8T * 0.3) + // take less than radius given by the hardPtCut and reject everything below + // auto hardCurvCut = 1.f/(0.35 * 87.f); + desc.add("ptmin", 0.9f)->setComment("Cut on minimum pt"); + desc.add("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel"); + desc.add("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward"); + desc.add("hardCurvCut", 1.f / (0.35 * 87.f))->setComment("Cut on minimum curvature"); + desc.add("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1"); + desc.add("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1"); + desc.add("earlyFishbone", true); + desc.add("lateFishbone", false); + desc.add("idealConditions", true); + desc.add("fillStatistics", false); + desc.add("minHitsPerNtuplet", 4); + desc.add("includeJumpingForwardDoublets", false); + desc.add("fit5as4", true); + desc.add("doClusterCut", true); + desc.add("doZCut", true); + desc.add("doPhiCut", true); + desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); + trackQualityCuts.add>("chi2Coeff", { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }) + ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut"); + trackQualityCuts.add("chi2Scale", 30.)->setComment("Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann fit)"); + trackQualityCuts.add("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV"); + trackQualityCuts.add("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm"); + trackQualityCuts.add("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm"); + trackQualityCuts.add("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV"); + trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); + trackQualityCuts.add("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment("Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply \"region cuts\" based on the fit results (pT, Tip, Zip)."); +} + + +PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, + float bfield, + cuda::stream_t<>& stream) const { + edm::Service cs; + PixelTrackHeterogeneous tracks(cs->make_device_unique(stream)); + + auto * soa = tracks.get(); + + CAHitNtupletGeneratorKernels kernels(m_params); + kernels.counters_ = m_counters; + HelixFitOnGPU fitter(bfield,m_params.fit5as4_); + + kernels.allocateOnGPU(stream); + fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + + kernels.buildDoublets(hits_d, stream); + kernels.launchKernels(hits_d, soa, stream.id()); + kernels.fillHitDetIndices(hits_d, soa, stream.id()); // in principle needed only if Hits not "available" + if (m_params.useRiemannFit_) { + fitter.launchRiemannKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + } else { + fitter.launchBrokenLineKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream); + } + kernels.classifyTuples(hits_d, soa, stream.id()); + + return tracks; +} + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h new file mode 100644 index 0000000000000..169f591c48e45 --- /dev/null +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -0,0 +1,73 @@ +#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h +#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h + +#include +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + + +#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" + +#include "CAHitNtupletGeneratorKernels.h" +#include "HelixFitOnGPU.h" + +// FIXME (split header???) +#include "GPUCACell.h" + +namespace edm { + class Event; + class EventSetup; + class ParameterSetDescription; +} // namespace edm + +class CAHitNtupletGeneratorOnGPU { +public: + using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnCPU = TrackingRecHit2DCUDA; + using hindex_type = TrackingRecHit2DSOAView::hindex_type; + + using Quality = pixelTrack::Quality; + using OutputSoA = pixelTrack::TrackSoA; + using HitContainer = pixelTrack::HitContainer; + using Tuple = HitContainer; + +public: + CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC) + : CAHitNtupletGeneratorOnGPU(cfg, iC) {} + CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC); + + ~CAHitNtupletGeneratorOnGPU(); + + static void fillDescriptions(edm::ParameterSetDescription& desc); + static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } + + PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d, + float bfield, + cuda::stream_t<>& stream) const; + +private: + + void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const; + + void hitNtuplets(HitsOnCPU const& hh, + const edm::EventSetup& es, + bool useRiemannFit, + cuda::stream_t<>& cudaStream); + + void cleanup(cudaStream_t stream); + + void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const; + + + CAHitNtupletGeneratorKernels::Params m_params; + + using Counters = CAHitNtupletGeneratorKernels::Counters; + CAHitNtupletGeneratorKernels::Counters * m_counters = nullptr; + +}; + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 2beb8b889c94c..f1709f7ae7063 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -15,7 +15,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CAConstants.h" class GPUCACell { public: @@ -33,10 +34,9 @@ class GPUCACell { using TmpTuple = GPU::VecArray; - using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; - - using Quality = pixelTuplesHeterogeneousProduct::Quality; - static constexpr auto bad = pixelTuplesHeterogeneousProduct::bad; + using HitContainer = pixelTrack::HitContainer; + using Quality = trackQuality::Quality; + static constexpr auto bad = trackQuality::bad; GPUCACell() = default; #ifdef __CUDACC__ @@ -249,7 +249,7 @@ class GPUCACell { __device__ inline void find_ntuplets(Hits const& hh, GPUCACell* __restrict__ cells, CellTracksVector& cellTracks, - TuplesOnGPU::Container& foundNtuplets, + HitContainer & foundNtuplets, AtomicPairCounter& apc, Quality* __restrict__ quality, TmpTuple& tmpNtuplet, diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index a374c975ef4b6..c071cdd347808 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -1,16 +1,16 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HelixFitOnGPU.h" -void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const* tuples, - TupleMultiplicity const* tupleMultiplicity, - Rfit::helix_fit* helix_fit_results) { +void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples, + TupleMultiplicity const *tupleMultiplicity, + OutputSoA *helix_fit_results) { tuples_d = tuples; tupleMultiplicity_d = tupleMultiplicity; - helix_fit_results_d = helix_fit_results; + outputSoa_d = helix_fit_results; assert(tuples_d); assert(tupleMultiplicity_d); - assert(helix_fit_results_d); + assert(outputSoa_d); } void HelixFitOnGPU::deallocateOnGPU() {} diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 022a6ba5ae623..c9688fc43418c 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -4,10 +4,9 @@ #include #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" -#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h" - -class TrackingRecHit2DSOAView; -class TrackingRecHit2DCUDA; +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "CAConstants.h" namespace Rfit { // in case of memory issue can be made smaller @@ -38,10 +37,12 @@ class HelixFitOnGPU { using HitsOnGPU = TrackingRecHit2DSOAView; using HitsOnCPU = TrackingRecHit2DCUDA; - using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; + using Tuples = pixelTrack::HitContainer; + using OutputSoA = pixelTrack::TrackSoA; + using TupleMultiplicity = CAConstants::TupleMultiplicity; - explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {} + explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {} ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } @@ -54,19 +55,19 @@ class HelixFitOnGPU { uint32_t maxNumberOfTuples, cuda::stream_t<> &cudaStream); - void allocateOnGPU(TuplesOnGPU::Container const *tuples, + void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, - Rfit::helix_fit *helix_fit_results); + OutputSoA * outputSoA); void deallocateOnGPU(); private: static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits(); // fowarded - TuplesOnGPU::Container const *tuples_d = nullptr; + Tuples const *tuples_d = nullptr; TupleMultiplicity const *tupleMultiplicity_d = nullptr; - double bField_; - Rfit::helix_fit *helix_fit_results_d = nullptr; + OutputSoA * outputSoa_d; + float bField_; const bool fit5as4_; }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index 4aea729e913a6..38f51e676a9ca 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -17,12 +17,13 @@ #include "HelixFitOnGPU.h" using HitsOnGPU = TrackingRecHit2DSOAView; -using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU; +using Tuples = pixelTrack::HitContainer; +using OutputSoA = pixelTrack::TrackSoA; using namespace Eigen; template -__global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets, +__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets, CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, HitsOnGPU const *__restrict__ hhp, @@ -51,17 +52,17 @@ __global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNt return; // get it from the ntuple container (one to one to helix) - auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start); - assert(helix_start < foundNtuplets->nbins()); + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start); + assert(tkid < foundNtuplets->nbins()); - assert(foundNtuplets->size(helix_start) == nHits); + assert(foundNtuplets->size(tkid) == nHits); Rfit::Map3xNd hits(phits + local_start); Rfit::Map4d fast_fit(pfast_fit + local_start); Rfit::Map6xNf hits_ge(phits_ge + local_start); // Prepare data structure - auto const *hitId = foundNtuplets->begin(helix_start); + auto const *hitId = foundNtuplets->begin(tkid); for (unsigned int i = 0; i < hitsInFit; ++i) { auto hit = hitId[i]; // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); @@ -102,7 +103,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict return; // get it for the ntuple container (one to one to helix) - auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start); + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start); Rfit::Map3xNd hits(phits + local_start); Rfit::Map4d fast_fit(pfast_fit_input + local_start); @@ -116,7 +117,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict circle_fit[local_start] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true); #ifdef RIEMANN_DEBUG -// printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start, +// printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid, // circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2)); #endif } @@ -125,7 +126,7 @@ template __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double B, - Rfit::helix_fit *results, + OutputSoA *results, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, @@ -144,7 +145,7 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ return; // get it for the ntuple container (one to one to helix) - auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start); + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start); Rfit::Map3xNd hits(phits + local_start); Rfit::Map4d fast_fit(pfast_fit_input + local_start); @@ -152,39 +153,31 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true); - par_uvrtopak(circle_fit[local_start], B, true); + Rfit::fromCircleToPerigee(circle_fit[local_start]); - // Grab helix_fit from the proper location in the output vector - auto &helix = results[helix_start]; - helix.par << circle_fit[local_start].par, line_fit.par; - - // TODO: pass properly error booleans - - helix.cov = Rfit::Matrix5d::Zero(); - helix.cov.block(0, 0, 3, 3) = circle_fit[local_start].cov; - helix.cov.block(3, 3, 2, 2) = line_fit.cov; - - helix.q = circle_fit[local_start].q; - helix.chi2_circle = circle_fit[local_start].chi2; - helix.chi2_line = line_fit.chi2; + results->stateAtBS.copyFromCircle(circle_fit[local_start].par,circle_fit[local_start].cov, + line_fit.par,line_fit.cov,1.f/float(B),tkid); + results->pt(tkid) = B/std::abs(circle_fit[local_start].par(2)); + results->eta(tkid) = asinhf(line_fit.par(0)); + results->chi2(tkid) = (circle_fit[local_start].chi2+line_fit.chi2)/(2*N-5); #ifdef RIEMANN_DEBUG printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", N, nHits, - helix_start, + tkid, circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2)); - printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0), line_fit.par(1)); + printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1)); printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n", - helix.chi2_circle, - helix.chi2_line, - helix.cov(0, 0), - helix.cov(1, 1), - helix.cov(2, 2), - helix.cov(3, 3), - helix.cov(4, 4)); + circle_fit[local_start].chi2, + line_fit.chi2, + circle_fit[local_start].cov(0, 0), + circle_fit[local_start].cov(1, 1), + circle_fit[local_start].cov(2, 2), + line_fit.cov(0, 0), + line_fit.cov(1, 1)); #endif } @@ -234,7 +227,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh, kernelLineFit<3><<>>(tupleMultiplicity_d, 3, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -266,7 +259,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh, kernelLineFit<4><<>>(tupleMultiplicity_d, 4, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -299,7 +292,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh, kernelLineFit<4><<>>(tupleMultiplicity_d, 5, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), @@ -331,7 +324,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh, kernelLineFit<5><<>>(tupleMultiplicity_d, 5, bField_, - helix_fit_results_d, + outputSoa_d, hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(), diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc new file mode 100644 index 0000000000000..b10de55871185 --- /dev/null +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -0,0 +1,177 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "DataFormats/GeometrySurface/interface/Plane.h" +#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h" +#include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "Geometry/CommonDetUnit/interface/GeomDet.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h" +#include "TrackingTools/Records/interface/TrackingComponentsRecord.h" +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" +#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h" + +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" + +/* + produces seeds directly from cuda produced tuples +*/ +class SeedProducerFromSoA : public edm::global::EDProducer<> { +public: + + explicit SeedProducerFromSoA(const edm::ParameterSet &iConfig); + ~SeedProducerFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; + + edm::EDGetTokenT tBeamSpot_; + edm::EDGetTokenT tokenTrack_; + + int32_t minNumberOfHits_; +}; + +SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet &iConfig) : + tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), + tokenTrack_(consumes(iConfig.getParameter("src"))), + minNumberOfHits_(iConfig.getParameter("minNumberOfHits")) + +{ + produces(); +} + +void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("src", edm::InputTag("pixelTrackSoA")); + desc.add("minNumberOfHits", 0); + + descriptions.addWithDefaultLabel(desc); +} + +void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { + + // std::cout << "Converting gpu helix to trajectory seed" << std::endl; + auto result = std::make_unique(); + + + edm::ESHandle fieldESH; + iSetup.get().get(fieldESH); + + edm::ESHandle tracker; + iSetup.get().get(tracker); + auto const & dus = tracker->detUnits(); + + edm::ESHandle propagatorHandle; + iSetup.get().get("PropagatorWithMaterial",propagatorHandle); + const Propagator* propagator = &(*propagatorHandle); + + edm::ESHandle httopo; + iSetup.get().get(httopo); + + + const auto &bsh = iEvent.get(tBeamSpot_); + // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; + GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); + + const auto & tsoa = *(iEvent.get(tokenTrack_)); + + auto const * quality = tsoa.qualityData(); + auto const & fit = tsoa.stateAtBS; + auto const & detIndices = tsoa.detIndices; + auto maxTracks = tsoa.stride(); + + int32_t nt = 0; + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.nHits(it); + if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... + + auto q = quality[it]; + if (q != trackQuality::loose) + continue; // FIXME + if (nHits< minNumberOfHits_) continue; + ++nt; + + // fill hits with invalid just to hold the detId + auto b = detIndices.begin(it); + edm::OwnVector hits; + for (int iHit = 0; iHit < nHits; ++iHit) { + auto const * det = dus[*(b+iHit)]; + // FIXME at some point get a proper type ... + hits.push_back(new InvalidTrackingRecHit(*det,TrackingRecHit::bad)); + } + + + // mind: this values are respect the beamspot! + + float phi = tsoa.phi(it); + + Rfit::Vector5d ipar,opar; + Rfit::Matrix5d icov,ocov; + fit.copyToDense(ipar,icov,it); + Rfit::transformToPerigeePlane(ipar,icov,opar,ocov); + + LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.); + AlgebraicSymMatrix55 m; + for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j); + + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot( + sp, -cp, 0, + 0, 0, -1.f, + cp, sp, 0); + + Plane impPointPlane(bs,rot); + GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), + impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product()); + + JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product()); + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m); + + FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo)); + + auto const & lastHit = hits.back(); + + TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface()); + + if (!outerState.isValid()){ + edm::LogError("SeedFromGPU")<<" was trying to create a seed from:\n"<emplace_back(pTraj, hits, alongMomentum); + + } + + iEvent.put(std::move(result)); +} + +DEFINE_FWK_MODULE(SeedProducerFromSoA);