Port the whole pixel workflow to new heterogeneous framework (#384)

- port the whole pixel workflow to new heterogeneous framework - implement a legacy cluster to SoA converter for the pixel RecHits - update the vertex producer to run on CPU as well as GPU
cms-patatrack · Nov 27, 2020 · 0444702 · 0444702
1 parent fbe2705
commit 0444702
Show file tree

Hide file tree

Showing 31 changed files with 2,495 additions and 138 deletions.
diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSetReader"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -0,0 +1,79 @@
+#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
+#define CUDADataFormatsTrackTrackHeterogeneous_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace trackQuality {
+  enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAT {
+public:
+
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = trackQuality::Quality;
+  using hindex_type = uint16_t;
+  using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
+  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));}
+  constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));}
+  constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());}
+  constexpr Quality * qualityData() { return (Quality*)(m_quality.data());}
+
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit  
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i);}
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoA<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+
+  // total number of tracks (including those not fitted)
+  uint32_t m_nTracks;
+
+};
+
+namespace pixelTrack{
+
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumber() { return 2 * 1024;}
+#else
+  constexpr uint32_t maxNumber() { return 32 * 1024;}
+#endif
+
+  using TrackSoA = TrackSoAT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+  using Quality = trackQuality::Quality;
+
+}
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+
+#endif // CUDADataFormatsTrackTrackSoA_H
+
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
@@ -0,0 +1,65 @@
+#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
+#define CUDADataFormatsTrackTrajectoryStateSOA_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoA {
+
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f,S> state;
+  eigenSoA::MatrixSoA<Vector15f,S> covariance;
+
+
+  template<typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__
+  void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
+     state(i) << cp.template cast<float>(), lp.template cast<float>();
+     state(i)(2) *=b;
+     auto cov =  covariance(i);
+     cov(0) = ccov(0,0);
+     cov(1) = ccov(0,1);
+     cov(2) = b*float(ccov(0,2));
+     cov(4) = cov(3) = 0;
+     cov(5) = ccov(1,1);
+     cov(6) = b*float(ccov(1,2));
+     cov(8) = cov(7) = 0;
+     cov(9) = b*b*float(ccov(2,2));
+     cov(11) = cov(10) = 0;
+     cov(12) = lcov(0,0);
+     cov(13) = lcov(0,1);
+     cov(14) = lcov(1,1);
+  }
+
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
+     state(i) = v.template cast<float>();
+     for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); 
+  }
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyToDense(V5 & v, M5 & cov, int32_t i) const {
+     v = state(i).template cast<typename V5::Scalar>();
+     for(int j=0, ind=0; j<5; ++j) {
+        cov(j,j) = covariance(i)(ind++); 
+        for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++);
+     }
+  }
+
+};
+
+#endif // CUDADataFormatsTrackTrajectoryStateSOA_H
+
+
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
@@ -0,0 +1,10 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
@@ -0,0 +1,8 @@
+<lcgdict>
+  <class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
+  <class name="edm::Wrapper<ArrayShadow<std::array<unsigned int,2001>>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,77 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__
+Matrix5d loadCov(Vector5d const & e) {
+  Matrix5d cov;
+  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
+      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
+      cov(j,i) = cov(i,j);
+    }
+   }
+  return cov;
+}
+
+
+using TS = TrajectoryStateSoA<128>;
+
+__global__ void testTSSoA(TS * pts, int n) {
+
+  assert(n<=128);
+
+  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
+  Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01;
+  auto cov0 = loadCov(e0);
+
+  TS & ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0,cov0,i);
+    Vector5d par1; Matrix5d cov1;
+    ts.copyToDense(par1,cov1,i);
+    Vector5d delV = par1-par0;
+    Matrix5d delM = cov1-cov0;
+    for(int j=0; j<5; ++j) {
+      assert(std::abs(delV(j))<1.e-5);
+      for (auto k=j;k<5;++k) {
+        assert(cov0(k,j)==cov0(j,k));
+        assert(cov1(k,j)==cov1(j,k));
+        assert(std::abs(delM(k,j))<1.e-5);
+      }
+    }
+
+  }
+}
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  exitSansCUDADevices();
+#endif
+
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS * ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d,128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts,128);
+#endif
+
+}
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -1,6 +1,41 @@
 import FWCore.ParameterSet.Config as cms
 
-def customizePixelTracksForProfiling(process):
+def customizePixelTracksForProfilingGPUOnly(process):
+    process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
+    process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting)
+
+    process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
+    process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco)
+    return process
+
+def customizePixelTracksForProfilingSoAonCPU(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.pixelVertexSoA = process.pixelVertexCUDA.clone()
+    process.pixelVertexSoA.onGPU = False
+    process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
+    process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco)
+
+    return process
+
+def customizePixelTracksForProfilingEnableTransfer(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi')
+    process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA)
+    return process
+
+def customizePixelTracksForProfilingEnableConversion(process):
+    # use old trick of output path
     process.MessageLogger.cerr.FwkReport.reportEvery = 100
 
     process.out = cms.OutputModule("AsciiOutputModule",
@@ -17,21 +52,3 @@ def customizePixelTracksForProfiling(process):
 
     return process
 
-def customizePixelTracksForProfilingDisableConversion(process):
-    process = customizePixelTracksForProfiling(process)
-
-    # Disable conversions to legacy
-    process.pixelTracksHitQuadruplets.gpuEnableConversion = False
-    process.pixelTracks.gpuEnableConversion = False
-    process.pixelVertices.gpuEnableConversion = False
-
-    return process
-
-def customizePixelTracksForProfilingDisableTransfer(process):
-    process = customizePixelTracksForProfilingDisableConversion(process)
-
-    # Disable "unnecessary" transfers to CPU
-    process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
-    process.pixelVertices.gpuEnableTransfer = False
-
-    return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -189,27 +189,52 @@ namespace Rfit {
     circle.par = par_pak;
   }
 
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+           circle.q/circle.par(2);
+
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.q;
+      Matrix3d J4;
+      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2));
+      circle.cov = J4 * circle.cov * J4.transpose();
+
+    circle.par = par_pak;
+  }
+
+
+
   // transformation between the "perigee" to cmssw localcoord frame
   // the plane of the latter is the perigee plane...
-  // from   //!<(phi,Tip,pt,cotan(theta)),Zip)
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
   // to q/p,dx/dz,dy/dz,x,z
-  template<typename V5, typename M5>
-  __host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) {
+  template<typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) {
 
     auto sinTheta2 = 1./(1.+ip(3)*ip(3));
     auto sinTheta = std::sqrt(sinTheta2);
     auto cosTheta = ip(3)*sinTheta;
 
-    op(0) = charge*sinTheta/ip(2);
+    op(0) = sinTheta*ip(2);
     op(1) = 0.;
     op(2) = -ip(3);
     op(3) = ip(1);
     op(4) = -ip(4);
 
     Matrix5d J = Matrix5d::Zero();
 
-    J(0,2) = -charge*sinTheta/(ip(2)*ip(2));
-    J(0,3) = -charge*sinTheta2*cosTheta/ip(2);
+    J(0,2) = sinTheta;
+    J(0,3) = -sinTheta2*cosTheta*ip(2);
     J(1,0) = 1.;
     J(2,3) = -1.;
     J(3,1) = 1.;