Skip to content

Commit

Permalink
Port the whole pixel workflow to new heterogeneous framework (#384)
Browse files Browse the repository at this point in the history
  - port the whole pixel workflow to new heterogeneous framework
  - implement a legacy cluster to SoA converter for the pixel RecHits
  - update the vertex producer to run on CPU as well as GPU
  • Loading branch information
fwyzard committed Nov 27, 2020
1 parent fbe2705 commit 0444702
Show file tree
Hide file tree
Showing 31 changed files with 2,495 additions and 138 deletions.
10 changes: 10 additions & 0 deletions CUDADataFormats/Track/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<use name="cuda-api-wrappers"/>
<use name="rootcore"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/ParameterSetReader"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="eigen"/>
<export>
<lib name="1"/>
</export>
79 changes: 79 additions & 0 deletions CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
#define CUDADataFormatsTrackTrackHeterogeneous_H

#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"

#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"

namespace trackQuality {
enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity };
}

template <int32_t S>
class TrackSoAT {
public:

static constexpr int32_t stride() { return S; }

using Quality = trackQuality::Quality;
using hindex_type = uint16_t;
using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;

// Always check quality is at least loose!
// CUDA does not support enums in __lgc ...
eigenSoA::ScalarSoA<uint8_t, S> m_quality;
constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));}
constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));}
constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());}
constexpr Quality * qualityData() { return (Quality*)(m_quality.data());}


// this is chi2/ndof as not necessarely all hits are used in the fit
eigenSoA::ScalarSoA<float, S> chi2;

constexpr int nHits(int i) const { return detIndices.size(i);}

// State at the Beam spot
// phi,tip,1/pt,cotan(theta),zip
TrajectoryStateSoA<S> stateAtBS;
eigenSoA::ScalarSoA<float, S> eta;
eigenSoA::ScalarSoA<float, S> pt;
constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); }
constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }

// state at the detector of the outermost hit
// representation to be decided...
// not yet filled on GPU
// TrajectoryStateSoA<S> stateAtOuterDet;

HitContainer hitIndices;
HitContainer detIndices;

// total number of tracks (including those not fitted)
uint32_t m_nTracks;

};

namespace pixelTrack{

#ifdef GPU_SMALL_EVENTS
constexpr uint32_t maxNumber() { return 2 * 1024;}
#else
constexpr uint32_t maxNumber() { return 32 * 1024;}
#endif

using TrackSoA = TrackSoAT<maxNumber()>;
using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
using HitContainer = TrackSoA::HitContainer;
using Quality = trackQuality::Quality;

}

using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;


#endif // CUDADataFormatsTrackTrackSoA_H

65 changes: 65 additions & 0 deletions CUDADataFormats/Track/interface/TrajectoryStateSoA.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
#define CUDADataFormatsTrackTrajectoryStateSOA_H

#include <Eigen/Dense>
#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"

template <int32_t S>
struct TrajectoryStateSoA {

using Vector5f = Eigen::Matrix<float, 5, 1>;
using Vector15f = Eigen::Matrix<float, 15, 1>;

using Vector5d = Eigen::Matrix<double, 5, 1>;
using Matrix5d = Eigen::Matrix<double, 5, 5>;


static constexpr int32_t stride() { return S; }

eigenSoA::MatrixSoA<Vector5f,S> state;
eigenSoA::MatrixSoA<Vector15f,S> covariance;


template<typename V3, typename M3, typename V2, typename M2>
__host__ __device__
void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
state(i) << cp.template cast<float>(), lp.template cast<float>();
state(i)(2) *=b;
auto cov = covariance(i);
cov(0) = ccov(0,0);
cov(1) = ccov(0,1);
cov(2) = b*float(ccov(0,2));
cov(4) = cov(3) = 0;
cov(5) = ccov(1,1);
cov(6) = b*float(ccov(1,2));
cov(8) = cov(7) = 0;
cov(9) = b*b*float(ccov(2,2));
cov(11) = cov(10) = 0;
cov(12) = lcov(0,0);
cov(13) = lcov(0,1);
cov(14) = lcov(1,1);
}


template<typename V5, typename M5>
__host__ __device__
void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
state(i) = v.template cast<float>();
for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k);
}

template<typename V5, typename M5>
__host__ __device__
void copyToDense(V5 & v, M5 & cov, int32_t i) const {
v = state(i).template cast<typename V5::Scalar>();
for(int j=0, ind=0; j<5; ++j) {
cov(j,j) = covariance(i)(ind++);
for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++);
}
}

};

#endif // CUDADataFormatsTrackTrajectoryStateSOA_H


10 changes: 10 additions & 0 deletions CUDADataFormats/Track/src/classes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef CUDADataFormats__src_classes_h
#define CUDADataFormats__src_classes_h

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "CUDADataFormats/Common/interface/HostProduct.h"
#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
#include "CUDADataFormats/Common/interface/ArrayShadow.h"
#include "DataFormats/Common/interface/Wrapper.h"

#endif
8 changes: 8 additions & 0 deletions CUDADataFormats/Track/src/classes_def.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<lcgdict>
<class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
<class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
<class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
<class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
<class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
<class name="edm::Wrapper<ArrayShadow<std::array<unsigned int,2001>>>" persistent="false"/>
</lcgdict>
13 changes: 13 additions & 0 deletions CUDADataFormats/Track/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<use name="HeterogeneousCore/CUDAUtilities"/>

<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
<use name="eigen"/>
<flags CXXFLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
<use name="eigen"/>
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
<flags CXXFLAGS="-g -DGPU_DEBUG"/>
</bin>

1 change: 1 addition & 0 deletions CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#include "TrajectoryStateSOA_t.h"
1 change: 1 addition & 0 deletions CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#include "TrajectoryStateSOA_t.h"
77 changes: 77 additions & 0 deletions CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"

using Vector5d = Eigen::Matrix<double, 5, 1>;
using Matrix5d = Eigen::Matrix<double, 5, 5>;

__host__ __device__
Matrix5d loadCov(Vector5d const & e) {
Matrix5d cov;
for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
for (int i = 0; i < 5; ++i) {
for (int j = 0; j < i; ++j) {
double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
cov(i,j) = (i+j)%2 ? -0.4*v : 0.1*v;
cov(j,i) = cov(i,j);
}
}
return cov;
}


using TS = TrajectoryStateSoA<128>;

__global__ void testTSSoA(TS * pts, int n) {

assert(n<=128);

Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01;
auto cov0 = loadCov(e0);

TS & ts = *pts;

int first = threadIdx.x + blockIdx.x * blockDim.x;

for (int i = first; i < n; i += blockDim.x * gridDim.x) {
ts.copyFromDense(par0,cov0,i);
Vector5d par1; Matrix5d cov1;
ts.copyToDense(par1,cov1,i);
Vector5d delV = par1-par0;
Matrix5d delM = cov1-cov0;
for(int j=0; j<5; ++j) {
assert(std::abs(delV(j))<1.e-5);
for (auto k=j;k<5;++k) {
assert(cov0(k,j)==cov0(j,k));
assert(cov1(k,j)==cov1(j,k));
assert(std::abs(delM(k,j))<1.e-5);
}
}

}
}

#ifdef __CUDACC__
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#endif

int main() {
#ifdef __CUDACC__
exitSansCUDADevices();
#endif


TS ts;

#ifdef __CUDACC__
TS * ts_d;
cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
testTSSoA<<<1, 64>>>(ts_d,128);
cudaCheck(cudaGetLastError());
cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
cudaCheck(cudaDeviceSynchronize());
#else
testTSSoA(&ts,128);
#endif

}
Original file line number Diff line number Diff line change
@@ -1,6 +1,41 @@
import FWCore.ParameterSet.Config as cms

def customizePixelTracksForProfiling(process):
def customizePixelTracksForProfilingGPUOnly(process):
process.MessageLogger.cerr.FwkReport.reportEvery = 100

process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting)

process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA)

process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco)
return process

def customizePixelTracksForProfilingSoAonCPU(process):
process = customizePixelTracksForProfilingGPUOnly(process)

process.pixelVertexSoA = process.pixelVertexCUDA.clone()
process.pixelVertexSoA.onGPU = False
process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA)

process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco)

return process

def customizePixelTracksForProfilingEnableTransfer(process):
process = customizePixelTracksForProfilingGPUOnly(process)

process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi')
process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi')
process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA)

process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA)
return process

def customizePixelTracksForProfilingEnableConversion(process):
# use old trick of output path
process.MessageLogger.cerr.FwkReport.reportEvery = 100

process.out = cms.OutputModule("AsciiOutputModule",
Expand All @@ -17,21 +52,3 @@ def customizePixelTracksForProfiling(process):

return process

def customizePixelTracksForProfilingDisableConversion(process):
process = customizePixelTracksForProfiling(process)

# Disable conversions to legacy
process.pixelTracksHitQuadruplets.gpuEnableConversion = False
process.pixelTracks.gpuEnableConversion = False
process.pixelVertices.gpuEnableConversion = False

return process

def customizePixelTracksForProfilingDisableTransfer(process):
process = customizePixelTracksForProfilingDisableConversion(process)

# Disable "unnecessary" transfers to CPU
process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
process.pixelVertices.gpuEnableTransfer = False

return process
37 changes: 31 additions & 6 deletions RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,27 +189,52 @@ namespace Rfit {
circle.par = par_pak;
}

/*!
\brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
consequently covariance matrix.
\param circle_uvr parameter (X0,Y0,R), covariance matrix to
be transformed and particle charge.
*/
__host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
Vector3d par_pak;
const double temp0 = circle.par.head(2).squaredNorm();
const double temp1 = sqrt(temp0);
par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
circle.q/circle.par(2);

const double temp2 = sqr(circle.par(0)) * 1. / temp0;
const double temp3 = 1. / temp1 * circle.q;
Matrix3d J4;
J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2));
circle.cov = J4 * circle.cov * J4.transpose();

circle.par = par_pak;
}



// transformation between the "perigee" to cmssw localcoord frame
// the plane of the latter is the perigee plane...
// from //!<(phi,Tip,pt,cotan(theta)),Zip)
// from //!<(phi,Tip,q/pt,cotan(theta)),Zip)
// to q/p,dx/dz,dy/dz,x,z
template<typename V5, typename M5>
__host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) {
template<typename VI5, typename MI5, typename VO5, typename MO5>
__host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) {

auto sinTheta2 = 1./(1.+ip(3)*ip(3));
auto sinTheta = std::sqrt(sinTheta2);
auto cosTheta = ip(3)*sinTheta;

op(0) = charge*sinTheta/ip(2);
op(0) = sinTheta*ip(2);
op(1) = 0.;
op(2) = -ip(3);
op(3) = ip(1);
op(4) = -ip(4);

Matrix5d J = Matrix5d::Zero();

J(0,2) = -charge*sinTheta/(ip(2)*ip(2));
J(0,3) = -charge*sinTheta2*cosTheta/ip(2);
J(0,2) = sinTheta;
J(0,3) = -sinTheta2*cosTheta*ip(2);
J(1,0) = 1.;
J(2,3) = -1.;
J(3,1) = 1.;
Expand Down
Loading

0 comments on commit 0444702

Please sign in to comment.