Skip to content

Commit

Permalink
fitter dataformat mod, PVP breaks at type conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
joshshterenberg committed Feb 17, 2023
1 parent 217967f commit f0b75f9
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 86 deletions.
17 changes: 7 additions & 10 deletions RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#ifndef fitterCUDA_h
#define fitterCUDA_h
//#include "CUDADataFormats/Track/interface/TrackForPVHeterogeneous.h"
#include "CUDADataFormats/Track/interface/TrackForPVSoAT.h"
#include "TrackingTools/TransientTrack/interface/TransientTrack.h"
#include "RecoVertex/VertexPrimitives/interface/TransientVertex.h"
//#include "RecoVertex/VertexTools/interface/VertexDistanceXY.h"
Expand Down Expand Up @@ -31,15 +31,12 @@ namespace fitterCUDA {


//void wrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, cudaStream_t stream);
std::vector<TransientVertex> wrapper(
algo algorithm,
std::vector<std::vector<reco::TransientTrack> >&& clusters,
reco::BeamSpot beamSpot,
VertexState beamVertexState,
bool f4D,
bool validBS,
bool weightFit,
bool fVerbose );
void wrapper(
unsigned int ntracks,
TrackForPV::TrackForPVSoA* GPUtracksObject,
TrackForPV::VertexForPVSoA* GPUverticesObject,
algo algorithm
);
}

#endif
113 changes: 102 additions & 11 deletions RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,93 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
clusterizerCUDA::bigKernelWrapper(ntracks, GPUtracksObject, GPUverticesObject, GPUbeta.get(), osumtkwt.get(), cParams, cudaStreamDefault);
// std::cout << "End kernel" << std::endl;




////////////////////////////////////////////////////////////////////
////////////////////// Fitting on GPU //////////////////////////////
////////////////////////////////////////////////////////////////////

//JS_EDIT: fitting moved here before copies back to the cpu
clusterizerCUDA::verticesAndClusterize(ntracks, GPUtracksObject, GPUverticesObject, cParams, cudaStreamDefault);


//change to just first algo with beamspot constraint
for (std::vector<algo>::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) {
auto result = std::make_unique<reco::VertexCollection>();
reco::VertexCollection& vColl = (*result);
std::vector<TransientVertex> pvs;
//outsource move algo to fitterCUDA::algo, less data movement
fitterCUDA::algo algorithm_for_fitter;
algorithm_for_fitter.fitter = (*algorithm).fitter;
algorithm_for_fitter.vertexSelector = (*algorithm).vertexSelector;
algorithm_for_fitter.label = (*algorithm).label;
algorithm_for_fitter.useBeamConstraint = (*algorithm).useBeamConstraint;
algorithm_for_fitter.minNdof = (*algorithm).minNdof;

fitterCUDA::wrapper(ntracks, GPUtracksObject, GPUverticesObject, algorithm_for_fitter);

//copy over back to CPU, keep conditionals below the same
//conversion happens here//

//POST_CONDITIONS

//if (fVerbose) std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices candidates =" << pvs.size() << std::endl;
//if (clusters.size() > 2 && clusters.size() > 2 * GPUverticesObject->nTrueVertex(0))
// edm::LogWarning("PrimaryVertexProducerCUDA")
// << "more than half of candidate vertices lost " << GPUverticesObject->nTrueVertex(0) << ' ' << clusters.size();

//should already be sorted
//if (GPUverticesObject->nTrueVertex(0) > 1) sort(GPUverticesObject->order(0), GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1), VertexHigherPtSquared());
for (unsigned int i = 0; i < GPUverticesObject->nTrueVertex(0); i++) {
auto iv = GPUverticesObject->order(i);
//for (TrackForPV::VertexForPvSoA::const_iterator iv = GPUverticesObject->order(0); iv != GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1); iv++) {
vColl.push_back(*iv);
}

if (vColl.empty()) {
GlobalError bse(beamSpot.rotatedCovariance3D());
if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
AlgebraicSymMatrix33 we;
we(0, 0) = 10000;
we(1, 1) = 10000;
we(2, 2) = 10000;
vColl.push_back(reco::Vertex(beamSpot.position(), we, 0., 0., 0));
if (fVerbose) {
std::cout << "RecoVertex/PrimaryVertexProducerCUDA: "
<< "Beamspot with invalid errors " << bse.matrix() << std::endl;
std::cout << "Will put Vertex derived from dummy-fake BeamSpot into Event.\n";
}
} else {
vColl.push_back(reco::Vertex(beamSpot.position(), beamSpot.rotatedCovariance3D(), 0., 0., 0));
if (fVerbose) {
std::cout << "RecoVertex/PrimaryVertexProducerCUDA: "
<< " will put Vertex derived from BeamSpot into Event.\n";
}
}
}

if (fVerbose) {
int ivtx = 0;
for (reco::VertexCollection::const_iterator v = vColl.begin(); v != vColl.end(); ++v) {
std::cout << "recvtx " << ivtx++ << "#trk " << std::setw(3) << v->tracksSize() << " chi2 " << std::setw(4)
<< v->chi2() << " ndof " << std::setw(3) << v->ndof() << " x " << std::setw(6) << v->position().x()
<< " dx " << std::setw(6) << v->xError() << " y " << std::setw(6) << v->position().y() << " dy "
<< std::setw(6) << v->yError() << " z " << std::setw(6) << v->position().z() << " dz " << std::setw(6)
<< v->zError();
if (f4D) {
std::cout << " t " << std::setw(6) << v->t() << " dt " << std::setw(6) << v->tError();
}
std::cout << std::endl;
}
}
iEvent.put(std::move(result), algorithm->label);
}





///// TODO:: update this when we put the fitter into GPU as well ////

//cudaCheck(cudaFree(GPUverticesObject));
//cudaCheck(cudaFree(CPUtracksObject));
//cudaCheck(cudaFree(GPUtracksObject));
Expand All @@ -422,21 +506,27 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
// cudaCheck(cudaFree(GPUtracksObject));
//cudaCheck(cudaFree(d_obj_ptr));
// //std::cout << "Finished copying 2" << std::endl;
std::vector<TransientVertex> pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get());


//JS_EDIT: COMMENT BC CHANGED
// std::vector<TransientVertex> pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get());
// clusterize tracks in Z
std::vector<std::vector<reco::TransientTrack> >&& clusters = clusterizerCUDA::clusterize(pv, cParams);
//std::vector<std::vector<reco::TransientTrack> >&& clusters = clusterizerCUDA::clusterize(pv, cParams);


// cudaCheck(cudaFree(CPUverticesObject));
// cudaCheck(cudaFree(CPUtracksObject));
cudaCheck(cudaDeviceSynchronize());
////////////////////////////////////////////////////////////////////
////////////////////// Fitting on GPU //////////////////////////////
////////////////////////////////////////////////////////////////////



//std::vector<reco::TransientTrack> seltks;
// std::vector<std::vector<reco::TransientTrack> > clusters;

// std::vector<std::vector<reco::TransientTrack> > clusters;

// vertex fits
/*
for (std::vector<algo>::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) {
auto result = std::make_unique<reco::VertexCollection>();
reco::VertexCollection& vColl = (*result);
Expand All @@ -452,7 +542,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
algorithm_for_fitter.minNdof = (*algorithm).minNdof;
pvs = fitterCUDA::wrapper(algorithm_for_fitter, std::move(clusters), beamSpot, beamVertexState, f4D, validBS, weightFit, fVerbose);

*/
/*
for (std::vector<std::vector<reco::TransientTrack> >::const_iterator iclus = clusters.begin();
iclus != clusters.end();
Expand Down Expand Up @@ -540,7 +630,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
(!validBS || (*(algorithm->vertexSelector))(v, beamVertexState)))
pvs.push_back(v);
} // end of cluster loop
*/
if (fVerbose) {
std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices candidates =" << pvs.size() << std::endl;
Expand All @@ -553,7 +643,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
if (pvs.empty() && seltks.size() > 5)
edm::LogWarning("PrimaryVertexProducerCUDA")
<< "no vertex found with " << seltks.size() << " tracks and " << clusters.size() << " vertex-candidates";
*/
// sort vertices by pt**2 vertex (aka signal vertex tagging)
if (pvs.size() > 1) {
sort(pvs.begin(), pvs.end(), VertexHigherPtSquared());
Expand Down Expand Up @@ -611,9 +701,10 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
<< v->zError();
std::cout << std::endl;
}
*/
iEvent.put(std::move(result), algorithm->label);
}
*/
}

void PrimaryVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down
122 changes: 57 additions & 65 deletions RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,22 @@
namespace fitterCUDA {


/*

__global__ void fitterKernel(
TransientVertex cuda_pvs[],
algo algorithm,
reco::TransientTrack cuda_clusters[][max_size],
reco::BeamSpot beamSpot,
VertexState beamVertexState,
int clusters_size,
int max_size,
bool f4D,
bool validBS,
bool weightFit,
bool fVerbose
unsigned int ntracks,
TrackForPV::TrackForPVSoA* GPUtracksObject,
TrackForPV::VertexForPVSoA* GPUverticesObject,
algo algorithm
){
// return;
// RECOMMENT IF NEEDED FROM HERE
int idx = blockIdx.x * blockDim.x + threadIdx.x;

// for (std::vector<std::vector<reco::reco::TransientTrack> >::const_iterator iclus = clusters.begin();
// iclus != clusters.end();
// iclus++) {

/*
for (int i = 0; i < clusters_size; i++) {
double sumwt = 0.;
double sumwt2 = 0.;
Expand Down Expand Up @@ -122,92 +115,89 @@ __global__ void fitterKernel(
(!validBS || (*(algorithm.vertexSelector))(v, beamVertexState)))
cuda_pvs[idx] = v; //pvs.push_back(v);
}
}
*/
}


#ifdef __CUDACC__
std::vector<TransientVertex> wrapper(
algo algorithm,
std::vector<std::vector<reco::TransientTrack> >&& clusters,
reco::BeamSpot beamSpot,
VertexState beamVertexState,
bool f4D,
bool validBS,
bool weightFit,
bool fVerbose ){

std::cout << "\n\n\n\n\ngot to the wrapper\n";
void wrapper(
unsigned int ntracks,
TrackForPV::TrackForPVSoA* GPUtracksObject,
TrackForPV::VertexForPVSoA* GPUverticesObject,
algo algorithm
){

//defines grid
unsigned int blockSize = 1;
unsigned int gridSize = 1;
std::cout << "defined grid size\n";


/*
//create and allocate all host memory (only pvs needed)
TransientVertex *cpu_pvs;
cpu_pvs = (TransientVertex *) malloc(clusters.size() * sizeof(TransientVertex));
std::cout << "created and allocated all host memory\n";
TrackForPV::TrackForPVSoA* cpu_clusters;
cpu_pvs = (TransientVertex *) malloc(ntracks * sizeof(TransientVertex));
cpu_clusters = (reco::TransientTrack *) malloc(clusters.size() * max_size * sizeof(reco::TransientTrack)); //1D flattened
//create and allocate all device memory (pvs and clusters)
reco::TransientTrack **cuda_clusters;
TransientVertex *cuda_pvs;
std::cout << "initial defs good\n";
cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex)));
cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * sizeof(reco::TransientTrack *)));
std::cout << "initial cuda mallocs good\n";
reco::TransientTrack sample_track;
long unsigned int max_size = 0;
for (long unsigned int i = 0; i < clusters.size(); i++) if (max_size < clusters[i].size()) max_size = clusters[i].size();
/////////////////////////////////////////////////////////
////// MODIFY TYPE / CONVERT TO WHAT'S NEEDED HERE //////
/////////////////////////////////////////////////////////
long unsigned int n_iter[clusters.size()];
std::cout << "size of clusters array: " << clusters.size() << "X" << max_size << "\n";
//std::cout << "size of cpu_clusters: " << end(cpu_clusters) - begin(cpu_clusters) << "\n";
for (long unsigned int i = 0; i < clusters.size(); i++) {
cudaCheck(cudaMalloc(&(cuda_clusters[i]), (max_size+1) * sizeof(reco::TransientTrack))); //this is segfaulting rn
std::cout << "cudamalloc of clusters 2d is good\n";
std::cout << "i is " << i << ". current size is " << clusters[i].size() << "\n";
n_iter[i] = clusters[i].size(); //use in kernel to keep track of bounds on 2D array
for (long unsigned int j = 0; j < clusters[i].size(); j++) {
std::cout << "everything until population good\n";
cuda_clusters[i][j] = clusters[i][j]; //populating rectangular 2D cluster array on valid vals
std::cout << "j is " << j << "\t";
reco::TransientTrack test = clusters[i][j];
std::cout << "read ok\t";
cpu_clusters[i * max_size + j] = test; //populating rectangular 2D cluster array on valid vals
std::cout << "write ok\n";
}
cuda_clusters[i][clusters[i].size()] = sample_track; //SET FOR FINAL VALUES, COMPARE IN KERNEL
}
std::cout << "created and allocated all device memory\n";
//host to device memory copy (NONE NEEDED)
//cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, memSize1, cudaMemcpyHostToDevice));
//cudaCheck(cudaMemcpy(cuda_pvs, cpu_pvs, memSize2, cudaMemcpyHostToDevice));
//std::cout << "host memory copied to device memory\n";
std::cout << "created and allocated all host memory\n";
//create and allocate all device memory (pvs and clusters)
reco::TransientTrack *cuda_clusters;
TransientVertex *cuda_pvs;
cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex)));
cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack)));
std::cout << "created and allocated all device memory\n";
//host to device memory copy
cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack), cudaMemcpyHostToDevice));
std::cout << "host memory copied to device memory\n";
*/

//action!
/*
fitterKernel<<<gridSize, blockSize>>>(
cuda_pvs,
algorithm,
cuda_clusters,
beamSpot,
beamVertexState,
clusters.size(),
max_size,
f4D,
validBS,
weightFit,
fVerbose
ntracks,
GPUtracksObject,
GPUverticesObject,
algorithm
);
std::cout << "main action complete\n";
*/

//wait for device to complete / error check
cudaDeviceSynchronize();
cudaCheck(cudaGetLastError());
std::cout << "sync / error check complete\n";


/*
//device to host memory copy
cudaCheck(cudaMemcpy(cpu_pvs, cuda_pvs, clusters.size() * sizeof(TransientVertex), cudaMemcpyDeviceToHost));
std::cout << "device memory copied to host memory\n";
//clear device memory
for (long unsigned int i = 0; i < clusters.size(); i++) {
cudaCheck(cudaFree(cuda_clusters[i]));
}
cudaCheck(cudaFree(cuda_clusters));
cudaCheck(cudaFree(cuda_pvs));
std::cout << "cleared device memory\n\n";
Expand All @@ -219,6 +209,8 @@ std::vector<TransientVertex> wrapper(
//done
return pvs;
*/
}
#endif
}
Expand Down
Binary file modified RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root
Binary file not shown.
Binary file modified RecoVertex/PrimaryVertexProducer/test/test_gpu.root
Binary file not shown.

0 comments on commit f0b75f9

Please sign in to comment.