diff --git a/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h b/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h index c0cf5b1ea132b..27b03cbe45246 100644 --- a/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h +++ b/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h @@ -1,6 +1,6 @@ #ifndef fitterCUDA_h #define fitterCUDA_h -//#include "CUDADataFormats/Track/interface/TrackForPVHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackForPVSoAT.h" #include "TrackingTools/TransientTrack/interface/TransientTrack.h" #include "RecoVertex/VertexPrimitives/interface/TransientVertex.h" //#include "RecoVertex/VertexTools/interface/VertexDistanceXY.h" @@ -31,15 +31,12 @@ namespace fitterCUDA { //void wrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, cudaStream_t stream); - std::vector wrapper( - algo algorithm, - std::vector >&& clusters, - reco::BeamSpot beamSpot, - VertexState beamVertexState, - bool f4D, - bool validBS, - bool weightFit, - bool fVerbose ); + void wrapper( + unsigned int ntracks, + TrackForPV::TrackForPVSoA* GPUtracksObject, + TrackForPV::VertexForPVSoA* GPUverticesObject, + algo algorithm + ); } #endif diff --git a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc index 0116363d5d31b..9218e37912930 100644 --- a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc +++ b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc @@ -398,9 +398,93 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu clusterizerCUDA::bigKernelWrapper(ntracks, GPUtracksObject, GPUverticesObject, GPUbeta.get(), osumtkwt.get(), cParams, cudaStreamDefault); // std::cout << "End kernel" << std::endl; - + + + //////////////////////////////////////////////////////////////////// + ////////////////////// Fitting on GPU ////////////////////////////// + //////////////////////////////////////////////////////////////////// + + //JS_EDIT: fitting moved here before copies back to the cpu + clusterizerCUDA::verticesAndClusterize(ntracks, GPUtracksObject, GPUverticesObject, cParams, cudaStreamDefault); + + + //change to just first algo with beamspot constraint + for (std::vector::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) { + auto result = std::make_unique(); + reco::VertexCollection& vColl = (*result); + std::vector pvs; + //outsource move algo to fitterCUDA::algo, less data movement + fitterCUDA::algo algorithm_for_fitter; + algorithm_for_fitter.fitter = (*algorithm).fitter; + algorithm_for_fitter.vertexSelector = (*algorithm).vertexSelector; + algorithm_for_fitter.label = (*algorithm).label; + algorithm_for_fitter.useBeamConstraint = (*algorithm).useBeamConstraint; + algorithm_for_fitter.minNdof = (*algorithm).minNdof; + + fitterCUDA::wrapper(ntracks, GPUtracksObject, GPUverticesObject, algorithm_for_fitter); + + //copy over back to CPU, keep conditionals below the same + //conversion happens here// + + //POST_CONDITIONS + + //if (fVerbose) std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices candidates =" << pvs.size() << std::endl; + //if (clusters.size() > 2 && clusters.size() > 2 * GPUverticesObject->nTrueVertex(0)) + // edm::LogWarning("PrimaryVertexProducerCUDA") + // << "more than half of candidate vertices lost " << GPUverticesObject->nTrueVertex(0) << ' ' << clusters.size(); + + //should already be sorted + //if (GPUverticesObject->nTrueVertex(0) > 1) sort(GPUverticesObject->order(0), GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1), VertexHigherPtSquared()); + for (unsigned int i = 0; i < GPUverticesObject->nTrueVertex(0); i++) { + auto iv = GPUverticesObject->order(i); + //for (TrackForPV::VertexForPvSoA::const_iterator iv = GPUverticesObject->order(0); iv != GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1); iv++) { + vColl.push_back(*iv); + } + + if (vColl.empty()) { + GlobalError bse(beamSpot.rotatedCovariance3D()); + if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) { + AlgebraicSymMatrix33 we; + we(0, 0) = 10000; + we(1, 1) = 10000; + we(2, 2) = 10000; + vColl.push_back(reco::Vertex(beamSpot.position(), we, 0., 0., 0)); + if (fVerbose) { + std::cout << "RecoVertex/PrimaryVertexProducerCUDA: " + << "Beamspot with invalid errors " << bse.matrix() << std::endl; + std::cout << "Will put Vertex derived from dummy-fake BeamSpot into Event.\n"; + } + } else { + vColl.push_back(reco::Vertex(beamSpot.position(), beamSpot.rotatedCovariance3D(), 0., 0., 0)); + if (fVerbose) { + std::cout << "RecoVertex/PrimaryVertexProducerCUDA: " + << " will put Vertex derived from BeamSpot into Event.\n"; + } + } + } + + if (fVerbose) { + int ivtx = 0; + for (reco::VertexCollection::const_iterator v = vColl.begin(); v != vColl.end(); ++v) { + std::cout << "recvtx " << ivtx++ << "#trk " << std::setw(3) << v->tracksSize() << " chi2 " << std::setw(4) + << v->chi2() << " ndof " << std::setw(3) << v->ndof() << " x " << std::setw(6) << v->position().x() + << " dx " << std::setw(6) << v->xError() << " y " << std::setw(6) << v->position().y() << " dy " + << std::setw(6) << v->yError() << " z " << std::setw(6) << v->position().z() << " dz " << std::setw(6) + << v->zError(); + if (f4D) { + std::cout << " t " << std::setw(6) << v->t() << " dt " << std::setw(6) << v->tError(); + } + std::cout << std::endl; + } + } + iEvent.put(std::move(result), algorithm->label); + } + + + + + ///// TODO:: update this when we put the fitter into GPU as well //// - //cudaCheck(cudaFree(GPUverticesObject)); //cudaCheck(cudaFree(CPUtracksObject)); //cudaCheck(cudaFree(GPUtracksObject)); @@ -422,21 +506,27 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu // cudaCheck(cudaFree(GPUtracksObject)); //cudaCheck(cudaFree(d_obj_ptr)); // //std::cout << "Finished copying 2" << std::endl; - std::vector pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get()); + + + //JS_EDIT: COMMENT BC CHANGED +// std::vector pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get()); // clusterize tracks in Z - std::vector >&& clusters = clusterizerCUDA::clusterize(pv, cParams); + //std::vector >&& clusters = clusterizerCUDA::clusterize(pv, cParams); + + // cudaCheck(cudaFree(CPUverticesObject)); // cudaCheck(cudaFree(CPUtracksObject)); cudaCheck(cudaDeviceSynchronize()); - //////////////////////////////////////////////////////////////////// - ////////////////////// Fitting on GPU ////////////////////////////// - //////////////////////////////////////////////////////////////////// + + + //std::vector seltks; // std::vector > clusters; // std::vector > clusters; // vertex fits + /* for (std::vector::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) { auto result = std::make_unique(); reco::VertexCollection& vColl = (*result); @@ -452,7 +542,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu algorithm_for_fitter.minNdof = (*algorithm).minNdof; pvs = fitterCUDA::wrapper(algorithm_for_fitter, std::move(clusters), beamSpot, beamVertexState, f4D, validBS, weightFit, fVerbose); - + */ /* for (std::vector >::const_iterator iclus = clusters.begin(); iclus != clusters.end(); @@ -540,7 +630,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu (!validBS || (*(algorithm->vertexSelector))(v, beamVertexState))) pvs.push_back(v); } // end of cluster loop - */ + if (fVerbose) { std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices candidates =" << pvs.size() << std::endl; @@ -553,7 +643,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu if (pvs.empty() && seltks.size() > 5) edm::LogWarning("PrimaryVertexProducerCUDA") << "no vertex found with " << seltks.size() << " tracks and " << clusters.size() << " vertex-candidates"; - */ + // sort vertices by pt**2 vertex (aka signal vertex tagging) if (pvs.size() > 1) { sort(pvs.begin(), pvs.end(), VertexHigherPtSquared()); @@ -611,9 +701,10 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu << v->zError(); std::cout << std::endl; } - */ + iEvent.put(std::move(result), algorithm->label); } + */ } void PrimaryVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { diff --git a/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc b/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc index 21fe48a11af46..64a65f71a9f22 100644 --- a/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc +++ b/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc @@ -22,29 +22,22 @@ namespace fitterCUDA { -/* + __global__ void fitterKernel( - TransientVertex cuda_pvs[], - algo algorithm, - reco::TransientTrack cuda_clusters[][max_size], - reco::BeamSpot beamSpot, - VertexState beamVertexState, - int clusters_size, - int max_size, - bool f4D, - bool validBS, - bool weightFit, - bool fVerbose + unsigned int ntracks, + TrackForPV::TrackForPVSoA* GPUtracksObject, + TrackForPV::VertexForPVSoA* GPUverticesObject, + algo algorithm ){ - +// return; +// RECOMMENT IF NEEDED FROM HERE int idx = blockIdx.x * blockDim.x + threadIdx.x; - // for (std::vector >::const_iterator iclus = clusters.begin(); // iclus != clusters.end(); // iclus++) { - +/* for (int i = 0; i < clusters_size; i++) { double sumwt = 0.; double sumwt2 = 0.; @@ -122,92 +115,89 @@ __global__ void fitterKernel( (!validBS || (*(algorithm.vertexSelector))(v, beamVertexState))) cuda_pvs[idx] = v; //pvs.push_back(v); } -} */ +} + #ifdef __CUDACC__ -std::vector wrapper( - algo algorithm, - std::vector >&& clusters, - reco::BeamSpot beamSpot, - VertexState beamVertexState, - bool f4D, - bool validBS, - bool weightFit, - bool fVerbose ){ - - std::cout << "\n\n\n\n\ngot to the wrapper\n"; +void wrapper( + unsigned int ntracks, + TrackForPV::TrackForPVSoA* GPUtracksObject, + TrackForPV::VertexForPVSoA* GPUverticesObject, + algo algorithm +){ //defines grid unsigned int blockSize = 1; unsigned int gridSize = 1; std::cout << "defined grid size\n"; + + /* //create and allocate all host memory (only pvs needed) TransientVertex *cpu_pvs; - cpu_pvs = (TransientVertex *) malloc(clusters.size() * sizeof(TransientVertex)); - std::cout << "created and allocated all host memory\n"; + TrackForPV::TrackForPVSoA* cpu_clusters; + cpu_pvs = (TransientVertex *) malloc(ntracks * sizeof(TransientVertex)); + cpu_clusters = (reco::TransientTrack *) malloc(clusters.size() * max_size * sizeof(reco::TransientTrack)); //1D flattened - //create and allocate all device memory (pvs and clusters) - reco::TransientTrack **cuda_clusters; - TransientVertex *cuda_pvs; - std::cout << "initial defs good\n"; - cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex))); - cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * sizeof(reco::TransientTrack *))); - std::cout << "initial cuda mallocs good\n"; - reco::TransientTrack sample_track; - long unsigned int max_size = 0; - for (long unsigned int i = 0; i < clusters.size(); i++) if (max_size < clusters[i].size()) max_size = clusters[i].size(); + ///////////////////////////////////////////////////////// + ////// MODIFY TYPE / CONVERT TO WHAT'S NEEDED HERE ////// + ///////////////////////////////////////////////////////// + + long unsigned int n_iter[clusters.size()]; + std::cout << "size of clusters array: " << clusters.size() << "X" << max_size << "\n"; + //std::cout << "size of cpu_clusters: " << end(cpu_clusters) - begin(cpu_clusters) << "\n"; for (long unsigned int i = 0; i < clusters.size(); i++) { - cudaCheck(cudaMalloc(&(cuda_clusters[i]), (max_size+1) * sizeof(reco::TransientTrack))); //this is segfaulting rn - std::cout << "cudamalloc of clusters 2d is good\n"; + std::cout << "i is " << i << ". current size is " << clusters[i].size() << "\n"; + n_iter[i] = clusters[i].size(); //use in kernel to keep track of bounds on 2D array for (long unsigned int j = 0; j < clusters[i].size(); j++) { - std::cout << "everything until population good\n"; - cuda_clusters[i][j] = clusters[i][j]; //populating rectangular 2D cluster array on valid vals + std::cout << "j is " << j << "\t"; + reco::TransientTrack test = clusters[i][j]; + std::cout << "read ok\t"; + cpu_clusters[i * max_size + j] = test; //populating rectangular 2D cluster array on valid vals + std::cout << "write ok\n"; } - cuda_clusters[i][clusters[i].size()] = sample_track; //SET FOR FINAL VALUES, COMPARE IN KERNEL } - std::cout << "created and allocated all device memory\n"; - //host to device memory copy (NONE NEEDED) - //cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, memSize1, cudaMemcpyHostToDevice)); - //cudaCheck(cudaMemcpy(cuda_pvs, cpu_pvs, memSize2, cudaMemcpyHostToDevice)); - //std::cout << "host memory copied to device memory\n"; + std::cout << "created and allocated all host memory\n"; + + //create and allocate all device memory (pvs and clusters) + reco::TransientTrack *cuda_clusters; + TransientVertex *cuda_pvs; + cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex))); + cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack))); + + std::cout << "created and allocated all device memory\n"; + + //host to device memory copy + cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack), cudaMemcpyHostToDevice)); + std::cout << "host memory copied to device memory\n"; + */ //action! - /* fitterKernel<<>>( - cuda_pvs, - algorithm, - cuda_clusters, - beamSpot, - beamVertexState, - clusters.size(), - max_size, - f4D, - validBS, - weightFit, - fVerbose + ntracks, + GPUtracksObject, + GPUverticesObject, + algorithm ); std::cout << "main action complete\n"; - */ //wait for device to complete / error check cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); std::cout << "sync / error check complete\n"; + + /* //device to host memory copy cudaCheck(cudaMemcpy(cpu_pvs, cuda_pvs, clusters.size() * sizeof(TransientVertex), cudaMemcpyDeviceToHost)); std::cout << "device memory copied to host memory\n"; //clear device memory - for (long unsigned int i = 0; i < clusters.size(); i++) { - cudaCheck(cudaFree(cuda_clusters[i])); - } cudaCheck(cudaFree(cuda_clusters)); cudaCheck(cudaFree(cuda_pvs)); std::cout << "cleared device memory\n\n"; @@ -219,6 +209,8 @@ std::vector wrapper( //done return pvs; + + */ } #endif } diff --git a/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root b/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root index d7b3f4ce85684..e81e8c6a294f7 100644 Binary files a/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root and b/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root differ diff --git a/RecoVertex/PrimaryVertexProducer/test/test_gpu.root b/RecoVertex/PrimaryVertexProducer/test/test_gpu.root index 46da6af46c08c..95119c4d64b9b 100644 Binary files a/RecoVertex/PrimaryVertexProducer/test/test_gpu.root and b/RecoVertex/PrimaryVertexProducer/test/test_gpu.root differ