Merge pull request #1 from cericeci/CMSSW_12_6_0_pre5_dev_forJosh

PR with updated dataformats and clusterize+vectorize in CUDA for discussion
cericeci · Feb 16, 2023 · 217967f · 217967f
2 parents 8ffa62d + 146f0da
commit 217967f
Show file tree

Hide file tree

Showing 3 changed files with 92 additions and 3 deletions.
diff --git a/CUDADataFormats/Track/interface/TrackForPVSoAT.h b/CUDADataFormats/Track/interface/TrackForPVSoAT.h
@@ -15,7 +15,7 @@ using Vector512d = Eigen::Matrix<double, 1024, 1>;
 template <int32_t S>
 class TrackForPVSoAHeterogeneousT {
 public:
-  static constexpr int32_t stride() { return S; }
+  static constexpr uint32_t stride() { return S; }
 
 public:
   // Track properties needed for the PV selection + fitting
@@ -47,6 +47,16 @@ class TrackForPVSoAHeterogeneousT {
 
   //eigenSoA::ScalarSoA<int8_t, S> nPixelHits;
   //eigenSoA::ScalarSoA<int8_t, S> nTrackerHits;
+  // For the fitter
+  eigenSoA::ScalarSoA<double, S> dx;
+  eigenSoA::ScalarSoA<double, S> dy;
+  eigenSoA::ScalarSoA<double, S> dz;
+  eigenSoA::ScalarSoA<double, S> px;
+  eigenSoA::ScalarSoA<double, S> py;
+  eigenSoA::ScalarSoA<double, S> pz;
+  eigenSoA::ScalarSoA<double, S> dxError;
+  eigenSoA::ScalarSoA<double, S> dyError;
+  eigenSoA::ScalarSoA<double, S> dzError;
 
   // The track-vertex association matrices
   eigenSoA::MatrixSoA<Vector512d, S> vert_sw;
@@ -64,7 +74,7 @@ class TrackForPVSoAHeterogeneousT {
 template <int32_t S>
 class VertexForPVSoAHeterogeneousT {
 public:
-  static constexpr int32_t stride() { return S; }
+  static constexpr uint32_t stride() { return S; }
 
 public:
   // Track properties needed for the PV selection + fitting
@@ -79,6 +89,9 @@ class VertexForPVSoAHeterogeneousT {
   eigenSoA::ScalarSoA<double, S> exparg; // Or this
   eigenSoA::ScalarSoA<int, S> order;
   eigenSoA::ScalarSoA<double, S> z;
+  eigenSoA::ScalarSoA<double, S> x;
+  eigenSoA::ScalarSoA<double, S> y;
+  eigenSoA::ScalarSoA<unsigned int, S> ntracks;
   eigenSoA::ScalarSoA<double, S> rho;
   // Auxiliar vectors
   eigenSoA::ScalarSoA<double, S> aux1;

diff --git a/RecoVertex/PrimaryVertexProducer/interface/clusterizerCUDA.h b/RecoVertex/PrimaryVertexProducer/interface/clusterizerCUDA.h
@@ -987,6 +987,6 @@ std::vector<TransientVertex> vertices(unsigned int ntracks, TrackForPV::TrackFor
   */
 void dumpTV(TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, unsigned int gridSize);
 void bigKernelWrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, double* beta, double* osumtkwt, clusterParameters params, cudaStream_t stream);
-
+void verticesAndClusterize(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, clusterParameters params, cudaStream_t stream);  
 }
 #endif
diff --git a/RecoVertex/PrimaryVertexProducer/plugins/clusterizerCUDA.cc b/RecoVertex/PrimaryVertexProducer/plugins/clusterizerCUDA.cc
@@ -868,6 +868,74 @@ __global__ void resortVerticesAndAssign(TrackForPV::TrackForPVSoA* tracks, Track
 //   }
 }
 
+__global__ void verticesAndClusterizeKernel(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, clusterParameters params){
+
+  size_t firstElement = threadIdx.x + blockIdx.x * blockDim.x; // This is going to be the vertex index
+  size_t gridSize = blockDim.x * gridDim.x;
+
+  // From here it used to be vertices
+
+  for (unsigned int k = firstElement; k < vertices->nTrueVertex(0); k+= gridSize) {
+    unsigned int ivertex = vertices->order(k);
+    vertices->ntracks(ivertex) = 0;
+    for (unsigned int itrackO = 0; itrackO < tracks->nTrueTracks; itrackO+= 1){ 
+      // Check if vertex is valid, i.e. count tracks
+      unsigned int itrack = tracks->order(itrackO);
+      unsigned int ivtxFromTk = tracks->kmin(itrack);
+      if (ivtxFromTk == k){
+	vertices->ntracks(ivertex)++;
+      }
+    }
+    if (not(vertices->ntracks(ivertex) < 1)){
+      vertices->isGood(ivertex) = false; // No longer needed
+      continue; //Skip vertex if it has no tracks
+    }
+    // If we are here, we are going to fit the vertex later, so just in case initialize this
+    vertices->x(ivertex) = 0;
+    vertices->y(ivertex) = 0;
+  }
+  __syncthreads();
+  // From here it used to be clusterize
+  // So we now check whether each vertex is further enough from the previous one
+  for (unsigned int k = firstElement; k < vertices->nTrueVertex(0); k+= gridSize) {
+    int prevVertex = k-1;
+    unsigned int thisVertex = (int) vertices->order(k);
+    if (not(vertices->isGood(thisVertex))){
+      continue;
+    }
+    while (!(vertices->isGood(vertices->order(prevVertex)) && prevVertex >= 0)){
+      // Find the previous vertex that was good
+      prevVertex--;
+    }
+    if ((prevVertex < 0)){ // If it is first, always good
+      vertices->isGood(thisVertex) = true;
+    }
+    else if (std::abs(vertices->z(thisVertex)-vertices->z(prevVertex)) > (2* params.vertexSize)){ //If it is further away enough, it is also good
+      vertices->isGood(thisVertex) = true;
+    }
+    else{
+      vertices->isGood(thisVertex) = false;
+    }
+  }
+  // This is new, basically we have to deal with the order being broken by the invalidation of vertexes and set back again the vertex multiplicity, unfortunately can't be parallelized without competing conditions
+  __syncthreads();
+  if (threadIdx.x == 0 && blockIdx.x == 0){
+    unsigned int k = 0;
+    while (k != vertices->nTrueVertex(0)){
+      unsigned int thisVertex = vertices->order(k);
+      if (vertices->isGood(k)){ // If is good just continue
+        k++;
+      }
+      else{
+        for (unsigned int l = vertices->nTrueVertex(0)-1; l >= k; l --){ //If it is bad, move one position all indexes
+	  vertices->order(l) = vertices->order(l+1);
+	}
+        vertices->nTrueVertex(0)--; // And reduce vertex number by 1
+      }
+    }
+  }
+}
+
 __global__ void bigKernel(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, clusterParameters params, double* osumtkwt, double* beta){
 
   extern __shared__ double rbeta[];
@@ -1141,6 +1209,14 @@ void bigKernelWrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, T
   cudaCheck(cudaGetLastError());
 */
 }
+
+void verticesAndClusterize(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, clusterParameters params, cudaStream_t stream){
+  unsigned int blockSize = 512;
+  unsigned int gridSize  = 1;
+  verticesAndClusterizeKernel<<<gridSize, blockSize,blockSize,stream>>>(ntracks, tracks, vertices, params);
+  cudaCheck(cudaGetLastError());
+}
+
 /*
   // Only on GPUs, of course...
 void initializeWrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, TrackForPV::VertexForPVSoA* vertices, double* beta, double* osumtkwt, clusterParameters params, cudaStream_t stream){