cms-patatrack
diff --git a/‎RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
-2 b/‎RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
-2
diff --git a/‎RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+2-2 b/‎RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h
+2-2
diff --git a/‎RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+37-31 b/‎RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+37-31
diff --git a/‎RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
-2 b/‎RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
-2
diff --git a/‎RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+25-5 b/‎RecoPixelVertexing/PixelTriplets/plugins/CAConstants.h
+25-5
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
     process.pixelVertices.gpuEnableConversion = False
@@ -32,7 +31,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
 
 
@@ -38,7 +38,7 @@ namespace Rfit
       |cov(X0, R)|cov(Y0, R)|cov( R, R)|
     */
     int32_t q;  //!< particle charge
-    float chi2 = 0.0;
+    float chi2;
   };
 
   struct line_fit
@@ -49,7 +49,7 @@ namespace Rfit
       |cov(c_t,c_t)|cov(Zip,c_t)| \n
       |cov(c_t,Zip)|cov(Zip,Zip)|
     */
-    double chi2 = 0.0;
+    double chi2;
   };
 
   struct helix_fit
 
@@ -11,12 +11,12 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
 
-using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
-
-using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
+using HitsOnGPU = TrackingRecHit2DSOAView;
 using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
 
 using namespace Eigen;
@@ -75,14 +75,14 @@ void kernelBLFastFit(TuplesOnGPU::Container const * __restrict__ foundNtuplets,
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
-    hhp->cpeParams->detParams(hhp->detInd_d[hit]).frame.toGlobal(hhp->xerr_d[hit], 0, hhp->yerr_d[hit], ge);
+    hhp->cpeParams().detParams(hhp->detectorIndex(hit)).frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
 #ifdef BL_DUMP_HITS
     if (dump){
-      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detInd_d[hit],i,hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
-      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detInd_d[hit],i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+      printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", helix_start, hhp->detectorIndex(hit),i,hhp->xGlobal(hit),hhp->yGlobal(hit),hhp->zGlobal(hit));
+      printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",helix_start,hhp->detetectorIndex(hit),i,ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
     }
 #endif
-    hits.col(i) << hhp->xg_d[hit], hhp->yg_d[hit], hhp->zg_d[hit];
+    hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
     hits_ge.col(i) << ge[0],ge[1],ge[2],ge[3],ge[4],ge[5];
   }
   BrokenLine::BL_Fast_fit(hits,fast_fit);
@@ -167,65 +167,71 @@ void kernelBLFit(
 }
 
 
-void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t cudaStream)
+void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const & hh, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cuda::stream_t<> & stream)
 {
-    assert(tuples_d); assert(fast_fit_resultsGPU_);
+    assert(tuples_d);
 
     auto blockSize = 64;
     auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
-    for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
+   //  Fit internals
+   edm::Service<CUDAService> cs;
+   auto hitsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix3xNd<4>)/sizeof(double),stream);
+   auto hits_geGPU_ = cs->make_device_unique<float[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Matrix6x4f)/sizeof(float),stream);
+   auto fast_fit_resultsGPU_ = cs->make_device_unique<double[]>(maxNumberOfConcurrentFits_ * sizeof(Rfit::Vector4d)/sizeof(double),stream);
+
+   for (uint32_t offset=0; offset<maxNumberOfTuples; offset+=maxNumberOfConcurrentFits_) {
 
       // fit triplets
-      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           3, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              3, offset);
       cudaCheck(cudaGetLastError());
 
       // fit quads
-      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+      kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           4, offset);
       cudaCheck(cudaGetLastError());
 
-      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              4, offset);
       cudaCheck(cudaGetLastError());
 
       if (fit5as4_) {
         // fit penta (only first 4)
-        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       } else {
         // fit penta (all 5)
-        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-          tuples_d, tupleMultiplicity_d, hh.gpu_d,
-          hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+        kernelBLFastFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
+          tuples_d, tupleMultiplicity_d, hh.view(),
+          hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
           5, offset);
         cudaCheck(cudaGetLastError());
 
-        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(
              tupleMultiplicity_d, bField_, helix_fit_results_d,
-             hitsGPU_, hits_geGPU_, fast_fit_resultsGPU_,
+             hitsGPU_.get(), hits_geGPU_.get(), fast_fit_resultsGPU_.get(),
              5, offset);
         cudaCheck(cudaGetLastError());
       }
 
@@ -11,8 +11,6 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
-<flags CXXFLAGS="-g -fno-math-errno"/>
-<flags CUDA_FLAGS="-g"/>
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
@@ -6,32 +6,52 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
-#include "RecoLocalTracker/SiPixelClusterizer/interface/PixelTrackingGPUConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 
 // #define ONLY_PHICUT
 
 namespace CAConstants {
 
    // constants
-
-   constexpr uint32_t maxNumberOfQuadruplets() { return 6*1024; }
+#ifdef GPU_SMALL_EVENTS
+   constexpr uint32_t maxNumberOfTuples() { return 3*1024;}
+#else
+   constexpr uint32_t maxNumberOfTuples() { return 6*1024;}
+#endif
+   constexpr uint32_t maxNumberOfQuadruplets() { return maxNumberOfTuples(); }
 #ifndef ONLY_PHICUT
+#ifndef GPU_SMALL_EVENTS
    constexpr uint32_t maxNumberOfDoublets() { return 262144; }
    constexpr uint32_t maxCellsPerHit() { return 128; }
+#else
+   constexpr uint32_t maxNumberOfDoublets() { return 262144/2; }
+   constexpr uint32_t maxCellsPerHit() { return 128/2; }
+#endif
 #else
    constexpr uint32_t maxNumberOfDoublets() { return 6*262144; }
    constexpr uint32_t maxCellsPerHit() { return 4*128; }
 #endif
+  constexpr uint32_t maxNumOfActiveDoublets() { return maxNumberOfDoublets()/4;}
+
+
    constexpr uint32_t maxNumberOfLayerPairs() { return 13; }
    constexpr uint32_t maxNumberOfLayers() { return 10; }
-   constexpr uint32_t maxTuples() { return 6*1024;}
+   constexpr uint32_t maxTuples() { return maxNumberOfTuples();}
 
    // types
    using hindex_type = uint16_t; // FIXME from siPixelRecHitsHeterogeneousProduct
    using tindex_type = uint16_t; //  for tuples
+
+   using CellNeighbors = GPU::VecArray< uint32_t, 36>;
+   using CellTracks = GPU::VecArray< tindex_type, 42>;
+
+   using CellNeighborsVector = GPU::SimpleVector<CellNeighbors>;
+   using CellTracksVector = GPU::SimpleVector<CellTracks>;
+
    using OuterHitOfCell = GPU::VecArray< uint32_t, maxCellsPerHit()>;
    using TuplesContainer = OneToManyAssoc<hindex_type, maxTuples(), 5*maxTuples()>;
-   using HitToTuple = OneToManyAssoc<tindex_type, PixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
+   using HitToTuple = OneToManyAssoc<tindex_type, pixelGPUConstants::maxNumberOfHits, 4*maxTuples()>; // 3.5 should be enough
    using TupleMultiplicity = OneToManyAssoc<tindex_type,8,maxTuples()>;
 
 }