cms-patatrack · vkhristenko · Jul 1, 2019 · Jul 1, 2019 · Jul 1, 2019 · Jul 1, 2019
diff --git a/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h b/RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
@@ -72,6 +72,30 @@ struct EventInputDataGPU {
     }
 };
 
+enum class KernelsVersion : uint32_t {
+    // kernels are completely splitted and launched/sync from the host
+    SplittedHostLaunch = 0,
+    // kernels are completely splitted, a grid of (1,1) is launched
+    // and a single gpu thread will do launch/sync of consequitive kernels
+    SplittedDeviceLaunch = 1,
+    // fused minimization, 10 x 10 threads per block.
+    // 1 channel per block
+    Fused = 2,
+    // Reasoning:
+    // 1) after iteration 0 and 1, there are < 10% channels left
+    // 2) after iteration 2, there are always < 1% channels left
+    // 3) instead of continueing spinning (launch+sync) the cpu, 
+    //   launch fused minimization
+    // Kernels are splitted, as in SplittedHostLaunch version
+    // after iteration 3 (TODO: should be configurable) laucnh fused version
+    // to continue minimization. there should be small number of blocks needed (resources! consumed)
+    HybridHostLaunch = 3,
+    // similar to SplittedDeviceLaunch for launching, (1, 1) gpu grid controls
+    // minimization. Hybrid - Runs fused minimization starting from iteration N
+    // TODO: right now N is fixed at 4
+    HybridDeviceLaunch = 4
+};
+
 // parameters have a fixed type
 // Can we go by with single precision
 struct ConfigurationParameters {
@@ -100,6 +124,7 @@ struct ConfigurationParameters {
     std::array<uint32_t, 3> kernelMinimizeThreads;
 
     bool shouldRunTimingComputation;
+    KernelsVersion version;
 };
 
 struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr> 
@@ -145,11 +170,14 @@ struct EventDataForScratchGPU {
     SampleVector *samples = nullptr;
     SampleGainVector *gainsNoise = nullptr;
 
-    SampleMatrix* noisecov = nullptr;
-    PulseMatrixType *pulse_matrix = nullptr;
-    FullSampleMatrix* pulse_covariances = nullptr;
-    BXVectorType *activeBXs = nullptr;
-    char *acState = nullptr;
+    uint32_t *v2rmapping_1=nullptr;
+    uint32_t *v2rmapping_2=nullptr;
+    uint32_t *pChannelsCounter = nullptr;
+
+    SampleVector::Scalar *decompMatrixMainLoop = nullptr, *decompMatrixFnnls=nullptr;
+    SampleVector::Scalar *AtA=nullptr, *Atb=nullptr;
+    char *samplesMapping=nullptr, *npassive=nullptr;
+    ::ecal::reco::StorageScalarType *chi2_prev=nullptr;
 
     bool *hasSwitchToGain6=nullptr,
          *hasSwitchToGain1=nullptr,
@@ -173,16 +201,27 @@ struct EventDataForScratchGPU {
         cudaCheck( cudaMalloc((void**)&gainsNoise,
             size * sizeof(SampleGainVector)) );
 
-        cudaCheck( cudaMalloc((void**)&pulse_covariances,
-            size * sizeof(FullSampleMatrix)) );
-        cudaCheck( cudaMalloc((void**)&noisecov,
-            size * sizeof(SampleMatrix)) );
-        cudaCheck( cudaMalloc((void**)&pulse_matrix,
-            size * sizeof(PulseMatrixType)) );
-        cudaCheck( cudaMalloc((void**)&activeBXs,
-            size * sizeof(BXVectorType)) );
-        cudaCheck( cudaMalloc((void**)&acState,
+        cudaCheck( cudaMalloc((void**)&v2rmapping_1,
+            size * sizeof(uint32_t)) );
+        cudaCheck( cudaMalloc((void**)&v2rmapping_2,
+            size * sizeof(uint32_t)) );
+        cudaCheck( cudaMalloc((void**)&pChannelsCounter,
+            sizeof(uint32_t)) );
+        // FIXME: replace 55 with MapSymM::total
+        cudaCheck( cudaMalloc((void**)&decompMatrixMainLoop,
+            size * 55 * sizeof(SampleVector::Scalar)) );
+        cudaCheck( cudaMalloc((void**)&decompMatrixFnnls,
+            size * 55 * sizeof(SampleVector::Scalar)) );
+        cudaCheck( cudaMalloc((void**)&AtA,
+            size * 55 * sizeof(SampleVector::Scalar)) );
+        cudaCheck( cudaMalloc((void**)&Atb,
+            size * 10 * sizeof(SampleVector::Scalar)) );
+        cudaCheck( cudaMalloc((void**)&samplesMapping,
+            size * 10 * sizeof(char)) );
+        cudaCheck( cudaMalloc((void**)&npassive,
             size * sizeof(char)) );
+        cudaCheck( cudaMalloc((void**)&chi2_prev,
+            size * sizeof(::ecal::reco::StorageScalarType)) );
 
         cudaCheck( cudaMalloc((void**)&hasSwitchToGain6,
             size * sizeof(bool)) );
@@ -232,11 +271,17 @@ struct EventDataForScratchGPU {
         cudaCheck( cudaFree(samples) );
         cudaCheck( cudaFree(gainsNoise) );
 
-        cudaCheck( cudaFree(pulse_covariances) );
-        cudaCheck( cudaFree(noisecov) );
-        cudaCheck( cudaFree(pulse_matrix) );
-        cudaCheck( cudaFree(activeBXs) );
-        cudaCheck( cudaFree(acState) );
+        cudaCheck( cudaFree(v2rmapping_1) );
+        cudaCheck( cudaFree(v2rmapping_2) );
+        cudaCheck( cudaFree(pChannelsCounter) );
+
+        cudaCheck( cudaFree(decompMatrixMainLoop) );
+        cudaCheck( cudaFree(decompMatrixFnnls) );
+        cudaCheck( cudaFree(AtA) );
+        cudaCheck( cudaFree(Atb) );
+        cudaCheck( cudaFree(samplesMapping) );
+        cudaCheck( cudaFree(npassive) );
+        cudaCheck( cudaFree(chi2_prev) );
 
         cudaCheck( cudaFree(hasSwitchToGain6) );
         cudaCheck( cudaFree(hasSwitchToGain1) );

diff --git a/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h
@@ -9,36 +9,14 @@
 namespace ecal { namespace multifit {
 
 constexpr int SampleVectorSize = 10;
-constexpr int FullSampleVectorSize = 19;
-constexpr int PulseVectorSize = 12;
-constexpr int NGains = 3;
+constexpr uint32_t noiseCovIsZeroMask = 0xffffffff;
+constexpr uint32_t idIsInvalidMask = 0xffffffff;
 
 using data_type = ::ecal::reco::ComputationScalarType;
 
-typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
-typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
-using SampleMatrixD = Eigen::Matrix<double,SampleVectorSize,SampleVectorSize>;
-
 typedef Eigen::Matrix<data_type,SampleVectorSize,1> SampleVector;
-typedef Eigen::Matrix<data_type,FullSampleVectorSize,1> FullSampleVector;
-typedef Eigen::Matrix<data_type,Eigen::Dynamic,1,0,PulseVectorSize,1> PulseVector;
-typedef Eigen::Matrix<char,Eigen::Dynamic,1,0,PulseVectorSize,1> BXVector;
 typedef Eigen::Matrix<char, SampleVectorSize,1> SampleGainVector;
 typedef Eigen::Matrix<data_type,SampleVectorSize,SampleVectorSize> SampleMatrix;
-typedef Eigen::Matrix<data_type,FullSampleVectorSize,FullSampleVectorSize> FullSampleMatrix;
-typedef Eigen::Matrix<data_type,Eigen::Dynamic,Eigen::Dynamic,0,PulseVectorSize,PulseVectorSize> PulseMatrix;
-typedef Eigen::Matrix<data_type,SampleVectorSize,Eigen::Dynamic,0,SampleVectorSize,PulseVectorSize> SamplePulseMatrix;
-typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
-typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
-typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
-typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;
-
-typedef Eigen::Matrix<data_type,1,1> SingleMatrix;
-typedef Eigen::Matrix<data_type,1,1> SingleVector;
-
-typedef std::array<SampleMatrixD,NGains> SampleMatrixGainArray;
-
-using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;
 
 }}