From c15e4536e8880936e968183a9882c08882fa5f84 Mon Sep 17 00:00:00 2001
From: mariadalfonso <mariadalfonso@users.noreply.github.com>
Date: Thu, 30 Jul 2020 21:21:01 +0200
Subject: [PATCH] Refactor common ECAL and HCAL code (cms-patatrack#523)

Move duplicated Eigen code to a common file, and use it for both ECAL and HCAL.
Move HCAL general reconstruction code from the hcal::multifit to the hcal::reconstruction namespace.
---
 .../plugins/AmplitudeComputationKernels.cu    |  37 +--
 .../EcalRecProducers/plugins/KernelHelpers.h  | 272 +-----------------
 2 files changed, 23 insertions(+), 286 deletions(-)

diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
index 293d77720dd2b..8c4f1e1e7a2fc 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
+++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
@@ -10,9 +10,9 @@
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_log.h"
 
-#include "KernelHelpers.h"
-#include "AmplitudeComputationKernels.h"
 #include "AmplitudeComputationCommonKernels.h"
+#include "AmplitudeComputationKernels.h"
+#include "KernelHelpers.h"
 
 namespace ecal {
   namespace multifit {
@@ -143,9 +143,9 @@ namespace ecal {
 
       extern __shared__ char shrmem[];
       DataType* shrMatrixLForFnnlsStorage =
-          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
-      DataType* shrAtAStorage =
-          reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
+          reinterpret_cast<DataType*>(shrmem) + calo::multifit::MapSymM<DataType, NPULSES>::total * threadIdx.x;
+      DataType* shrAtAStorage = reinterpret_cast<DataType*>(shrmem) +
+                                calo::multifit::MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
 
       // FIXME: remove eitehr idx or ch -> they are teh same thing
       int idx = threadIdx.x + blockDim.x * blockIdx.x;
@@ -174,12 +174,12 @@ namespace ecal {
         int iter = 0;
         int npassive = 0;
 
-        ColumnVector<NPULSES, int> pulseOffsets;
+        calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
 #pragma unroll
         for (int i = 0; i < NPULSES; ++i)
           pulseOffsets(i) = i;
 
-        ColumnVector<NPULSES, DataType> resultAmplitudes;
+        calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
 #pragma unroll
         for (int counter = 0; counter < NPULSES; counter++)
           resultAmplitudes(counter) = 0;
@@ -197,7 +197,7 @@ namespace ecal {
           //inverse_cov = noisecov[idx];
           //DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
           DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
-          MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
+          calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
           int counter = 0;
 #pragma unroll
           for (int col = 0; col < NSAMPLES; col++)
@@ -210,21 +210,21 @@ namespace ecal {
           // compute actual covariance decomposition
           //covariance_decomposition.compute(inverse_cov);
           //auto const& matrixL = covariance_decomposition.matrixL();
-          DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
-          MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
-          compute_decomposition_unrolled(matrixL, covMatrix);
+          DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
+          calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
+          calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);
 
           // L * A = P
-          ColMajorMatrix<NSAMPLES, NPULSES> A;
-          solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
+          calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
+          calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
 
           // L b = s
           float reg_b[NSAMPLES];
-          solve_forward_subst_vector(reg_b, samples[idx], matrixL);
+          calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);
 
           // FIXME: shared mem
           //DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
-          MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
+          calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
           //SampleMatrix AtA;
           SampleVector Atb;
 #pragma unroll
@@ -277,7 +277,7 @@ namespace ecal {
 
           // FIXME: shared mem
           //DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
-          MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
+          calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
 
           fnnls(AtA,
                 Atb,
@@ -384,8 +384,9 @@ namespace ecal {
         unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
         uint32_t const offsetForHashes = conditions.offsetForHashes;
         uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
-        auto const nbytesShared =
-            2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
+        auto const nbytesShared = 2 * threads_min *
+                                  calo::multifit::MapSymM<DataType, SampleVector::RowsAtCompileTime>::total *
+                                  sizeof(DataType);
         kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
             eventInputGPU.ebDigis.ids.get(),
             eventInputGPU.eeDigis.ids.get(),
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
index 103087c3517f9..1dc7a0de0f7f3 100644
--- a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
+++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
@@ -1,6 +1,8 @@
 #ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
 #define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
 
+#include "DataFormats/Math/interface/EigenComputations.h"
+
 #include <cmath>
 #include <limits>
 #include <type_traits>
@@ -10,280 +12,14 @@
 namespace ecal {
   namespace multifit {
 
-    template <int NROWS, int NCOLS>
-    using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;
-
-    template <int NROWS, int NCOLS>
-    using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;
-
-    template <int SIZE, typename T = float>
-    using ColumnVector = Eigen::Matrix<T, SIZE, 1>;
-
-    template <int SIZE, typename T = float>
-    using RowVector = Eigen::Matrix<T, 1, SIZE>;
-
-    // FIXME: provide specialization for Row Major layout
-    template <typename T, int Stride, int Order = Eigen::ColMajor>
-    struct MapSymM {
-      using type = T;
-      using base_type = typename std::remove_const<type>::type;
-
-      static constexpr int total = Stride * (Stride + 1) / 2;
-      static constexpr int stride = Stride;
-      T* data;
-
-      __forceinline__ __device__ MapSymM(T* data) : data{data} {}
-
-      __forceinline__ __device__ T const& operator()(int const row, int const col) const {
-        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
-        auto const index = total - tmp + row - col;
-        return data[index];
-      }
-
-      template <typename U = T>
-      __forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
-      operator()(int const row, int const col) {
-        auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
-        auto const index = total - tmp + row - col;
-        return data[index];
-      }
-    };
-
-    // FIXME: either use/modify/improve eigen or make this more generic
-    // this is a map for a pulse matrix to building a 2d matrix for each channel
-    // and hide indexing
-    template <typename T>
-    struct MapMForPM {
-      using type = T;
-      using base_type = typename std::remove_cv<type>::type;
-
-      type* data;
-      __forceinline__ __device__ MapMForPM(type* data) : data{data} {}
-
-      __forceinline__ __device__ base_type operator()(int const row, int const col) const {
-        auto const index = 2 - col + row;
-        return index >= 0 ? data[index] : 0;
-      }
-    };
-
-    // simple/trivial cholesky decomposition impl
-    template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
-      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-
-#pragma unroll
-      for (int i = 1; i < MatrixType1::stride; i++) {
-        T sumsq{0};
-        for (int j = 0; j < i; j++) {
-          T sumsq2{0};
-          auto const m_i_j = M(i, j);
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-        }
-
-        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
-        L(i, i) = l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
-      auto const sqrtm_0_0 = std::sqrt(M(0, 0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-
-      for (int i = 1; i < N; i++) {
-        T sumsq{0};
-        for (int j = 0; j < i; j++) {
-          T sumsq2{0};
-          auto const m_i_j = M(i, j);
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-        }
-
-        auto const l_i_i = std::sqrt(M(i, i) - sumsq);
-        L(i, i) = l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
-        MatrixType1& L,
-        MatrixType2 const& M,
-        float b[MatrixType1::stride],
-        VectorType const& Atb,
-        int const N,
-        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
-      auto const real_0 = pulseOffsets(0);
-      auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
-      L(0, 0) = sqrtm_0_0;
-      using T = typename MatrixType1::base_type;
-      b[0] = Atb(real_0) / sqrtm_0_0;
-
-      for (int i = 1; i < N; i++) {
-        auto const i_real = pulseOffsets(i);
-        T sumsq{0};
-        T total = 0;
-        auto const atb = Atb(i_real);
-        for (int j = 0; j < i; j++) {
-          auto const j_real = pulseOffsets(j);
-          T sumsq2{0};
-          auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
-          for (int k = 0; k < j; ++k)
-            sumsq2 += L(i, k) * L(j, k);
-
-          auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-          L(i, j) = value_i_j;
-
-          sumsq += value_i_j * value_i_j;
-          total += value_i_j * b[j];
-        }
-
-        auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
-        L(i, i) = l_i_i;
-        b[i] = (atb - total) / l_i_i;
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
-        MatrixType1& L,
-        MatrixType2 const& M,
-        float b[MatrixType1::stride],
-        VectorType const& Atb,
-        int const N,
-        ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
-      using T = typename MatrixType1::base_type;
-      auto const i = N - 1;
-      auto const i_real = pulseOffsets(i);
-      T sumsq{0};
-      T total = 0;
-      for (int j = 0; j < i; j++) {
-        auto const j_real = pulseOffsets(j);
-        T sumsq2{0};
-        auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
-        for (int k = 0; k < j; ++k)
-          sumsq2 += L(i, k) * L(j, k);
-
-        auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
-        L(i, j) = value_i_j;
-        sumsq += value_i_j * value_i_j;
-
-        total += value_i_j * b[j];
-      }
-
-      auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
-      L(i, i) = l_i_i;
-      b[i] = (Atb(i_real) - total) / l_i_i;
-    }
-
-    template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
-    __device__ void solve_forward_subst_matrix(MatrixType1& A,
-                                               MatrixType2 const& pulseMatrixView,
-                                               MatrixType3 const& matrixL) {
-      // FIXME: this assumes pulses are on columns and samples on rows
-      constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
-      constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;
-
-#pragma unroll
-      for (int icol = 0; icol < NPULSES; icol++) {
-        float reg_b[NSAMPLES];
-        float reg_L[NSAMPLES];
-
-// preload a column and load column 0 of cholesky
-#pragma unroll
-        for (int i = 0; i < NSAMPLES; i++) {
-          reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
-          reg_L[i] = matrixL(i, 0);
-        }
-
-        // compute x0 and store it
-        auto x_prev = reg_b[0] / reg_L[0];
-        A(0, icol) = x_prev;
-
-// iterate
-#pragma unroll
-        for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
-          for (int counter = iL; counter < NSAMPLES; counter++)
-            reg_b[counter] -= x_prev * reg_L[counter];
-
-// load the next column of cholesky
-#pragma unroll
-          for (int counter = iL; counter < NSAMPLES; counter++)
-            reg_L[counter] = matrixL(counter, iL);
-
-          // compute the next x for M(iL, icol)
-          x_prev = reg_b[iL] / reg_L[iL];
-
-          // store the result value
-          A(iL, icol) = x_prev;
-        }
-      }
-    }
-
-    template <typename MatrixType1, typename MatrixType2>
-    __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
-                                               MatrixType1 inputAmplitudesView,
-                                               MatrixType2 matrixL) {
-      constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
-
-      float reg_b_tmp[NSAMPLES];
-      float reg_L[NSAMPLES];
-
-// preload a column and load column 0 of cholesky
-#pragma unroll
-      for (int i = 0; i < NSAMPLES; i++) {
-        reg_b_tmp[i] = inputAmplitudesView(i);
-        reg_L[i] = matrixL(i, 0);
-      }
-
-      // compute x0 and store it
-      auto x_prev = reg_b_tmp[0] / reg_L[0];
-      reg_b[0] = x_prev;
-
-// iterate
-#pragma unroll
-      for (int iL = 1; iL < NSAMPLES; iL++) {
-// update accum
-#pragma unroll
-        for (int counter = iL; counter < NSAMPLES; counter++)
-          reg_b_tmp[counter] -= x_prev * reg_L[counter];
-
-// load the next column of cholesky
-#pragma unroll
-        for (int counter = iL; counter < NSAMPLES; counter++)
-          reg_L[counter] = matrixL(counter, iL);
-
-        // compute the next x for M(iL, icol)
-        x_prev = reg_b_tmp[iL] / reg_L[iL];
-
-        // store the result value
-        reg_b[iL] = x_prev;
-      }
-    }
-
     // TODO: add active bxs
     template <typename MatrixType, typename VectorType>
     __device__ void fnnls(MatrixType const& AtA,
                           VectorType const& Atb,
                           VectorType& solution,
                           int& npassive,
-                          ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
-                          MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
+                          calo::multifit::ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
+                          calo::multifit::MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
                           double const eps,
                           int const maxIterations) {
       // constants