From c15e4536e8880936e968183a9882c08882fa5f84 Mon Sep 17 00:00:00 2001 From: mariadalfonso Date: Thu, 30 Jul 2020 21:21:01 +0200 Subject: [PATCH] Refactor common ECAL and HCAL code (cms-patatrack#523) Move duplicated Eigen code to a common file, and use it for both ECAL and HCAL. Move HCAL general reconstruction code from the hcal::multifit to the hcal::reconstruction namespace. --- .../plugins/AmplitudeComputationKernels.cu | 37 +-- .../EcalRecProducers/plugins/KernelHelpers.h | 272 +----------------- 2 files changed, 23 insertions(+), 286 deletions(-) diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu index 293d77720dd2b..8c4f1e1e7a2fc 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu @@ -10,9 +10,9 @@ #include "DataFormats/Math/interface/approx_exp.h" #include "DataFormats/Math/interface/approx_log.h" -#include "KernelHelpers.h" -#include "AmplitudeComputationKernels.h" #include "AmplitudeComputationCommonKernels.h" +#include "AmplitudeComputationKernels.h" +#include "KernelHelpers.h" namespace ecal { namespace multifit { @@ -143,9 +143,9 @@ namespace ecal { extern __shared__ char shrmem[]; DataType* shrMatrixLForFnnlsStorage = - reinterpret_cast(shrmem) + MapSymM::total * threadIdx.x; - DataType* shrAtAStorage = - reinterpret_cast(shrmem) + MapSymM::total * (threadIdx.x + blockDim.x); + reinterpret_cast(shrmem) + calo::multifit::MapSymM::total * threadIdx.x; + DataType* shrAtAStorage = reinterpret_cast(shrmem) + + calo::multifit::MapSymM::total * (threadIdx.x + blockDim.x); // FIXME: remove eitehr idx or ch -> they are teh same thing int idx = threadIdx.x + blockDim.x * blockIdx.x; @@ -174,12 +174,12 @@ namespace ecal { int iter = 0; int npassive = 0; - ColumnVector pulseOffsets; + calo::multifit::ColumnVector pulseOffsets; #pragma unroll for (int i = 0; i < NPULSES; ++i) pulseOffsets(i) = i; - ColumnVector resultAmplitudes; + calo::multifit::ColumnVector resultAmplitudes; #pragma unroll for (int counter = 0; counter < NPULSES; counter++) resultAmplitudes(counter) = 0; @@ -197,7 +197,7 @@ namespace ecal { //inverse_cov = noisecov[idx]; //DataType covMatrixStorage[MapSymM::total]; DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; - MapSymM covMatrix{covMatrixStorage}; + calo::multifit::MapSymM covMatrix{covMatrixStorage}; int counter = 0; #pragma unroll for (int col = 0; col < NSAMPLES; col++) @@ -210,21 +210,21 @@ namespace ecal { // compute actual covariance decomposition //covariance_decomposition.compute(inverse_cov); //auto const& matrixL = covariance_decomposition.matrixL(); - DataType matrixLStorage[MapSymM::total]; - MapSymM matrixL{matrixLStorage}; - compute_decomposition_unrolled(matrixL, covMatrix); + DataType matrixLStorage[calo::multifit::MapSymM::total]; + calo::multifit::MapSymM matrixL{matrixLStorage}; + calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix); // L * A = P - ColMajorMatrix A; - solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); + calo::multifit::ColMajorMatrix A; + calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); // L b = s float reg_b[NSAMPLES]; - solve_forward_subst_vector(reg_b, samples[idx], matrixL); + calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL); // FIXME: shared mem //DataType AtAStorage[MapSymM::total]; - MapSymM AtA{shrAtAStorage}; + calo::multifit::MapSymM AtA{shrAtAStorage}; //SampleMatrix AtA; SampleVector Atb; #pragma unroll @@ -277,7 +277,7 @@ namespace ecal { // FIXME: shared mem //DataType matrixLForFnnlsStorage[MapSymM::total]; - MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; + calo::multifit::MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; fnnls(AtA, Atb, @@ -384,8 +384,9 @@ namespace ecal { unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min; uint32_t const offsetForHashes = conditions.offsetForHashes; uint32_t const offsetForInputs = eventInputGPU.ebDigis.size; - auto const nbytesShared = - 2 * threads_min * MapSymM::total * sizeof(DataType); + auto const nbytesShared = 2 * threads_min * + calo::multifit::MapSymM::total * + sizeof(DataType); kernel_minimize<<>>( eventInputGPU.ebDigis.ids.get(), eventInputGPU.eeDigis.ids.get(), diff --git a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h index 103087c3517f9..1dc7a0de0f7f3 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h @@ -1,6 +1,8 @@ #ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h #define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h +#include "DataFormats/Math/interface/EigenComputations.h" + #include #include #include @@ -10,280 +12,14 @@ namespace ecal { namespace multifit { - template - using ColMajorMatrix = Eigen::Matrix; - - template - using RowMajorMatrix = Eigen::Matrix; - - template - using ColumnVector = Eigen::Matrix; - - template - using RowVector = Eigen::Matrix; - - // FIXME: provide specialization for Row Major layout - template - struct MapSymM { - using type = T; - using base_type = typename std::remove_const::type; - - static constexpr int total = Stride * (Stride + 1) / 2; - static constexpr int stride = Stride; - T* data; - - __forceinline__ __device__ MapSymM(T* data) : data{data} {} - - __forceinline__ __device__ T const& operator()(int const row, int const col) const { - auto const tmp = (Stride - col) * (Stride - col + 1) / 2; - auto const index = total - tmp + row - col; - return data[index]; - } - - template - __forceinline__ __device__ typename std::enable_if::value, base_type>::type& - operator()(int const row, int const col) { - auto const tmp = (Stride - col) * (Stride - col + 1) / 2; - auto const index = total - tmp + row - col; - return data[index]; - } - }; - - // FIXME: either use/modify/improve eigen or make this more generic - // this is a map for a pulse matrix to building a 2d matrix for each channel - // and hide indexing - template - struct MapMForPM { - using type = T; - using base_type = typename std::remove_cv::type; - - type* data; - __forceinline__ __device__ MapMForPM(type* data) : data{data} {} - - __forceinline__ __device__ base_type operator()(int const row, int const col) const { - auto const index = 2 - col + row; - return index >= 0 ? data[index] : 0; - } - }; - - // simple/trivial cholesky decomposition impl - template - __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) { - auto const sqrtm_0_0 = std::sqrt(M(0, 0)); - L(0, 0) = sqrtm_0_0; - using T = typename MatrixType1::base_type; - -#pragma unroll - for (int i = 1; i < MatrixType1::stride; i++) { - T sumsq{0}; - for (int j = 0; j < i; j++) { - T sumsq2{0}; - auto const m_i_j = M(i, j); - for (int k = 0; k < j; ++k) - sumsq2 += L(i, k) * L(j, k); - - auto const value_i_j = (m_i_j - sumsq2) / L(j, j); - L(i, j) = value_i_j; - - sumsq += value_i_j * value_i_j; - } - - auto const l_i_i = std::sqrt(M(i, i) - sumsq); - L(i, i) = l_i_i; - } - } - - template - __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) { - auto const sqrtm_0_0 = std::sqrt(M(0, 0)); - L(0, 0) = sqrtm_0_0; - using T = typename MatrixType1::base_type; - - for (int i = 1; i < N; i++) { - T sumsq{0}; - for (int j = 0; j < i; j++) { - T sumsq2{0}; - auto const m_i_j = M(i, j); - for (int k = 0; k < j; ++k) - sumsq2 += L(i, k) * L(j, k); - - auto const value_i_j = (m_i_j - sumsq2) / L(j, j); - L(i, j) = value_i_j; - - sumsq += value_i_j * value_i_j; - } - - auto const l_i_i = std::sqrt(M(i, i) - sumsq); - L(i, i) = l_i_i; - } - } - - template - __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets( - MatrixType1& L, - MatrixType2 const& M, - float b[MatrixType1::stride], - VectorType const& Atb, - int const N, - ColumnVector const& pulseOffsets) { - auto const real_0 = pulseOffsets(0); - auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0)); - L(0, 0) = sqrtm_0_0; - using T = typename MatrixType1::base_type; - b[0] = Atb(real_0) / sqrtm_0_0; - - for (int i = 1; i < N; i++) { - auto const i_real = pulseOffsets(i); - T sumsq{0}; - T total = 0; - auto const atb = Atb(i_real); - for (int j = 0; j < i; j++) { - auto const j_real = pulseOffsets(j); - T sumsq2{0}; - auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real)); - for (int k = 0; k < j; ++k) - sumsq2 += L(i, k) * L(j, k); - - auto const value_i_j = (m_i_j - sumsq2) / L(j, j); - L(i, j) = value_i_j; - - sumsq += value_i_j * value_i_j; - total += value_i_j * b[j]; - } - - auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq); - L(i, i) = l_i_i; - b[i] = (atb - total) / l_i_i; - } - } - - template - __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets( - MatrixType1& L, - MatrixType2 const& M, - float b[MatrixType1::stride], - VectorType const& Atb, - int const N, - ColumnVector const& pulseOffsets) { - using T = typename MatrixType1::base_type; - auto const i = N - 1; - auto const i_real = pulseOffsets(i); - T sumsq{0}; - T total = 0; - for (int j = 0; j < i; j++) { - auto const j_real = pulseOffsets(j); - T sumsq2{0}; - auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real)); - for (int k = 0; k < j; ++k) - sumsq2 += L(i, k) * L(j, k); - - auto const value_i_j = (m_i_j - sumsq2) / L(j, j); - L(i, j) = value_i_j; - sumsq += value_i_j * value_i_j; - - total += value_i_j * b[j]; - } - - auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq); - L(i, i) = l_i_i; - b[i] = (Atb(i_real) - total) / l_i_i; - } - - template - __device__ void solve_forward_subst_matrix(MatrixType1& A, - MatrixType2 const& pulseMatrixView, - MatrixType3 const& matrixL) { - // FIXME: this assumes pulses are on columns and samples on rows - constexpr auto NPULSES = MatrixType2::ColsAtCompileTime; - constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime; - -#pragma unroll - for (int icol = 0; icol < NPULSES; icol++) { - float reg_b[NSAMPLES]; - float reg_L[NSAMPLES]; - -// preload a column and load column 0 of cholesky -#pragma unroll - for (int i = 0; i < NSAMPLES; i++) { - reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol)); - reg_L[i] = matrixL(i, 0); - } - - // compute x0 and store it - auto x_prev = reg_b[0] / reg_L[0]; - A(0, icol) = x_prev; - -// iterate -#pragma unroll - for (int iL = 1; iL < NSAMPLES; iL++) { -// update accum -#pragma unroll - for (int counter = iL; counter < NSAMPLES; counter++) - reg_b[counter] -= x_prev * reg_L[counter]; - -// load the next column of cholesky -#pragma unroll - for (int counter = iL; counter < NSAMPLES; counter++) - reg_L[counter] = matrixL(counter, iL); - - // compute the next x for M(iL, icol) - x_prev = reg_b[iL] / reg_L[iL]; - - // store the result value - A(iL, icol) = x_prev; - } - } - } - - template - __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], - MatrixType1 inputAmplitudesView, - MatrixType2 matrixL) { - constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime; - - float reg_b_tmp[NSAMPLES]; - float reg_L[NSAMPLES]; - -// preload a column and load column 0 of cholesky -#pragma unroll - for (int i = 0; i < NSAMPLES; i++) { - reg_b_tmp[i] = inputAmplitudesView(i); - reg_L[i] = matrixL(i, 0); - } - - // compute x0 and store it - auto x_prev = reg_b_tmp[0] / reg_L[0]; - reg_b[0] = x_prev; - -// iterate -#pragma unroll - for (int iL = 1; iL < NSAMPLES; iL++) { -// update accum -#pragma unroll - for (int counter = iL; counter < NSAMPLES; counter++) - reg_b_tmp[counter] -= x_prev * reg_L[counter]; - -// load the next column of cholesky -#pragma unroll - for (int counter = iL; counter < NSAMPLES; counter++) - reg_L[counter] = matrixL(counter, iL); - - // compute the next x for M(iL, icol) - x_prev = reg_b_tmp[iL] / reg_L[iL]; - - // store the result value - reg_b[iL] = x_prev; - } - } - // TODO: add active bxs template __device__ void fnnls(MatrixType const& AtA, VectorType const& Atb, VectorType& solution, int& npassive, - ColumnVector& pulseOffsets, - MapSymM& matrixL, + calo::multifit::ColumnVector& pulseOffsets, + calo::multifit::MapSymM& matrixL, double const eps, int const maxIterations) { // constants