Skip to content

Commit

Permalink
Refactor common ECAL and HCAL code (#523)
Browse files Browse the repository at this point in the history
Move duplicated Eigen code to a common file, and use it for both ECAL and HCAL.
Move HCAL general reconstruction code from the hcal::multifit to the hcal::reconstruction namespace.
  • Loading branch information
mariadalfonso authored and fwyzard committed Oct 7, 2020
1 parent a5a52b0 commit d8b98ac
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 286 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
#include "DataFormats/Math/interface/approx_exp.h"
#include "DataFormats/Math/interface/approx_log.h"

#include "KernelHelpers.h"
#include "AmplitudeComputationKernels.h"
#include "AmplitudeComputationCommonKernels.h"
#include "AmplitudeComputationKernels.h"
#include "KernelHelpers.h"

namespace ecal {
namespace multifit {
Expand Down Expand Up @@ -143,9 +143,9 @@ namespace ecal {

extern __shared__ char shrmem[];
DataType* shrMatrixLForFnnlsStorage =
reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * threadIdx.x;
DataType* shrAtAStorage =
reinterpret_cast<DataType*>(shrmem) + MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);
reinterpret_cast<DataType*>(shrmem) + calo::multifit::MapSymM<DataType, NPULSES>::total * threadIdx.x;
DataType* shrAtAStorage = reinterpret_cast<DataType*>(shrmem) +
calo::multifit::MapSymM<DataType, NPULSES>::total * (threadIdx.x + blockDim.x);

// FIXME: remove eitehr idx or ch -> they are teh same thing
int idx = threadIdx.x + blockDim.x * blockIdx.x;
Expand Down Expand Up @@ -174,12 +174,12 @@ namespace ecal {
int iter = 0;
int npassive = 0;

ColumnVector<NPULSES, int> pulseOffsets;
calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
#pragma unroll
for (int i = 0; i < NPULSES; ++i)
pulseOffsets(i) = i;

ColumnVector<NPULSES, DataType> resultAmplitudes;
calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
#pragma unroll
for (int counter = 0; counter < NPULSES; counter++)
resultAmplitudes(counter) = 0;
Expand All @@ -197,7 +197,7 @@ namespace ecal {
//inverse_cov = noisecov[idx];
//DataType covMatrixStorage[MapSymM<DataType, NSAMPLES>::total];
DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
int counter = 0;
#pragma unroll
for (int col = 0; col < NSAMPLES; col++)
Expand All @@ -210,21 +210,21 @@ namespace ecal {
// compute actual covariance decomposition
//covariance_decomposition.compute(inverse_cov);
//auto const& matrixL = covariance_decomposition.matrixL();
DataType matrixLStorage[MapSymM<DataType, NSAMPLES>::total];
MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
compute_decomposition_unrolled(matrixL, covMatrix);
DataType matrixLStorage[calo::multifit::MapSymM<DataType, NSAMPLES>::total];
calo::multifit::MapSymM<DataType, NSAMPLES> matrixL{matrixLStorage};
calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix);

// L * A = P
ColMajorMatrix<NSAMPLES, NPULSES> A;
solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);
calo::multifit::ColMajorMatrix<NSAMPLES, NPULSES> A;
calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL);

// L b = s
float reg_b[NSAMPLES];
solve_forward_subst_vector(reg_b, samples[idx], matrixL);
calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL);

// FIXME: shared mem
//DataType AtAStorage[MapSymM<DataType, NPULSES>::total];
MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
//SampleMatrix AtA;
SampleVector Atb;
#pragma unroll
Expand Down Expand Up @@ -277,7 +277,7 @@ namespace ecal {

// FIXME: shared mem
//DataType matrixLForFnnlsStorage[MapSymM<DataType, NPULSES>::total];
MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};
calo::multifit::MapSymM<DataType, NPULSES> matrixLForFnnls{shrMatrixLForFnnlsStorage};

fnnls(AtA,
Atb,
Expand Down Expand Up @@ -384,8 +384,9 @@ namespace ecal {
unsigned int blocks_min = threads_min > totalChannels ? 1 : (totalChannels + threads_min - 1) / threads_min;
uint32_t const offsetForHashes = conditions.offsetForHashes;
uint32_t const offsetForInputs = eventInputGPU.ebDigis.size;
auto const nbytesShared =
2 * threads_min * MapSymM<DataType, SampleVector::RowsAtCompileTime>::total * sizeof(DataType);
auto const nbytesShared = 2 * threads_min *
calo::multifit::MapSymM<DataType, SampleVector::RowsAtCompileTime>::total *
sizeof(DataType);
kernel_minimize<<<blocks_min, threads_min, nbytesShared, cudaStream>>>(
eventInputGPU.ebDigis.ids.get(),
eventInputGPU.eeDigis.ids.get(),
Expand Down
272 changes: 4 additions & 268 deletions RecoLocalCalo/EcalRecProducers/plugins/KernelHelpers.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h
#define RecoLocalCalo_EcalRecProducers_plugins_KernelHelpers_h

#include "DataFormats/Math/interface/EigenComputations.h"

#include <cmath>
#include <limits>
#include <type_traits>
Expand All @@ -10,280 +12,14 @@
namespace ecal {
namespace multifit {

template <int NROWS, int NCOLS>
using ColMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::ColMajor>;

template <int NROWS, int NCOLS>
using RowMajorMatrix = Eigen::Matrix<float, NROWS, NCOLS, Eigen::RowMajor>;

template <int SIZE, typename T = float>
using ColumnVector = Eigen::Matrix<T, SIZE, 1>;

template <int SIZE, typename T = float>
using RowVector = Eigen::Matrix<T, 1, SIZE>;

// FIXME: provide specialization for Row Major layout
template <typename T, int Stride, int Order = Eigen::ColMajor>
struct MapSymM {
using type = T;
using base_type = typename std::remove_const<type>::type;

static constexpr int total = Stride * (Stride + 1) / 2;
static constexpr int stride = Stride;
T* data;

__forceinline__ __device__ MapSymM(T* data) : data{data} {}

__forceinline__ __device__ T const& operator()(int const row, int const col) const {
auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
auto const index = total - tmp + row - col;
return data[index];
}

template <typename U = T>
__forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
operator()(int const row, int const col) {
auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
auto const index = total - tmp + row - col;
return data[index];
}
};

// FIXME: either use/modify/improve eigen or make this more generic
// this is a map for a pulse matrix to building a 2d matrix for each channel
// and hide indexing
template <typename T>
struct MapMForPM {
using type = T;
using base_type = typename std::remove_cv<type>::type;

type* data;
__forceinline__ __device__ MapMForPM(type* data) : data{data} {}

__forceinline__ __device__ base_type operator()(int const row, int const col) const {
auto const index = 2 - col + row;
return index >= 0 ? data[index] : 0;
}
};

// simple/trivial cholesky decomposition impl
template <typename MatrixType1, typename MatrixType2>
__forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
auto const sqrtm_0_0 = std::sqrt(M(0, 0));
L(0, 0) = sqrtm_0_0;
using T = typename MatrixType1::base_type;

#pragma unroll
for (int i = 1; i < MatrixType1::stride; i++) {
T sumsq{0};
for (int j = 0; j < i; j++) {
T sumsq2{0};
auto const m_i_j = M(i, j);
for (int k = 0; k < j; ++k)
sumsq2 += L(i, k) * L(j, k);

auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
L(i, j) = value_i_j;

sumsq += value_i_j * value_i_j;
}

auto const l_i_i = std::sqrt(M(i, i) - sumsq);
L(i, i) = l_i_i;
}
}

template <typename MatrixType1, typename MatrixType2>
__forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
auto const sqrtm_0_0 = std::sqrt(M(0, 0));
L(0, 0) = sqrtm_0_0;
using T = typename MatrixType1::base_type;

for (int i = 1; i < N; i++) {
T sumsq{0};
for (int j = 0; j < i; j++) {
T sumsq2{0};
auto const m_i_j = M(i, j);
for (int k = 0; k < j; ++k)
sumsq2 += L(i, k) * L(j, k);

auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
L(i, j) = value_i_j;

sumsq += value_i_j * value_i_j;
}

auto const l_i_i = std::sqrt(M(i, i) - sumsq);
L(i, i) = l_i_i;
}
}

template <typename MatrixType1, typename MatrixType2, typename VectorType>
__forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
MatrixType1& L,
MatrixType2 const& M,
float b[MatrixType1::stride],
VectorType const& Atb,
int const N,
ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
auto const real_0 = pulseOffsets(0);
auto const sqrtm_0_0 = std::sqrt(M(real_0, real_0));
L(0, 0) = sqrtm_0_0;
using T = typename MatrixType1::base_type;
b[0] = Atb(real_0) / sqrtm_0_0;

for (int i = 1; i < N; i++) {
auto const i_real = pulseOffsets(i);
T sumsq{0};
T total = 0;
auto const atb = Atb(i_real);
for (int j = 0; j < i; j++) {
auto const j_real = pulseOffsets(j);
T sumsq2{0};
auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
for (int k = 0; k < j; ++k)
sumsq2 += L(i, k) * L(j, k);

auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
L(i, j) = value_i_j;

sumsq += value_i_j * value_i_j;
total += value_i_j * b[j];
}

auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
L(i, i) = l_i_i;
b[i] = (atb - total) / l_i_i;
}
}

template <typename MatrixType1, typename MatrixType2, typename VectorType>
__forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
MatrixType1& L,
MatrixType2 const& M,
float b[MatrixType1::stride],
VectorType const& Atb,
int const N,
ColumnVector<MatrixType1::stride, int> const& pulseOffsets) {
using T = typename MatrixType1::base_type;
auto const i = N - 1;
auto const i_real = pulseOffsets(i);
T sumsq{0};
T total = 0;
for (int j = 0; j < i; j++) {
auto const j_real = pulseOffsets(j);
T sumsq2{0};
auto const m_i_j = M(std::max(i_real, j_real), std::min(i_real, j_real));
for (int k = 0; k < j; ++k)
sumsq2 += L(i, k) * L(j, k);

auto const value_i_j = (m_i_j - sumsq2) / L(j, j);
L(i, j) = value_i_j;
sumsq += value_i_j * value_i_j;

total += value_i_j * b[j];
}

auto const l_i_i = std::sqrt(M(i_real, i_real) - sumsq);
L(i, i) = l_i_i;
b[i] = (Atb(i_real) - total) / l_i_i;
}

template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
__device__ void solve_forward_subst_matrix(MatrixType1& A,
MatrixType2 const& pulseMatrixView,
MatrixType3 const& matrixL) {
// FIXME: this assumes pulses are on columns and samples on rows
constexpr auto NPULSES = MatrixType2::ColsAtCompileTime;
constexpr auto NSAMPLES = MatrixType2::RowsAtCompileTime;

#pragma unroll
for (int icol = 0; icol < NPULSES; icol++) {
float reg_b[NSAMPLES];
float reg_L[NSAMPLES];

// preload a column and load column 0 of cholesky
#pragma unroll
for (int i = 0; i < NSAMPLES; i++) {
reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
reg_L[i] = matrixL(i, 0);
}

// compute x0 and store it
auto x_prev = reg_b[0] / reg_L[0];
A(0, icol) = x_prev;

// iterate
#pragma unroll
for (int iL = 1; iL < NSAMPLES; iL++) {
// update accum
#pragma unroll
for (int counter = iL; counter < NSAMPLES; counter++)
reg_b[counter] -= x_prev * reg_L[counter];

// load the next column of cholesky
#pragma unroll
for (int counter = iL; counter < NSAMPLES; counter++)
reg_L[counter] = matrixL(counter, iL);

// compute the next x for M(iL, icol)
x_prev = reg_b[iL] / reg_L[iL];

// store the result value
A(iL, icol) = x_prev;
}
}
}

template <typename MatrixType1, typename MatrixType2>
__device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
MatrixType1 inputAmplitudesView,
MatrixType2 matrixL) {
constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;

float reg_b_tmp[NSAMPLES];
float reg_L[NSAMPLES];

// preload a column and load column 0 of cholesky
#pragma unroll
for (int i = 0; i < NSAMPLES; i++) {
reg_b_tmp[i] = inputAmplitudesView(i);
reg_L[i] = matrixL(i, 0);
}

// compute x0 and store it
auto x_prev = reg_b_tmp[0] / reg_L[0];
reg_b[0] = x_prev;

// iterate
#pragma unroll
for (int iL = 1; iL < NSAMPLES; iL++) {
// update accum
#pragma unroll
for (int counter = iL; counter < NSAMPLES; counter++)
reg_b_tmp[counter] -= x_prev * reg_L[counter];

// load the next column of cholesky
#pragma unroll
for (int counter = iL; counter < NSAMPLES; counter++)
reg_L[counter] = matrixL(counter, iL);

// compute the next x for M(iL, icol)
x_prev = reg_b_tmp[iL] / reg_L[iL];

// store the result value
reg_b[iL] = x_prev;
}
}

// TODO: add active bxs
template <typename MatrixType, typename VectorType>
__device__ void fnnls(MatrixType const& AtA,
VectorType const& Atb,
VectorType& solution,
int& npassive,
ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
calo::multifit::ColumnVector<VectorType::RowsAtCompileTime, int>& pulseOffsets,
calo::multifit::MapSymM<float, VectorType::RowsAtCompileTime>& matrixL,
double const eps,
int const maxIterations) {
// constants
Expand Down

0 comments on commit d8b98ac

Please sign in to comment.