From 6a502124c9b2dc45800d1bbeaf91ae0962f5e121 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 19 Oct 2020 19:56:20 +0200 Subject: [PATCH] Use Eigen and CUDA preprocessor symbols Use Eigen preprocessor symbols instead of explicit CUDA keywords. Use CUDA preprocessor symbols to protect CUDA-only functions. --- .../interface/MultifitComputations.h | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/DataFormats/CaloRecHit/interface/MultifitComputations.h b/DataFormats/CaloRecHit/interface/MultifitComputations.h index d448d65a332e3..35234473b5faa 100644 --- a/DataFormats/CaloRecHit/interface/MultifitComputations.h +++ b/DataFormats/CaloRecHit/interface/MultifitComputations.h @@ -32,16 +32,16 @@ namespace calo { static constexpr int stride = Stride; T* data; - __forceinline__ __device__ MapSymM(T* data) : data{data} {} + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapSymM(T* data) : data{data} {} - __forceinline__ __device__ T const& operator()(int const row, int const col) const { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC T const& operator()(int const row, int const col) const { auto const tmp = (Stride - col) * (Stride - col + 1) / 2; auto const index = total - tmp + row - col; return data[index]; } template - __forceinline__ __device__ typename std::enable_if::value, base_type>::type& + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC typename std::enable_if::value, base_type>::type& operator()(int const row, int const col) { auto const tmp = (Stride - col) * (Stride - col + 1) / 2; auto const index = total - tmp + row - col; @@ -58,9 +58,9 @@ namespace calo { using base_type = typename std::remove_cv::type; type* data; - __forceinline__ __device__ MapMForPM(type* data) : data{data} {} + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapMForPM(type* data) : data{data} {} - __forceinline__ __device__ base_type operator()(int const row, int const col) const { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC base_type operator()(int const row, int const col) const { auto const index = 2 - col + row; return index >= 0 ? data[index] : 0; } @@ -68,7 +68,7 @@ namespace calo { // simple/trivial cholesky decomposition impl template - __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) { auto const sqrtm_0_0 = std::sqrt(M(0, 0)); L(0, 0) = sqrtm_0_0; using T = typename MatrixType1::base_type; @@ -94,7 +94,7 @@ namespace calo { } template - __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) { + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) { auto const sqrtm_0_0 = std::sqrt(M(0, 0)); L(0, 0) = sqrtm_0_0; using T = typename MatrixType1::base_type; @@ -119,7 +119,7 @@ namespace calo { } template - __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets( + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_forwardsubst_with_offsets( MatrixType1& L, MatrixType2 const& M, float b[MatrixType1::stride], @@ -158,7 +158,7 @@ namespace calo { } template - __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets( + EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void update_decomposition_forwardsubst_with_offsets( MatrixType1& L, MatrixType2 const& M, float b[MatrixType1::stride], @@ -190,7 +190,7 @@ namespace calo { } template - __device__ void solve_forward_subst_matrix(MatrixType1& A, + EIGEN_DEVICE_FUNC void solve_forward_subst_matrix(MatrixType1& A, MatrixType2 const& pulseMatrixView, MatrixType3 const& matrixL) { // FIXME: this assumes pulses are on columns and samples on rows @@ -205,7 +205,12 @@ namespace calo { // preload a column and load column 0 of cholesky #pragma unroll for (int i = 0; i < NSAMPLES; i++) { +#ifdef __CUDA_ARCH__ + // load through the read-only cache reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol)); +#else + reg_b[i] = pulseMatrixView.coeffRef(i, icol); +#endif // __CUDA_ARCH__ reg_L[i] = matrixL(i, 0); } @@ -236,7 +241,7 @@ namespace calo { } template - __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], + EIGEN_DEVICE_FUNC void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime], MatrixType1 inputAmplitudesView, MatrixType2 matrixL) { constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime; @@ -278,7 +283,7 @@ namespace calo { // TODO: add active bxs template - __device__ void fnnls(MatrixType const& AtA, + EIGEN_DEVICE_FUNC void fnnls(MatrixType const& AtA, VectorType const& Atb, VectorType& solution, int& npassive,