From 6a502124c9b2dc45800d1bbeaf91ae0962f5e121 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 19 Oct 2020 19:56:20 +0200
Subject: [PATCH] Use Eigen and CUDA preprocessor symbols

Use Eigen preprocessor symbols instead of explicit CUDA keywords.
Use CUDA preprocessor symbols to protect CUDA-only functions.
---
 .../interface/MultifitComputations.h          | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)
diff --git a/DataFormats/CaloRecHit/interface/MultifitComputations.h b/DataFormats/CaloRecHit/interface/MultifitComputations.h
index d448d65a332e3..35234473b5faa 100644
--- a/DataFormats/CaloRecHit/interface/MultifitComputations.h
+++ b/DataFormats/CaloRecHit/interface/MultifitComputations.h
@@ -32,16 +32,16 @@ namespace calo {
       static constexpr int stride = Stride;
       T* data;
 
-      __forceinline__ __device__ MapSymM(T* data) : data{data} {}
+      EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapSymM(T* data) : data{data} {}
 
-      __forceinline__ __device__ T const& operator()(int const row, int const col) const {
+      EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC T const& operator()(int const row, int const col) const {
         auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
         auto const index = total - tmp + row - col;
         return data[index];
       }
 
       template <typename U = T>
-      __forceinline__ __device__ typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
+      EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC typename std::enable_if<std::is_same<base_type, U>::value, base_type>::type&
       operator()(int const row, int const col) {
         auto const tmp = (Stride - col) * (Stride - col + 1) / 2;
         auto const index = total - tmp + row - col;
@@ -58,9 +58,9 @@ namespace calo {
       using base_type = typename std::remove_cv<type>::type;
 
       type* data;
-      __forceinline__ __device__ MapMForPM(type* data) : data{data} {}
+      EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC MapMForPM(type* data) : data{data} {}
 
-      __forceinline__ __device__ base_type operator()(int const row, int const col) const {
+      EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC base_type operator()(int const row, int const col) const {
         auto const index = 2 - col + row;
         return index >= 0 ? data[index] : 0;
       }
@@ -68,7 +68,7 @@ namespace calo {
 
     // simple/trivial cholesky decomposition impl
     template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
+    EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_unrolled(MatrixType1& L, MatrixType2 const& M) {
       auto const sqrtm_0_0 = std::sqrt(M(0, 0));
       L(0, 0) = sqrtm_0_0;
       using T = typename MatrixType1::base_type;
@@ -94,7 +94,7 @@ namespace calo {
     }
 
     template <typename MatrixType1, typename MatrixType2>
-    __forceinline__ __device__ void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
+    EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition(MatrixType1& L, MatrixType2 const& M, int const N) {
       auto const sqrtm_0_0 = std::sqrt(M(0, 0));
       L(0, 0) = sqrtm_0_0;
       using T = typename MatrixType1::base_type;
@@ -119,7 +119,7 @@ namespace calo {
     }
 
     template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void compute_decomposition_forwardsubst_with_offsets(
+    EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void compute_decomposition_forwardsubst_with_offsets(
         MatrixType1& L,
         MatrixType2 const& M,
         float b[MatrixType1::stride],
@@ -158,7 +158,7 @@ namespace calo {
     }
 
     template <typename MatrixType1, typename MatrixType2, typename VectorType>
-    __forceinline__ __device__ void update_decomposition_forwardsubst_with_offsets(
+    EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC void update_decomposition_forwardsubst_with_offsets(
         MatrixType1& L,
         MatrixType2 const& M,
         float b[MatrixType1::stride],
@@ -190,7 +190,7 @@ namespace calo {
     }
 
     template <typename MatrixType1, typename MatrixType2, typename MatrixType3>
-    __device__ void solve_forward_subst_matrix(MatrixType1& A,
+    EIGEN_DEVICE_FUNC void solve_forward_subst_matrix(MatrixType1& A,
                                                MatrixType2 const& pulseMatrixView,
                                                MatrixType3 const& matrixL) {
       // FIXME: this assumes pulses are on columns and samples on rows
@@ -205,7 +205,12 @@ namespace calo {
 // preload a column and load column 0 of cholesky
 #pragma unroll
         for (int i = 0; i < NSAMPLES; i++) {
+#ifdef __CUDA_ARCH__
+          // load through the read-only cache
           reg_b[i] = __ldg(&pulseMatrixView.coeffRef(i, icol));
+#else
+          reg_b[i] = pulseMatrixView.coeffRef(i, icol);
+#endif  // __CUDA_ARCH__
           reg_L[i] = matrixL(i, 0);
         }
 
@@ -236,7 +241,7 @@ namespace calo {
     }
 
     template <typename MatrixType1, typename MatrixType2>
-    __device__ void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
+    EIGEN_DEVICE_FUNC void solve_forward_subst_vector(float reg_b[MatrixType1::RowsAtCompileTime],
                                                MatrixType1 inputAmplitudesView,
                                                MatrixType2 matrixL) {
       constexpr auto NSAMPLES = MatrixType1::RowsAtCompileTime;
@@ -278,7 +283,7 @@ namespace calo {
 
     // TODO: add active bxs
     template <typename MatrixType, typename VectorType>
-    __device__ void fnnls(MatrixType const& AtA,
+    EIGEN_DEVICE_FUNC void fnnls(MatrixType const& AtA,
                           VectorType const& Atb,
                           VectorType& solution,
                           int& npassive,