Use CMS_UNROLL_LOOP instead of #pragma unroll in ECAL code (#597)

fwyzard · web-flow · commit 219ff604a10f · 2020-12-21T17:54:22.000+01:00
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.cu
@@ -10,6 +10,7 @@
 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 
 #include "AmplitudeComputationCommonKernels.h"
 #include "KernelHelpers.h"
@@ -128,7 +129,7 @@ namespace ecal {
 
         // non-divergent branch (except for the last 4 threads)
         if (threadIdx.x <= blockDim.x - 5) {
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int i = 0; i < 5; i++)
             shr_counts[threadIdx.x] += shr_hasSwitchToGain0[threadIdx.x + i];
         }
@@ -263,7 +264,7 @@ namespace ecal {
 
             // check if samples before sample_max have true
             bool saturated_before_max = false;
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (char ii = 0; ii < 5; ii++)
               saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii];
 
@@ -397,7 +398,7 @@ namespace ecal {
           noise_value += rms_x12[hashedId] * rms_x12[hashedId] * pedestal * G12SamplesCorrelation[vidx];
           // non-divergent branch
           if (!dynamicPedestal && addPedestalUncertainty > 0.f) {
-            noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal; // gainratio is 1
+            noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal;  // gainratio is 1
           }
 
           //
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.cu
@@ -9,6 +9,7 @@
 #include "DataFormats/EcalDigi/interface/EcalDigiCollections.h"
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 
 #include "AmplitudeComputationCommonKernels.h"
 #include "AmplitudeComputationKernels.h"
@@ -24,7 +25,7 @@ namespace ecal {
       constexpr int nsamples = SampleVector::RowsAtCompileTime;
       constexpr int npulses = BXVectorType::RowsAtCompileTime;
 
-#pragma unroll
+      CMS_UNROLL_LOOP
       for (unsigned int ipulse = 0; ipulse < npulses; ipulse++) {
         auto const amplitude = amplitudes.coeff(ipulse);
         if (amplitude == 0)
@@ -116,12 +117,12 @@ namespace ecal {
         int npassive = 0;
 
         calo::multifit::ColumnVector<NPULSES, int> pulseOffsets;
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int i = 0; i < NPULSES; ++i)
           pulseOffsets(i) = i;
 
         calo::multifit::ColumnVector<NPULSES, DataType> resultAmplitudes;
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < NPULSES; counter++)
           resultAmplitudes(counter) = 0;
 
@@ -141,12 +142,12 @@ namespace ecal {
           DataType* covMatrixStorage = shrMatrixLForFnnlsStorage;
           calo::multifit::MapSymM<DataType, NSAMPLES> covMatrix{covMatrixStorage};
           int counter = 0;
-#pragma unroll
-          for (int col = 0; col < NSAMPLES; col++)
-#pragma unroll
+          CMS_UNROLL_LOOP
+          for (int col = 0; col < NSAMPLES; col++) {
+            CMS_UNROLL_LOOP
             for (int row = col; row < NSAMPLES; row++)
               covMatrixStorage[counter++] = __ldg(&noisecov[idx].coeffRef(row, col));
-
+          }
           update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes);
 
           // compute actual covariance decomposition
@@ -169,36 +170,36 @@ namespace ecal {
           calo::multifit::MapSymM<DataType, NPULSES> AtA{shrAtAStorage};
           //SampleMatrix AtA;
           SampleVector Atb;
-#pragma unroll
+          CMS_UNROLL_LOOP
           for (int icol = 0; icol < NPULSES; icol++) {
             float reg_ai[NSAMPLES];
 
-// load column icol
-#pragma unroll
+            // load column icol
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NSAMPLES; counter++)
               reg_ai[counter] = A(counter, icol);
 
             // compute diagoanl
             float sum = 0.f;
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NSAMPLES; counter++)
               sum += reg_ai[counter] * reg_ai[counter];
 
             // store
             AtA(icol, icol) = sum;
 
-// go thru the other columns
-#pragma unroll
+            // go thru the other columns
+            CMS_UNROLL_LOOP
             for (int j = icol + 1; j < NPULSES; j++) {
               // load column j
               float reg_aj[NSAMPLES];
-#pragma unroll
+              CMS_UNROLL_LOOP
               for (int counter = 0; counter < NSAMPLES; counter++)
                 reg_aj[counter] = A(counter, j);
 
               // accum
               float sum = 0.f;
-#pragma unroll
+              CMS_UNROLL_LOOP
               for (int counter = 0; counter < NSAMPLES; counter++)
                 sum += reg_aj[counter] * reg_ai[counter];
 
@@ -209,7 +210,7 @@ namespace ecal {
 
             // Atb accum
             float sum_atb = 0.f;
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int counter = 0; counter < NSAMPLES; counter++)
               sum_atb += reg_ai[counter] * reg_b[counter];
 
@@ -251,7 +252,7 @@ namespace ecal {
         chi2s[inputCh] = chi2;
         energies[inputCh] = resultAmplitudes(5);
 
-#pragma unroll
+        CMS_UNROLL_LOOP
         for (int counter = 0; counter < NPULSES; counter++)
           amplitudes[inputCh](counter) = resultAmplitudes(counter);
       }
diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.cu
@@ -7,6 +7,7 @@
 #include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h"
 #include "DataFormats/Math/interface/approx_exp.h"
 #include "DataFormats/Math/interface/approx_log.h"
+#include "FWCore/Utilities/interface/CMSUnrollLoop.h"
 
 #include "Common.h"
 #include "TimeComputationKernels.h"
@@ -263,7 +264,7 @@ namespace ecal {
           if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) {
             const auto time_max_i = static_cast<ScalarType>(ratio_index);
             auto u = timeFitParameters[timeFitParameters_size - 1];
-#pragma unroll
+            CMS_UNROLL_LOOP
             for (int k = timeFitParameters_size - 2; k >= 0; k--)
               u = u * ratio_value + timeFitParameters[k];
 
@@ -365,7 +366,7 @@ namespace ecal {
       // TODO validate/check
       char iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
       bool oddElements = nthreads_per_channel % 2;
-#pragma unroll
+      CMS_UNROLL_LOOP
       while (iter >= 1) {
         if (ltx < iter)
           // for odd ns, the last guy will just store itself
@@ -410,7 +411,7 @@ namespace ecal {
       // reduce to compute time_max and time_wgt
       iter = nthreads_per_channel / 2 + nthreads_per_channel % 2;
       oddElements = nthreads_per_channel % 2;
-#pragma unroll
+      CMS_UNROLL_LOOP
       while (iter >= 1) {
         if (ltx < iter) {
           shr_time_wgt[threadIdx.x] = oddElements && (ltx == iter - 1 && ltx > 0)
@@ -893,8 +894,8 @@ namespace ecal {
         sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x6[hashedId]) * gain12Over6[hashedId];
         sample_value_error = rms_x6[hashedId] * gain12Over6[hashedId];
       } else if (gainId == 3) {
-        sample_value = (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] *
-                       gain12Over6[hashedId];
+        sample_value =
+            (static_cast<SampleVector::Scalar>(adc) - mean_x1[hashedId]) * gain6Over1[hashedId] * gain12Over6[hashedId];
         sample_value_error = rms_x1[hashedId] * gain6Over1[hashedId] * gain12Over6[hashedId];
       } else {
         sample_value = 0;