R2D: use GPU::SimpleVector for the error unpacking (#14)

cms-patatrack · Feb 20, 2018 · 2f625fc · 2f625fc
1 parent 5cd293f
commit 2f625fc
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 80 deletions.
diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu
@@ -46,6 +46,9 @@ context initDeviceMemory() {
   constexpr uint32_t MAX_WORD08_SIZE = MAX_FED * MAX_WORD  * sizeof(uint8_t);
   constexpr uint32_t MAX_WORD32_SIZE = MAX_FED * MAX_WORD  * sizeof(uint32_t);
   constexpr uint32_t MAX_WORD16_SIZE = MAX_FED * MAX_WORD  * sizeof(uint16_t);
+  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
+  constexpr uint32_t esize = sizeof(error_obj);
+  constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED * MAX_WORD * esize;
 
   cudaCheck(cudaMalloc((void**) & c.word_d,        MAX_WORD32_SIZE));
   cudaCheck(cudaMalloc((void**) & c.fedId_d,       MAX_WORD08_SIZE));
@@ -56,11 +59,8 @@ context initDeviceMemory() {
 
   cudaCheck(cudaMalloc((void**) & c.moduleInd_d,   MAX_WORD16_SIZE));
   cudaCheck(cudaMalloc((void**) & c.rawIdArr_d,    MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.errType_d,     MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.errWord_d,     MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.errFedID_d,    MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.errRawID_d,    MAX_WORD32_SIZE));
-
+  cudaCheck(cudaMalloc((void**) & c.error_d,       vsize));
+  cudaCheck(cudaMalloc((void**) & c.data_d,        MAX_ERROR_SIZE));
 
   // for the clusterizer
   cudaCheck(cudaMalloc((void**) & c.clus_d,        MAX_WORD32_SIZE)); // cluser index in module
@@ -71,7 +71,6 @@ context initDeviceMemory() {
 
   cudaCheck(cudaMalloc((void**) & c.debug_d,        MAX_WORD32_SIZE));
 
-
   // create a CUDA stream
   cudaCheck(cudaStreamCreate(&c.stream));
 
@@ -89,10 +88,8 @@ void freeMemory(context & c) {
   cudaCheck(cudaFree(c.adc_d));
   cudaCheck(cudaFree(c.moduleInd_d));
   cudaCheck(cudaFree(c.rawIdArr_d));
-  cudaCheck(cudaFree(c.errType_d));
-  cudaCheck(cudaFree(c.errWord_d));
-  cudaCheck(cudaFree(c.errFedID_d));
-  cudaCheck(cudaFree(c.errRawID_d));
+  cudaCheck(cudaFree(c.error_d));
+  cudaCheck(cudaFree(c.data_d));
 
  // these are for the clusterizer (to be moved)
  cudaCheck(cudaFree(c.moduleStart_d));
@@ -453,11 +450,10 @@ __device__ uint32_t getErrRawID(uint32_t fedId, uint32_t errWord, uint32_t error
 __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint32_t wordCounter, const uint32_t *Word, const uint8_t *fedIds,
                                  uint16_t * XX, uint16_t * YY, uint16_t * ADC,
                                  uint32_t * pdigi, uint32_t *rawIdArr, uint16_t * moduleId,
-                                 uint32_t *errType, uint32_t *errWord, uint32_t *errFedID, uint32_t *errRawID,
+                                 GPU::SimpleVector<error_obj> *err,
                                  bool useQualityInfo, bool includeErrors, bool debug)
 {
   uint32_t blockId  = blockIdx.x;
-
   uint32_t threadId  = threadIdx.x;
 
   bool skipROC = false;
@@ -474,14 +470,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
       rawIdArr[gIndex] = 0;
       moduleId[gIndex] = 9999; 
 
-
       uint32_t ww = Word[gIndex]; // Array containing 32 bit raw data
-      if (includeErrors) {
-        errType[gIndex]  = 0;
-        errWord[gIndex]  = ww;
-        errFedID[gIndex] = fedId;
-        errRawID[gIndex] = 0;
-      }
       if (ww == 0) {
         //noise and dead channels are ignored
         XX[gIndex]    = 0;  // 0 is an indicator of a noise/dead channel
@@ -490,25 +479,19 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
         continue ; // 0: bad word
       }
 
-
       uint32_t link  = getLink(ww);            // Extract link
       uint32_t roc   = getRoc(ww);             // Extract Roc in link
       DetIdGPU detId = getRawId(Map, fedId, link, roc);
 
-
       uint32_t errorType = checkROC(ww, fedId, link, Map, debug);
       skipROC = (roc < maxROCIndex) ? false : (errorType != 0);
       if (includeErrors and skipROC)
       {
         uint32_t rID = getErrRawID(fedId, ww, errorType, Map, debug);
-        errType[gIndex]  = errorType;
-        errWord[gIndex]  = ww;
-        errFedID[gIndex] = fedId;
-        errRawID[gIndex] = rID;
+        err->emplace_back(rID, ww, errorType, fedId);
         continue;
       }
 
-
       uint32_t rawId  = detId.RawId;
       uint32_t rocIdInDetUnit = detId.rocInDet;
       bool barrel = isBarrel(rawId);
@@ -551,10 +534,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
         if (includeErrors) {
           if (not rocRowColIsValid(row, col)) {
             uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
-            errType[gIndex]  = error;
-            errWord[gIndex]  = ww;
-            errFedID[gIndex] = fedId;
-            errRawID[gIndex] = rawId;
+            err->emplace_back(rawId, ww, error, fedId);
             if(debug) printf("BPIX1  Error status: %i\n", error);
             continue;
           }
@@ -569,10 +549,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
         localPix.col = col;
         if (includeErrors and not dcolIsValid(dcol, pxid)) {
           uint32_t error = conversionError(fedId, 3, debug);
-          errType[gIndex] = error;
-          errWord[gIndex] = ww;
-          errFedID[gIndex] = fedId;
-          errRawID[gIndex] = rawId;
+          err->emplace_back(rawId, ww, error, fedId);
           if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
           continue;
         }
@@ -587,8 +564,6 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
       rawIdArr[gIndex] = rawId;
     } // end of if (gIndex < end)
    } // end fake loop
-
-
 } // end of Raw to Digi kernel
 
 
@@ -599,7 +574,7 @@ void RawToDigi_wrapper(
     const uint32_t wordCounter, uint32_t *word, const uint32_t fedCounter,  uint8_t *fedId_h,
     bool convertADCtoElectrons, 
     uint32_t * pdigi_h, uint32_t *rawIdArr_h, 
-    uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h,
+    GPU::SimpleVector<error_obj> *error_h, GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
     bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive)
 {
   const int threadsPerBlock = 512;
@@ -610,7 +585,11 @@ void RawToDigi_wrapper(
   // wordCounter is the total no of words in each event to be trasfered on device
   cudaCheck(cudaMemcpyAsync(&c.word_d[0],     &word[0],     wordCounter*sizeof(uint32_t), cudaMemcpyHostToDevice, c.stream));
   cudaCheck(cudaMemcpyAsync(&c.fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t)/2, cudaMemcpyHostToDevice, c.stream));
-
+
+  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
+  constexpr uint32_t esize = sizeof(error_obj);
+  cudaCheck(cudaMemcpyAsync(c.error_d, error_h_tmp, vsize, cudaMemcpyHostToDevice, c.stream));
+
   // Launch rawToDigi kernel
   RawToDigi_kernel<<<blocks, threadsPerBlock, 0, c.stream>>>(
       cablingMapDevice,
@@ -621,10 +600,7 @@ void RawToDigi_wrapper(
       c.pdigi_d,
       c.rawIdArr_d,
       c.moduleInd_d,
-      c.errType_d,
-      c.errWord_d,
-      c.errFedID_d,
-      c.errRawID_d,
+      c.error_d,
       useQualityInfo,
       includeErrors,
       debug);
@@ -636,15 +612,15 @@ void RawToDigi_wrapper(
   cudaCheck(cudaMemcpyAsync(rawIdArr_h, c.rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
 
   if (includeErrors) {
-    cudaCheck(cudaMemcpyAsync(errType_h, c.errType_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
-    cudaCheck(cudaMemcpyAsync(errWord_h, c.errWord_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
-    cudaCheck(cudaMemcpyAsync(errFedID_h, c.errFedID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
-    cudaCheck(cudaMemcpyAsync(errRawID_h, c.errRawID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
+      cudaCheck(cudaMemcpyAsync(error_h, c.error_d, vsize, cudaMemcpyDeviceToHost, c.stream));
+      cudaStreamSynchronize(c.stream);
+      error_h->set_data(data_h);
+      int size = error_h->size();
+      cudaCheck(cudaMemcpyAsync(data_h, c.data_d, size*esize, cudaMemcpyDeviceToHost, c.stream));
   }
-  cudaStreamSynchronize(c.stream);
   // End  of Raw2Digi and passing data for cluserisation
 
- { 
+ {
    // clusterizer ...
    using namespace gpuClustering;
   int threadsPerBlock = 256;

diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h
@@ -8,6 +8,7 @@
 #include <cuda_runtime.h>
 
 #include "SiPixelFedCablingMapGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include<algorithm>
 
 const uint32_t layerStartBit_   = 20;
@@ -145,6 +146,14 @@ inline uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
 
 }
 
+struct error_obj {
+  uint32_t rawId;
+  uint32_t word;
+  unsigned char errorType;
+  unsigned char fedId;
+  __host__ __device__ error_obj(uint32_t a_, uint32_t b_, unsigned char c_, unsigned char d_):
+    rawId(a_), word(b_), errorType(c_), fedId(d_) {}
+};
 
 // configuration and memory buffers alocated on the GPU
 struct context {
@@ -159,12 +168,10 @@ struct context {
 
   uint16_t * moduleInd_d;
   uint32_t * rawIdArr_d;
-  uint32_t * errType_d;
-  uint32_t * errWord_d;
-  uint32_t * errFedID_d;
-  uint32_t * errRawID_d;
-
 
+  GPU::SimpleVector<error_obj> * error_d;
+  error_obj * data_d;
+
   // these are for the clusterizer (to be moved)
   uint32_t * moduleStart_d;
   int32_t *  clus_d;
@@ -176,12 +183,15 @@ struct context {
 
 
 // wrapper function to call RawToDigi on the GPU from host side
-void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice, SiPixelGainForHLTonGPU * const ped, 
-                        const uint32_t wordCounter, uint32_t *word, 
-                        const uint32_t fedCounter,  uint8_t *fedId_h,
-                        bool convertADCtoElectrons, uint32_t * pdigi_h,
-                        uint32_t *rawIdArr_h, uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h,
-                        bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive);
+void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice,
+                       SiPixelGainForHLTonGPU * const ped,
+                       const uint32_t wordCounter, uint32_t *word,
+                       const uint32_t fedCounter,  uint8_t *fedId_h,
+                       bool convertADCtoElectrons, uint32_t * pdigi_h,
+                       uint32_t *rawIdArr_h, GPU::SimpleVector<error_obj> *error_h,
+                       GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
+                       bool useQualityInfo, bool includeErrors, bool debug,
+                       uint32_t & nModulesActive);
 
 // void initCablingMap();
 context initDeviceMemory();

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc
@@ -90,9 +90,9 @@ void processCablingMap(SiPixelFedCablingMap const& cablingMap,  TrackerGeometry
       moduleId[i] = gdet->index();
     }
     LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl;
-    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i]         << std::setw(20) << linkMap[i]         << std::setw(20) << rocMap[i]   << std::endl;
-    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i]          << std::setw(20) << rocInDet[i]        << std::setw(20) << moduleId[i] << std::endl;
-    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool) badRocs[i] << std::setw(20) << (bool) modToUnp[i] << std::endl;
+    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i]  << std::setw(20) << linkMap[i]  << std::setw(20) << rocMap[i] << std::endl;
+    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i]   << std::setw(20) << rocInDet[i] << std::setw(20) << moduleId[i] << std::endl;
+    LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool)badRocs[i] << std::setw(20) << (bool)modToUnp[i] << std::endl;
     LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl;
   }
 

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc
@@ -64,8 +64,9 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
     usererrorlist = config_.getParameter<std::vector<int> > ("UserErrorList");
   }
   tFEDRawDataCollection = consumes <FEDRawDataCollection> (config_.getParameter<edm::InputTag>("InputLabel"));
+  debug = config_.getParameter<bool>("enableErrorDebug");
 
-  // start counters
+  //start counters
   ndigis = 0;
   nwords = 0;
 
@@ -127,16 +128,24 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
   cudaMallocHost(&pdigi_h,    sizeof(uint32_t)*WSIZE);
   cudaMallocHost(&rawIdArr_h, sizeof(uint32_t)*WSIZE);
 
-  cudaMallocHost(&errType_h,  sizeof(uint32_t)*WSIZE);
-  cudaMallocHost(&errRawID_h, sizeof(uint32_t)*WSIZE);
-  cudaMallocHost(&errWord_h,  sizeof(uint32_t)*WSIZE);
-  cudaMallocHost(&errFedID_h, sizeof(uint32_t)*WSIZE);
+  uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
+  uint32_t esize = sizeof(error_obj);
+  cudaCheck(cudaMallocHost(&error_h, vsize));
+  cudaCheck(cudaMallocHost(&error_h_tmp, vsize));
+  cudaCheck(cudaMallocHost(&data_h, MAX_FED*MAX_WORD*esize));
 
   cudaMallocHost(&mIndexStart_h, sizeof(int)*(NMODULE+1));
   cudaMallocHost(&mIndexEnd_h,   sizeof(int)*(NMODULE+1));
 
   // allocate memory for RawToDigi on GPU
   context_ = initDeviceMemory();
+
+  new (error_h) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, data_h);
+  new (error_h_tmp) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, context_.data_d);
+  assert(error_h->size() == 0);
+  assert(error_h->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
+  assert(error_h_tmp->size() == 0);
+  assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
 }
 
 // -----------------------------------------------------------------------------
@@ -154,10 +163,9 @@ SiPixelRawToDigiGPU::~SiPixelRawToDigiGPU() {
   cudaFreeHost(fedId_h);
   cudaFreeHost(pdigi_h);
   cudaFreeHost(rawIdArr_h);
-  cudaFreeHost(errType_h);
-  cudaFreeHost(errRawID_h);
-  cudaFreeHost(errWord_h);
-  cudaFreeHost(errFedID_h);
+  cudaFreeHost(error_h);
+  cudaFreeHost(error_h_tmp);
+  cudaFreeHost(data_h);
   cudaFreeHost(mIndexStart_h);
   cudaFreeHost(mIndexEnd_h);
 
@@ -209,6 +217,7 @@ SiPixelRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptio
   desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
   desc.addOptional<bool>("CheckPixelOrder");  // never used, kept for back-compatibility
   desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
+  desc.add<bool>("enableErrorDebug",false);
   descriptions.add("siPixelRawToDigiGPU",desc);
 }
 
@@ -222,7 +231,6 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
   int theWordCounter = 0;
   int theDigiCounter = 0;
   const uint32_t dummydetid = 0xffffffff;
-  debug = edm::MessageDrop::instance()->debugEnabled;
 
   // initialize quality record or update if necessary
   if (qualityWatcher.check( es ) && useQuality) {
@@ -347,8 +355,9 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
 
   // GPU specific: RawToDigi -> clustering
   uint32_t nModulesActive=0;
-  RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter, fedId_h, convertADCtoElectrons, pdigi_h, 
-      rawIdArr_h, errType_h, errWord_h, errFedID_h, errRawID_h, useQuality, includeErrors, debug, nModulesActive);
+  RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter,
+                    fedId_h, convertADCtoElectrons, pdigi_h, rawIdArr_h, error_h, error_h_tmp, data_h,
+                    useQuality, includeErrors, debug,nModulesActive);
 
   auto gpuProd = std::make_unique<std::vector<unsigned long long>>();
   gpuProd->resize(3);
@@ -377,10 +386,12 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
     theDigiCounter++;
   }
 
-  for (uint32_t i = 0; i < wordCounterGPU; i++) {
-    if (errType_h[i] != 0) {
-      SiPixelRawDataError error(errWord_h[i], errType_h[i], errFedID_h[i]+1200);
-      errors[errRawID_h[i]].push_back(error);
+  auto size = error_h->size();
+  for (auto i = 0; i < size; i++) {
+    error_obj err = (*error_h)[i];
+    if (err.errorType != 0) {
+        SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
+        errors[err.rawId].push_back(error);
     }
   }
 

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h
@@ -74,7 +74,9 @@ class SiPixelRawToDigiGPU : public edm::stream::EDProducer<> {
 
   // to store the output
   uint32_t *pdigi_h, *rawIdArr_h;                   // host copy of output
-  uint32_t *errType_h, *errWord_h, *errFedID_h, *errRawID_h;    // host copy of output
+  error_obj *data_h = nullptr;
+  GPU::SimpleVector<error_obj> *error_h = nullptr;
+  GPU::SimpleVector<error_obj> *error_h_tmp = nullptr;
   // store the start and end index for each module (total 1856 modules-phase 1)
   int *mIndexStart_h, *mIndexEnd_h;
 

diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -35,6 +35,7 @@
 ## Empty Regions PSet means complete unpacking
 siPixelDigisGPU.Regions = cms.PSet( )
 siPixelDigisGPU.CablingMapLabel = cms.string("")
+siPixelDigisGPU.enableErrorDebug = cms.bool(False)
 
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 phase1Pixel.toModify(siPixelDigis, UsePhase1=True)