diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu index 29d167899c1eb..e9634b34dcc37 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu +++ b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu @@ -46,6 +46,9 @@ context initDeviceMemory() { constexpr uint32_t MAX_WORD08_SIZE = MAX_FED * MAX_WORD * sizeof(uint8_t); constexpr uint32_t MAX_WORD32_SIZE = MAX_FED * MAX_WORD * sizeof(uint32_t); constexpr uint32_t MAX_WORD16_SIZE = MAX_FED * MAX_WORD * sizeof(uint16_t); + constexpr uint32_t vsize = sizeof(GPU::SimpleVector); + constexpr uint32_t esize = sizeof(error_obj); + constexpr uint32_t MAX_ERROR_SIZE = MAX_FED * MAX_WORD * esize; cudaCheck(cudaMalloc((void**) & c.word_d, MAX_WORD32_SIZE)); cudaCheck(cudaMalloc((void**) & c.fedId_d, MAX_WORD08_SIZE)); @@ -56,11 +59,8 @@ context initDeviceMemory() { cudaCheck(cudaMalloc((void**) & c.moduleInd_d, MAX_WORD16_SIZE)); cudaCheck(cudaMalloc((void**) & c.rawIdArr_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & c.errType_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & c.errWord_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & c.errFedID_d, MAX_WORD32_SIZE)); - cudaCheck(cudaMalloc((void**) & c.errRawID_d, MAX_WORD32_SIZE)); - + cudaCheck(cudaMalloc((void**) & c.error_d, vsize)); + cudaCheck(cudaMalloc((void**) & c.data_d, MAX_ERROR_SIZE)); // for the clusterizer cudaCheck(cudaMalloc((void**) & c.clus_d, MAX_WORD32_SIZE)); // cluser index in module @@ -71,7 +71,6 @@ context initDeviceMemory() { cudaCheck(cudaMalloc((void**) & c.debug_d, MAX_WORD32_SIZE)); - // create a CUDA stream cudaCheck(cudaStreamCreate(&c.stream)); @@ -89,10 +88,8 @@ void freeMemory(context & c) { cudaCheck(cudaFree(c.adc_d)); cudaCheck(cudaFree(c.moduleInd_d)); cudaCheck(cudaFree(c.rawIdArr_d)); - cudaCheck(cudaFree(c.errType_d)); - cudaCheck(cudaFree(c.errWord_d)); - cudaCheck(cudaFree(c.errFedID_d)); - cudaCheck(cudaFree(c.errRawID_d)); + cudaCheck(cudaFree(c.error_d)); + cudaCheck(cudaFree(c.data_d)); // these are for the clusterizer (to be moved) cudaCheck(cudaFree(c.moduleStart_d)); @@ -453,11 +450,10 @@ __device__ uint32_t getErrRawID(uint32_t fedId, uint32_t errWord, uint32_t error __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint32_t wordCounter, const uint32_t *Word, const uint8_t *fedIds, uint16_t * XX, uint16_t * YY, uint16_t * ADC, uint32_t * pdigi, uint32_t *rawIdArr, uint16_t * moduleId, - uint32_t *errType, uint32_t *errWord, uint32_t *errFedID, uint32_t *errRawID, + GPU::SimpleVector *err, bool useQualityInfo, bool includeErrors, bool debug) { uint32_t blockId = blockIdx.x; - uint32_t threadId = threadIdx.x; bool skipROC = false; @@ -474,14 +470,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3 rawIdArr[gIndex] = 0; moduleId[gIndex] = 9999; - uint32_t ww = Word[gIndex]; // Array containing 32 bit raw data - if (includeErrors) { - errType[gIndex] = 0; - errWord[gIndex] = ww; - errFedID[gIndex] = fedId; - errRawID[gIndex] = 0; - } if (ww == 0) { //noise and dead channels are ignored XX[gIndex] = 0; // 0 is an indicator of a noise/dead channel @@ -490,25 +479,19 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3 continue ; // 0: bad word } - uint32_t link = getLink(ww); // Extract link uint32_t roc = getRoc(ww); // Extract Roc in link DetIdGPU detId = getRawId(Map, fedId, link, roc); - uint32_t errorType = checkROC(ww, fedId, link, Map, debug); skipROC = (roc < maxROCIndex) ? false : (errorType != 0); if (includeErrors and skipROC) { uint32_t rID = getErrRawID(fedId, ww, errorType, Map, debug); - errType[gIndex] = errorType; - errWord[gIndex] = ww; - errFedID[gIndex] = fedId; - errRawID[gIndex] = rID; + err->emplace_back(rID, ww, errorType, fedId); continue; } - uint32_t rawId = detId.RawId; uint32_t rocIdInDetUnit = detId.rocInDet; bool barrel = isBarrel(rawId); @@ -551,10 +534,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3 if (includeErrors) { if (not rocRowColIsValid(row, col)) { uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays - errType[gIndex] = error; - errWord[gIndex] = ww; - errFedID[gIndex] = fedId; - errRawID[gIndex] = rawId; + err->emplace_back(rawId, ww, error, fedId); if(debug) printf("BPIX1 Error status: %i\n", error); continue; } @@ -569,10 +549,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3 localPix.col = col; if (includeErrors and not dcolIsValid(dcol, pxid)) { uint32_t error = conversionError(fedId, 3, debug); - errType[gIndex] = error; - errWord[gIndex] = ww; - errFedID[gIndex] = fedId; - errRawID[gIndex] = rawId; + err->emplace_back(rawId, ww, error, fedId); if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc); continue; } @@ -587,8 +564,6 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3 rawIdArr[gIndex] = rawId; } // end of if (gIndex < end) } // end fake loop - - } // end of Raw to Digi kernel @@ -599,7 +574,7 @@ void RawToDigi_wrapper( const uint32_t wordCounter, uint32_t *word, const uint32_t fedCounter, uint8_t *fedId_h, bool convertADCtoElectrons, uint32_t * pdigi_h, uint32_t *rawIdArr_h, - uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h, + GPU::SimpleVector *error_h, GPU::SimpleVector *error_h_tmp, error_obj *data_h, bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive) { const int threadsPerBlock = 512; @@ -610,7 +585,11 @@ void RawToDigi_wrapper( // wordCounter is the total no of words in each event to be trasfered on device cudaCheck(cudaMemcpyAsync(&c.word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyHostToDevice, c.stream)); cudaCheck(cudaMemcpyAsync(&c.fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t)/2, cudaMemcpyHostToDevice, c.stream)); - + + constexpr uint32_t vsize = sizeof(GPU::SimpleVector); + constexpr uint32_t esize = sizeof(error_obj); + cudaCheck(cudaMemcpyAsync(c.error_d, error_h_tmp, vsize, cudaMemcpyHostToDevice, c.stream)); + // Launch rawToDigi kernel RawToDigi_kernel<<>>( cablingMapDevice, @@ -621,10 +600,7 @@ void RawToDigi_wrapper( c.pdigi_d, c.rawIdArr_d, c.moduleInd_d, - c.errType_d, - c.errWord_d, - c.errFedID_d, - c.errRawID_d, + c.error_d, useQualityInfo, includeErrors, debug); @@ -636,15 +612,15 @@ void RawToDigi_wrapper( cudaCheck(cudaMemcpyAsync(rawIdArr_h, c.rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream)); if (includeErrors) { - cudaCheck(cudaMemcpyAsync(errType_h, c.errType_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream)); - cudaCheck(cudaMemcpyAsync(errWord_h, c.errWord_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream)); - cudaCheck(cudaMemcpyAsync(errFedID_h, c.errFedID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream)); - cudaCheck(cudaMemcpyAsync(errRawID_h, c.errRawID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream)); + cudaCheck(cudaMemcpyAsync(error_h, c.error_d, vsize, cudaMemcpyDeviceToHost, c.stream)); + cudaStreamSynchronize(c.stream); + error_h->set_data(data_h); + int size = error_h->size(); + cudaCheck(cudaMemcpyAsync(data_h, c.data_d, size*esize, cudaMemcpyDeviceToHost, c.stream)); } - cudaStreamSynchronize(c.stream); // End of Raw2Digi and passing data for cluserisation - { + { // clusterizer ... using namespace gpuClustering; int threadsPerBlock = 256; diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h index 7eb62c051089a..e1ed06c989e00 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h +++ b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h @@ -8,6 +8,7 @@ #include #include "SiPixelFedCablingMapGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" #include const uint32_t layerStartBit_ = 20; @@ -145,6 +146,14 @@ inline uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) { } +struct error_obj { + uint32_t rawId; + uint32_t word; + unsigned char errorType; + unsigned char fedId; + __host__ __device__ error_obj(uint32_t a_, uint32_t b_, unsigned char c_, unsigned char d_): + rawId(a_), word(b_), errorType(c_), fedId(d_) {} +}; // configuration and memory buffers alocated on the GPU struct context { @@ -159,12 +168,10 @@ struct context { uint16_t * moduleInd_d; uint32_t * rawIdArr_d; - uint32_t * errType_d; - uint32_t * errWord_d; - uint32_t * errFedID_d; - uint32_t * errRawID_d; - + GPU::SimpleVector * error_d; + error_obj * data_d; + // these are for the clusterizer (to be moved) uint32_t * moduleStart_d; int32_t * clus_d; @@ -176,12 +183,15 @@ struct context { // wrapper function to call RawToDigi on the GPU from host side -void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice, SiPixelGainForHLTonGPU * const ped, - const uint32_t wordCounter, uint32_t *word, - const uint32_t fedCounter, uint8_t *fedId_h, - bool convertADCtoElectrons, uint32_t * pdigi_h, - uint32_t *rawIdArr_h, uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h, - bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive); +void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice, + SiPixelGainForHLTonGPU * const ped, + const uint32_t wordCounter, uint32_t *word, + const uint32_t fedCounter, uint8_t *fedId_h, + bool convertADCtoElectrons, uint32_t * pdigi_h, + uint32_t *rawIdArr_h, GPU::SimpleVector *error_h, + GPU::SimpleVector *error_h_tmp, error_obj *data_h, + bool useQualityInfo, bool includeErrors, bool debug, + uint32_t & nModulesActive); // void initCablingMap(); context initDeviceMemory(); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc index 81bffcbc4b988..3a9bacb4426c5 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc @@ -90,9 +90,9 @@ void processCablingMap(SiPixelFedCablingMap const& cablingMap, TrackerGeometry moduleId[i] = gdet->index(); } LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl; - LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i] << std::setw(20) << linkMap[i] << std::setw(20) << rocMap[i] << std::endl; - LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i] << std::setw(20) << rocInDet[i] << std::setw(20) << moduleId[i] << std::endl; - LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool) badRocs[i] << std::setw(20) << (bool) modToUnp[i] << std::endl; + LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i] << std::setw(20) << linkMap[i] << std::setw(20) << rocMap[i] << std::endl; + LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i] << std::setw(20) << rocInDet[i] << std::setw(20) << moduleId[i] << std::endl; + LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool)badRocs[i] << std::setw(20) << (bool)modToUnp[i] << std::endl; LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl; } diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc index 06e12f246ee49..6d195792aa57a 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc @@ -64,8 +64,9 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf ) usererrorlist = config_.getParameter > ("UserErrorList"); } tFEDRawDataCollection = consumes (config_.getParameter("InputLabel")); + debug = config_.getParameter("enableErrorDebug"); - // start counters + //start counters ndigis = 0; nwords = 0; @@ -127,16 +128,24 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf ) cudaMallocHost(&pdigi_h, sizeof(uint32_t)*WSIZE); cudaMallocHost(&rawIdArr_h, sizeof(uint32_t)*WSIZE); - cudaMallocHost(&errType_h, sizeof(uint32_t)*WSIZE); - cudaMallocHost(&errRawID_h, sizeof(uint32_t)*WSIZE); - cudaMallocHost(&errWord_h, sizeof(uint32_t)*WSIZE); - cudaMallocHost(&errFedID_h, sizeof(uint32_t)*WSIZE); + uint32_t vsize = sizeof(GPU::SimpleVector); + uint32_t esize = sizeof(error_obj); + cudaCheck(cudaMallocHost(&error_h, vsize)); + cudaCheck(cudaMallocHost(&error_h_tmp, vsize)); + cudaCheck(cudaMallocHost(&data_h, MAX_FED*MAX_WORD*esize)); cudaMallocHost(&mIndexStart_h, sizeof(int)*(NMODULE+1)); cudaMallocHost(&mIndexEnd_h, sizeof(int)*(NMODULE+1)); // allocate memory for RawToDigi on GPU context_ = initDeviceMemory(); + + new (error_h) GPU::SimpleVector(MAX_FED*MAX_WORD, data_h); + new (error_h_tmp) GPU::SimpleVector(MAX_FED*MAX_WORD, context_.data_d); + assert(error_h->size() == 0); + assert(error_h->capacity() == static_cast(MAX_FED*MAX_WORD)); + assert(error_h_tmp->size() == 0); + assert(error_h_tmp->capacity() == static_cast(MAX_FED*MAX_WORD)); } // ----------------------------------------------------------------------------- @@ -154,10 +163,9 @@ SiPixelRawToDigiGPU::~SiPixelRawToDigiGPU() { cudaFreeHost(fedId_h); cudaFreeHost(pdigi_h); cudaFreeHost(rawIdArr_h); - cudaFreeHost(errType_h); - cudaFreeHost(errRawID_h); - cudaFreeHost(errWord_h); - cudaFreeHost(errFedID_h); + cudaFreeHost(error_h); + cudaFreeHost(error_h_tmp); + cudaFreeHost(data_h); cudaFreeHost(mIndexStart_h); cudaFreeHost(mIndexEnd_h); @@ -209,6 +217,7 @@ SiPixelRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptio desc.add("CablingMapLabel","")->setComment("CablingMap label"); //Tav desc.addOptional("CheckPixelOrder"); // never used, kept for back-compatibility desc.add("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering"); + desc.add("enableErrorDebug",false); descriptions.add("siPixelRawToDigiGPU",desc); } @@ -222,7 +231,6 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es) int theWordCounter = 0; int theDigiCounter = 0; const uint32_t dummydetid = 0xffffffff; - debug = edm::MessageDrop::instance()->debugEnabled; // initialize quality record or update if necessary if (qualityWatcher.check( es ) && useQuality) { @@ -347,8 +355,9 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es) // GPU specific: RawToDigi -> clustering uint32_t nModulesActive=0; - RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter, fedId_h, convertADCtoElectrons, pdigi_h, - rawIdArr_h, errType_h, errWord_h, errFedID_h, errRawID_h, useQuality, includeErrors, debug, nModulesActive); + RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter, + fedId_h, convertADCtoElectrons, pdigi_h, rawIdArr_h, error_h, error_h_tmp, data_h, + useQuality, includeErrors, debug,nModulesActive); auto gpuProd = std::make_unique>(); gpuProd->resize(3); @@ -377,10 +386,12 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es) theDigiCounter++; } - for (uint32_t i = 0; i < wordCounterGPU; i++) { - if (errType_h[i] != 0) { - SiPixelRawDataError error(errWord_h[i], errType_h[i], errFedID_h[i]+1200); - errors[errRawID_h[i]].push_back(error); + auto size = error_h->size(); + for (auto i = 0; i < size; i++) { + error_obj err = (*error_h)[i]; + if (err.errorType != 0) { + SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); + errors[err.rawId].push_back(error); } } diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h index 5b56edaab9282..272c49c740ca9 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h @@ -74,7 +74,9 @@ class SiPixelRawToDigiGPU : public edm::stream::EDProducer<> { // to store the output uint32_t *pdigi_h, *rawIdArr_h; // host copy of output - uint32_t *errType_h, *errWord_h, *errFedID_h, *errRawID_h; // host copy of output + error_obj *data_h = nullptr; + GPU::SimpleVector *error_h = nullptr; + GPU::SimpleVector *error_h_tmp = nullptr; // store the start and end index for each module (total 1856 modules-phase 1) int *mIndexStart_h, *mIndexEnd_h; diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py index aabe584943683..68ae0c1706807 100644 --- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py +++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py @@ -35,6 +35,7 @@ ## Empty Regions PSet means complete unpacking siPixelDigisGPU.Regions = cms.PSet( ) siPixelDigisGPU.CablingMapLabel = cms.string("") +siPixelDigisGPU.enableErrorDebug = cms.bool(False) from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel phase1Pixel.toModify(siPixelDigis, UsePhase1=True) diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h index 21353808bea01..94769ca7351d1 100644 --- a/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h +++ b/HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h @@ -11,7 +11,7 @@ namespace GPU { template struct SimpleVector { // Constructors __host__ __device__ SimpleVector(int capacity, T *data) // ownership of m_data stays within the caller - : m_size(0), m_data(data), m_capacity(capacity) { + : m_size(0), m_capacity(capacity), m_data(data) { static_assert(std::is_trivially_destructible::value); } @@ -83,6 +83,11 @@ template struct SimpleVector { __inline__ __host__ __device__ int capacity() const { return m_capacity; } __inline__ __host__ __device__ T *data() const { return m_data; } + + __inline__ __host__ __device__ void resize(int size) { m_size = size; } + + __inline__ __host__ __device__ void set_data(T * data) { m_data = data; } + private: int m_size; @@ -93,3 +98,4 @@ template struct SimpleVector { } // namespace GPU #endif // HeterogeneousCore_CUDAUtilities_GPUSimpleVector_h +