Skip to content

Commit

Permalink
R2D: use GPU::SimpleVector for the error unpacking (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
Cesare Calabria authored and fwyzard committed Feb 20, 2018
1 parent 5cd293f commit 2f625fc
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 80 deletions.
72 changes: 24 additions & 48 deletions EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ context initDeviceMemory() {
constexpr uint32_t MAX_WORD08_SIZE = MAX_FED * MAX_WORD * sizeof(uint8_t);
constexpr uint32_t MAX_WORD32_SIZE = MAX_FED * MAX_WORD * sizeof(uint32_t);
constexpr uint32_t MAX_WORD16_SIZE = MAX_FED * MAX_WORD * sizeof(uint16_t);
constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
constexpr uint32_t esize = sizeof(error_obj);
constexpr uint32_t MAX_ERROR_SIZE = MAX_FED * MAX_WORD * esize;

cudaCheck(cudaMalloc((void**) & c.word_d, MAX_WORD32_SIZE));
cudaCheck(cudaMalloc((void**) & c.fedId_d, MAX_WORD08_SIZE));
Expand All @@ -56,11 +59,8 @@ context initDeviceMemory() {

cudaCheck(cudaMalloc((void**) & c.moduleInd_d, MAX_WORD16_SIZE));
cudaCheck(cudaMalloc((void**) & c.rawIdArr_d, MAX_WORD32_SIZE));
cudaCheck(cudaMalloc((void**) & c.errType_d, MAX_WORD32_SIZE));
cudaCheck(cudaMalloc((void**) & c.errWord_d, MAX_WORD32_SIZE));
cudaCheck(cudaMalloc((void**) & c.errFedID_d, MAX_WORD32_SIZE));
cudaCheck(cudaMalloc((void**) & c.errRawID_d, MAX_WORD32_SIZE));

cudaCheck(cudaMalloc((void**) & c.error_d, vsize));
cudaCheck(cudaMalloc((void**) & c.data_d, MAX_ERROR_SIZE));

// for the clusterizer
cudaCheck(cudaMalloc((void**) & c.clus_d, MAX_WORD32_SIZE)); // cluser index in module
Expand All @@ -71,7 +71,6 @@ context initDeviceMemory() {

cudaCheck(cudaMalloc((void**) & c.debug_d, MAX_WORD32_SIZE));


// create a CUDA stream
cudaCheck(cudaStreamCreate(&c.stream));

Expand All @@ -89,10 +88,8 @@ void freeMemory(context & c) {
cudaCheck(cudaFree(c.adc_d));
cudaCheck(cudaFree(c.moduleInd_d));
cudaCheck(cudaFree(c.rawIdArr_d));
cudaCheck(cudaFree(c.errType_d));
cudaCheck(cudaFree(c.errWord_d));
cudaCheck(cudaFree(c.errFedID_d));
cudaCheck(cudaFree(c.errRawID_d));
cudaCheck(cudaFree(c.error_d));
cudaCheck(cudaFree(c.data_d));

// these are for the clusterizer (to be moved)
cudaCheck(cudaFree(c.moduleStart_d));
Expand Down Expand Up @@ -453,11 +450,10 @@ __device__ uint32_t getErrRawID(uint32_t fedId, uint32_t errWord, uint32_t error
__global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint32_t wordCounter, const uint32_t *Word, const uint8_t *fedIds,
uint16_t * XX, uint16_t * YY, uint16_t * ADC,
uint32_t * pdigi, uint32_t *rawIdArr, uint16_t * moduleId,
uint32_t *errType, uint32_t *errWord, uint32_t *errFedID, uint32_t *errRawID,
GPU::SimpleVector<error_obj> *err,
bool useQualityInfo, bool includeErrors, bool debug)
{
uint32_t blockId = blockIdx.x;

uint32_t threadId = threadIdx.x;

bool skipROC = false;
Expand All @@ -474,14 +470,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
rawIdArr[gIndex] = 0;
moduleId[gIndex] = 9999;


uint32_t ww = Word[gIndex]; // Array containing 32 bit raw data
if (includeErrors) {
errType[gIndex] = 0;
errWord[gIndex] = ww;
errFedID[gIndex] = fedId;
errRawID[gIndex] = 0;
}
if (ww == 0) {
//noise and dead channels are ignored
XX[gIndex] = 0; // 0 is an indicator of a noise/dead channel
Expand All @@ -490,25 +479,19 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
continue ; // 0: bad word
}


uint32_t link = getLink(ww); // Extract link
uint32_t roc = getRoc(ww); // Extract Roc in link
DetIdGPU detId = getRawId(Map, fedId, link, roc);


uint32_t errorType = checkROC(ww, fedId, link, Map, debug);
skipROC = (roc < maxROCIndex) ? false : (errorType != 0);
if (includeErrors and skipROC)
{
uint32_t rID = getErrRawID(fedId, ww, errorType, Map, debug);
errType[gIndex] = errorType;
errWord[gIndex] = ww;
errFedID[gIndex] = fedId;
errRawID[gIndex] = rID;
err->emplace_back(rID, ww, errorType, fedId);
continue;
}


uint32_t rawId = detId.RawId;
uint32_t rocIdInDetUnit = detId.rocInDet;
bool barrel = isBarrel(rawId);
Expand Down Expand Up @@ -551,10 +534,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
if (includeErrors) {
if (not rocRowColIsValid(row, col)) {
uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
errType[gIndex] = error;
errWord[gIndex] = ww;
errFedID[gIndex] = fedId;
errRawID[gIndex] = rawId;
err->emplace_back(rawId, ww, error, fedId);
if(debug) printf("BPIX1 Error status: %i\n", error);
continue;
}
Expand All @@ -569,10 +549,7 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
localPix.col = col;
if (includeErrors and not dcolIsValid(dcol, pxid)) {
uint32_t error = conversionError(fedId, 3, debug);
errType[gIndex] = error;
errWord[gIndex] = ww;
errFedID[gIndex] = fedId;
errRawID[gIndex] = rawId;
err->emplace_back(rawId, ww, error, fedId);
if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
continue;
}
Expand All @@ -587,8 +564,6 @@ __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint3
rawIdArr[gIndex] = rawId;
} // end of if (gIndex < end)
} // end fake loop


} // end of Raw to Digi kernel


Expand All @@ -599,7 +574,7 @@ void RawToDigi_wrapper(
const uint32_t wordCounter, uint32_t *word, const uint32_t fedCounter, uint8_t *fedId_h,
bool convertADCtoElectrons,
uint32_t * pdigi_h, uint32_t *rawIdArr_h,
uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h,
GPU::SimpleVector<error_obj> *error_h, GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive)
{
const int threadsPerBlock = 512;
Expand All @@ -610,7 +585,11 @@ void RawToDigi_wrapper(
// wordCounter is the total no of words in each event to be trasfered on device
cudaCheck(cudaMemcpyAsync(&c.word_d[0], &word[0], wordCounter*sizeof(uint32_t), cudaMemcpyHostToDevice, c.stream));
cudaCheck(cudaMemcpyAsync(&c.fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t)/2, cudaMemcpyHostToDevice, c.stream));


constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
constexpr uint32_t esize = sizeof(error_obj);
cudaCheck(cudaMemcpyAsync(c.error_d, error_h_tmp, vsize, cudaMemcpyHostToDevice, c.stream));

// Launch rawToDigi kernel
RawToDigi_kernel<<<blocks, threadsPerBlock, 0, c.stream>>>(
cablingMapDevice,
Expand All @@ -621,10 +600,7 @@ void RawToDigi_wrapper(
c.pdigi_d,
c.rawIdArr_d,
c.moduleInd_d,
c.errType_d,
c.errWord_d,
c.errFedID_d,
c.errRawID_d,
c.error_d,
useQualityInfo,
includeErrors,
debug);
Expand All @@ -636,15 +612,15 @@ void RawToDigi_wrapper(
cudaCheck(cudaMemcpyAsync(rawIdArr_h, c.rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));

if (includeErrors) {
cudaCheck(cudaMemcpyAsync(errType_h, c.errType_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
cudaCheck(cudaMemcpyAsync(errWord_h, c.errWord_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
cudaCheck(cudaMemcpyAsync(errFedID_h, c.errFedID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
cudaCheck(cudaMemcpyAsync(errRawID_h, c.errRawID_d, wordCounter*sizeof(uint32_t), cudaMemcpyDeviceToHost, c.stream));
cudaCheck(cudaMemcpyAsync(error_h, c.error_d, vsize, cudaMemcpyDeviceToHost, c.stream));
cudaStreamSynchronize(c.stream);
error_h->set_data(data_h);
int size = error_h->size();
cudaCheck(cudaMemcpyAsync(data_h, c.data_d, size*esize, cudaMemcpyDeviceToHost, c.stream));
}
cudaStreamSynchronize(c.stream);
// End of Raw2Digi and passing data for cluserisation

{
{
// clusterizer ...
using namespace gpuClustering;
int threadsPerBlock = 256;
Expand Down
32 changes: 21 additions & 11 deletions EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <cuda_runtime.h>

#include "SiPixelFedCablingMapGPU.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
#include<algorithm>

const uint32_t layerStartBit_ = 20;
Expand Down Expand Up @@ -145,6 +146,14 @@ inline uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {

}

struct error_obj {
uint32_t rawId;
uint32_t word;
unsigned char errorType;
unsigned char fedId;
__host__ __device__ error_obj(uint32_t a_, uint32_t b_, unsigned char c_, unsigned char d_):
rawId(a_), word(b_), errorType(c_), fedId(d_) {}
};

// configuration and memory buffers alocated on the GPU
struct context {
Expand All @@ -159,12 +168,10 @@ struct context {

uint16_t * moduleInd_d;
uint32_t * rawIdArr_d;
uint32_t * errType_d;
uint32_t * errWord_d;
uint32_t * errFedID_d;
uint32_t * errRawID_d;


GPU::SimpleVector<error_obj> * error_d;
error_obj * data_d;

// these are for the clusterizer (to be moved)
uint32_t * moduleStart_d;
int32_t * clus_d;
Expand All @@ -176,12 +183,15 @@ struct context {


// wrapper function to call RawToDigi on the GPU from host side
void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice, SiPixelGainForHLTonGPU * const ped,
const uint32_t wordCounter, uint32_t *word,
const uint32_t fedCounter, uint8_t *fedId_h,
bool convertADCtoElectrons, uint32_t * pdigi_h,
uint32_t *rawIdArr_h, uint32_t *errType_h, uint32_t *errWord_h, uint32_t *errFedID_h, uint32_t *errRawID_h,
bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive);
void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice,
SiPixelGainForHLTonGPU * const ped,
const uint32_t wordCounter, uint32_t *word,
const uint32_t fedCounter, uint8_t *fedId_h,
bool convertADCtoElectrons, uint32_t * pdigi_h,
uint32_t *rawIdArr_h, GPU::SimpleVector<error_obj> *error_h,
GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
bool useQualityInfo, bool includeErrors, bool debug,
uint32_t & nModulesActive);

// void initCablingMap();
context initDeviceMemory();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ void processCablingMap(SiPixelFedCablingMap const& cablingMap, TrackerGeometry
moduleId[i] = gdet->index();
}
LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i] << std::setw(20) << linkMap[i] << std::setw(20) << rocMap[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i] << std::setw(20) << rocInDet[i] << std::setw(20) << moduleId[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool) badRocs[i] << std::setw(20) << (bool) modToUnp[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << fedMap[i] << std::setw(20) << linkMap[i] << std::setw(20) << rocMap[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << RawId[i] << std::setw(20) << rocInDet[i] << std::setw(20) << moduleId[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << i << std::setw(20) << (bool)badRocs[i] << std::setw(20) << (bool)modToUnp[i] << std::endl;
LogDebug("SiPixelFedCablingMapGPU") << "----------------------------------------------------------------------------" << std::endl;
}

Expand Down
43 changes: 27 additions & 16 deletions EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,9 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
usererrorlist = config_.getParameter<std::vector<int> > ("UserErrorList");
}
tFEDRawDataCollection = consumes <FEDRawDataCollection> (config_.getParameter<edm::InputTag>("InputLabel"));
debug = config_.getParameter<bool>("enableErrorDebug");

// start counters
//start counters
ndigis = 0;
nwords = 0;

Expand Down Expand Up @@ -127,16 +128,24 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
cudaMallocHost(&pdigi_h, sizeof(uint32_t)*WSIZE);
cudaMallocHost(&rawIdArr_h, sizeof(uint32_t)*WSIZE);

cudaMallocHost(&errType_h, sizeof(uint32_t)*WSIZE);
cudaMallocHost(&errRawID_h, sizeof(uint32_t)*WSIZE);
cudaMallocHost(&errWord_h, sizeof(uint32_t)*WSIZE);
cudaMallocHost(&errFedID_h, sizeof(uint32_t)*WSIZE);
uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
uint32_t esize = sizeof(error_obj);
cudaCheck(cudaMallocHost(&error_h, vsize));
cudaCheck(cudaMallocHost(&error_h_tmp, vsize));
cudaCheck(cudaMallocHost(&data_h, MAX_FED*MAX_WORD*esize));

cudaMallocHost(&mIndexStart_h, sizeof(int)*(NMODULE+1));
cudaMallocHost(&mIndexEnd_h, sizeof(int)*(NMODULE+1));

// allocate memory for RawToDigi on GPU
context_ = initDeviceMemory();

new (error_h) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, data_h);
new (error_h_tmp) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, context_.data_d);
assert(error_h->size() == 0);
assert(error_h->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
assert(error_h_tmp->size() == 0);
assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
}

// -----------------------------------------------------------------------------
Expand All @@ -154,10 +163,9 @@ SiPixelRawToDigiGPU::~SiPixelRawToDigiGPU() {
cudaFreeHost(fedId_h);
cudaFreeHost(pdigi_h);
cudaFreeHost(rawIdArr_h);
cudaFreeHost(errType_h);
cudaFreeHost(errRawID_h);
cudaFreeHost(errWord_h);
cudaFreeHost(errFedID_h);
cudaFreeHost(error_h);
cudaFreeHost(error_h_tmp);
cudaFreeHost(data_h);
cudaFreeHost(mIndexStart_h);
cudaFreeHost(mIndexEnd_h);

Expand Down Expand Up @@ -209,6 +217,7 @@ SiPixelRawToDigiGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptio
desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
desc.addOptional<bool>("CheckPixelOrder"); // never used, kept for back-compatibility
desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
desc.add<bool>("enableErrorDebug",false);
descriptions.add("siPixelRawToDigiGPU",desc);
}

Expand All @@ -222,7 +231,6 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
int theWordCounter = 0;
int theDigiCounter = 0;
const uint32_t dummydetid = 0xffffffff;
debug = edm::MessageDrop::instance()->debugEnabled;

// initialize quality record or update if necessary
if (qualityWatcher.check( es ) && useQuality) {
Expand Down Expand Up @@ -347,8 +355,9 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)

// GPU specific: RawToDigi -> clustering
uint32_t nModulesActive=0;
RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter, fedId_h, convertADCtoElectrons, pdigi_h,
rawIdArr_h, errType_h, errWord_h, errFedID_h, errRawID_h, useQuality, includeErrors, debug, nModulesActive);
RawToDigi_wrapper(context_, cablingMapGPUDevice_, gainForHLTonGPU_, wordCounterGPU, word, fedCounter,
fedId_h, convertADCtoElectrons, pdigi_h, rawIdArr_h, error_h, error_h_tmp, data_h,
useQuality, includeErrors, debug,nModulesActive);

auto gpuProd = std::make_unique<std::vector<unsigned long long>>();
gpuProd->resize(3);
Expand Down Expand Up @@ -377,10 +386,12 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
theDigiCounter++;
}

for (uint32_t i = 0; i < wordCounterGPU; i++) {
if (errType_h[i] != 0) {
SiPixelRawDataError error(errWord_h[i], errType_h[i], errFedID_h[i]+1200);
errors[errRawID_h[i]].push_back(error);
auto size = error_h->size();
for (auto i = 0; i < size; i++) {
error_obj err = (*error_h)[i];
if (err.errorType != 0) {
SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
errors[err.rawId].push_back(error);
}
}

Expand Down
4 changes: 3 additions & 1 deletion EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ class SiPixelRawToDigiGPU : public edm::stream::EDProducer<> {

// to store the output
uint32_t *pdigi_h, *rawIdArr_h; // host copy of output
uint32_t *errType_h, *errWord_h, *errFedID_h, *errRawID_h; // host copy of output
error_obj *data_h = nullptr;
GPU::SimpleVector<error_obj> *error_h = nullptr;
GPU::SimpleVector<error_obj> *error_h_tmp = nullptr;
// store the start and end index for each module (total 1856 modules-phase 1)
int *mIndexStart_h, *mIndexEnd_h;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
## Empty Regions PSet means complete unpacking
siPixelDigisGPU.Regions = cms.PSet( )
siPixelDigisGPU.CablingMapLabel = cms.string("")
siPixelDigisGPU.enableErrorDebug = cms.bool(False)

from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
phase1Pixel.toModify(siPixelDigis, UsePhase1=True)
Expand Down
Loading

0 comments on commit 2f625fc

Please sign in to comment.