Skip to content

Commit

Permalink
Migrated PixelRecHit to Heterogeneous producer (cms-sw#81)
Browse files Browse the repository at this point in the history
Migrate PixelRecHit EDProducer to HeterogeneousEDProducer, including the cpu product.
Data structures on gpu now include everything needed for Doublets, CA and fit.
Layer splitting done: phi sorting (or partial sorting) requires cms-sw#69.
Includes some cleanup and bug fixes.
  • Loading branch information
VinInn authored and fwyzard committed Jun 29, 2018
1 parent 10d59f2 commit af7cd9e
Show file tree
Hide file tree
Showing 9 changed files with 293 additions and 67 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ namespace phase1PixelTopology {

constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule)* uint32_t(numColsInModule);

constexpr uint32_t numberOfModules = 1856;

constexpr uint32_t layerStart[11] = {0,96,320,672,1184,1296,1408,1520,1632,1744,1856};
constexpr char const * layerName[10] = {"BL1","BL2","BL3","BL4",
"E+1", "E+2", "E+3",
"E-1", "E-2", "E-3"
};


// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,23 @@
striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits)
trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer)


from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import *
from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *

from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import *
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter
gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone())



from Configuration.ProcessModifiers.gpu_cff import gpu
_pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy()
_pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting)
_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting)
gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu)


from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *

Expand Down
65 changes: 59 additions & 6 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,60 @@
#include "gpuPixelRecHits.h"

namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel() {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {

cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.xg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.zg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.rg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.xl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.xerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.iphi_d,(gpuClustering::MaxNumModules*256)*sizeof(int16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));

cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

}

PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
cudaCheck(cudaFree(gpu_.charge_d));
cudaCheck(cudaFree(gpu_.detInd_d));
cudaCheck(cudaFree(gpu_.xg_d));
cudaCheck(cudaFree(gpu_.yg_d));
cudaCheck(cudaFree(gpu_.zg_d));
cudaCheck(cudaFree(gpu_.rg_d));
cudaCheck(cudaFree(gpu_.xl_d));
cudaCheck(cudaFree(gpu_.yl_d));
cudaCheck(cudaFree(gpu_.xerr_d));
cudaCheck(cudaFree(gpu_.yerr_d));
cudaCheck(cudaFree(gpu_.iphi_d));
cudaCheck(cudaFree(gpu_.sortIndex_d));
cudaCheck(cudaFree(gpu_.mr_d));
cudaCheck(cudaFree(gpu_.mc_d));
// cudaCheck(cudaFree(gpu_.hist_d));

cudaCheck(cudaFree(gpu_d));
}

void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream) {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
input.clusInModule_d,
input.clusInModule_d + gpuClustering::MaxNumModules + 1,
Expand All @@ -51,6 +80,7 @@ namespace pixelgpudetails {
int blocks = input.nModules; // active modules (with digis)
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
cpeParams,
gpu_.bs_d,
input.moduleInd_d,
input.xx_d, input.yy_d, input.adc_d,
input.moduleStart_d,
Expand All @@ -59,27 +89,50 @@ namespace pixelgpudetails {
input.nDigis,
gpu_.hitsModuleStart_d,
gpu_.charge_d,
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d,
gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d,
true // for the time being stay local...
gpu_.detInd_d,
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
gpu_.iphi_d,
gpu_.xl_d, gpu_.yl_d,
gpu_.xerr_d, gpu_.yerr_d,
gpu_.mr_d, gpu_.mc_d
);

// needed only if hits on CPU are required...
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));

// to be moved to gpu?
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
hitsLayerStart_[10]=nhits;

std::cout << "hit layerStart ";
for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
std::cout << "end:" << hitsLayerStart_[10] << std::endl;

cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));

// for timing test
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);

// fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);


}

HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
// needed only if hits on CPU are required...
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];

HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
return hoc;
}
Expand Down
26 changes: 9 additions & 17 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,21 @@
#include <cstdint>
#include <vector>

#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"


namespace pixelCPEforGPU {
struct ParamsOnGPU;
}

namespace pixelgpudetails {
struct HitsOnGPU{
uint32_t * hitsModuleStart_d;
int32_t * charge_d;
float *xg_d, *yg_d, *zg_d;
float *xerr_d, *yerr_d;
uint16_t * mr_d;
};
using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;

struct HitsOnCPU {
explicit HitsOnCPU(uint32_t nhits) :
charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits){}
uint32_t hitsModuleStart[2001];
std::vector<int32_t> charge;
std::vector<float> xl, yl;
std::vector<float> xe, ye;
std::vector<uint16_t> mr;
};
using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;

class PixelRecHitGPUKernel {
public:
PixelRecHitGPUKernel();
PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
~PixelRecHitGPUKernel();

PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
Expand All @@ -43,14 +32,17 @@ namespace pixelgpudetails {
PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;

void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream);

HitsOnCPU getOutput(cuda::stream_t<>& stream) const;

private:
HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product"
HitsOnGPU gpu_;
uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
uint32_t hitsLayerStart_[11];
};
}

Expand Down
Loading

0 comments on commit af7cd9e

Please sign in to comment.