Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU: better hits #81

Merged
merged 13 commits into from
Jun 29, 2018
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ namespace phase1PixelTopology {

constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule)* uint32_t(numColsInModule);

constexpr uint32_t numberOfModules = 1856;

constexpr uint32_t layerStart[11] = {0,96,320,672,1184,1296,1408,1520,1632,1744,1856};
constexpr char const * layerName[10] = {"BL1","BL2","BL3","BL4",
"E+1", "E+2", "E+3",
"E-1", "E-2", "E-3"
};

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to note that this introduces yet another convention for naming pixel layers. It seems to be only for debug prints, so I don't object.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@makortel what are the "usual" pixel layer names, and where are they defined ?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We (=CMS) have many conventions in different places

I don't think none of these is authoritative enough to suggest a change in here (since they're for printouts only, for configuration input's I'd probably suggest the seeding layers' convention). Anyway these have the nice property (on purpose?) that the length of BPix and FPix strings are the same (that none of the other conventions have).



// this is for the ROC n<512 (upgrade 1024)
constexpr inline
uint16_t divu52(uint16_t n) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,23 @@
striptrackerlocalreco = cms.Sequence(siStripZeroSuppression*siStripClusters*siStripMatchedRecHits)
trackerlocalreco = cms.Sequence(pixeltrackerlocalreco*striptrackerlocalreco*clusterSummaryProducer)


from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import *
from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *

from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneous_cfi import *
from RecoLocalTracker.SiPixelRecHits.siPixelRecHitHeterogeneousConverter_cfi import siPixelRecHitHeterogeneousConverter as _siPixelRecHitHeterogeneousConverter
gpu.toReplaceWith(siPixelRecHitsPreSplitting, _siPixelRecHitHeterogeneousConverter.clone())



from Configuration.ProcessModifiers.gpu_cff import gpu
_pixeltrackerlocalreco_gpu = pixeltrackerlocalreco.copy()
_pixeltrackerlocalreco_gpu.replace(siPixelClustersPreSplitting, siPixelClustersHeterogeneous+siPixelClustersPreSplitting)
_pixeltrackerlocalreco_gpu.replace(siPixelRecHitsPreSplitting, siPixelRecHitHeterogeneous+siPixelRecHitsPreSplitting)
gpu.toReplaceWith(pixeltrackerlocalreco, _pixeltrackerlocalreco_gpu)


from RecoLocalTracker.SiPhase2Clusterizer.phase2TrackerClusterizer_cfi import *
from RecoLocalTracker.Phase2TrackerRecHits.Phase2StripCPEGeometricESProducer_cfi import *

Expand Down
65 changes: 59 additions & 6 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,60 @@
#include "gpuPixelRecHits.h"

namespace pixelgpudetails {
PixelRecHitGPUKernel::PixelRecHitGPUKernel() {
PixelRecHitGPUKernel::PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream) {

cudaCheck(cudaMalloc((void**) & gpu_.bs_d,3*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsModuleStart_d,(gpuClustering::MaxNumModules+1)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.hitsLayerStart_d,(11)*sizeof(uint32_t)));
cudaCheck(cudaMalloc((void**) & gpu_.charge_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.detInd_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.xg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.zg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.rg_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.xl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yl_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.xerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.yerr_d,(gpuClustering::MaxNumModules*256)*sizeof(float)));
cudaCheck(cudaMalloc((void**) & gpu_.iphi_d,(gpuClustering::MaxNumModules*256)*sizeof(int16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.sortIndex_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mr_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
cudaCheck(cudaMalloc((void**) & gpu_.mc_d,(gpuClustering::MaxNumModules*256)*sizeof(uint16_t)));
// cudaCheck(cudaMalloc((void**) & gpu_.hist_d, 10*sizeof(HitsOnGPU::Hist)));

cudaCheck(cudaMalloc((void**) & gpu_d, sizeof(HitsOnGPU)));
cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault,cudaStream.id()));

}

PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
cudaCheck(cudaFree(gpu_.hitsModuleStart_d));
cudaCheck(cudaFree(gpu_.charge_d));
cudaCheck(cudaFree(gpu_.detInd_d));
cudaCheck(cudaFree(gpu_.xg_d));
cudaCheck(cudaFree(gpu_.yg_d));
cudaCheck(cudaFree(gpu_.zg_d));
cudaCheck(cudaFree(gpu_.rg_d));
cudaCheck(cudaFree(gpu_.xl_d));
cudaCheck(cudaFree(gpu_.yl_d));
cudaCheck(cudaFree(gpu_.xerr_d));
cudaCheck(cudaFree(gpu_.yerr_d));
cudaCheck(cudaFree(gpu_.iphi_d));
cudaCheck(cudaFree(gpu_.sortIndex_d));
cudaCheck(cudaFree(gpu_.mr_d));
cudaCheck(cudaFree(gpu_.mc_d));
// cudaCheck(cudaFree(gpu_.hist_d));

cudaCheck(cudaFree(gpu_d));
}

void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream) {

cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3*sizeof(float), cudaMemcpyDefault, stream.id()));

thrust::exclusive_scan(thrust::cuda::par.on(stream.id()),
input.clusInModule_d,
input.clusInModule_d + gpuClustering::MaxNumModules + 1,
Expand All @@ -51,6 +80,7 @@ namespace pixelgpudetails {
int blocks = input.nModules; // active modules (with digis)
gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
cpeParams,
gpu_.bs_d,
input.moduleInd_d,
input.xx_d, input.yy_d, input.adc_d,
input.moduleStart_d,
Expand All @@ -59,27 +89,50 @@ namespace pixelgpudetails {
input.nDigis,
gpu_.hitsModuleStart_d,
gpu_.charge_d,
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d,
gpu_.xerr_d, gpu_.yerr_d, gpu_.mr_d,
true // for the time being stay local...
gpu_.detInd_d,
gpu_.xg_d, gpu_.yg_d, gpu_.zg_d, gpu_.rg_d,
gpu_.iphi_d,
gpu_.xl_d, gpu_.yl_d,
gpu_.xerr_d, gpu_.yerr_d,
gpu_.mr_d, gpu_.mc_d
);

// needed only if hits on CPU are required...
cudaCheck(cudaMemcpyAsync(hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));

// to be moved to gpu?
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];
for (int i=0;i<10;++i) hitsLayerStart_[i]=hitsModuleStart_[phase1PixelTopology::layerStart[i]];
hitsLayerStart_[10]=nhits;

std::cout << "hit layerStart ";
for (int i=0;i<10;++i) std::cout << phase1PixelTopology::layerName[i] << ':' << hitsLayerStart_[i] << ' ';
std::cout << "end:" << hitsLayerStart_[10] << std::endl;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, now we have a per-event (non-thread safe) printout from here.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will be removed at next iteration when the sorting will be implemented as well


cudaCheck(cudaMemcpyAsync(gpu_.hitsLayerStart_d, hitsLayerStart_, (11) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));

// for timing test
// radixSortMultiWrapper<int16_t><<<10, 256, 0, c.stream>>>(gpu_.iphi_d,gpu_.sortIndex_d,gpu_.hitsLayerStart_d);

// fillManyFromVector(gpu_.hist_d,10,gpu_.iphi_d, gpu_.hitsLayerStart_d, nhits,256,c.stream);


}

HitsOnCPU PixelRecHitGPUKernel::getOutput(cuda::stream_t<>& stream) const {
// needed only if hits on CPU are required...
auto nhits = hitsModuleStart_[gpuClustering::MaxNumModules];

HitsOnCPU hoc(nhits);
hoc.gpu_d = gpu_d;
memcpy(hoc.hitsModuleStart, hitsModuleStart_, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t));
cudaCheck(cudaMemcpyAsync(hoc.charge.data(), gpu_.charge_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yg_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xl.data(), gpu_.xl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.yl.data(), gpu_.yl_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.xe.data(), gpu_.xerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.ye.data(), gpu_.yerr_d, nhits*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mr.data(), gpu_.mr_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaMemcpyAsync(hoc.mc.data(), gpu_.mc_d, nhits*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
cudaCheck(cudaStreamSynchronize(stream.id()));
return hoc;
}
Expand Down
26 changes: 9 additions & 17 deletions RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,21 @@
#include <cstdint>
#include <vector>

#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"


namespace pixelCPEforGPU {
struct ParamsOnGPU;
}

namespace pixelgpudetails {
struct HitsOnGPU{
uint32_t * hitsModuleStart_d;
int32_t * charge_d;
float *xg_d, *yg_d, *zg_d;
float *xerr_d, *yerr_d;
uint16_t * mr_d;
};
using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;

struct HitsOnCPU {
explicit HitsOnCPU(uint32_t nhits) :
charge(nhits),xl(nhits),yl(nhits),xe(nhits),ye(nhits), mr(nhits){}
uint32_t hitsModuleStart[2001];
std::vector<int32_t> charge;
std::vector<float> xl, yl;
std::vector<float> xe, ye;
std::vector<uint16_t> mr;
};
using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;

class PixelRecHitGPUKernel {
public:
PixelRecHitGPUKernel();
PixelRecHitGPUKernel(cuda::stream_t<>& cudaStream);
~PixelRecHitGPUKernel();

PixelRecHitGPUKernel(const PixelRecHitGPUKernel&) = delete;
Expand All @@ -43,14 +32,17 @@ namespace pixelgpudetails {
PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;

void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
float const * bs,
pixelCPEforGPU::ParamsOnGPU const * cpeParams,
cuda::stream_t<>& stream);

HitsOnCPU getOutput(cuda::stream_t<>& stream) const;

private:
HitsOnGPU * gpu_d; // copy of the structure on the gpu itself: this is the "Product"
HitsOnGPU gpu_;
uint32_t hitsModuleStart_[gpuClustering::MaxNumModules+1];
uint32_t hitsLayerStart_[11];
};
}

Expand Down
Loading