Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] ECAL Reco on GPU v2 #366

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 64 additions & 19 deletions RecoLocalCalo/EcalRecAlgos/interface/DeclsForKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,30 @@ struct EventInputDataGPU {
}
};

enum class KernelsVersion : uint32_t {
// kernels are completely splitted and launched/sync from the host
SplittedHostLaunch = 0,
// kernels are completely splitted, a grid of (1,1) is launched
// and a single gpu thread will do launch/sync of consequitive kernels
SplittedDeviceLaunch = 1,
// fused minimization, 10 x 10 threads per block.
// 1 channel per block
Fused = 2,
// Reasoning:
// 1) after iteration 0 and 1, there are < 10% channels left
// 2) after iteration 2, there are always < 1% channels left
// 3) instead of continueing spinning (launch+sync) the cpu,
// launch fused minimization
// Kernels are splitted, as in SplittedHostLaunch version
// after iteration 3 (TODO: should be configurable) laucnh fused version
// to continue minimization. there should be small number of blocks needed (resources! consumed)
HybridHostLaunch = 3,
// similar to SplittedDeviceLaunch for launching, (1, 1) gpu grid controls
// minimization. Hybrid - Runs fused minimization starting from iteration N
// TODO: right now N is fixed at 4
HybridDeviceLaunch = 4
};

// parameters have a fixed type
// Can we go by with single precision
struct ConfigurationParameters {
Expand Down Expand Up @@ -100,6 +124,7 @@ struct ConfigurationParameters {
std::array<uint32_t, 3> kernelMinimizeThreads;

bool shouldRunTimingComputation;
KernelsVersion version;
};

struct EventOutputDataGPU final : public ::ecal::UncalibratedRecHit<::ecal::Tag::ptr>
Expand Down Expand Up @@ -145,11 +170,14 @@ struct EventDataForScratchGPU {
SampleVector *samples = nullptr;
SampleGainVector *gainsNoise = nullptr;

SampleMatrix* noisecov = nullptr;
PulseMatrixType *pulse_matrix = nullptr;
FullSampleMatrix* pulse_covariances = nullptr;
BXVectorType *activeBXs = nullptr;
char *acState = nullptr;
uint32_t *v2rmapping_1=nullptr;
uint32_t *v2rmapping_2=nullptr;
uint32_t *pChannelsCounter = nullptr;

SampleVector::Scalar *decompMatrixMainLoop = nullptr, *decompMatrixFnnls=nullptr;
SampleVector::Scalar *AtA=nullptr, *Atb=nullptr;
char *samplesMapping=nullptr, *npassive=nullptr;
::ecal::reco::StorageScalarType *chi2_prev=nullptr;

bool *hasSwitchToGain6=nullptr,
*hasSwitchToGain1=nullptr,
Expand All @@ -173,16 +201,27 @@ struct EventDataForScratchGPU {
cudaCheck( cudaMalloc((void**)&gainsNoise,
size * sizeof(SampleGainVector)) );

cudaCheck( cudaMalloc((void**)&pulse_covariances,
size * sizeof(FullSampleMatrix)) );
cudaCheck( cudaMalloc((void**)&noisecov,
size * sizeof(SampleMatrix)) );
cudaCheck( cudaMalloc((void**)&pulse_matrix,
size * sizeof(PulseMatrixType)) );
cudaCheck( cudaMalloc((void**)&activeBXs,
size * sizeof(BXVectorType)) );
cudaCheck( cudaMalloc((void**)&acState,
cudaCheck( cudaMalloc((void**)&v2rmapping_1,
size * sizeof(uint32_t)) );
cudaCheck( cudaMalloc((void**)&v2rmapping_2,
size * sizeof(uint32_t)) );
cudaCheck( cudaMalloc((void**)&pChannelsCounter,
sizeof(uint32_t)) );
// FIXME: replace 55 with MapSymM::total
cudaCheck( cudaMalloc((void**)&decompMatrixMainLoop,
size * 55 * sizeof(SampleVector::Scalar)) );
cudaCheck( cudaMalloc((void**)&decompMatrixFnnls,
size * 55 * sizeof(SampleVector::Scalar)) );
cudaCheck( cudaMalloc((void**)&AtA,
size * 55 * sizeof(SampleVector::Scalar)) );
cudaCheck( cudaMalloc((void**)&Atb,
size * 10 * sizeof(SampleVector::Scalar)) );
cudaCheck( cudaMalloc((void**)&samplesMapping,
size * 10 * sizeof(char)) );
cudaCheck( cudaMalloc((void**)&npassive,
size * sizeof(char)) );
cudaCheck( cudaMalloc((void**)&chi2_prev,
size * sizeof(::ecal::reco::StorageScalarType)) );

cudaCheck( cudaMalloc((void**)&hasSwitchToGain6,
size * sizeof(bool)) );
Expand Down Expand Up @@ -232,11 +271,17 @@ struct EventDataForScratchGPU {
cudaCheck( cudaFree(samples) );
cudaCheck( cudaFree(gainsNoise) );

cudaCheck( cudaFree(pulse_covariances) );
cudaCheck( cudaFree(noisecov) );
cudaCheck( cudaFree(pulse_matrix) );
cudaCheck( cudaFree(activeBXs) );
cudaCheck( cudaFree(acState) );
cudaCheck( cudaFree(v2rmapping_1) );
cudaCheck( cudaFree(v2rmapping_2) );
cudaCheck( cudaFree(pChannelsCounter) );

cudaCheck( cudaFree(decompMatrixMainLoop) );
cudaCheck( cudaFree(decompMatrixFnnls) );
cudaCheck( cudaFree(AtA) );
cudaCheck( cudaFree(Atb) );
cudaCheck( cudaFree(samplesMapping) );
cudaCheck( cudaFree(npassive) );
cudaCheck( cudaFree(chi2_prev) );

cudaCheck( cudaFree(hasSwitchToGain6) );
cudaCheck( cudaFree(hasSwitchToGain1) );
Expand Down
26 changes: 2 additions & 24 deletions RecoLocalCalo/EcalRecAlgos/interface/EigenMatrixTypes_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,36 +9,14 @@
namespace ecal { namespace multifit {

constexpr int SampleVectorSize = 10;
constexpr int FullSampleVectorSize = 19;
constexpr int PulseVectorSize = 12;
constexpr int NGains = 3;
constexpr uint32_t noiseCovIsZeroMask = 0xffffffff;
constexpr uint32_t idIsInvalidMask = 0xffffffff;

using data_type = ::ecal::reco::ComputationScalarType;

typedef Eigen::Matrix<data_type, SampleVectorSize, SampleVectorSize> PulseMatrixType;
typedef Eigen::Matrix<char, SampleVectorSize, 1> BXVectorType;
using SampleMatrixD = Eigen::Matrix<double,SampleVectorSize,SampleVectorSize>;

typedef Eigen::Matrix<data_type,SampleVectorSize,1> SampleVector;
typedef Eigen::Matrix<data_type,FullSampleVectorSize,1> FullSampleVector;
typedef Eigen::Matrix<data_type,Eigen::Dynamic,1,0,PulseVectorSize,1> PulseVector;
typedef Eigen::Matrix<char,Eigen::Dynamic,1,0,PulseVectorSize,1> BXVector;
typedef Eigen::Matrix<char, SampleVectorSize,1> SampleGainVector;
typedef Eigen::Matrix<data_type,SampleVectorSize,SampleVectorSize> SampleMatrix;
typedef Eigen::Matrix<data_type,FullSampleVectorSize,FullSampleVectorSize> FullSampleMatrix;
typedef Eigen::Matrix<data_type,Eigen::Dynamic,Eigen::Dynamic,0,PulseVectorSize,PulseVectorSize> PulseMatrix;
typedef Eigen::Matrix<data_type,SampleVectorSize,Eigen::Dynamic,0,SampleVectorSize,PulseVectorSize> SamplePulseMatrix;
typedef Eigen::LLT<SampleMatrix> SampleDecompLLT;
typedef Eigen::LLT<SampleMatrixD> SampleDecompLLTD;
typedef Eigen::LLT<PulseMatrix> PulseDecompLLT;
typedef Eigen::LDLT<PulseMatrix> PulseDecompLDLT;

typedef Eigen::Matrix<data_type,1,1> SingleMatrix;
typedef Eigen::Matrix<data_type,1,1> SingleVector;

typedef std::array<SampleMatrixD,NGains> SampleMatrixGainArray;

using PermutationMatrix = Eigen::PermutationMatrix<SampleMatrix::RowsAtCompileTime>;

}}

Expand Down
Loading