From dd2b6a332a39830b48a500ccbcbb0309adddf53a Mon Sep 17 00:00:00 2001 From: Thomas Date: Thu, 12 Oct 2023 14:16:27 +0200 Subject: [PATCH] ECAL unpacker and ECAL multifit algorithm migration to alpaka. Co-authored-by: Thomas Reis Co-authored-by: Davide Valsecchi Co-authored-by: Jakub Andrzej Gajownik --- CUDADataFormats/EcalRecHitSoA/BuildFile.xml | 1 + .../EcalRecHitSoA/interface/EcalRecHit.h | 2 +- .../interface/EcalUncalibratedRecHit.h | 5 +- .../EcalRecHitSoA/interface/RecoTypes.h | 13 - .../python/EcalPhiSymRecoSequence_cff.py | 4 +- Configuration/PyReleaseValidation/README.md | 1 + .../python/upgradeWorkflowComponents.py | 21 + .../python/RawToDigi_Repacked_cff.py | 7 +- .../StandardSequences/python/RawToDigi_cff.py | 7 +- .../python/SimL1EmulatorRepack_GCTGT_cff.py | 8 +- .../clients/beam_dqm_sourceclient-live_cfg.py | 12 +- .../beamfake_dqm_sourceclient-live_cfg.py | 2 +- .../beampixel_dqm_sourceclient-live_cfg.py | 4 +- .../clients/csc_dqm_sourceclient-live_cfg.py | 4 +- .../clients/ecal_dqm_sourceclient-live_cfg.py | 57 +- .../clients/l1t_dqm_sourceclient-live_cfg.py | 4 +- .../l1temulator_dqm_sourceclient-live_cfg.py | 4 +- .../l1tstage1_dqm_sourceclient-live_cfg.py | 4 +- ...tage1emulator_dqm_sourceclient-live_cfg.py | 4 +- .../l1tstage2_dqm_sourceclient-live_cfg.py | 2 +- ...tage2emulator_dqm_sourceclient-live_cfg.py | 2 +- .../pixel_dqm_sourceclient-live_cfg.py | 14 +- .../clients/scal_dqm_sourceclient-live_cfg.py | 8 +- ...istrip_approx_dqm_sourceclient-live_cfg.py | 18 +- .../sistrip_dqm_sourceclient-live_cfg.py | 18 +- .../interface/MultifitComputations.h | 2 +- .../EcalDigi/interface/EcalDigiCollections.h | 11 +- DataFormats/EcalRecHit/interface/RecoTypes.h | 13 + .../EcalRawToDigi/plugins/BuildFile.xml | 16 +- .../plugins/EcalDigisFromPortableProducer.cc | 210 +++ .../plugins/alpaka/DeclsForKernels.h | 43 + .../EcalElectronicsMappingHostESProducer.cc | 58 + .../plugins/alpaka/EcalRawToDigiPortable.cc | 142 ++ .../plugins/alpaka/UnpackPortable.dev.cc | 441 +++++++ .../plugins/alpaka/UnpackPortable.h | 22 + .../EcalRawToDigi/python/ecalDigis_cff.py | 43 +- .../python/customizeHLTforAlpaka.py | 102 ++ RecoLocalCalo/EcalRecProducers/BuildFile.xml | 1 + .../EigenMatrixTypes_gpu.h | 9 +- .../AmplitudeComputationCommonKernels.h | 2 +- .../plugins/AmplitudeComputationKernels.h | 2 +- .../EcalRecProducers/plugins/BuildFile.xml | 14 +- .../plugins/DeclsForKernels.h | 5 +- .../plugins/EcalCPURecHitProducer.cc | 2 +- .../plugins/EcalRecHitProducerGPU.cc | 2 +- .../plugins/EcalUncalibRecHitSoAToLegacy.cc | 105 ++ .../plugins/TimeComputationKernels.h | 2 +- .../AmplitudeComputationCommonKernels.h | 488 +++++++ .../alpaka/AmplitudeComputationKernels.dev.cc | 316 +++++ .../alpaka/AmplitudeComputationKernels.h | 28 + .../plugins/alpaka/DeclsForKernels.h | 130 ++ .../EcalMultifitConditionsHostESProducer.cc | 213 +++ .../EcalMultifitParametersHostESProducer.cc | 99 ++ ...alUncalibRecHitMultiFitAlgoPortable.dev.cc | 234 ++++ .../EcalUncalibRecHitMultiFitAlgoPortable.h | 30 + .../EcalUncalibRecHitProducerPortable.cc | 222 ++++ .../plugins/alpaka/KernelHelpers.dev.cc | 275 ++++ .../plugins/alpaka/KernelHelpers.h | 19 + .../plugins/alpaka/TimeComputationKernels.h | 1162 +++++++++++++++++ .../python/ecalLocalCustom.py | 12 +- .../python/ecalMultiFitUncalibRecHit_cff.py | 41 +- Validation/Configuration/python/ECALHCAL.py | 4 +- 62 files changed, 4603 insertions(+), 143 deletions(-) delete mode 100644 CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h create mode 100644 DataFormats/EcalRecHit/interface/RecoTypes.h create mode 100644 EventFilter/EcalRawToDigi/plugins/EcalDigisFromPortableProducer.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/alpaka/DeclsForKernels.h create mode 100644 EventFilter/EcalRawToDigi/plugins/alpaka/EcalElectronicsMappingHostESProducer.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/alpaka/EcalRawToDigiPortable.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.dev.cc create mode 100644 EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.h create mode 100644 HLTrigger/Configuration/python/customizeHLTforAlpaka.py rename RecoLocalCalo/EcalRecProducers/{plugins => interface}/EigenMatrixTypes_gpu.h (87%) create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitSoAToLegacy.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/DeclsForKernels.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitConditionsHostESProducer.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitParametersHostESProducer.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.dev.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitProducerPortable.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.dev.cc create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.h create mode 100644 RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml index a684d9a23f1c6..6d67c5d5f6220 100644 --- a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml +++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml @@ -3,6 +3,7 @@ + diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h index 731b8b801407f..3e312218a112f 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h @@ -5,7 +5,7 @@ #include #include "CUDADataFormats/CaloCommon/interface/Common.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +#include "DataFormats/EcalRecHit/interface/RecoTypes.h" #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" namespace ecal { diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h index 7497f71269089..a48850e68858f 100644 --- a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h @@ -1,12 +1,9 @@ #ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_h #define CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_h -#include -#include - #include "CUDADataFormats/CaloCommon/interface/Common.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" #include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalRecHit/interface/RecoTypes.h" namespace ecal { diff --git a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h deleted file mode 100644 index 87c4252a5e949..0000000000000 --- a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h -#define CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h - -namespace ecal { - namespace reco { - - using ComputationScalarType = float; - using StorageScalarType = float; - - } // namespace reco -} // namespace ecal - -#endif // CUDADataFormats_EcalRecHitSoA_interface_RecoTypes_h diff --git a/Calibration/EcalCalibAlgos/python/EcalPhiSymRecoSequence_cff.py b/Calibration/EcalCalibAlgos/python/EcalPhiSymRecoSequence_cff.py index d171115387a04..7aa1598a63789 100644 --- a/Calibration/EcalCalibAlgos/python/EcalPhiSymRecoSequence_cff.py +++ b/Calibration/EcalCalibAlgos/python/EcalPhiSymRecoSequence_cff.py @@ -155,8 +155,8 @@ def customise(process): """ # Change input collection for the /AlCaPhiSym/*/RAW stream dataformat - process.ecalMultiFitUncalibRecHit.cpu.EBdigiCollection = cms.InputTag("hltEcalPhiSymFilter", "phiSymEcalDigisEB") - process.ecalMultiFitUncalibRecHit.cpu.EEdigiCollection = cms.InputTag("hltEcalPhiSymFilter", "phiSymEcalDigisEE") + process.ecalMultiFitUncalibRecHitCPU.EBdigiCollection = "hltEcalPhiSymFilter:phiSymEcalDigisEB" + process.ecalMultiFitUncalibRecHitCPU.EEdigiCollection = "hltEcalPhiSymFilter:phiSymEcalDigisEE" process.ecalRecHit.cpu.killDeadChannels = cms.bool( False ) process.ecalRecHit.cpu.recoverEBVFE = cms.bool( False ) process.ecalRecHit.cpu.recoverEEVFE = cms.bool( False ) diff --git a/Configuration/PyReleaseValidation/README.md b/Configuration/PyReleaseValidation/README.md index 9c9cb96cd7792..be645015e9e4f 100644 --- a/Configuration/PyReleaseValidation/README.md +++ b/Configuration/PyReleaseValidation/README.md @@ -30,6 +30,7 @@ The offsets currently in use are: * 0.2: Tracking Run-2 era, `Run2_2017_trackingRun2` * 0.3: 0.1 + 0.2 * 0.4: LowPU tracking era, `Run2_2017_trackingLowPU` +* 0.411: Patatrack, ECAL only, Alpaka * 0.5: Pixel tracking only + 0.1 * 0.501: Patatrack, pixel only quadruplets, on CPU * 0.502: Patatrack, pixel only quadruplets, with automatic offload to GPU if available diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py index ea6041c9d2822..01891e6cc4ef0 100644 --- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py +++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py @@ -1048,6 +1048,27 @@ def setup_(self, step, stepName, stepDict, k, properties): offset = 0.508, ) +# ECAL-only workflow running on CPU or GPU with Alpaka code +# - HLT with Alpaka +# - ECAL-only reconstruction with Alpaka, with DQM and validation +# - harvesting +upgradeWFs['PatatrackECALOnlyAlpaka'] = PatatrackWorkflow( + digi = { + # customize the ECAL Local Reco part of the HLT menu for Alpaka + '--procModifiers': 'alpaka', + '--customise' : 'HLTrigger/Configuration/customizeHLTforAlpaka.customizeHLTforAlpakaEcalLocalReco' + }, + reco = { + '-s': 'RAW2DIGI:RawToDigi_ecalOnly,RECO:reconstruction_ecalOnly,VALIDATION:@ecalOnlyValidation,DQM:@ecalOnly', + '--procModifiers': 'alpaka' + }, + harvest = { + '-s': 'HARVESTING:@ecalOnlyValidation+@ecal' + }, + suffix = 'Patatrack_ECALOnlyAlpaka', + offset = 0.411, +) + # ECAL-only workflow running on CPU # - HLT on CPU # - ECAL-only reconstruction on CPU, with DQM and validation diff --git a/Configuration/StandardSequences/python/RawToDigi_Repacked_cff.py b/Configuration/StandardSequences/python/RawToDigi_Repacked_cff.py index 3d64a8c1c4912..e40eaee29c6e1 100644 --- a/Configuration/StandardSequences/python/RawToDigi_Repacked_cff.py +++ b/Configuration/StandardSequences/python/RawToDigi_Repacked_cff.py @@ -2,6 +2,9 @@ from Configuration.StandardSequences.RawToDigi_cff import * +from Configuration.ProcessModifiers.gpu_cff import gpu +from Configuration.ProcessModifiers.alpaka_cff import alpaka + scalersRawToDigi.scalersInputTag = 'rawDataRepacker' csctfDigis.producer = 'rawDataRepacker' dttfDigis.DTTF_FED_Source = 'rawDataRepacker' @@ -10,7 +13,9 @@ gtEvmDigis.EvmGtInputTag = 'rawDataRepacker' siPixelDigis.cpu.InputLabel = 'rawDataRepacker' siStripDigis.ProductLabel = 'rawDataRepacker' -ecalDigis.cpu.InputLabel = 'rawDataRepacker' +ecalDigisCPU.InputLabel = 'rawDataRepacker' +gpu.toModify(ecalDigisGPU, InputLabel = 'rawDataRepacker') +alpaka.toModify(ecalDigisPortable, InputLabel = 'rawDataRepacker') ecalPreshowerDigis.sourceTag = 'rawDataRepacker' hcalDigis.InputLabel = 'rawDataRepacker' muonCSCDigis.InputObjects = 'rawDataRepacker' diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py index c245488f29ef7..321e5daa02370 100644 --- a/Configuration/StandardSequences/python/RawToDigi_cff.py +++ b/Configuration/StandardSequences/python/RawToDigi_cff.py @@ -73,9 +73,14 @@ RawToDigiTask_hcalOnly = cms.Task(hcalDigis) RawToDigi_hcalOnly = cms.Sequence(RawToDigiTask_hcalOnly) +from Configuration.ProcessModifiers.gpu_cff import gpu +from Configuration.ProcessModifiers.alpaka_cff import alpaka + scalersRawToDigi.scalersInputTag = 'rawDataCollector' siPixelDigis.cpu.InputLabel = 'rawDataCollector' -ecalDigis.cpu.InputLabel = 'rawDataCollector' +ecalDigisCPU.InputLabel = 'rawDataCollector' +gpu.toModify(ecalDigisGPU, InputLabel = 'rawDataCollector') +alpaka.toModify(ecalDigisPortable, InputLabel = 'rawDataCollector') ecalPreshowerDigis.sourceTag = 'rawDataCollector' hcalDigis.InputLabel = 'rawDataCollector' muonCSCDigis.InputObjects = 'rawDataCollector' diff --git a/Configuration/StandardSequences/python/SimL1EmulatorRepack_GCTGT_cff.py b/Configuration/StandardSequences/python/SimL1EmulatorRepack_GCTGT_cff.py index dbcc43f08352c..3a1d0505b4d8c 100644 --- a/Configuration/StandardSequences/python/SimL1EmulatorRepack_GCTGT_cff.py +++ b/Configuration/StandardSequences/python/SimL1EmulatorRepack_GCTGT_cff.py @@ -26,8 +26,14 @@ ## run the L1 emulator ## +from Configuration.ProcessModifiers.gpu_cff import gpu +from Configuration.ProcessModifiers.alpaka_cff import alpaka + from L1Trigger.L1TCalorimeter.L1TCaloStage1_PPFromRaw_cff import * -ecalDigis.cpu.InputLabel = cms.InputTag( 'rawDataCollector', processName=cms.InputTag.skipCurrentProcess()) +from EventFilter.EcalRawToDigi.ecalDigis_cff import ecalDigisCPU, ecalDigisGPU, ecalDigisPortable +ecalDigisCPU.InputLabel = cms.InputTag('rawDataCollector', processName=cms.InputTag.skipCurrentProcess()) +gpu.toModify(ecalDigisGPU, InputLabel = cms.InputTag('rawDataCollector', processName=cms.InputTag.skipCurrentProcess())) +alpaka.toModify(ecalDigisPortable, InputLabel = cms.InputTag('rawDataCollector', processName=cms.InputTag.skipCurrentProcess())) hcalDigis.InputLabel = cms.InputTag( 'rawDataCollector', processName=cms.InputTag.skipCurrentProcess()) simHcalTriggerPrimitiveDigis.InputTagFEDRaw = cms.InputTag( 'rawDataCollector', processName=cms.InputTag.skipCurrentProcess()) diff --git a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py index 48131b4e4bde6..a15d331b490d1 100644 --- a/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/beam_dqm_sourceclient-live_cfg.py @@ -17,12 +17,10 @@ from Configuration.Eras.Era_Run3_cff import Run3 process = cms.Process("BeamMonitorLegacy", Run3) -process.MessageLogger = cms.Service("MessageLogger", - debugModules = cms.untracked.vstring('*'), - cerr = cms.untracked.PSet( - threshold = cms.untracked.string('WARNING') - ), - destinations = cms.untracked.vstring('cerr') +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.debugModules = cms.untracked.vstring('*') +process.MessageLogger.cerr = cms.untracked.PSet( + threshold = cms.untracked.string('WARNING') ) # switch @@ -309,7 +307,7 @@ process.castorDigis.InputLabel = rawDataInputTag process.csctfDigis.producer = rawDataInputTag process.dttfDigis.DTTF_FED_Source = rawDataInputTag -process.ecalDigis.cpu.InputLabel = rawDataInputTag +process.ecalDigisCPU.InputLabel = rawDataInputTag process.ecalPreshowerDigis.sourceTag = rawDataInputTag process.gctDigis.inputLabel = rawDataInputTag process.gtDigis.DaqGtInputTag = rawDataInputTag diff --git a/DQM/Integration/python/clients/beamfake_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beamfake_dqm_sourceclient-live_cfg.py index 589cb0bd790f5..c1ce11e58c568 100644 --- a/DQM/Integration/python/clients/beamfake_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/beamfake_dqm_sourceclient-live_cfg.py @@ -121,7 +121,7 @@ """ process.castorDigis.InputLabel = rawDataInputTag process.csctfDigis.producer = rawDataInputTag process.dttfDigis.DTTF_FED_Source = rawDataInputTag -process.ecalDigis.cpu.InputLabel = rawDataInputTag +process.ecalDigisCPU.InputLabel = rawDataInputTag process.ecalPreshowerDigis.sourceTag = rawDataInputTag process.gctDigis.inputLabel = rawDataInputTag process.gtDigis.DaqGtInputTag = rawDataInputTag diff --git a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py index d91ba52ffc396..a20d7e6435458 100644 --- a/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/beampixel_dqm_sourceclient-live_cfg.py @@ -121,7 +121,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" - process.ecalDigis.cpu.InputLabel = "rawDataCollector" + process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -174,7 +174,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py index 6630a69535dfa..d35d5114bf361 100644 --- a/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/csc_dqm_sourceclient-live_cfg.py @@ -180,7 +180,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" -process.ecalDigis.cpu.InputLabel = "rawDataCollector" +process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -205,7 +205,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/ecal_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/ecal_dqm_sourceclient-live_cfg.py index ef96ea65c011a..5c4dc363968ea 100644 --- a/DQM/Integration/python/clients/ecal_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/ecal_dqm_sourceclient-live_cfg.py @@ -53,13 +53,13 @@ ### Individual module setups ### # Use the ratio timing method for the online DQM -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.timealgo = "RatioMethod" -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.outOfTimeThresholdGain12pEB = 5. -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.outOfTimeThresholdGain12mEB = 5. -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.outOfTimeThresholdGain61pEB = 5. -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.outOfTimeThresholdGain61mEB = 5. -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.timeCalibTag = ':' -process.ecalMultiFitUncalibRecHit.cpu.algoPSet.timeOffsetTag = ':' +process.ecalMultiFitUncalibRecHitCPU.algoPSet.timealgo = "RatioMethod" +process.ecalMultiFitUncalibRecHitCPU.algoPSet.outOfTimeThresholdGain12pEB = 5. +process.ecalMultiFitUncalibRecHitCPU.algoPSet.outOfTimeThresholdGain12mEB = 5. +process.ecalMultiFitUncalibRecHitCPU.algoPSet.outOfTimeThresholdGain61pEB = 5. +process.ecalMultiFitUncalibRecHitCPU.algoPSet.outOfTimeThresholdGain61mEB = 5. +process.ecalMultiFitUncalibRecHitCPU.algoPSet.timeCalibTag = ':' +process.ecalMultiFitUncalibRecHitCPU.algoPSet.timeOffsetTag = ':' process.ecalPhysicsFilter = cms.EDFilter("EcalMonitorPrescaler", cosmics = cms.untracked.uint32(1), @@ -67,31 +67,26 @@ EcalRawDataCollection = cms.InputTag("ecalDigis") ) -process.MessageLogger = cms.Service("MessageLogger", - cerr = cms.untracked.PSet( - default = cms.untracked.PSet( - limit = cms.untracked.int32(-1) - ), - EcalLaserDbService = cms.untracked.PSet( - limit = cms.untracked.int32(10) - ), - noTimeStamps = cms.untracked.bool(True), - threshold = cms.untracked.string('WARNING'), - noLineBreaks = cms.untracked.bool(True) +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.cerr = cms.untracked.PSet( + default = cms.untracked.PSet( + limit = cms.untracked.int32(-1) ), - cout = cms.untracked.PSet( - default = cms.untracked.PSet( - limit = cms.untracked.int32(0) - ), - EcalDQM = cms.untracked.PSet( - limit = cms.untracked.int32(-1) - ), - threshold = cms.untracked.string('INFO') + EcalLaserDbService = cms.untracked.PSet( + limit = cms.untracked.int32(10) ), - categories = cms.untracked.vstring('EcalDQM', - 'EcalLaserDbService'), - destinations = cms.untracked.vstring('cerr', - 'cout') + noTimeStamps = cms.untracked.bool(True), + threshold = cms.untracked.string('WARNING'), + noLineBreaks = cms.untracked.bool(True) +) +process.MessageLogger.cout = cms.untracked.PSet( + default = cms.untracked.PSet( + limit = cms.untracked.int32(0) + ), + EcalDQM = cms.untracked.PSet( + limit = cms.untracked.int32(-1) + ), + threshold = cms.untracked.string('INFO') ) process.maxEvents = cms.untracked.PSet( @@ -196,7 +191,7 @@ process.ecalMonitorTask.workerParameters.PresampleTask.params.doPulseMaxCheck = False elif runTypeName == 'hi_run': process.ecalMonitorTask.collectionTags.Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = 'rawDataRepacker' + process.ecalDigisCPU.InputLabel = 'rawDataRepacker' elif runTypeName == 'hpu_run': if not unitTest: process.source.SelectEvents = cms.untracked.PSet(SelectEvents = cms.vstring('*')) diff --git a/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py index 84a996a8e0251..b356b80900fa9 100644 --- a/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1t_dqm_sourceclient-live_cfg.py @@ -173,7 +173,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" -process.ecalDigis.cpu.InputLabel = "rawDataCollector" +process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -192,7 +192,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py index c42e7eabcb60c..6435e7e224413 100644 --- a/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1temulator_dqm_sourceclient-live_cfg.py @@ -186,7 +186,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" -process.ecalDigis.cpu.InputLabel = "rawDataCollector" +process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -208,7 +208,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py index a71cea1aef341..47272fe19a1a0 100644 --- a/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1tstage1_dqm_sourceclient-live_cfg.py @@ -183,7 +183,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" -process.ecalDigis.cpu.InputLabel = "rawDataCollector" +process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -202,7 +202,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py index 07821ec686fdd..5b8559bc502d5 100644 --- a/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1tstage1emulator_dqm_sourceclient-live_cfg.py @@ -195,7 +195,7 @@ process.castorDigis.InputLabel = "rawDataCollector" process.csctfDigis.producer = "rawDataCollector" process.dttfDigis.DTTF_FED_Source = "rawDataCollector" -process.ecalDigis.cpu.InputLabel = "rawDataCollector" +process.ecalDigisCPU.InputLabel = "rawDataCollector" process.ecalPreshowerDigis.sourceTag = "rawDataCollector" process.gctDigis.inputLabel = "rawDataCollector" process.gtDigis.DaqGtInputTag = "rawDataCollector" @@ -217,7 +217,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py index 1e08647bed02c..0350ce6412c73 100644 --- a/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1tstage2_dqm_sourceclient-live_cfg.py @@ -132,7 +132,7 @@ process.castorDigis.InputLabel = rawDataRepackerLabel process.ctppsDiamondRawToDigi.rawDataTag = rawDataRepackerLabel process.ctppsPixelDigis.inputLabel = rawDataRepackerLabel - process.ecalDigis.cpu.InputLabel = rawDataRepackerLabel + process.ecalDigisCPU.InputLabel = rawDataRepackerLabel process.ecalPreshowerDigis.sourceTag = rawDataRepackerLabel process.hcalDigis.InputLabel = rawDataRepackerLabel process.muonCSCDigis.InputObjects = rawDataRepackerLabel diff --git a/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py index 50f00b5cea742..41e11e6a4bd97 100644 --- a/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/l1tstage2emulator_dqm_sourceclient-live_cfg.py @@ -131,7 +131,7 @@ process.castorDigis.InputLabel = rawDataRepackerLabel process.ctppsDiamondRawToDigi.rawDataTag = rawDataRepackerLabel process.ctppsPixelDigis.inputLabel = rawDataRepackerLabel - process.ecalDigis.cpu.InputLabel = rawDataRepackerLabel + process.ecalDigisCPU.InputLabel = rawDataRepackerLabel process.ecalPreshowerDigis.sourceTag = rawDataRepackerLabel process.hcalDigis.InputLabel = rawDataRepackerLabel process.muonCSCDigis.InputObjects = rawDataRepackerLabel diff --git a/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py index 7df9fa22ac802..c069029538198 100644 --- a/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/pixel_dqm_sourceclient-live_cfg.py @@ -22,14 +22,12 @@ TAG ="PixelPhase1" -process.MessageLogger = cms.Service("MessageLogger", - debugModules = cms.untracked.vstring('siPixelDigis', - 'siStripClusters', - 'SiPixelRawDataErrorSource', - 'SiPixelDigiSource'), - cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')), - destinations = cms.untracked.vstring('cout') -) +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.debugModules = cms.untracked.vstring('siPixelDigis', + 'siStripClusters', + 'SiPixelRawDataErrorSource', + 'SiPixelDigiSource') +process.MessageLogger.cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')) #---------------------------- # Event Source diff --git a/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py index 8388d384151ec..e677a2c05ede6 100644 --- a/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/scal_dqm_sourceclient-live_cfg.py @@ -37,10 +37,8 @@ process.load("DQMServices.Components.DQMScalInfo_cfi") # message logger -process.MessageLogger = cms.Service("MessageLogger", - destinations = cms.untracked.vstring('cout'), - cout = cms.untracked.PSet(threshold = cms.untracked.string('WARNING')) - ) +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.cout = cms.untracked.PSet(threshold = cms.untracked.string('WARNING')) # Global tag # Condition for P5 cluster @@ -91,7 +89,7 @@ process.castorDigis.InputLabel = "rawDataRepacker" process.csctfDigis.producer = "rawDataRepacker" process.dttfDigis.DTTF_FED_Source = "rawDataRepacker" - process.ecalDigis.cpu.InputLabel = "rawDataRepacker" + process.ecalDigisCPU.InputLabel = "rawDataRepacker" process.ecalPreshowerDigis.sourceTag = "rawDataRepacker" process.gctDigis.inputLabel = "rawDataRepacker" process.gtDigis.DaqGtInputTag = "rawDataRepacker" diff --git a/DQM/Integration/python/clients/sistrip_approx_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/sistrip_approx_dqm_sourceclient-live_cfg.py index 1708fc82aeae6..19f43ef65315e 100644 --- a/DQM/Integration/python/clients/sistrip_approx_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/sistrip_approx_dqm_sourceclient-live_cfg.py @@ -9,15 +9,13 @@ from Configuration.Eras.Era_Run3_cff import Run3 process = cms.Process("SiStripApproxMonitor", Run3) -process.MessageLogger = cms.Service("MessageLogger", - debugModules = cms.untracked.vstring('siStripDigis', - 'siStripClusters', - 'siStripZeroSuppression', - 'SiStripClusterizer', - 'siStripApproximateClusterComparator'), - cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')), - destinations = cms.untracked.vstring('cout') - ) +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.debugModules = cms.untracked.vstring('siStripDigis', + 'siStripClusters', + 'siStripZeroSuppression', + 'SiStripClusterizer', + 'siStripApproximateClusterComparator') +process.MessageLogger.cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')) live=True unitTest=False @@ -189,7 +187,7 @@ process.castorDigis.InputLabel = rawDataRepackerLabel process.csctfDigis.producer = rawDataRepackerLabel process.dttfDigis.DTTF_FED_Source = rawDataRepackerLabel - process.ecalDigis.cpu.InputLabel = rawDataRepackerLabel + process.ecalDigisCPU.InputLabel = rawDataRepackerLabel process.ecalPreshowerDigis.sourceTag = rawDataRepackerLabel process.gctDigis.inputLabel = rawDataRepackerLabel process.hcalDigis.InputLabel = rawDataRepackerLabel diff --git a/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py b/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py index e06ddfada3199..a784c2d35e345 100644 --- a/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py +++ b/DQM/Integration/python/clients/sistrip_dqm_sourceclient-live_cfg.py @@ -9,14 +9,12 @@ from Configuration.Eras.Era_Run3_cff import Run3 process = cms.Process("SiStripMonitor", Run3) -process.MessageLogger = cms.Service("MessageLogger", - debugModules = cms.untracked.vstring('siStripDigis', - 'siStripClusters', - 'siStripZeroSuppression', - 'SiStripClusterizer'), - cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')), - destinations = cms.untracked.vstring('cout') -) +process.load('FWCore.MessageService.MessageLogger_cfi') +process.MessageLogger.debugModules = cms.untracked.vstring('siStripDigis', + 'siStripClusters', + 'siStripZeroSuppression', + 'SiStripClusterizer') +process.MessageLogger.cout = cms.untracked.PSet(threshold = cms.untracked.string('ERROR')) live=True unitTest=False @@ -517,7 +515,7 @@ process.castorDigis.InputLabel = rawDataCollectorLabel process.csctfDigis.producer = rawDataCollectorLabel process.dttfDigis.DTTF_FED_Source = rawDataCollectorLabel -process.ecalDigis.cpu.InputLabel = rawDataCollectorLabel +process.ecalDigisCPU.InputLabel = rawDataCollectorLabel process.ecalPreshowerDigis.sourceTag = rawDataCollectorLabel process.gctDigis.inputLabel = rawDataCollectorLabel process.gtDigis.DaqGtInputTag = rawDataCollectorLabel @@ -541,7 +539,7 @@ process.castorDigis.InputLabel = rawDataRepackerLabel process.csctfDigis.producer = rawDataRepackerLabel process.dttfDigis.DTTF_FED_Source = rawDataRepackerLabel - process.ecalDigis.cpu.InputLabel = rawDataRepackerLabel + process.ecalDigisCPU.InputLabel = rawDataRepackerLabel process.ecalPreshowerDigis.sourceTag = rawDataRepackerLabel process.gctDigis.inputLabel = rawDataRepackerLabel process.hcalDigis.InputLabel = rawDataRepackerLabel diff --git a/DataFormats/CaloRecHit/interface/MultifitComputations.h b/DataFormats/CaloRecHit/interface/MultifitComputations.h index f2d57d2ddb1e7..253ba348dfaf7 100644 --- a/DataFormats/CaloRecHit/interface/MultifitComputations.h +++ b/DataFormats/CaloRecHit/interface/MultifitComputations.h @@ -413,7 +413,7 @@ namespace calo { // compute the gradient //w.tail(nactive) = Atb.tail(nactive) - (AtA * solution).tail(nactive); - Eigen::Index w_max_idx; + Eigen::Index w_max_idx = 0; float w_max = -std::numeric_limits::max(); for (int icol = npassive; icol < NPULSES; icol++) { auto const icol_real = pulseOffsets(icol); diff --git a/DataFormats/EcalDigi/interface/EcalDigiCollections.h b/DataFormats/EcalDigi/interface/EcalDigiCollections.h index 6e4a04066a1f3..5a86f3a0bf8a5 100644 --- a/DataFormats/EcalDigi/interface/EcalDigiCollections.h +++ b/DataFormats/EcalDigi/interface/EcalDigiCollections.h @@ -48,10 +48,13 @@ class EBDigiCollectionPh2 : public EcalDigiCollectionPh2 { class EcalDigiCollection : public edm::DataFrameContainer { public: typedef edm::DataFrameContainer::size_type size_type; - static const size_type MAXSAMPLES = 10; + static const size_type MAXSAMPLES = ecalPh1::sampleSize; explicit EcalDigiCollection(size_type istride = MAXSAMPLES, int isubdet = 0) : edm::DataFrameContainer(istride, isubdet) {} void swap(DataFrameContainer& other) { this->DataFrameContainer::swap(other); } + void swap(DataFrameContainer::IdContainer& otherIds, DataFrameContainer::DataContainer& otherData) { + this->DataFrameContainer::swap(otherIds, otherData); + } }; // make edm (and ecal client) happy @@ -63,6 +66,9 @@ class EBDigiCollection : public EcalDigiCollection { EBDigiCollection(size_type istride = MAXSAMPLES) : EcalDigiCollection(istride, EcalBarrel) {} void swap(EBDigiCollection& other) { this->EcalDigiCollection::swap(other); } + void swap(EBDigiCollection::IdContainer& otherIds, EBDigiCollection::DataContainer& otherData) { + this->EcalDigiCollection::swap(otherIds, otherData); + } void push_back(const Digi& digi) { DataFrameContainer::push_back(digi.id(), digi.frame().begin()); } void push_back(id_type iid) { DataFrameContainer::push_back(iid); } void push_back(id_type iid, data_type const* idata) { DataFrameContainer::push_back(iid, idata); } @@ -76,6 +82,9 @@ class EEDigiCollection : public EcalDigiCollection { EEDigiCollection(size_type istride = MAXSAMPLES) : EcalDigiCollection(istride, EcalEndcap) {} void swap(EEDigiCollection& other) { this->EcalDigiCollection::swap(other); } + void swap(EEDigiCollection::IdContainer& otherIds, EEDigiCollection::DataContainer& otherData) { + this->EcalDigiCollection::swap(otherIds, otherData); + } void push_back(const Digi& digi) { edm::DataFrameContainer::push_back(digi.id(), digi.frame().begin()); } void push_back(id_type iid) { DataFrameContainer::push_back(iid); } void push_back(id_type iid, data_type const* idata) { DataFrameContainer::push_back(iid, idata); } diff --git a/DataFormats/EcalRecHit/interface/RecoTypes.h b/DataFormats/EcalRecHit/interface/RecoTypes.h new file mode 100644 index 0000000000000..a7b1469fa57d3 --- /dev/null +++ b/DataFormats/EcalRecHit/interface/RecoTypes.h @@ -0,0 +1,13 @@ +#ifndef DataFormats_EcalRecHit_interface_RecoTypes_h +#define DataFormats_EcalRecHit_interface_RecoTypes_h + +namespace ecal { + namespace reco { + + using ComputationScalarType = float; + using StorageScalarType = float; + + } // namespace reco +} // namespace ecal + +#endif // DataFormats_EcalRecHit_interface_RecoTypes_h diff --git a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml index 02b8be67a6522..ae261fc4de1df 100644 --- a/EventFilter/EcalRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/EcalRawToDigi/plugins/BuildFile.xml @@ -1,13 +1,11 @@ - + - - @@ -16,5 +14,17 @@ + + + + + + + + + + + + diff --git a/EventFilter/EcalRawToDigi/plugins/EcalDigisFromPortableProducer.cc b/EventFilter/EcalRawToDigi/plugins/EcalDigisFromPortableProducer.cc new file mode 100644 index 0000000000000..d2c450f1ac2ed --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/EcalDigisFromPortableProducer.cc @@ -0,0 +1,210 @@ +#include + +#include "DataFormats/EcalDetId/interface/EcalDetIdCollections.h" +#include "DataFormats/EcalDigi/interface/EcalConstants.h" +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "DataFormats/EcalDigi/interface/EcalDigiHostCollection.h" +#include "DataFormats/EcalRawData/interface/EcalRawDataCollections.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" + +class EcalDigisFromPortableProducer : public edm::stream::EDProducer<> { +public: + explicit EcalDigisFromPortableProducer(edm::ParameterSet const& ps); + ~EcalDigisFromPortableProducer() override = default; + static void fillDescriptions(edm::ConfigurationDescriptions&); + +private: + void produce(edm::Event&, edm::EventSetup const&) override; + + template + edm::EDPutTokenT dummyProduces(ARGS&&... args) { + return (produceDummyIntegrityCollections_) ? produces(std::forward(args)...) + : edm::EDPutTokenT{}; + } + +private: + // input digi collections on host in SoA format + using InputProduct = EcalDigiHostCollection; + edm::EDGetTokenT digisInEBToken_; + edm::EDGetTokenT digisInEEToken_; + + // output digi collections in legacy format + edm::EDPutTokenT digisOutEBToken_; + edm::EDPutTokenT digisOutEEToken_; + + // whether to produce dummy integrity collections + bool produceDummyIntegrityCollections_; + + // dummy producer collections + edm::EDPutTokenT ebSrFlagToken_; + edm::EDPutTokenT eeSrFlagToken_; + + // dummy ECAL raw data collection + edm::EDPutTokenT ecalRawDataToken_; + + // dummy integrity for xtal data + edm::EDPutTokenT ebIntegrityGainErrorsToken_; + edm::EDPutTokenT ebIntegrityGainSwitchErrorsToken_; + edm::EDPutTokenT ebIntegrityChIdErrorsToken_; + + // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection) + edm::EDPutTokenT eeIntegrityGainErrorsToken_; + edm::EDPutTokenT eeIntegrityGainSwitchErrorsToken_; + edm::EDPutTokenT eeIntegrityChIdErrorsToken_; + + // dummy integrity errors + edm::EDPutTokenT integrityTTIdErrorsToken_; + edm::EDPutTokenT integrityZSXtalIdErrorsToken_; + edm::EDPutTokenT integrityBlockSizeErrorsToken_; + + edm::EDPutTokenT pnDiodeDigisToken_; + + // dummy TCC collections + edm::EDPutTokenT ecalTriggerPrimitivesToken_; + edm::EDPutTokenT ecalPseudoStripInputsToken_; + + // dummy mem integrity collections + edm::EDPutTokenT ecalIntegrityMemTtIdErrorsToken_; + edm::EDPutTokenT ecalIntegrityMemBlockSizeErrorsToken_; + edm::EDPutTokenT ecalIntegrityMemChIdErrorsToken_; + edm::EDPutTokenT ecalIntegrityMemGainErrorsToken_; +}; + +void EcalDigisFromPortableProducer::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisInLabelEB", edm::InputTag{"ecalRawToDigiPortable", "ebDigis"}); + desc.add("digisInLabelEE", edm::InputTag{"ecalRawToDigiPortable", "eeDigis"}); + desc.add("digisOutLabelEB", "ebDigis"); + desc.add("digisOutLabelEE", "eeDigis"); + desc.add("produceDummyIntegrityCollections", false); + + confDesc.add("ecalDigisFromPortableProducer", desc); +} + +EcalDigisFromPortableProducer::EcalDigisFromPortableProducer(const edm::ParameterSet& ps) + : // input digi collections on host in SoA format + digisInEBToken_{consumes(ps.getParameter("digisInLabelEB"))}, + digisInEEToken_{consumes(ps.getParameter("digisInLabelEE"))}, + + // output digi collections in legacy format + digisOutEBToken_{produces(ps.getParameter("digisOutLabelEB"))}, + digisOutEEToken_{produces(ps.getParameter("digisOutLabelEE"))}, + + // whether to produce dummy integrity collections + produceDummyIntegrityCollections_{ps.getParameter("produceDummyIntegrityCollections")}, + + // dummy collections + ebSrFlagToken_{dummyProduces()}, + eeSrFlagToken_{dummyProduces()}, + + // dummy ECAL raw data collection + ecalRawDataToken_{dummyProduces()}, + + // dummy integrity for xtal data + ebIntegrityGainErrorsToken_{dummyProduces("EcalIntegrityGainErrors")}, + ebIntegrityGainSwitchErrorsToken_{dummyProduces("EcalIntegrityGainSwitchErrors")}, + ebIntegrityChIdErrorsToken_{dummyProduces("EcalIntegrityChIdErrors")}, + + // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection) + eeIntegrityGainErrorsToken_{dummyProduces("EcalIntegrityGainErrors")}, + eeIntegrityGainSwitchErrorsToken_{dummyProduces("EcalIntegrityGainSwitchErrors")}, + eeIntegrityChIdErrorsToken_{dummyProduces("EcalIntegrityChIdErrors")}, + + // dummy integrity errors + integrityTTIdErrorsToken_{dummyProduces("EcalIntegrityTTIdErrors")}, + integrityZSXtalIdErrorsToken_{dummyProduces("EcalIntegrityZSXtalIdErrors")}, + integrityBlockSizeErrorsToken_{dummyProduces("EcalIntegrityBlockSizeErrors")}, + + // + pnDiodeDigisToken_{dummyProduces()}, + + // dummy TCC collections + ecalTriggerPrimitivesToken_{dummyProduces("EcalTriggerPrimitives")}, + ecalPseudoStripInputsToken_{dummyProduces("EcalPseudoStripInputs")}, + + // dummy mem integrity collections + ecalIntegrityMemTtIdErrorsToken_{dummyProduces("EcalIntegrityMemTtIdErrors")}, + ecalIntegrityMemBlockSizeErrorsToken_{ + dummyProduces("EcalIntegrityMemBlockSizeErrors")}, + ecalIntegrityMemChIdErrorsToken_{dummyProduces("EcalIntegrityMemChIdErrors")}, + ecalIntegrityMemGainErrorsToken_{dummyProduces("EcalIntegrityMemGainErrors")} {} + +void EcalDigisFromPortableProducer::produce(edm::Event& event, edm::EventSetup const& setup) { + // output collections + auto digisEB = std::make_unique(); + auto digisEE = std::make_unique(); + + auto const& digisEBSoAHostColl = event.get(digisInEBToken_); + auto const& digisEESoAHostColl = event.get(digisInEEToken_); + auto& digisEBSoAView = digisEBSoAHostColl.view(); + auto& digisEESoAView = digisEESoAHostColl.view(); + + auto const digisEBSize = digisEBSoAView.size(); + auto const digisEESize = digisEESoAView.size(); + auto const digisEBDataSize = digisEBSize * ecalPh1::sampleSize; + auto const digisEEDataSize = digisEESize * ecalPh1::sampleSize; + + // Intermediate containers because the DigiCollection containers are accessible only as const + EBDigiCollection::IdContainer digisIdsEB; + EEDigiCollection::IdContainer digisIdsEE; + EBDigiCollection::DataContainer digisDataEB; + EEDigiCollection::DataContainer digisDataEE; + + digisIdsEB.resize(digisEBSize); + digisIdsEE.resize(digisEESize); + digisDataEB.resize(digisEBDataSize); + digisDataEE.resize(digisEEDataSize); + + // copy data + std::memcpy(digisIdsEB.data(), digisEBSoAView.id(), digisEBSize * sizeof(uint32_t)); + std::memcpy(digisIdsEE.data(), digisEESoAView.id(), digisEESize * sizeof(uint32_t)); + std::memcpy(digisDataEB.data(), digisEBSoAView.data()->data(), digisEBDataSize * sizeof(uint16_t)); + std::memcpy(digisDataEE.data(), digisEESoAView.data()->data(), digisEEDataSize * sizeof(uint16_t)); + + digisEB->swap(digisIdsEB, digisDataEB); + digisEE->swap(digisIdsEE, digisDataEE); + + digisEB->sort(); + digisEE->sort(); + + event.put(digisOutEBToken_, std::move(digisEB)); + event.put(digisOutEEToken_, std::move(digisEE)); + + if (produceDummyIntegrityCollections_) { + // dummy collections + event.emplace(ebSrFlagToken_); + event.emplace(eeSrFlagToken_); + // dummy ECAL raw data collection + event.emplace(ecalRawDataToken_); + // dummy integrity for xtal data + event.emplace(ebIntegrityGainErrorsToken_); + event.emplace(ebIntegrityGainSwitchErrorsToken_); + event.emplace(ebIntegrityChIdErrorsToken_); + // dummy integrity for xtal data - EE specific (to be rivisited towards EB+EE common collection) + event.emplace(eeIntegrityGainErrorsToken_); + event.emplace(eeIntegrityGainSwitchErrorsToken_); + event.emplace(eeIntegrityChIdErrorsToken_); + // dummy integrity errors + event.emplace(integrityTTIdErrorsToken_); + event.emplace(integrityZSXtalIdErrorsToken_); + event.emplace(integrityBlockSizeErrorsToken_); + // + event.emplace(pnDiodeDigisToken_); + // dummy TCC collections + event.emplace(ecalTriggerPrimitivesToken_); + event.emplace(ecalPseudoStripInputsToken_); + // dummy mem integrity collections + event.emplace(ecalIntegrityMemTtIdErrorsToken_); + event.emplace(ecalIntegrityMemBlockSizeErrorsToken_); + event.emplace(ecalIntegrityMemChIdErrorsToken_); + event.emplace(ecalIntegrityMemGainErrorsToken_); + } +} + +DEFINE_FWK_MODULE(EcalDigisFromPortableProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/alpaka/DeclsForKernels.h b/EventFilter/EcalRawToDigi/plugins/alpaka/DeclsForKernels.h new file mode 100644 index 0000000000000..c91bad61e2dce --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/alpaka/DeclsForKernels.h @@ -0,0 +1,43 @@ +#ifndef EventFilter_EcalRawToDigi_plugins_alpaka_DeclsForKernels_h +#define EventFilter_EcalRawToDigi_plugins_alpaka_DeclsForKernels_h + +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw { + + struct InputDataHost { + // delete the default constructor because alpaka buffers do not have a default constructor + InputDataHost() = delete; + + explicit InputDataHost(const Queue& queue, size_t size, size_t nFeds) + : data{cms::alpakatools::make_host_buffer(queue, size)}, + offsets{cms::alpakatools::make_host_buffer(queue, nFeds)}, + feds{cms::alpakatools::make_host_buffer(queue, nFeds)} {}; + + cms::alpakatools::host_buffer data; + cms::alpakatools::host_buffer offsets; + cms::alpakatools::host_buffer feds; + }; + + struct ConfigurationParameters { + uint32_t maxChannelsEE; + uint32_t maxChannelsEB; + }; + + struct InputDataDevice { + InputDataDevice() = delete; + + explicit InputDataDevice(const Queue& queue, size_t size, size_t nFeds) + : data{cms::alpakatools::make_device_buffer(queue, size)}, + offsets{cms::alpakatools::make_device_buffer(queue, nFeds)}, + feds{cms::alpakatools::make_device_buffer(queue, nFeds)} {}; + + cms::alpakatools::device_buffer data; + cms::alpakatools::device_buffer offsets; + cms::alpakatools::device_buffer feds; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw + +#endif // EventFilter_EcalRawToDigi_plugins_alpaka_DeclsForKernels_h diff --git a/EventFilter/EcalRawToDigi/plugins/alpaka/EcalElectronicsMappingHostESProducer.cc b/EventFilter/EcalRawToDigi/plugins/alpaka/EcalElectronicsMappingHostESProducer.cc new file mode 100644 index 0000000000000..32708b201ef2d --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/alpaka/EcalElectronicsMappingHostESProducer.cc @@ -0,0 +1,58 @@ +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" +#include "CondFormats/EcalObjects/interface/EcalMappingElectronics.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalElectronicsMappingDevice.h" +#include "DataFormats/EcalDetId/interface/EcalElectronicsId.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESProducer.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ModuleFactory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + class EcalElectronicsMappingHostESProducer : public ESProducer { + public: + EcalElectronicsMappingHostESProducer(edm::ParameterSet const& iConfig) : ESProducer(iConfig) { + auto cc = setWhatProduced(this); + token_ = cc.consumes(); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + descriptions.addWithDefaultLabel(desc); + } + + std::unique_ptr produce(EcalMappingElectronicsRcd const& iRecord) { + auto const& mapping = iRecord.get(token_); + + // TODO: 0x3FFFFF * 4B ~= 16MB + // tmp solution for linear mapping of eid -> did + int const size = 0x3FFFFF; + auto product = std::make_unique(size, cms::alpakatools::host()); + + // fill in eb + auto const& barrelValues = mapping.barrelItems(); + for (unsigned int i = 0; i < barrelValues.size(); ++i) { + EcalElectronicsId eid{barrelValues[i].electronicsid}; + EBDetId did{EBDetId::unhashIndex(i)}; + product->view()[eid.linearIndex()].rawid() = did.rawId(); + } + + // fill in ee + auto const& endcapValues = mapping.endcapItems(); + for (unsigned int i = 0; i < endcapValues.size(); ++i) { + EcalElectronicsId eid{endcapValues[i].electronicsid}; + EEDetId did{EEDetId::unhashIndex(i)}; + product->view()[eid.linearIndex()].rawid() = did.rawId(); + } + return product; + } + + private: + edm::ESGetToken token_; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +DEFINE_FWK_EVENTSETUP_ALPAKA_MODULE(EcalElectronicsMappingHostESProducer); diff --git a/EventFilter/EcalRawToDigi/plugins/alpaka/EcalRawToDigiPortable.cc b/EventFilter/EcalRawToDigi/plugins/alpaka/EcalRawToDigiPortable.cc new file mode 100644 index 0000000000000..7739cf15c0ab3 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/alpaka/EcalRawToDigiPortable.cc @@ -0,0 +1,142 @@ +#include "CondFormats/DataRecord/interface/EcalMappingElectronicsRcd.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalElectronicsMappingDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h" +#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/ESGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/stream/EDProducer.h" + +#include + +#include "DeclsForKernels.h" +#include "UnpackPortable.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + class EcalRawToDigiPortable : public stream::EDProducer<> { + public: + explicit EcalRawToDigiPortable(edm::ParameterSet const& ps); + ~EcalRawToDigiPortable() override = default; + static void fillDescriptions(edm::ConfigurationDescriptions&); + + void produce(device::Event&, device::EventSetup const&) override; + + private: + edm::EDGetTokenT rawDataToken_; + using OutputProduct = EcalDigiDeviceCollection; + device::EDPutToken digisDevEBToken_; + device::EDPutToken digisDevEEToken_; + device::ESGetToken eMappingToken_; + + std::vector fedsToUnpack_; + + ecal::raw::ConfigurationParameters config_; + }; + + void EcalRawToDigiPortable::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("InputLabel", edm::InputTag("rawDataCollector")); + std::vector feds(54); + for (uint32_t i = 0; i < 54; ++i) + feds[i] = i + 601; + desc.add>("FEDs", feds); + desc.add("maxChannelsEB", 61200); + desc.add("maxChannelsEE", 14648); + desc.add("digisLabelEB", "ebDigis"); + desc.add("digisLabelEE", "eeDigis"); + + confDesc.addWithDefaultLabel(desc); + } + + EcalRawToDigiPortable::EcalRawToDigiPortable(const edm::ParameterSet& ps) + : rawDataToken_{consumes(ps.getParameter("InputLabel"))}, + digisDevEBToken_{produces(ps.getParameter("digisLabelEB"))}, + digisDevEEToken_{produces(ps.getParameter("digisLabelEE"))}, + eMappingToken_{esConsumes()}, + fedsToUnpack_{ps.getParameter>("FEDs")} { + config_.maxChannelsEB = ps.getParameter("maxChannelsEB"); + config_.maxChannelsEE = ps.getParameter("maxChannelsEE"); + } + + void EcalRawToDigiPortable::produce(device::Event& event, device::EventSetup const& setup) { + // conditions + auto const& eMappingProduct = setup.getData(eMappingToken_); + + // event data + const auto rawDataHandle = event.getHandle(rawDataToken_); + + // make a first iteration over the FEDs to compute the total buffer size + uint32_t size = 0; + uint32_t feds = 0; + for (auto const& fed : fedsToUnpack_) { + auto const& data = rawDataHandle->FEDData(fed); + auto const nbytes = data.size(); + + // skip empty FEDs + if (nbytes < globalFieds::EMPTYEVENTSIZE) + continue; + + size += nbytes; + ++feds; + } + + auto& queue = event.queue(); + + // input host buffers + ecal::raw::InputDataHost inputHost(queue, size, feds); + + // output device collections + OutputProduct digisDevEB{static_cast(config_.maxChannelsEB), queue}; + OutputProduct digisDevEE{static_cast(config_.maxChannelsEE), queue}; + // reset the size scalar of the SoA + // memset takes an alpaka view that is created from the scalar in a view to the device collection + auto digiViewEB = cms::alpakatools::make_device_view(alpaka::getDev(queue), digisDevEB.view().size()); + auto digiViewEE = cms::alpakatools::make_device_view(alpaka::getDev(queue), digisDevEE.view().size()); + alpaka::memset(queue, digiViewEB, 0); + alpaka::memset(queue, digiViewEE, 0); + + // iterate over FEDs to fill the host buffer + uint32_t currentCummOffset = 0; + uint32_t fedCounter = 0; + for (auto const& fed : fedsToUnpack_) { + auto const& data = rawDataHandle->FEDData(fed); + auto const nbytes = data.size(); + + // skip empty FEDs + if (nbytes < globalFieds::EMPTYEVENTSIZE) + continue; + + // copy raw data into host buffer + std::memcpy(inputHost.data.data() + currentCummOffset, data.data(), nbytes); + // set the offset in bytes from the start + inputHost.offsets[fedCounter] = currentCummOffset; + inputHost.feds[fedCounter] = fed; + + // this is the current offset into the buffer + currentCummOffset += nbytes; + ++fedCounter; + } + assert(currentCummOffset == size); + assert(fedCounter == feds); + + // unpack if at least one FED has data + if (fedCounter > 0) { + ecal::raw::unpackRaw(queue, inputHost, digisDevEB, digisDevEE, eMappingProduct, fedCounter, currentCummOffset); + } + + event.emplace(digisDevEBToken_, std::move(digisDevEB)); + event.emplace(digisDevEEToken_, std::move(digisDevEE)); + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" +DEFINE_FWK_ALPAKA_MODULE(EcalRawToDigiPortable); diff --git a/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.dev.cc b/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.dev.cc new file mode 100644 index 0000000000000..374a5a9c2c87f --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.dev.cc @@ -0,0 +1,441 @@ +#include + +#include "DataFormats/DetId/interface/DetId.h" +#include "DataFormats/EcalDigi/interface/EcalConstants.h" +#include "EventFilter/EcalRawToDigi/interface/ElectronicsIdGPU.h" +#include "EventFilter/EcalRawToDigi/interface/DCCRawDataDefinitions.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +#include "UnpackPortable.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw { + + using namespace ::ecal::raw; + using namespace cms::alpakatools; + + class Kernel_unpack { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + unsigned char const* __restrict__ data, + uint32_t const* __restrict__ offsets, + int const* __restrict__ feds, + EcalDigiDeviceCollection::View digisDevEB, + EcalDigiDeviceCollection::View digisDevEE, + EcalElectronicsMappingDevice::ConstView eid2did, + uint32_t const nbytesTotal) const { + constexpr auto kSampleSize = ecalPh1::sampleSize; + + // indices + auto const ifed = alpaka::getIdx(acc)[0u]; + auto const threadIdx = alpaka::getIdx(acc)[0u]; + + // offset in bytes + auto const offset = offsets[ifed]; + // fed id + auto const fed = feds[ifed]; + auto const isBarrel = is_barrel(static_cast(fed - 600)); + // size + auto const gridDim = alpaka::getWorkDiv(acc)[0u]; + auto const size = ifed == gridDim - 1 ? nbytesTotal - offset : offsets[ifed + 1] - offset; + auto* samples = isBarrel ? digisDevEB.data()->data() : digisDevEE.data()->data(); + auto* ids = isBarrel ? digisDevEB.id() : digisDevEE.id(); + auto* pChannelsCounter = isBarrel ? &digisDevEB.size() : &digisDevEE.size(); + + // offset to the right raw buffer + uint64_t const* buffer = reinterpret_cast(data + offset); + + // dump first 3 bits for each 64-bit word + //print_first3bits(buffer, size / 8); + + // + // fed header + // + auto const fed_header = buffer[0]; + uint32_t bx = (fed_header >> H_BX_B) & H_BX_MASK; + uint32_t lv1 = (fed_header >> H_L1_B) & H_L1_MASK; + uint32_t triggerType = (fed_header >> H_TTYPE_B) & H_TTYPE_MASK; + + // determine the number of FE channels from the trigger type + uint32_t numbChannels(0); + if (triggerType == PHYSICTRIGGER) { + numbChannels = NUMB_FE; + } else if (triggerType == CALIBRATIONTRIGGER) { + numbChannels = NUMB_FE + 2; // FE + 2 MEM blocks + } else { + // unsupported trigger type + return; + } + + // 9 for fed + dcc header + // 36 for 4 EE TCC blocks or 18 for 1 EB TCC block + // 6 for SR block size + + // dcc header w2 + auto const w2 = buffer[2]; + uint8_t const fov = (w2 >> H_FOV_B) & H_FOV_MASK; + + // make a list of channels with data from DCC header channels status + // this could be done for each block instead of each thread since it defined per FED + uint8_t exp_ttids[NUMB_FE + 2]; // FE + 2 MEM blocks + uint8_t ch = 1; + uint8_t nCh = 0; + for (uint8_t i = 4; i < 9; ++i) { // data words with channel status info + for (uint8_t j = 0; j < 14; ++j, ++ch) { // channel status fields in one data word + const uint8_t shift = j * 4; //each channel has 4 bits + const int chStatus = (buffer[i] >> shift) & H_CHSTATUS_MASK; + const bool regular = (chStatus == CH_DISABLED || chStatus == CH_SUPPRESS); + const bool problematic = + (chStatus == CH_TIMEOUT || chStatus == CH_HEADERERR || chStatus == CH_LINKERR || + chStatus == CH_LENGTHERR || chStatus == CH_IFIFOFULL || chStatus == CH_L1AIFIFOFULL); + if (!(regular || problematic)) { + exp_ttids[nCh] = ch; + ++nCh; + } + } + } + + // + // print Tower block headers + // + uint8_t ntccblockwords = isBarrel ? 18 : 36; + auto const* tower_blocks_start = buffer + 9 + ntccblockwords + 6; + auto const* trailer = buffer + (size / 8 - 1); + auto const* current_tower_block = tower_blocks_start; + uint8_t iCh = 0; + uint8_t next_tower_id = exp_ttids[iCh]; + while (current_tower_block < trailer && iCh < numbChannels) { + auto const w = *current_tower_block; + uint8_t ttid = w & TOWER_ID_MASK; + uint16_t bxlocal = (w >> TOWER_BX_B) & TOWER_BX_MASK; + uint16_t lv1local = (w >> TOWER_L1_B) & TOWER_L1_MASK; + uint16_t block_length = (w >> TOWER_LENGTH_B) & TOWER_LENGTH_MASK; + + // fast forward to the next good tower id (in case of recovery from an earlier header corruption) + while (exp_ttids[iCh] < next_tower_id) { + ++iCh; + } + ++iCh; + + // check if the tower id in the tower header is the one expected + // if not try to find the next good header, point the current_tower_block to it, and extract its tower id + // or break if there is none + if (ttid != next_tower_id) { + next_tower_id = find_next_tower_block(current_tower_block, trailer, bx, lv1); + if (next_tower_id < TOWER_ID_MASK) { + continue; + } else { + break; + } + } + + // prepare for the next iteration + next_tower_id = exp_ttids[iCh]; + + uint16_t const dccbx = bx & 0xfff; + uint16_t const dccl1 = lv1 & 0xfff; + // fov>=1 is required to support simulated data for which bx==bxlocal==0 + if (fov >= 1 && !is_synced_towerblock(dccbx, bxlocal, dccl1, lv1local)) { + current_tower_block += block_length; + continue; + } + + // go through all the channels + // get the next channel coordinates + uint32_t const nchannels = (block_length - 1) / 3; + + bool bad_block = false; + auto& ch_with_bad_block = alpaka::declareSharedVar(acc); + if (once_per_block(acc)) { + ch_with_bad_block = std::numeric_limits::max(); + } + // make sure the shared memory is initialised for all threads + alpaka::syncBlockThreads(acc); + + auto const threadsPerBlock = alpaka::getWorkDiv(acc)[0u]; + // 1 threads per channel in this block + // All threads enter the loop regardless if they will treat channel indices channel >= nchannels. + // The threads with excess indices perform no operations but also reach the syncBlockThreads() inside the loop. + for (uint32_t i = 0; i < nchannels; i += threadsPerBlock) { + auto const channel = i + threadIdx; + + uint64_t wdata; + uint8_t stripid; + uint8_t xtalid; + + // threads must be inside the range (no break here because of syncBlockThreads() afterwards) + if (channel < nchannels && channel < ch_with_bad_block) { + // inc the channel's counter and get the pos where to store + wdata = current_tower_block[1 + channel * 3]; + stripid = wdata & 0x7; + xtalid = (wdata >> 4) & 0x7; + + // check if the stripid and xtalid are in the allowed range and if not skip the rest of the block + if (stripid < ElectronicsIdGPU::MIN_STRIPID || stripid > ElectronicsIdGPU::MAX_STRIPID || + xtalid < ElectronicsIdGPU::MIN_XTALID || xtalid > ElectronicsIdGPU::MAX_XTALID) { + bad_block = true; + } + if (channel > 0) { + // check if the stripid has increased or that the xtalid has increased from the previous data word. If not something is wrong and the rest of the block is skipped. + auto const prev_channel = channel - 1; + auto const prevwdata = current_tower_block[1 + prev_channel * 3]; + uint8_t const laststripid = prevwdata & 0x7; + uint8_t const lastxtalid = (prevwdata >> 4) & 0x7; + if ((stripid == laststripid && xtalid <= lastxtalid) || (stripid < laststripid)) { + bad_block = true; + } + } + } + + // check if this thread has the lowest bad block + if (bad_block && channel < ch_with_bad_block) { + alpaka::atomicMin(acc, &ch_with_bad_block, channel, alpaka::hierarchy::Threads{}); + } + + // make sure that all threads that have to have set the ch_with_bad_block shared memory + alpaka::syncBlockThreads(acc); + + // threads outside of the range or bad block detected in this thread or one working on a lower block -> stop this loop iteration here + if (channel >= nchannels || channel >= ch_with_bad_block) { + continue; + } + + ElectronicsIdGPU eid{fed2dcc(fed), ttid, stripid, xtalid}; + auto const didraw = isBarrel ? compute_ebdetid(eid) : eid2did[eid.linearIndex()].rawid(); + // FIXME: what kind of channels are these guys + if (didraw == 0) + continue; + + // get samples + uint16_t sampleValues[kSampleSize]; + sampleValues[0] = (wdata >> 16) & 0x3fff; + sampleValues[1] = (wdata >> 32) & 0x3fff; + sampleValues[2] = (wdata >> 48) & 0x3fff; + auto const wdata1 = current_tower_block[2 + channel * 3]; + sampleValues[3] = wdata1 & 0x3fff; + sampleValues[4] = (wdata1 >> 16) & 0x3fff; + sampleValues[5] = (wdata1 >> 32) & 0x3fff; + sampleValues[6] = (wdata1 >> 48) & 0x3fff; + auto const wdata2 = current_tower_block[3 + channel * 3]; + sampleValues[7] = wdata2 & 0x3fff; + sampleValues[8] = (wdata2 >> 16) & 0x3fff; + sampleValues[9] = (wdata2 >> 32) & 0x3fff; + + // check gain + bool isSaturation = true; + short firstGainZeroSampID{-1}, firstGainZeroSampADC{-1}; + for (uint32_t si = 0; si < kSampleSize; ++si) { + if (gainId(sampleValues[si]) == 0) { + firstGainZeroSampID = si; + firstGainZeroSampADC = adc(sampleValues[si]); + break; + } + } + if (firstGainZeroSampID != -1) { + unsigned int plateauEnd = std::min(kSampleSize, (unsigned int)(firstGainZeroSampID + 5)); + for (unsigned int s = firstGainZeroSampID; s < plateauEnd; s++) { + if (!(gainId(sampleValues[s]) == 0 && adc(sampleValues[s]) == firstGainZeroSampADC)) { + isSaturation = false; + break; + } //it's not saturation + } + // get rid of channels which are stuck in gain0 + if (firstGainZeroSampID < 3) { + isSaturation = false; + } + if (!isSaturation) + continue; + } else { // there is no zero gainId sample + // gain switch check + short numGain = 1; + bool gainSwitchError = false; + for (unsigned int si = 1; si < kSampleSize; ++si) { + if ((gainId(sampleValues[si - 1]) > gainId(sampleValues[si])) && numGain < 5) + gainSwitchError = true; + if (gainId(sampleValues[si - 1]) == gainId(sampleValues[si])) + numGain++; + else + numGain = 1; + } + if (gainSwitchError) + continue; + } + + auto const pos = alpaka::atomicAdd(acc, pChannelsCounter, 1u, alpaka::hierarchy::Threads{}); + + // store to global + ids[pos] = didraw; + std::memcpy(&samples[pos * kSampleSize], sampleValues, kSampleSize * sizeof(uint16_t)); + } + + current_tower_block += block_length; + } + } + + private: + ALPAKA_FN_INLINE ALPAKA_FN_ACC void print_raw_buffer(uint8_t const* const buffer, + uint32_t const nbytes, + uint32_t const nbytes_per_row = 20) const { + for (uint32_t i = 0; i < nbytes; ++i) { + if (i % nbytes_per_row == 0 && i > 0) + printf("\n"); + printf("%02X ", buffer[i]); + } + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC void print_first3bits(uint64_t const* buffer, uint32_t size) const { + for (uint32_t i = 0; i < size; ++i) { + uint8_t const b61 = (buffer[i] >> 61) & 0x1; + uint8_t const b62 = (buffer[i] >> 62) & 0x1; + uint8_t const b63 = (buffer[i] >> 63) & 0x1; + printf("[word: %u] %u%u%u\n", i, b63, b62, b61); + } + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC bool is_barrel(uint8_t dccid) const { + return dccid >= ElectronicsIdGPU::MIN_DCCID_EBM && dccid <= ElectronicsIdGPU::MAX_DCCID_EBP; + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC uint8_t fed2dcc(int fed) const { return static_cast(fed - 600); } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC int zside_for_eb(ElectronicsIdGPU const& eid) const { + int dcc = eid.dccId(); + return ((dcc >= ElectronicsIdGPU::MIN_DCCID_EBM && dcc <= ElectronicsIdGPU::MAX_DCCID_EBM)) ? -1 : 1; + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC uint8_t find_next_tower_block(uint64_t const*& current_tower_block, + uint64_t const* trailer, + uint32_t const bx, + uint32_t const lv1) const { + const auto* next_tower_block = current_tower_block + 1; // move forward to skip the broken header + + // expected LV1, BX, #TS + const uint64_t lv1local = ((lv1 - 1) & TOWER_L1_MASK); + const uint64_t bxlocal = (bx != 3564) ? bx : 0; + // The CPU unpacker also checks the # time samples expected in the header + // but those are currently not available here + + // construct tower header and mask + const uint64_t sign = 0xC0000000C0000000 + (lv1local << TOWER_L1_B) + (bxlocal << TOWER_BX_B); + const uint64_t mask = + 0xC0001000D0000000 + (uint64_t(TOWER_L1_MASK) << TOWER_L1_B) + (uint64_t(TOWER_BX_MASK) << TOWER_BX_B); + + while (next_tower_block < trailer) { + if ((*next_tower_block & mask) == sign) { + current_tower_block = next_tower_block; + return uint8_t(*next_tower_block & TOWER_ID_MASK); + } else { + ++next_tower_block; + } + } + return TOWER_ID_MASK; // return the maximum value + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC bool is_synced_towerblock(uint16_t const dccbx, + uint16_t const bx, + uint16_t const dccl1, + uint16_t const l1) const { + bool const bxsync = (bx == 0 && dccbx == 3564) || (bx == dccbx && dccbx != 3564); + bool const l1sync = (l1 == ((dccl1 - 1) & 0xfff)); + return bxsync && l1sync; + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC bool right_tower_for_eb(int tower) const { + // for EB, two types of tower (LVRB top/bottom) + return (tower > 12 && tower < 21) || (tower > 28 && tower < 37) || (tower > 44 && tower < 53) || + (tower > 60 && tower < 69); + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC uint32_t compute_ebdetid(ElectronicsIdGPU const& eid) const { + // as in Geometry/EcalMaping/.../EcalElectronicsMapping + auto const dcc = eid.dccId(); + auto const tower = eid.towerId(); + auto const strip = eid.stripId(); + auto const xtal = eid.xtalId(); + + int smid = 0; + int iphi = 0; + bool EBPlus = (zside_for_eb(eid) > 0); + bool EBMinus = !EBPlus; + + if (zside_for_eb(eid) < 0) { + smid = dcc + 19 - ElectronicsIdGPU::DCCID_PHI0_EBM; + iphi = (smid - 19) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * ((tower - 1) % ElectronicsIdGPU::kTowersInPhi); + } else { + smid = dcc + 1 - ElectronicsIdGPU::DCCID_PHI0_EBP; + iphi = (smid - 1) * ElectronicsIdGPU::kCrystalsInPhi; + iphi += 5 * (ElectronicsIdGPU::kTowersInPhi - ((tower - 1) % ElectronicsIdGPU::kTowersInPhi) - 1); + } + + bool RightTower = right_tower_for_eb(tower); + int ieta = 5 * ((tower - 1) / ElectronicsIdGPU::kTowersInPhi) + 1; + if (RightTower) { + ieta += (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } else { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } + } else { + ieta += 4 - (strip - 1); + if (strip % 2 == 1) { + if (EBMinus) + iphi += (4 - (xtal - 1)) + 1; + else + iphi += (xtal - 1) + 1; + } else { + if (EBMinus) + iphi += (xtal - 1) + 1; + else + iphi += (4 - (xtal - 1)) + 1; + } + } + + if (zside_for_eb(eid) < 0) + ieta = -ieta; + + DetId did{DetId::Ecal, EcalBarrel}; + return did.rawId() | ((ieta > 0) ? (0x10000 | (ieta << 9)) : ((-ieta) << 9)) | (iphi & 0x1FF); + } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC int adc(uint16_t sample) const { return sample & 0xfff; } + + ALPAKA_FN_INLINE ALPAKA_FN_ACC int gainId(uint16_t sample) const { return (sample >> 12) & 0x3; } + }; + + void unpackRaw(Queue& queue, + InputDataHost const& inputHost, + EcalDigiDeviceCollection& digisDevEB, + EcalDigiDeviceCollection& digisDevEE, + EcalElectronicsMappingDevice const& mapping, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal) { + // input device buffers + ecal::raw::InputDataDevice inputDevice(queue, nbytesTotal, nfedsWithData); + + // transfer the raw data + alpaka::memcpy(queue, inputDevice.data, inputHost.data); + alpaka::memcpy(queue, inputDevice.offsets, inputHost.offsets); + alpaka::memcpy(queue, inputDevice.feds, inputHost.feds); + + auto workDiv = cms::alpakatools::make_workdiv(nfedsWithData, 32); // 32 channels per block + alpaka::exec(queue, + workDiv, + Kernel_unpack{}, + inputDevice.data.data(), + inputDevice.offsets.data(), + inputDevice.feds.data(), + digisDevEB.view(), + digisDevEE.view(), + mapping.const_view(), + nbytesTotal); + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw diff --git a/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.h b/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.h new file mode 100644 index 0000000000000..9204d2ff71965 --- /dev/null +++ b/EventFilter/EcalRawToDigi/plugins/alpaka/UnpackPortable.h @@ -0,0 +1,22 @@ +#ifndef EventFilter_EcalRawToDigi_plugins_alpaka_UnpackPortable_h +#define EventFilter_EcalRawToDigi_plugins_alpaka_UnpackPortable_h + +#include "CondFormats/EcalObjects/interface/alpaka/EcalElectronicsMappingDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "DeclsForKernels.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw { + + void unpackRaw(Queue& queue, + InputDataHost const& inputHost, + EcalDigiDeviceCollection& digisDevEB, + EcalDigiDeviceCollection& digisDevEE, + EcalElectronicsMappingDevice const& mapping, + uint32_t const nfedsWithData, + uint32_t const nbytesTotal); + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::raw + +#endif // EventFilter_EcalRawToDigi_plugins_alpaka_UnpackPortable_h diff --git a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py index cd202d520303a..0710a87569343 100644 --- a/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py +++ b/EventFilter/EcalRawToDigi/python/ecalDigis_cff.py @@ -3,8 +3,10 @@ # ECAL unpacker running on CPU from EventFilter.EcalRawToDigi.EcalUnpackerData_cfi import ecalEBunpacker as _ecalEBunpacker +ecalDigisCPU = _ecalEBunpacker.clone() + ecalDigis = SwitchProducerCUDA( - cpu = _ecalEBunpacker.clone() + cpu = ecalDigisCPU ) ecalDigisTask = cms.Task( @@ -12,6 +14,8 @@ ecalDigis ) +from Configuration.StandardSequences.Accelerators_cff import * + # process modifier to run on GPUs from Configuration.ProcessModifiers.gpu_cff import gpu @@ -22,10 +26,10 @@ from EventFilter.EcalRawToDigi.ecalRawToDigiGPU_cfi import ecalRawToDigiGPU as _ecalRawToDigiGPU ecalDigisGPU = _ecalRawToDigiGPU.clone() -# extend the SwitchProducer to add a case to copy the ECAL digis from GPU to CPU and covert them from SoA to legacy format +# extend the SwitchProducer to add a case to copy the ECAL digis from GPU to CPU and convert them from SoA to legacy format from EventFilter.EcalRawToDigi.ecalCPUDigisProducer_cfi import ecalCPUDigisProducer as _ecalCPUDigisProducer gpu.toModify(ecalDigis, - # copy the ECAL digis from GPU to CPU and covert them from SoA to legacy format + # copy the ECAL digis from GPU to CPU and convert them from SoA to legacy format cuda = _ecalCPUDigisProducer.clone( digisInLabelEB = ('ecalDigisGPU', 'ebDigis'), digisInLabelEE = ('ecalDigisGPU', 'eeDigis'), @@ -38,6 +42,37 @@ ecalElectronicsMappingGPUESProducer, # run the ECAL unpacker on GPU ecalDigisGPU, - # run the ECAL unpacker on CPU, or copy the ECAL digis from GPU to CPU and covert them from SoA to legacy format + # run the ECAL unpacker on CPU, or copy the ECAL digis from GPU to CPU and convert them from SoA to legacy format + ecalDigis +)) + +# process modifier to run alpaka implementation +from Configuration.ProcessModifiers.alpaka_cff import alpaka + +# ECAL conditions used by the portable unpacker +from EventFilter.EcalRawToDigi.ecalElectronicsMappingHostESProducer_cfi import ecalElectronicsMappingHostESProducer + +# alpaka ECAL unpacker +from EventFilter.EcalRawToDigi.ecalRawToDigiPortable_cfi import ecalRawToDigiPortable as _ecalRawToDigiPortable +ecalDigisPortable = _ecalRawToDigiPortable.clone() + +from EventFilter.EcalRawToDigi.ecalDigisFromPortableProducer_cfi import ecalDigisFromPortableProducer as _ecalDigisFromPortableProducer + +# replace the SwitchProducer branches with a module to copy the ECAL digis from the accelerator to CPU (if needed) and convert them from SoA to legacy format +_ecalDigisFromPortable = _ecalDigisFromPortableProducer.clone( + digisInLabelEB = 'ecalDigisPortable:ebDigis', + digisInLabelEE = 'ecalDigisPortable:eeDigis', + produceDummyIntegrityCollections = True +) +alpaka.toModify(ecalDigis, + cpu = _ecalDigisFromPortable.clone() +) + +alpaka.toReplaceWith(ecalDigisTask, cms.Task( + # ECAL conditions used by the portable unpacker + ecalElectronicsMappingHostESProducer, + # run the portable ECAL unpacker + ecalDigisPortable, + # copy the ECAL digis from GPU to CPU (if needed) and convert them from SoA to legacy format ecalDigis )) diff --git a/HLTrigger/Configuration/python/customizeHLTforAlpaka.py b/HLTrigger/Configuration/python/customizeHLTforAlpaka.py new file mode 100644 index 0000000000000..d2b8fa901461c --- /dev/null +++ b/HLTrigger/Configuration/python/customizeHLTforAlpaka.py @@ -0,0 +1,102 @@ +import FWCore.ParameterSet.Config as cms + +def customizeHLTforAlpakaEcalLocalReco(process): + process.load("HeterogeneousCore.AlpakaCore.ProcessAcceleratorAlpaka_cfi") + if hasattr(process, 'hltEcalDigisGPU'): + process.hltEcalDigisPortable = cms.EDProducer("EcalRawToDigiPortable@alpaka", + FEDs = process.hltEcalDigisGPU.FEDs, + InputLabel = process.hltEcalDigisGPU.InputLabel, + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ), + digisLabelEB = process.hltEcalDigisGPU.digisLabelEB, + digisLabelEE = process.hltEcalDigisGPU.digisLabelEE, + maxChannelsEB = process.hltEcalDigisGPU.maxChannelsEB, + maxChannelsEE = process.hltEcalDigisGPU.maxChannelsEE, + mightGet = cms.optional.untracked.vstring + ) + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask.add(process.hltEcalDigisPortable) + + process.load("EventFilter.EcalRawToDigi.ecalElectronicsMappingHostESProducer_cfi") + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask.add(process.ecalElectronicsMappingHostESProducer) + + delattr(process, 'hltEcalDigisGPU') + delattr(process, 'ecalElectronicsMappingGPUESProducer') + + if hasattr(process, 'hltEcalDigisFromGPU'): + process.hltEcalDigisFromGPU = cms.EDProducer( "EcalDigisFromPortableProducer", + digisInLabelEB = cms.InputTag( 'hltEcalDigisPortable','ebDigis' ), + digisInLabelEE = cms.InputTag( 'hltEcalDigisPortable','eeDigis' ), + digisOutLabelEB = cms.string( "ebDigis" ), + digisOutLabelEE = cms.string( "eeDigis" ), + produceDummyIntegrityCollections = cms.bool( False ) + ) + + if hasattr(process, 'hltEcalUncalibRecHitGPU'): + process.hltEcalUncalibRecHitPortable = cms.EDProducer("EcalUncalibRecHitProducerPortable@alpaka", + EBtimeConstantTerm = process.hltEcalUncalibRecHitGPU.EBtimeConstantTerm, + EBtimeFitLimits_Lower = process.hltEcalUncalibRecHitGPU.EBtimeFitLimits_Lower, + EBtimeFitLimits_Upper = process.hltEcalUncalibRecHitGPU.EBtimeFitLimits_Upper, + EBtimeNconst = process.hltEcalUncalibRecHitGPU.EBtimeNconst, + EEtimeConstantTerm = process.hltEcalUncalibRecHitGPU.EEtimeConstantTerm, + EEtimeFitLimits_Lower = process.hltEcalUncalibRecHitGPU.EEtimeFitLimits_Lower, + EEtimeFitLimits_Upper = process.hltEcalUncalibRecHitGPU.EEtimeFitLimits_Upper, + EEtimeNconst = process.hltEcalUncalibRecHitGPU.EEtimeNconst, + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ), + amplitudeThresholdEB = process.hltEcalUncalibRecHitGPU.amplitudeThresholdEB, + amplitudeThresholdEE = process.hltEcalUncalibRecHitGPU.amplitudeThresholdEE, + digisLabelEB = cms.InputTag("hltEcalDigisPortable","ebDigis"), + digisLabelEE = cms.InputTag("hltEcalDigisPortable","eeDigis"), + kernelMinimizeThreads = process.hltEcalUncalibRecHitGPU.kernelMinimizeThreads, + mightGet = cms.optional.untracked.vstring, + outOfTimeThresholdGain12mEB = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain12mEB, + outOfTimeThresholdGain12mEE = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain12mEE, + outOfTimeThresholdGain12pEB = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain12pEB, + outOfTimeThresholdGain12pEE = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain12pEE, + outOfTimeThresholdGain61mEB = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain61mEB, + outOfTimeThresholdGain61mEE = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain61mEE, + outOfTimeThresholdGain61pEB = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain61pEB, + outOfTimeThresholdGain61pEE = process.hltEcalUncalibRecHitGPU.outOfTimeThresholdGain61pEE, + recHitsLabelEB = process.hltEcalUncalibRecHitGPU.recHitsLabelEB, + recHitsLabelEE = process.hltEcalUncalibRecHitGPU.recHitsLabelEE, + shouldRunTimingComputation = process.hltEcalUncalibRecHitGPU.shouldRunTimingComputation + ) + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask.add(process.hltEcalUncalibRecHitPortable) + + process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitConditionsHostESProducer_cfi") + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask.add(process.ecalMultifitConditionsHostESProducer) + + process.ecalMultifitParametersSource = cms.ESSource("EmptyESSource", + firstValid = cms.vuint32(1), + iovIsRunNotTime = cms.bool(True), + recordName = cms.string('EcalMultifitParametersRcd') + ) + process.load("RecoLocalCalo.EcalRecProducers.ecalMultifitParametersHostESProducer_cfi") + process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask.add(process.ecalMultifitParametersHostESProducer) + + delattr(process, 'hltEcalUncalibRecHitGPU') + + if hasattr(process, 'hltEcalUncalibRecHitFromSoA'): + process.hltEcalUncalibRecHitFromSoA = cms.EDProducer("EcalUncalibRecHitSoAToLegacy", + isPhase2 = process.hltEcalUncalibRecHitFromSoA.isPhase2, + mightGet = cms.optional.untracked.vstring, + recHitsLabelCPUEB = process.hltEcalUncalibRecHitFromSoA.recHitsLabelCPUEB, + recHitsLabelCPUEE = process.hltEcalUncalibRecHitFromSoA.recHitsLabelCPUEE, + uncalibRecHitsPortableEB = cms.InputTag("hltEcalUncalibRecHitPortable","EcalUncalibRecHitsEB"), + uncalibRecHitsPortableEE = cms.InputTag("hltEcalUncalibRecHitPortable","EcalUncalibRecHitsEE") + ) + + if hasattr(process, 'hltEcalUncalibRecHitSoA'): + delattr(process, 'hltEcalUncalibRecHitSoA') + + process.HLTDoFullUnpackingEgammaEcalTask = cms.ConditionalTask(process.HLTDoFullUnpackingEgammaEcalWithoutPreshowerTask, process.HLTPreshowerTask) + + return process + +def customizeHLTforAlpaka(process): + process = customizeHLTforAlpakaEcalLocalReco(process) + + return process + diff --git a/RecoLocalCalo/EcalRecProducers/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/BuildFile.xml index b77b79e9c1180..4852e0b98d1f4 100644 --- a/RecoLocalCalo/EcalRecProducers/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/BuildFile.xml @@ -1,5 +1,6 @@ + diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h b/RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h similarity index 87% rename from RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h rename to RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h index bbf9cb0dbb5c9..dab46c4868ab3 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EigenMatrixTypes_gpu.h +++ b/RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h @@ -1,11 +1,10 @@ -#ifndef RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h -#define RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h +#ifndef RecoLocalCalo_EcalRecProducers_EigenMatrixTypes_gpu_h +#define RecoLocalCalo_EcalRecProducers_EigenMatrixTypes_gpu_h #include - #include -#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +#include "DataFormats/EcalRecHit/interface/RecoTypes.h" namespace ecal { namespace multifit { @@ -46,4 +45,4 @@ namespace ecal { } // namespace multifit } // namespace ecal -#endif // RecoLocalCalo_EcalRecProducers_plugins_EigenMatrixTypes_gpu_h +#endif // RecoLocalCalo_EcalRecProducers_EigenMatrixTypes_gpu_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h index 1797fb6d2ec88..20495ebf49be5 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationCommonKernels.h @@ -1,8 +1,8 @@ #ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h #define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" #include "DeclsForKernels.h" -#include "EigenMatrixTypes_gpu.h" class EcalPulseShape; // this flag setting is applied to all of the cases diff --git a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h index 72ccf3b11a987..762de114c4a6a 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/AmplitudeComputationKernels.h @@ -1,8 +1,8 @@ #ifndef RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h #define RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" #include "DeclsForKernels.h" -#include "EigenMatrixTypes_gpu.h" class EcalPulseShape; class EcalPulseCovariance; diff --git a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml index 83b7e5f912c76..40ad5ade53326 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml +++ b/RecoLocalCalo/EcalRecProducers/plugins/BuildFile.xml @@ -1,4 +1,3 @@ - @@ -9,8 +8,6 @@ - - @@ -22,5 +19,16 @@ + + + + + + + + + + + diff --git a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h index cbd28df94eb42..68bbc3400f23c 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/DeclsForKernels.h @@ -9,7 +9,6 @@ #include "CUDADataFormats/EcalDigi/interface/DigisCollection.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" #include "CondFormats/EcalObjects/interface/EcalChannelStatus.h" #include "CondFormats/EcalObjects/interface/EcalChannelStatusCode.h" #include "CondFormats/EcalObjects/interface/EcalGainRatios.h" @@ -32,9 +31,9 @@ #include "CondFormats/EcalObjects/interface/EcalTimeCalibConstantsGPU.h" #include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" #include "CondFormats/EcalObjects/interface/EcalWeightSet.h" +#include "DataFormats/EcalRecHit/interface/RecoTypes.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -#include "EigenMatrixTypes_gpu.h" +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" struct EcalPulseShape; class EcalSampleMask; diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc index 3de6b62898925..286f4cd2f413c 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalCPURecHitProducer.cc @@ -91,7 +91,7 @@ void EcalCPURecHitProducer::acquire(edm::Event const& event, cudaMemcpyDeviceToHost, ctx.stream())); // - // ./CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h:using StorageScalarType = float; + // ./DataFormats/EcalRecHit/interface/RecoTypes.h:using StorageScalarType = float; // cudaCheck(cudaMemcpyAsync(recHitsEB_.energy.data(), diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc index 9edf3ad0087b1..86dbacbf69e3e 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalRecHitProducerGPU.cc @@ -1,6 +1,5 @@ #include "CUDADataFormats/EcalRecHitSoA/interface/EcalRecHit.h" #include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit.h" -#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" #include "CommonTools/Utils/interface/StringToEnumValue.h" #include "CondFormats/DataRecord/interface/EcalADCToGeVConstantRcd.h" #include "CondFormats/DataRecord/interface/EcalChannelStatusRcd.h" @@ -18,6 +17,7 @@ #include "CondFormats/EcalObjects/interface/EcalRechitADCToGeVConstantGPU.h" #include "CondFormats/EcalObjects/interface/EcalRechitChannelStatusGPU.h" #include "DataFormats/EcalRecHit/interface/EcalRecHit.h" +#include "DataFormats/EcalRecHit/interface/RecoTypes.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" #include "FWCore/Framework/interface/MakerMacros.h" diff --git a/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitSoAToLegacy.cc b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitSoAToLegacy.cc new file mode 100644 index 0000000000000..32ebbf669186f --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/EcalUncalibRecHitSoAToLegacy.cc @@ -0,0 +1,105 @@ +#include "DataFormats/EcalDigi/interface/EcalDigiCollections.h" +#include "DataFormats/EcalRecHit/interface/EcalRecHitCollections.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/EmptyGroupDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/EDPutToken.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHitHostCollection.h" + +class EcalUncalibRecHitSoAToLegacy : public edm::stream::EDProducer<> { +public: + explicit EcalUncalibRecHitSoAToLegacy(edm::ParameterSet const &ps); + ~EcalUncalibRecHitSoAToLegacy() override = default; + static void fillDescriptions(edm::ConfigurationDescriptions &); + +private: + using InputProduct = EcalUncalibratedRecHitHostCollection; + void produce(edm::Event &, edm::EventSetup const &) override; + +private: + const bool isPhase2_; + const edm::EDGetTokenT uncalibRecHitsPortableEB_; + const edm::EDGetTokenT uncalibRecHitsPortableEE_; + const edm::EDPutTokenT uncalibRecHitsCPUEBToken_; + const edm::EDPutTokenT uncalibRecHitsCPUEEToken_; +}; + +void EcalUncalibRecHitSoAToLegacy::fillDescriptions(edm::ConfigurationDescriptions &confDesc) { + edm::ParameterSetDescription desc; + + desc.add("uncalibRecHitsPortableEB", + edm::InputTag("ecalMultiFitUncalibRecHitPortable", "EcalUncalibRecHitsEB")); + desc.add("recHitsLabelCPUEB", "EcalUncalibRecHitsEB"); + desc.ifValue(edm::ParameterDescription("isPhase2", false, true), + false >> (edm::ParameterDescription( + "uncalibRecHitsPortableEE", + edm::InputTag("ecalMultiFitUncalibRecHitPortable", "EcalUncalibRecHitsEE"), + true) and + edm::ParameterDescription("recHitsLabelCPUEE", "EcalUncalibRecHitsEE", true)) or + true >> edm::EmptyGroupDescription()); + confDesc.add("ecalUncalibRecHitSoAToLegacy", desc); +} + +EcalUncalibRecHitSoAToLegacy::EcalUncalibRecHitSoAToLegacy(edm::ParameterSet const &ps) + : isPhase2_{ps.getParameter("isPhase2")}, + uncalibRecHitsPortableEB_{consumes(ps.getParameter("uncalibRecHitsPortableEB"))}, + uncalibRecHitsPortableEE_{ + isPhase2_ ? edm::EDGetTokenT{} + : consumes(ps.getParameter("uncalibRecHitsPortableEE"))}, + uncalibRecHitsCPUEBToken_{ + produces(ps.getParameter("recHitsLabelCPUEB"))}, + uncalibRecHitsCPUEEToken_{ + isPhase2_ ? edm::EDPutTokenT{} + : produces(ps.getParameter("recHitsLabelCPUEE"))} {} + +void EcalUncalibRecHitSoAToLegacy::produce(edm::Event &event, edm::EventSetup const &setup) { + auto const &uncalRecHitsEBColl = event.get(uncalibRecHitsPortableEB_); + auto const &uncalRecHitsEBCollView = uncalRecHitsEBColl.const_view(); + auto recHitsCPUEB = std::make_unique(); + recHitsCPUEB->reserve(uncalRecHitsEBCollView.size()); + + for (uint32_t i = 0; i < uncalRecHitsEBCollView.size(); ++i) { + recHitsCPUEB->emplace_back(DetId{uncalRecHitsEBCollView.id()[i]}, + uncalRecHitsEBCollView.amplitude()[i], + uncalRecHitsEBCollView.pedestal()[i], + uncalRecHitsEBCollView.jitter()[i], + uncalRecHitsEBCollView.chi2()[i], + uncalRecHitsEBCollView.flags()[i]); + if (isPhase2_) { + (*recHitsCPUEB)[i].setAmplitudeError(uncalRecHitsEBCollView.amplitudeError()[i]); + } + (*recHitsCPUEB)[i].setJitterError(uncalRecHitsEBCollView.jitterError()[i]); + for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) { + (*recHitsCPUEB)[i].setOutOfTimeAmplitude(sample, uncalRecHitsEBCollView.outOfTimeAmplitudes()[i][sample]); + } + } + event.put(uncalibRecHitsCPUEBToken_, std::move(recHitsCPUEB)); + + if (!isPhase2_) { + auto const &uncalRecHitsEEColl = event.get(uncalibRecHitsPortableEE_); + auto const &uncalRecHitsEECollView = uncalRecHitsEEColl.const_view(); + auto recHitsCPUEE = std::make_unique(); + recHitsCPUEE->reserve(uncalRecHitsEECollView.size()); + + for (uint32_t i = 0; i < uncalRecHitsEECollView.size(); ++i) { + recHitsCPUEE->emplace_back(DetId{uncalRecHitsEECollView.id()[i]}, + uncalRecHitsEECollView.amplitude()[i], + uncalRecHitsEECollView.pedestal()[i], + uncalRecHitsEECollView.jitter()[i], + uncalRecHitsEECollView.chi2()[i], + uncalRecHitsEECollView.flags()[i]); + (*recHitsCPUEE)[i].setJitterError(uncalRecHitsEECollView.jitterError()[i]); + for (uint32_t sample = 0; sample < EcalDataFrame::MAXSAMPLES; ++sample) { + (*recHitsCPUEE)[i].setOutOfTimeAmplitude(sample, uncalRecHitsEECollView.outOfTimeAmplitudes()[i][sample]); + } + } + event.put(uncalibRecHitsCPUEEToken_, std::move(recHitsCPUEE)); + } +} + +DEFINE_FWK_MODULE(EcalUncalibRecHitSoAToLegacy); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h index dea6bad26fa0d..30cf742d44d10 100644 --- a/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h +++ b/RecoLocalCalo/EcalRecProducers/plugins/TimeComputationKernels.h @@ -8,9 +8,9 @@ #include "DataFormats/Math/interface/approx_exp.h" #include "DataFormats/Math/interface/approx_log.h" +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" #include "DeclsForKernels.h" -#include "EigenMatrixTypes_gpu.h" //#define DEBUG diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h new file mode 100644 index 0000000000000..e590ce0d8b795 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationCommonKernels.h @@ -0,0 +1,488 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationCommonKernels_h + +#include +#include +#include + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalDigi/interface/EcalMGPASample.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" + +#include "DeclsForKernels.h" +#include "KernelHelpers.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + /// + /// assume kernel launch configuration is + /// (MAXSAMPLES * nchannels, blocks) + /// TODO: is there a point to split this kernel further to separate reductions + /// + class Kernel_prep_1d_and_initialize { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB, + EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ::ecal::multifit::SampleVector* amplitudes, + ::ecal::multifit::SampleGainVector* gainsNoise, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + char* acState, + ::ecal::multifit::BXVectorType* bxs, + bool const gainSwitchUseMaxSampleEB, + bool const gainSwitchUseMaxSampleEE) const { + constexpr bool dynamicPedestal = false; //---- default to false, ok + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + constexpr int sample_max = 5; + constexpr int full_pulse_max = 9; + auto const offsetForHashes = conditionsDev.offsetEE(); + + auto const nchannelsEB = digisDevEB.size(); + auto const nchannelsEE = digisDevEE.size(); + auto const nchannels = nchannelsEB + nchannelsEE; + auto const totalElements = nchannels * nsamples; + + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + + char* shared_mem = alpaka::getDynSharedMem(acc); + auto* shr_hasSwitchToGain6 = reinterpret_cast(shared_mem); + auto* shr_hasSwitchToGain1 = shr_hasSwitchToGain6 + elemsPerBlock; + auto* shr_hasSwitchToGain0 = shr_hasSwitchToGain1 + elemsPerBlock; + auto* shr_isSaturated = shr_hasSwitchToGain0 + elemsPerBlock; + auto* shr_hasSwitchToGain0_tmp = shr_isSaturated + elemsPerBlock; + auto* shr_counts = reinterpret_cast(shr_hasSwitchToGain0_tmp) + elemsPerBlock; + + for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) { + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + // set the output collection size scalars + if (idx.global == 0) { + uncalibRecHitsEB.size() = nchannelsEB; + uncalibRecHitsEE.size() = nchannelsEE; + } + + auto const ch = idx.global / nsamples; + // for accessing input arrays + int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global; + // eb is first and then ee + auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data(); + auto const gainId = ecalMGPA::gainId(digis_in[inputTx]); + + // store into shared mem for initialization + shr_hasSwitchToGain6[idx.local] = gainId == EcalMgpaBitwiseGain6; + shr_hasSwitchToGain1[idx.local] = gainId == EcalMgpaBitwiseGain1; + shr_hasSwitchToGain0_tmp[idx.local] = gainId == EcalMgpaBitwiseGain0; + shr_hasSwitchToGain0[idx.local] = shr_hasSwitchToGain0_tmp[idx.local]; + shr_counts[idx.local] = 0; + } + + alpaka::syncBlockThreads(acc); + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const sample = idx.local % nsamples; + + // non-divergent branch (except for the last 4 threads) + if (idx.local <= elemsPerBlock - 5) { + CMS_UNROLL_LOOP + for (int i = 0; i < 5; ++i) + shr_counts[idx.local] += shr_hasSwitchToGain0[idx.local + i]; + } + shr_isSaturated[idx.local] = shr_counts[idx.local] == 5; + + // + // unrolled reductions + // + if (sample < 5) { + shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 5]; + shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 5]; + + // duplication of hasSwitchToGain0 in order not to + // introduce another syncthreads + shr_hasSwitchToGain0_tmp[idx.local] = + shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 5]; + } + } + + alpaka::syncBlockThreads(acc); + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const sample = idx.local % nsamples; + + if (sample < 2) { + // note, both threads per channel take value [3] twice to avoid another if + shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 2] || + shr_hasSwitchToGain6[idx.local + 3]; + shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 2] || + shr_hasSwitchToGain1[idx.local + 3]; + + shr_hasSwitchToGain0_tmp[idx.local] = shr_hasSwitchToGain0_tmp[idx.local] || + shr_hasSwitchToGain0_tmp[idx.local + 2] || + shr_hasSwitchToGain0_tmp[idx.local + 3]; + + // sample < 2 -> first 2 threads of each channel will be used here + // => 0 -> will compare 3 and 4 and put into 0 + // => 1 -> will compare 4 and 5 and put into 1 + shr_isSaturated[idx.local] = shr_isSaturated[idx.local + 3] || shr_isSaturated[idx.local + 4]; + } + } + + alpaka::syncBlockThreads(acc); + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ch = idx.global / nsamples; + auto const sample = idx.local % nsamples; + + if (sample == 0) { + shr_hasSwitchToGain6[idx.local] = shr_hasSwitchToGain6[idx.local] || shr_hasSwitchToGain6[idx.local + 1]; + shr_hasSwitchToGain1[idx.local] = shr_hasSwitchToGain1[idx.local] || shr_hasSwitchToGain1[idx.local + 1]; + shr_hasSwitchToGain0_tmp[idx.local] = + shr_hasSwitchToGain0_tmp[idx.local] || shr_hasSwitchToGain0_tmp[idx.local + 1]; + + hasSwitchToGain6[ch] = shr_hasSwitchToGain6[idx.local]; + hasSwitchToGain1[ch] = shr_hasSwitchToGain1[idx.local]; + + shr_isSaturated[idx.local + 3] = shr_isSaturated[idx.local] || shr_isSaturated[idx.local + 1]; + isSaturated[ch] = shr_isSaturated[idx.local + 3]; + } + } + + // TODO: w/o this sync, there is a race + // if (idx.local == sample_max) below uses max sample thread, not for 0 sample + // check if we can remove it + alpaka::syncBlockThreads(acc); + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ch = idx.global / nsamples; + auto const sample = idx.local % nsamples; + + // for accessing input arrays + int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch; + int const inputTx = ch >= nchannelsEB ? idx.global - nchannelsEB * nsamples : idx.global; + + auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id(); + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + // TODO offset for ee, 0 for eb + auto const hashedId = isBarrel ? reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + reconstruction::hashedIndexEE(did.rawId()); + + // eb is first and then ee + auto const* digis_in = ch >= nchannelsEB ? digisDevEE.data()->data() : digisDevEB.data()->data(); + + auto* amplitudesForMinimization = reinterpret_cast<::ecal::multifit::SampleVector*>( + ch >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data() + : uncalibRecHitsEB.outOfTimeAmplitudes()->data()); + auto* energies = ch >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude(); + auto* chi2 = ch >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2(); + auto* g_pedestal = ch >= nchannelsEB ? uncalibRecHitsEE.pedestal() : uncalibRecHitsEB.pedestal(); + auto* dids_out = ch >= nchannelsEB ? uncalibRecHitsEE.id() : uncalibRecHitsEB.id(); + auto* flags = ch >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags(); + + auto const adc = ecalMGPA::adc(digis_in[inputTx]); + auto const gainId = ecalMGPA::gainId(digis_in[inputTx]); + ::ecal::multifit::SampleVector::Scalar amplitude = 0.; + ::ecal::multifit::SampleVector::Scalar pedestal = 0.; + ::ecal::multifit::SampleVector::Scalar gainratio = 0.; + + // TODO: divergent branch + if (gainId == 0 || gainId == 3) { + pedestal = conditionsDev.pedestals_mean_x1()[hashedId]; + gainratio = conditionsDev.gain6Over1()[hashedId] * conditionsDev.gain12Over6()[hashedId]; + gainsNoise[ch](sample) = 2; + } else if (gainId == 1) { + pedestal = conditionsDev.pedestals_mean_x12()[hashedId]; + gainratio = 1.; + gainsNoise[ch](sample) = 0; + } else if (gainId == 2) { + pedestal = conditionsDev.pedestals_mean_x6()[hashedId]; + gainratio = conditionsDev.gain12Over6()[hashedId]; + gainsNoise[ch](sample) = 1; + } + + // TODO: compile time constant -> branch should be non-divergent + if (dynamicPedestal) + amplitude = static_cast<::ecal::multifit::SampleVector::Scalar>(adc) * gainratio; + else + amplitude = (static_cast<::ecal::multifit::SampleVector::Scalar>(adc) - pedestal) * gainratio; + amplitudes[ch][sample] = amplitude; + +#ifdef ECAL_RECO_ALPAKA_DEBUG + printf("%d %d %d %d %f %f %f\n", idx.global, ch, sample, adc, amplitude, pedestal, gainratio); + if (adc == 0) + printf("adc is zero\n"); +#endif + + // + // initialization + // + amplitudesForMinimization[inputCh](sample) = 0; + bxs[ch](sample) = sample - 5; + + // select the thread for the max sample + //---> hardcoded above to be 5th sample, ok + if (sample == sample_max) { + // + // initialization + // + acState[ch] = static_cast(MinimizationState::NotFinished); + energies[inputCh] = 0; + chi2[inputCh] = 0; + g_pedestal[inputCh] = 0; + uint32_t flag = 0; + dids_out[inputCh] = did.rawId(); + + // start of this channel in shared mem + auto const chStart = idx.local - sample_max; + // thread for the max sample in shared mem + auto const threadMax = idx.local; + auto const gainSwitchUseMaxSample = isBarrel ? gainSwitchUseMaxSampleEB : gainSwitchUseMaxSampleEE; + + // this flag setting is applied to all of the cases + if (shr_hasSwitchToGain6[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain6; + if (shr_hasSwitchToGain1[chStart]) + flag |= 0x1 << EcalUncalibratedRecHit::kHasSwitchToGain1; + + // this corresponds to cpu branching on lastSampleBeforeSaturation + // likely false + // check only for the idx.local corresponding to sample==0 + if (sample == 0 && shr_hasSwitchToGain0_tmp[idx.local]) { + // assign for the case some sample having gainId == 0 + //energies[inputCh] = amplitudes[ch][sample_max]; + energies[inputCh] = amplitude; + + // check if samples before sample_max have true + bool saturated_before_max = false; + CMS_UNROLL_LOOP + for (char ii = 0; ii < 5; ++ii) + saturated_before_max = saturated_before_max || shr_hasSwitchToGain0[chStart + ii]; + + // if saturation is in the max sample and not in the first 5 + if (!saturated_before_max && shr_hasSwitchToGain0[threadMax]) + energies[inputCh] = 49140; // 4095 * 12 (maximum ADC range * MultiGainPreAmplifier (MGPA) gain) + // This is the actual maximum range that is set when we saturate. + //---- AM FIXME : no pedestal subtraction??? + //It should be "(4095. - pedestal) * gainratio" + + // set state flag to terminate further processing of this channel + acState[ch] = static_cast(MinimizationState::Precomputed); + flag |= 0x1 << EcalUncalibratedRecHit::kSaturated; + flags[inputCh] = flag; + continue; + } + + // according to cpu version + // auto max_amplitude = amplitudes[ch][sample_max]; + auto const max_amplitude = amplitude; + // pulse shape template value + auto shape_value = conditionsDev.pulseShapes()[hashedId][full_pulse_max - 7]; + // note, no syncing as the same thread will be accessing here + bool hasGainSwitch = + shr_hasSwitchToGain6[chStart] || shr_hasSwitchToGain1[chStart] || shr_isSaturated[chStart + 3]; + + // pedestal is final unconditionally + g_pedestal[inputCh] = pedestal; + if (hasGainSwitch && gainSwitchUseMaxSample) { + // thread for sample=0 will access the right guys + energies[inputCh] = max_amplitude / shape_value; + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[inputCh] = flag; + continue; + } + + // will be used in the future for setting state + auto const rmsForChecking = conditionsDev.pedestals_rms_x12()[hashedId]; + + // this happens cause sometimes rms_x12 is 0... + // needs to be checkec why this is the case + // general case here is that noisecov is a Zero matrix + if (rmsForChecking == 0) { + acState[ch] = static_cast(MinimizationState::Precomputed); + flags[inputCh] = flag; + continue; + } + + // for the case when no shortcuts were taken + flags[inputCh] = flag; + } + } + } + } + }; + + /// + /// assume kernel launch configuration is + /// ([MAXSAMPLES, MAXSAMPLES], nchannels) + /// + class Kernel_prep_2d { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ::ecal::multifit::SampleGainVector const* gainsNoise, + ::ecal::multifit::SampleMatrix* noisecov, + ::ecal::multifit::PulseMatrixType* pulse_matrix, + bool const* hasSwitchToGain6, + bool const* hasSwitchToGain1, + bool const* isSaturated) const { + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + auto const offsetForHashes = conditionsDev.offsetEE(); + auto const nchannelsEB = digisDevEB.size(); + constexpr float addPedestalUncertainty = 0.f; + constexpr bool dynamicPedestal = false; + constexpr bool simplifiedNoiseModelForGainSwitch = true; //---- default is true + + // pulse matrix + auto const* pulse_shapes = reinterpret_cast(conditionsDev.pulseShapes()->data()); + + auto const blockDimX = alpaka::getWorkDiv(acc)[1u]; + auto const elemsPerBlockX = alpaka::getWorkDiv(acc)[1u]; + auto const elemsPerBlockY = alpaka::getWorkDiv(acc)[0u]; + Vec2D const size_2d = {elemsPerBlockY, blockDimX * elemsPerBlockX}; // {y, x} coordinates + + for (auto ndindex : cms::alpakatools::elements_with_stride_nd(acc, size_2d)) { + auto const ch = ndindex[1] / nsamples; + auto const tx = ndindex[1] % nsamples; + auto const ty = ndindex[0]; + + // to access input arrays (ids and digis only) + int const inputCh = ch >= nchannelsEB ? ch - nchannelsEB : ch; + auto const* dids = ch >= nchannelsEB ? digisDevEE.id() : digisDevEB.id(); + + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + auto const* G12SamplesCorrelation = isBarrel ? conditionsDev.sampleCorrelation_EB_G12().data() + : conditionsDev.sampleCorrelation_EE_G12().data(); + auto const* G6SamplesCorrelation = + isBarrel ? conditionsDev.sampleCorrelation_EB_G6().data() : conditionsDev.sampleCorrelation_EE_G6().data(); + auto const* G1SamplesCorrelation = + isBarrel ? conditionsDev.sampleCorrelation_EB_G1().data() : conditionsDev.sampleCorrelation_EE_G1().data(); + auto const hasGainSwitch = hasSwitchToGain6[ch] || hasSwitchToGain1[ch] || isSaturated[ch]; + + auto const vidx = std::abs(static_cast(ty) - static_cast(tx)); + + // non-divergent branch for all threads per block + if (hasGainSwitch) { + // TODO: did not include simplified noise model + float noise_value = 0; + + // non-divergent branch - all threads per block + // TODO: all of these constants indicate that + // that these parts could be splitted into completely different + // kernels and run one of them only depending on the config + if (simplifiedNoiseModelForGainSwitch) { + constexpr int isample_max = 5; // according to cpu defs + auto const gainidx = gainsNoise[ch][isample_max]; + + // non-divergent branches + if (gainidx == 0) { + auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId]; + noise_value = rms_x12 * rms_x12 * G12SamplesCorrelation[vidx]; + } else if (gainidx == 1) { + auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId]; + auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId]; + noise_value = gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * G6SamplesCorrelation[vidx]; + } else if (gainidx == 2) { + auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId]; + auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId]; + auto const gain12Over1 = gain12Over6 * gain6Over1; + auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId]; + noise_value = gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * G1SamplesCorrelation[vidx]; + } + if (!dynamicPedestal && addPedestalUncertainty > 0.f) + noise_value += addPedestalUncertainty * addPedestalUncertainty; + } else { + int gainidx = 0; + char mask = gainidx; + int pedestal = gainsNoise[ch][ty] == mask ? 1 : 0; + // NB: gainratio is 1, that is why it does not appear in the formula + auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId]; + noise_value += rms_x12 * rms_x12 * pedestal * G12SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += addPedestalUncertainty * addPedestalUncertainty * pedestal; // gainratio is 1 + } + + // + gainidx = 1; + mask = gainidx; + pedestal = gainsNoise[ch][ty] == mask ? 1 : 0; + auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId]; + auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId]; + noise_value += gain12Over6 * gain12Over6 * rms_x6 * rms_x6 * pedestal * G6SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += gain12Over6 * gain12Over6 * addPedestalUncertainty * addPedestalUncertainty * pedestal; + } + + // + gainidx = 2; + mask = gainidx; + pedestal = gainsNoise[ch][ty] == mask ? 1 : 0; + auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId]; + auto const gain12Over1 = gain12Over6 * gain6Over1; + auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId]; + noise_value += gain12Over1 * gain12Over1 * rms_x1 * rms_x1 * pedestal * G1SamplesCorrelation[vidx]; + // non-divergent branch + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + noise_value += gain12Over1 * gain12Over1 * addPedestalUncertainty * addPedestalUncertainty * pedestal; + } + } + + noisecov[ch](ty, tx) = noise_value; + } else { + auto const rms = conditionsDev.pedestals_rms_x12()[hashedId]; + float noise_value = rms * rms * G12SamplesCorrelation[vidx]; + if (!dynamicPedestal && addPedestalUncertainty > 0.f) { + //---- add fully correlated component to noise covariance to inflate pedestal uncertainty + noise_value += addPedestalUncertainty * addPedestalUncertainty; + } + noisecov[ch](ty, tx) = noise_value; + } + + auto const posToAccess = 9 - static_cast(tx) + static_cast(ty); // see cpu for reference + float const value = posToAccess >= 7 ? pulse_shapes[hashedId].pdfval[posToAccess - 7] : 0; + pulse_matrix[ch](ty, tx) = value; + } + } + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +namespace alpaka::trait { + using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_prep_1d_and_initialize. + template + struct BlockSharedMemDynSizeBytes { + //! \return The size of the shared memory allocated for a block. + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_prep_1d_and_initialize const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + // return the amount of dynamic shared memory needed + std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * (5 * sizeof(bool) + sizeof(char)); + return bytes; + } + }; +} // namespace alpaka::trait + +#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationCommonKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc new file mode 100644 index 0000000000000..fcf9e5de16f40 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.dev.cc @@ -0,0 +1,316 @@ +#include +#include +#include + +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "DataFormats/CaloRecHit/interface/MultifitComputations.h" +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +#include "AmplitudeComputationKernels.h" +#include "KernelHelpers.h" +#include "EcalUncalibRecHitMultiFitAlgoPortable.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + using namespace ::ecal::multifit; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void update_covariance(EcalPulseCovariance const& pulse_covariance, + MatrixType& inverse_cov, + SampleVector const& amplitudes) { + constexpr auto nsamples = SampleVector::RowsAtCompileTime; + constexpr auto npulses = BXVectorType::RowsAtCompileTime; + + CMS_UNROLL_LOOP + for (unsigned int ipulse = 0; ipulse < npulses; ++ipulse) { + auto const amplitude = amplitudes.coeff(ipulse); + if (amplitude == 0) + continue; + + // FIXME: ipulse - 5 -> ipulse - firstOffset + int bx = ipulse - 5; + int first_sample_t = std::max(0, bx + 3); + int offset = -3 - bx; + + auto const value_sq = amplitude * amplitude; + + for (int col = first_sample_t; col < nsamples; ++col) { + for (int row = col; row < nsamples; ++row) { + inverse_cov(row, col) += value_sq * pulse_covariance.covval[row + offset][col + offset]; + } + } + } + } + + /// + /// launch ctx parameters are (nchannels / block, blocks) + /// TODO: trivial impl for now, there must be a way to improve + /// + /// Conventions: + /// - amplitudes -> solution vector, what we are fitting for + /// - samples -> raw detector responses + /// - passive constraint - satisfied constraint + /// - active constraint - unsatisfied (yet) constraint + /// + class Kernel_minimize { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + InputProduct::ConstView const& digisDevEB, + InputProduct::ConstView const& digisDevEE, + OutputProduct::View uncalibRecHitsEB, + OutputProduct::View uncalibRecHitsEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ::ecal::multifit::SampleMatrix const* noisecov, + ::ecal::multifit::PulseMatrixType const* pulse_matrix, + ::ecal::multifit::BXVectorType* bxs, + ::ecal::multifit::SampleVector const* samples, + bool* hasSwitchToGain6, + bool* hasSwitchToGain1, + bool* isSaturated, + char* acState, + int max_iterations) const { + // FIXME: ecal has 10 samples and 10 pulses.... + // but this needs to be properly treated and renamed everywhere + constexpr auto NSAMPLES = SampleMatrix::RowsAtCompileTime; + constexpr auto NPULSES = SampleMatrix::ColsAtCompileTime; + static_assert(NSAMPLES == NPULSES); + + using DataType = SampleVector::Scalar; + + auto const elemsPerBlock(alpaka::getWorkDiv(acc)[0u]); + + auto const nchannelsEB = digisDevEB.size(); + auto const nchannels = nchannelsEB + digisDevEE.size(); + auto const offsetForHashes = conditionsDev.offsetEE(); + + auto const* pulse_covariance = reinterpret_cast(conditionsDev.pulseCovariance()); + + // shared memory + DataType* shrmem = alpaka::getDynSharedMem(acc); + + // channel + for (auto idx : cms::alpakatools::elements_with_stride(acc, nchannels)) { + if (static_cast(acState[idx]) == MinimizationState::Precomputed) + continue; + + auto const elemIdx = idx % elemsPerBlock; + + // shared memory pointers + DataType* shrMatrixLForFnnlsStorage = shrmem + calo::multifit::MapSymM::total * elemIdx; + DataType* shrAtAStorage = + shrmem + calo::multifit::MapSymM::total * (elemIdx + elemsPerBlock); + + auto* amplitudes = + reinterpret_cast(idx >= nchannelsEB ? uncalibRecHitsEE.outOfTimeAmplitudes()->data() + : uncalibRecHitsEB.outOfTimeAmplitudes()->data()); + auto* energies = idx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude(); + auto* chi2s = idx >= nchannelsEB ? uncalibRecHitsEE.chi2() : uncalibRecHitsEB.chi2(); + + // get the hash + int const inputCh = idx >= nchannelsEB ? idx - nchannelsEB : idx; + auto const* dids = idx >= nchannelsEB ? digisDevEE.id() : digisDevEB.id(); + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + + // inits + int npassive = 0; + + calo::multifit::ColumnVector pulseOffsets; + CMS_UNROLL_LOOP + for (int i = 0; i < NPULSES; ++i) + pulseOffsets(i) = i; + + calo::multifit::ColumnVector resultAmplitudes; + CMS_UNROLL_LOOP + for (int counter = 0; counter < NPULSES; ++counter) + resultAmplitudes(counter) = 0; + + // inits + //SampleDecompLLT covariance_decomposition; + //SampleMatrix inverse_cov; + // SampleVector::Scalar chi2 = 0, chi2_now = 0; + float chi2 = 0, chi2_now = 0; + + // loop for up to max_iterations + for (int iter = 0; iter < max_iterations; ++iter) { + //inverse_cov = noisecov[idx]; + //DataType covMatrixStorage[MapSymM::total]; + DataType* covMatrixStorage = shrMatrixLForFnnlsStorage; + calo::multifit::MapSymM covMatrix{covMatrixStorage}; + int counter = 0; + CMS_UNROLL_LOOP + for (int col = 0; col < NSAMPLES; ++col) { + CMS_UNROLL_LOOP + for (int row = col; row < NSAMPLES; ++row) { + covMatrixStorage[counter++] = noisecov[idx].coeffRef(row, col); + } + } + update_covariance(pulse_covariance[hashedId], covMatrix, resultAmplitudes); + + // compute actual covariance decomposition + //covariance_decomposition.compute(inverse_cov); + //auto const& matrixL = covariance_decomposition.matrixL(); + DataType matrixLStorage[calo::multifit::MapSymM::total]; + calo::multifit::MapSymM matrixL{matrixLStorage}; + calo::multifit::compute_decomposition_unrolled(matrixL, covMatrix); + + // L * A = P + calo::multifit::ColMajorMatrix A; + calo::multifit::solve_forward_subst_matrix(A, pulse_matrix[idx], matrixL); + + // L b = s + float reg_b[NSAMPLES]; + calo::multifit::solve_forward_subst_vector(reg_b, samples[idx], matrixL); + + // FIXME: shared mem + //DataType AtAStorage[MapSymM::total]; + calo::multifit::MapSymM AtA{shrAtAStorage}; + //SampleMatrix AtA; + SampleVector Atb; + CMS_UNROLL_LOOP + for (int icol = 0; icol < NPULSES; ++icol) { + float reg_ai[NSAMPLES]; + + // load column icol + CMS_UNROLL_LOOP + for (int counter = 0; counter < NSAMPLES; ++counter) + reg_ai[counter] = A(counter, icol); + + // compute diagoanl + float sum = 0.f; + CMS_UNROLL_LOOP + for (int counter = 0; counter < NSAMPLES; ++counter) + sum += reg_ai[counter] * reg_ai[counter]; + + // store + AtA(icol, icol) = sum; + + // go thru the other columns + CMS_UNROLL_LOOP + for (int j = icol + 1; j < NPULSES; ++j) { + // load column j + float reg_aj[NSAMPLES]; + CMS_UNROLL_LOOP + for (int counter = 0; counter < NSAMPLES; ++counter) + reg_aj[counter] = A(counter, j); + + // accum + float sum = 0.f; + CMS_UNROLL_LOOP + for (int counter = 0; counter < NSAMPLES; ++counter) + sum += reg_aj[counter] * reg_ai[counter]; + + // store + //AtA(icol, j) = sum; + AtA(j, icol) = sum; + } + + // Atb accum + float sum_atb = 0.f; + CMS_UNROLL_LOOP + for (int counter = 0; counter < NSAMPLES; ++counter) + sum_atb += reg_ai[counter] * reg_b[counter]; + + // store atb + Atb(icol) = sum_atb; + } + + // FIXME: shared mem + //DataType matrixLForFnnlsStorage[MapSymM::total]; + calo::multifit::MapSymM matrixLForFnnls{shrMatrixLForFnnlsStorage}; + + calo::multifit::fnnls(AtA, + Atb, + //amplitudes[idx], + resultAmplitudes, + npassive, + pulseOffsets, + matrixLForFnnls, + 1e-11, + 500, + 16, + 2); + + calo::multifit::calculateChiSq(matrixL, pulse_matrix[idx], resultAmplitudes, samples[idx], chi2_now); + + auto const deltachi2 = chi2_now - chi2; + chi2 = chi2_now; + + if (std::abs(deltachi2) < 1e-3) + break; + } + + // store to global output values + // FIXME: amplitudes are used in global directly + chi2s[inputCh] = chi2; + energies[inputCh] = resultAmplitudes(5); + + CMS_UNROLL_LOOP + for (int counter = 0; counter < NPULSES; ++counter) + amplitudes[inputCh](counter) = resultAmplitudes(counter); + } + } + }; + + void minimization_procedure(Queue& queue, + InputProduct const& digisDevEB, + InputProduct const& digisDevEE, + OutputProduct& uncalibRecHitsDevEB, + OutputProduct& uncalibRecHitsDevEE, + EventDataForScratchDevice& scratch, + EcalMultifitConditionsDevice const& conditionsDev, + ConfigurationParameters const& configParams, + uint32_t const totalChannels) { + using DataType = SampleVector::Scalar; + // TODO: configure from python + auto threads_min = configParams.kernelMinimizeThreads[0]; + auto blocks_min = cms::alpakatools::divide_up_by(totalChannels, threads_min); + + auto workDivMinimize = cms::alpakatools::make_workdiv(blocks_min, threads_min); + alpaka::exec(queue, + workDivMinimize, + Kernel_minimize{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + uncalibRecHitsDevEB.view(), + uncalibRecHitsDevEE.view(), + conditionsDev.const_view(), + reinterpret_cast<::ecal::multifit::SampleMatrix*>(scratch.noisecovDevBuf.data()), + reinterpret_cast<::ecal::multifit::PulseMatrixType*>(scratch.pulse_matrixDevBuf.data()), + reinterpret_cast<::ecal::multifit::BXVectorType*>(scratch.activeBXsDevBuf.data()), + reinterpret_cast<::ecal::multifit::SampleVector*>(scratch.samplesDevBuf.data()), + scratch.hasSwitchToGain6DevBuf.data(), + scratch.hasSwitchToGain1DevBuf.data(), + scratch.isSaturatedDevBuf.data(), + scratch.acStateDevBuf.data(), + 50); // maximum number of fit iterations + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +namespace alpaka::trait { + using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_minimize. + template + struct BlockSharedMemDynSizeBytes { + //! \return The size of the shared memory allocated for a block. + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_minimize const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + using ScalarType = ecal::multifit::SampleVector::Scalar; + + // return the amount of dynamic shared memory needed + std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * + calo::multifit::MapSymM::total * + sizeof(ScalarType); + return bytes; + } + }; +} // namespace alpaka::trait diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.h new file mode 100644 index 0000000000000..fa8700301bc81 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/AmplitudeComputationKernels.h @@ -0,0 +1,28 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_AmplitudeComputationKernels_h + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "DeclsForKernels.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + using InputProduct = EcalDigiDeviceCollection; + using OutputProduct = EcalUncalibratedRecHitDeviceCollection; + + void minimization_procedure(Queue& queue, + InputProduct const& digisDevEB, + InputProduct const& digisDevEE, + OutputProduct& uncalibRecHitsDevEB, + OutputProduct& uncalibRecHitsDevEE, + EventDataForScratchDevice& scratch, + EcalMultifitConditionsDevice const& conditionsDev, + ConfigurationParameters const& configParams, + uint32_t const totalChannels); + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +#endif // RecoLocalCalo_EcalRecProducers_plugins_AmplitudeComputationKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/DeclsForKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/DeclsForKernels.h new file mode 100644 index 0000000000000..6f96b26d253d1 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/DeclsForKernels.h @@ -0,0 +1,130 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_DeclsForKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_DeclsForKernels_h + +#include + +#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" + +class EcalSampleMask; + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + enum class TimeComputationState : char { NotFinished = 0, Finished = 1 }; + enum class MinimizationState : char { + NotFinished = 0, + Finished = 1, + Precomputed = 2, + }; + + // parameters have a fixed type + // Can we go by with single precision + struct ConfigurationParameters { + using type = double; + + type timeFitLimitsFirstEB, timeFitLimitsFirstEE; + type timeFitLimitsSecondEB, timeFitLimitsSecondEE; + + type timeConstantTermEB, timeConstantTermEE; + + type timeNconstEB, timeNconstEE; + + type amplitudeThreshEE, amplitudeThreshEB; + + type outOfTimeThreshG12pEB, outOfTimeThreshG12mEB; + type outOfTimeThreshG12pEE, outOfTimeThreshG12mEE; + type outOfTimeThreshG61pEE, outOfTimeThreshG61mEE; + type outOfTimeThreshG61pEB, outOfTimeThreshG61mEB; + + std::array kernelMinimizeThreads; + + bool shouldRunTimingComputation; + }; + + template + constexpr uint32_t getLength() { + return EigenM::RowsAtCompileTime * EigenM::ColsAtCompileTime; + } + + struct EventDataForScratchDevice { + using SVT = ::ecal::multifit::SampleVector::Scalar; + using SGVT = ::ecal::multifit::SampleGainVector::Scalar; + using SMT = ::ecal::multifit::SampleMatrix::Scalar; + using PMT = ::ecal::multifit::PulseMatrixType::Scalar; + using BXVT = ::ecal::multifit::BXVectorType::Scalar; + + static constexpr auto svlength = getLength<::ecal::multifit::SampleVector>(); + static constexpr auto sgvlength = getLength<::ecal::multifit::SampleGainVector>(); + static constexpr auto smlength = getLength<::ecal::multifit::SampleMatrix>(); + static constexpr auto pmlength = getLength<::ecal::multifit::PulseMatrixType>(); + static constexpr auto bxvlength = getLength<::ecal::multifit::BXVectorType>(); + + // delete the default constructor because alpaka buffers do not have a default constructor + EventDataForScratchDevice() = delete; + + explicit EventDataForScratchDevice(ConfigurationParameters const& configParameters, uint32_t size, Queue& queue) + : samplesDevBuf{cms::alpakatools::make_device_buffer(queue, size * svlength)}, + gainsNoiseDevBuf{cms::alpakatools::make_device_buffer(queue, size * sgvlength)}, + noisecovDevBuf{cms::alpakatools::make_device_buffer(queue, size * smlength)}, + pulse_matrixDevBuf{cms::alpakatools::make_device_buffer(queue, size * pmlength)}, + activeBXsDevBuf{cms::alpakatools::make_device_buffer(queue, size * bxvlength)}, + acStateDevBuf{cms::alpakatools::make_device_buffer(queue, size)}, + hasSwitchToGain6DevBuf{cms::alpakatools::make_device_buffer(queue, size)}, + hasSwitchToGain1DevBuf{cms::alpakatools::make_device_buffer(queue, size)}, + isSaturatedDevBuf{cms::alpakatools::make_device_buffer(queue, size)} { + if (configParameters.shouldRunTimingComputation) { + sample_valuesDevBuf = cms::alpakatools::make_device_buffer(queue, size * svlength); + sample_value_errorsDevBuf = cms::alpakatools::make_device_buffer(queue, size * svlength); + useless_sample_valuesDevBuf = + cms::alpakatools::make_device_buffer(queue, size * EcalDataFrame::MAXSAMPLES); + chi2sNullHypotDevBuf = cms::alpakatools::make_device_buffer(queue, size); + sum0sNullHypotDevBuf = cms::alpakatools::make_device_buffer(queue, size); + sumAAsNullHypotDevBuf = cms::alpakatools::make_device_buffer(queue, size); + pedestal_numsDevBuf = cms::alpakatools::make_device_buffer(queue, size); + + tMaxAlphaBetasDevBuf = cms::alpakatools::make_device_buffer(queue, size); + tMaxErrorAlphaBetasDevBuf = cms::alpakatools::make_device_buffer(queue, size); + accTimeMaxDevBuf = cms::alpakatools::make_device_buffer(queue, size); + accTimeWgtDevBuf = cms::alpakatools::make_device_buffer(queue, size); + ampMaxAlphaBetaDevBuf = cms::alpakatools::make_device_buffer(queue, size); + ampMaxErrorDevBuf = cms::alpakatools::make_device_buffer(queue, size); + timeMaxDevBuf = cms::alpakatools::make_device_buffer(queue, size); + timeErrorDevBuf = cms::alpakatools::make_device_buffer(queue, size); + tcStateDevBuf = cms::alpakatools::make_device_buffer(queue, size); + } + }; + + cms::alpakatools::device_buffer samplesDevBuf; + cms::alpakatools::device_buffer gainsNoiseDevBuf; + + cms::alpakatools::device_buffer noisecovDevBuf; + cms::alpakatools::device_buffer pulse_matrixDevBuf; + cms::alpakatools::device_buffer activeBXsDevBuf; + cms::alpakatools::device_buffer acStateDevBuf; + + cms::alpakatools::device_buffer hasSwitchToGain6DevBuf; + cms::alpakatools::device_buffer hasSwitchToGain1DevBuf; + cms::alpakatools::device_buffer isSaturatedDevBuf; + + std::optional> sample_valuesDevBuf; + std::optional> sample_value_errorsDevBuf; + std::optional> useless_sample_valuesDevBuf; + std::optional> chi2sNullHypotDevBuf; + std::optional> sum0sNullHypotDevBuf; + std::optional> sumAAsNullHypotDevBuf; + std::optional> pedestal_numsDevBuf; + std::optional> tMaxAlphaBetasDevBuf; + std::optional> tMaxErrorAlphaBetasDevBuf; + std::optional> accTimeMaxDevBuf; + std::optional> accTimeWgtDevBuf; + std::optional> ampMaxAlphaBetaDevBuf; + std::optional> ampMaxErrorDevBuf; + std::optional> timeMaxDevBuf; + std::optional> timeErrorDevBuf; + std::optional> tcStateDevBuf; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +#endif // RecoLocalCalo_EcalRecProducers_plugins_alpaka_DeclsForKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitConditionsHostESProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitConditionsHostESProducer.cc new file mode 100644 index 0000000000000..6db1ff58b2740 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitConditionsHostESProducer.cc @@ -0,0 +1,213 @@ +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#include +#include +#include "CondFormats/DataRecord/interface/EcalGainRatiosRcd.h" +#include "CondFormats/DataRecord/interface/EcalPedestalsRcd.h" +#include "CondFormats/DataRecord/interface/EcalPulseCovariancesRcd.h" +#include "CondFormats/DataRecord/interface/EcalPulseShapesRcd.h" +#include "CondFormats/DataRecord/interface/EcalSampleMaskRcd.h" +#include "CondFormats/DataRecord/interface/EcalSamplesCorrelationRcd.h" +#include "CondFormats/DataRecord/interface/EcalTimeBiasCorrectionsRcd.h" +#include "CondFormats/DataRecord/interface/EcalTimeCalibConstantsRcd.h" +#include "CondFormats/DataRecord/interface/EcalTimeOffsetConstantRcd.h" +#include "CondFormats/EcalObjects/interface/EcalGainRatios.h" +#include "CondFormats/EcalObjects/interface/EcalPedestals.h" +#include "CondFormats/EcalObjects/interface/EcalPulseCovariances.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" +#include "CondFormats/EcalObjects/interface/EcalSamplesCorrelation.h" +#include "CondFormats/EcalObjects/interface/EcalSampleMask.h" +#include "CondFormats/EcalObjects/interface/EcalTimeBiasCorrections.h" +#include "CondFormats/EcalObjects/interface/EcalTimeCalibConstants.h" +#include "CondFormats/EcalObjects/interface/EcalTimeOffsetConstant.h" + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "CondFormats/EcalObjects/interface/EcalMultifitConditionsSoA.h" +#include "CondFormats/DataRecord/interface/EcalMultifitConditionsRcd.h" + +#include "DataFormats/EcalDigi/interface/EcalConstants.h" +#include "CondFormats/EcalObjects/interface/EcalPulseShapes.h" + +#include "DataFormats/EcalDetId/interface/EcalElectronicsId.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESProducer.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ModuleFactory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + class EcalMultifitConditionsHostESProducer : public ESProducer { + public: + EcalMultifitConditionsHostESProducer(edm::ParameterSet const& iConfig) : ESProducer(iConfig) { + auto cc = setWhatProduced(this); + pedestalsToken_ = cc.consumes(); + gainRatiosToken_ = cc.consumes(); + pulseShapesToken_ = cc.consumes(); + pulseCovariancesToken_ = cc.consumes(); + samplesCorrelationToken_ = cc.consumes(); + timeBiasCorrectionsToken_ = cc.consumes(); + timeCalibConstantsToken_ = cc.consumes(); + sampleMaskToken_ = cc.consumes(); + timeOffsetConstantToken_ = cc.consumes(); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + descriptions.addWithDefaultLabel(desc); + } + + std::unique_ptr produce(EcalMultifitConditionsRcd const& iRecord) { + auto const& pedestalsData = iRecord.get(pedestalsToken_); + auto const& gainRatiosData = iRecord.get(gainRatiosToken_); + auto const& pulseShapesData = iRecord.get(pulseShapesToken_); + auto const& pulseCovariancesData = iRecord.get(pulseCovariancesToken_); + auto const& samplesCorrelationData = iRecord.get(samplesCorrelationToken_); + auto const& timeBiasCorrectionsData = iRecord.get(timeBiasCorrectionsToken_); + auto const& timeCalibConstantsData = iRecord.get(timeCalibConstantsToken_); + auto const& sampleMaskData = iRecord.get(sampleMaskToken_); + auto const& timeOffsetConstantData = iRecord.get(timeOffsetConstantToken_); + + size_t numberOfXtals = pedestalsData.size(); + + auto product = std::make_unique(numberOfXtals, cms::alpakatools::host()); + auto view = product->view(); + + // Filling pedestals + const auto barrelSize = pedestalsData.barrelItems().size(); + const auto endcapSize = pedestalsData.endcapItems().size(); + + auto const& pedestalsEB = pedestalsData.barrelItems(); + auto const& pedestalsEE = pedestalsData.endcapItems(); + auto const& gainRatiosEB = gainRatiosData.barrelItems(); + auto const& gainRatiosEE = gainRatiosData.endcapItems(); + auto const& pulseShapesEB = pulseShapesData.barrelItems(); + auto const& pulseShapesEE = pulseShapesData.endcapItems(); + auto const& pulseCovariancesEB = pulseCovariancesData.barrelItems(); + auto const& pulseCovariancesEE = pulseCovariancesData.endcapItems(); + auto const& timeCalibConstantsEB = timeCalibConstantsData.barrelItems(); + auto const& timeCalibConstantsEE = timeCalibConstantsData.endcapItems(); + + for (unsigned int i = 0; i < barrelSize; i++) { + auto vi = view[i]; + + vi.pedestals_mean_x12() = pedestalsEB[i].mean_x12; + vi.pedestals_rms_x12() = pedestalsEB[i].rms_x12; + vi.pedestals_mean_x6() = pedestalsEB[i].mean_x6; + vi.pedestals_rms_x6() = pedestalsEB[i].rms_x6; + vi.pedestals_mean_x1() = pedestalsEB[i].mean_x1; + vi.pedestals_rms_x1() = pedestalsEB[i].rms_x1; + + vi.gain12Over6() = gainRatiosEB[i].gain12Over6(); + vi.gain6Over1() = gainRatiosEB[i].gain6Over1(); + + vi.timeCalibConstants() = timeCalibConstantsEB[i]; + + std::memcpy(vi.pulseShapes().data(), pulseShapesEB[i].pdfval, sizeof(float) * EcalPulseShape::TEMPLATESAMPLES); + for (unsigned int j = 0; j < EcalPulseShape::TEMPLATESAMPLES; j++) { + for (unsigned int k = 0; k < EcalPulseShape::TEMPLATESAMPLES; k++) { + vi.pulseCovariance()(j, k) = pulseCovariancesEB[i].val(j, k); + } + } + } // end Barrel loop + for (unsigned int i = 0; i < endcapSize; i++) { + auto vi = view[barrelSize + i]; + + vi.pedestals_mean_x12() = pedestalsEE[i].mean_x12; + vi.pedestals_rms_x12() = pedestalsEE[i].rms_x12; + vi.pedestals_mean_x6() = pedestalsEE[i].mean_x6; + vi.pedestals_rms_x6() = pedestalsEE[i].rms_x6; + vi.pedestals_mean_x1() = pedestalsEE[i].mean_x1; + vi.pedestals_rms_x1() = pedestalsEE[i].rms_x1; + + vi.gain12Over6() = gainRatiosEE[i].gain12Over6(); + vi.gain6Over1() = gainRatiosEE[i].gain6Over1(); + + vi.timeCalibConstants() = timeCalibConstantsEE[i]; + + std::memcpy(vi.pulseShapes().data(), pulseShapesEE[i].pdfval, sizeof(float) * EcalPulseShape::TEMPLATESAMPLES); + + for (unsigned int j = 0; j < EcalPulseShape::TEMPLATESAMPLES; j++) { + for (unsigned int k = 0; k < EcalPulseShape::TEMPLATESAMPLES; k++) { + vi.pulseCovariance()(j, k) = pulseCovariancesEE[i].val(j, k); + } + } + } // end Endcap loop + + // === Scalar data (not by xtal) + //TimeBiasCorrection + // Assert that there are not more parameters than the EcalMultiFitConditionsSoA expects + assert(timeBiasCorrectionsData.EBTimeCorrAmplitudeBins.size() <= kMaxTimeBiasCorrectionBinsEB); + assert(timeBiasCorrectionsData.EBTimeCorrShiftBins.size() <= kMaxTimeBiasCorrectionBinsEB); + std::memcpy(view.timeBiasCorrections_amplitude_EB().data(), + timeBiasCorrectionsData.EBTimeCorrAmplitudeBins.data(), + sizeof(float) * kMaxTimeBiasCorrectionBinsEB); + std::memcpy(view.timeBiasCorrections_shift_EB().data(), + timeBiasCorrectionsData.EBTimeCorrShiftBins.data(), + sizeof(float) * kMaxTimeBiasCorrectionBinsEB); + + // Assert that there are not more parameters than the EcalMultiFitConditionsSoA expects + assert(timeBiasCorrectionsData.EETimeCorrAmplitudeBins.size() <= kMaxTimeBiasCorrectionBinsEE); + assert(timeBiasCorrectionsData.EETimeCorrShiftBins.size() <= kMaxTimeBiasCorrectionBinsEE); + std::memcpy(view.timeBiasCorrections_amplitude_EE().data(), + timeBiasCorrectionsData.EETimeCorrAmplitudeBins.data(), + sizeof(float) * kMaxTimeBiasCorrectionBinsEE); + std::memcpy(view.timeBiasCorrections_shift_EE().data(), + timeBiasCorrectionsData.EETimeCorrShiftBins.data(), + sizeof(float) * kMaxTimeBiasCorrectionBinsEE); + + view.timeBiasCorrectionSizeEB() = + std::min(timeBiasCorrectionsData.EBTimeCorrAmplitudeBins.size(), kMaxTimeBiasCorrectionBinsEB); + view.timeBiasCorrectionSizeEE() = + std::min(timeBiasCorrectionsData.EETimeCorrAmplitudeBins.size(), kMaxTimeBiasCorrectionBinsEE); + + // SampleCorrelation + std::memcpy(view.sampleCorrelation_EB_G12().data(), + samplesCorrelationData.EBG12SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + std::memcpy(view.sampleCorrelation_EB_G6().data(), + samplesCorrelationData.EBG6SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + std::memcpy(view.sampleCorrelation_EB_G1().data(), + samplesCorrelationData.EBG1SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + + std::memcpy(view.sampleCorrelation_EE_G12().data(), + samplesCorrelationData.EEG12SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + std::memcpy(view.sampleCorrelation_EE_G6().data(), + samplesCorrelationData.EBG6SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + std::memcpy(view.sampleCorrelation_EE_G1().data(), + samplesCorrelationData.EEG1SamplesCorrelation.data(), + sizeof(double) * ecalPh1::sampleSize); + + // Sample masks + view.sampleMask_EB() = sampleMaskData.getEcalSampleMaskRecordEB(); + view.sampleMask_EE() = sampleMaskData.getEcalSampleMaskRecordEE(); + + // Time offsets + view.timeOffset_EB() = timeOffsetConstantData.getEBValue(); + view.timeOffset_EE() = timeOffsetConstantData.getEEValue(); + + // number of barrel items as offset for hashed ID access to EE items of columns + view.offsetEE() = barrelSize; + + return product; + } + + private: + edm::ESGetToken pedestalsToken_; + edm::ESGetToken gainRatiosToken_; + edm::ESGetToken pulseShapesToken_; + edm::ESGetToken pulseCovariancesToken_; + edm::ESGetToken samplesCorrelationToken_; + edm::ESGetToken timeBiasCorrectionsToken_; + edm::ESGetToken timeCalibConstantsToken_; + edm::ESGetToken sampleMaskToken_; + edm::ESGetToken timeOffsetConstantToken_; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +DEFINE_FWK_EVENTSETUP_ALPAKA_MODULE(EcalMultifitConditionsHostESProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitParametersHostESProducer.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitParametersHostESProducer.cc new file mode 100644 index 0000000000000..809dacdabc43e --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalMultifitParametersHostESProducer.cc @@ -0,0 +1,99 @@ +#include + +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h" +#include "CondFormats/EcalObjects/interface/EcalMultifitParametersSoA.h" +#include "CondFormats/DataRecord/interface/EcalMultifitParametersRcd.h" + +#include "DataFormats/EcalDigi/interface/EcalConstants.h" + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESProducer.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ModuleFactory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + class EcalMultifitParametersHostESProducer : public ESProducer { + public: + EcalMultifitParametersHostESProducer(edm::ParameterSet const&); + ~EcalMultifitParametersHostESProducer() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions&); + std::unique_ptr produce(EcalMultifitParametersRcd const&); + + private: + std::vector ebTimeFitParameters_; + std::vector eeTimeFitParameters_; + std::vector ebAmplitudeFitParameters_; + std::vector eeAmplitudeFitParameters_; + }; + + EcalMultifitParametersHostESProducer::EcalMultifitParametersHostESProducer(edm::ParameterSet const& iConfig) + : ESProducer(iConfig) { + setWhatProduced(this); + + auto const ebTimeFitParamsFromPSet = iConfig.getParameter>("EBtimeFitParameters"); + auto const eeTimeFitParamsFromPSet = iConfig.getParameter>("EEtimeFitParameters"); + // Assert that there are as many parameters as the EcalMultiFitParametersSoA expects + assert(ebTimeFitParamsFromPSet.size() == kNTimeFitParams); + assert(eeTimeFitParamsFromPSet.size() == kNTimeFitParams); + ebTimeFitParameters_.assign(ebTimeFitParamsFromPSet.begin(), ebTimeFitParamsFromPSet.end()); + eeTimeFitParameters_.assign(eeTimeFitParamsFromPSet.begin(), eeTimeFitParamsFromPSet.end()); + + auto const ebAmplFitParamsFromPSet = iConfig.getParameter>("EBamplitudeFitParameters"); + auto const eeAmplFitParamsFromPSet = iConfig.getParameter>("EEamplitudeFitParameters"); + // Assert that there are as many parameters as the EcalMultiFitParametersSoA expects + assert(ebAmplFitParamsFromPSet.size() == kNAmplitudeFitParams); + assert(eeAmplFitParamsFromPSet.size() == kNAmplitudeFitParams); + ebAmplitudeFitParameters_.assign(ebAmplFitParamsFromPSet.begin(), ebAmplFitParamsFromPSet.end()); + eeAmplitudeFitParameters_.assign(eeAmplFitParamsFromPSet.begin(), eeAmplFitParamsFromPSet.end()); + } + + void EcalMultifitParametersHostESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add>("EBtimeFitParameters", + {-2.015452e+00, + 3.130702e+00, + -1.234730e+01, + 4.188921e+01, + -8.283944e+01, + 9.101147e+01, + -5.035761e+01, + 1.105621e+01}); + desc.add>("EEtimeFitParameters", + {-2.390548e+00, + 3.553628e+00, + -1.762341e+01, + 6.767538e+01, + -1.332130e+02, + 1.407432e+02, + -7.541106e+01, + 1.620277e+01}); + desc.add>("EBamplitudeFitParameters", {1.138, 1.652}); + desc.add>("EEamplitudeFitParameters", {1.890, 1.400}); + descriptions.addWithDefaultLabel(desc); + } + + std::unique_ptr EcalMultifitParametersHostESProducer::produce( + EcalMultifitParametersRcd const& iRecord) { + size_t const sizeone = 1; + auto product = std::make_unique(sizeone, cms::alpakatools::host()); + auto view = product->view(); + + std::memcpy(view.timeFitParamsEB().data(), ebTimeFitParameters_.data(), sizeof(float) * kNTimeFitParams); + std::memcpy(view.timeFitParamsEE().data(), eeTimeFitParameters_.data(), sizeof(float) * kNTimeFitParams); + + std::memcpy( + view.amplitudeFitParamsEB().data(), ebAmplitudeFitParameters_.data(), sizeof(float) * kNAmplitudeFitParams); + std::memcpy( + view.amplitudeFitParamsEE().data(), eeAmplitudeFitParameters_.data(), sizeof(float) * kNAmplitudeFitParams); + + return product; + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +DEFINE_FWK_EVENTSETUP_ALPAKA_MODULE(EcalMultifitParametersHostESProducer); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.dev.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.dev.cc new file mode 100644 index 0000000000000..e2f56ae901903 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.dev.cc @@ -0,0 +1,234 @@ +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +#include "AmplitudeComputationCommonKernels.h" +#include "AmplitudeComputationKernels.h" +#include "EcalUncalibRecHitMultiFitAlgoPortable.h" +#include "TimeComputationKernels.h" + +//#define DEBUG +//#define ECAL_RECO_ALPAKA_DEBUG + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + using namespace cms::alpakatools; + + void launchKernels(Queue& queue, + InputProduct const& digisDevEB, + InputProduct const& digisDevEE, + OutputProduct& uncalibRecHitsDevEB, + OutputProduct& uncalibRecHitsDevEE, + EcalMultifitConditionsDevice const& conditionsDev, + EcalMultifitParametersDevice const& paramsDev, + ConfigurationParameters const& configParams) { + using digis_type = std::vector; + using dids_type = std::vector; + // according to the cpu setup //----> hardcoded + bool constexpr gainSwitchUseMaxSampleEB = true; + // according to the cpu setup //----> hardcoded + bool constexpr gainSwitchUseMaxSampleEE = false; + auto constexpr kMaxSamples = EcalDataFrame::MAXSAMPLES; + + auto const ebSize = static_cast(uncalibRecHitsDevEB.const_view().metadata().size()); + auto const totalChannels = ebSize + static_cast(uncalibRecHitsDevEE.const_view().metadata().size()); + + EventDataForScratchDevice scratch(configParams, totalChannels, queue); + + // + // 1d preparation kernel + // + uint32_t constexpr nchannels_per_block = 32; + auto constexpr threads_1d = kMaxSamples * nchannels_per_block; + auto const blocks_1d = cms::alpakatools::divide_up_by(totalChannels * kMaxSamples, threads_1d); + auto workDivPrep1D = cms::alpakatools::make_workdiv(blocks_1d, threads_1d); + // Since the ::ecal::multifit::X objects are non-dynamic Eigen::Matrix types the returned pointers from the buffers + // and the ::ecal::multifit::X* both point to the data. + alpaka::exec(queue, + workDivPrep1D, + Kernel_prep_1d_and_initialize{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + uncalibRecHitsDevEB.view(), + uncalibRecHitsDevEE.view(), + conditionsDev.const_view(), + reinterpret_cast<::ecal::multifit::SampleVector*>(scratch.samplesDevBuf.data()), + reinterpret_cast<::ecal::multifit::SampleGainVector*>(scratch.gainsNoiseDevBuf.data()), + scratch.hasSwitchToGain6DevBuf.data(), + scratch.hasSwitchToGain1DevBuf.data(), + scratch.isSaturatedDevBuf.data(), + scratch.acStateDevBuf.data(), + reinterpret_cast<::ecal::multifit::BXVectorType*>(scratch.activeBXsDevBuf.data()), + gainSwitchUseMaxSampleEB, + gainSwitchUseMaxSampleEE); + + // + // 2d preparation kernel + // + Vec2D const blocks_2d{1u, totalChannels}; // {y, x} coordiantes + Vec2D const threads_2d{kMaxSamples, kMaxSamples}; + auto workDivPrep2D = cms::alpakatools::make_workdiv(blocks_2d, threads_2d); + alpaka::exec(queue, + workDivPrep2D, + Kernel_prep_2d{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + conditionsDev.const_view(), + reinterpret_cast<::ecal::multifit::SampleGainVector*>(scratch.gainsNoiseDevBuf.data()), + reinterpret_cast<::ecal::multifit::SampleMatrix*>(scratch.noisecovDevBuf.data()), + reinterpret_cast<::ecal::multifit::PulseMatrixType*>(scratch.pulse_matrixDevBuf.data()), + scratch.hasSwitchToGain6DevBuf.data(), + scratch.hasSwitchToGain1DevBuf.data(), + scratch.isSaturatedDevBuf.data()); + + // run minimization kernels + minimization_procedure(queue, + digisDevEB, + digisDevEE, + uncalibRecHitsDevEB, + uncalibRecHitsDevEE, + scratch, + conditionsDev, + configParams, + totalChannels); + + if (configParams.shouldRunTimingComputation) { + // + // TODO: this guy can run concurrently with other kernels, + // there is no dependence on the order of execution + // + auto const blocks_time_init = blocks_1d; + auto const threads_time_init = threads_1d; + auto workDivTimeCompInit1D = cms::alpakatools::make_workdiv(blocks_time_init, threads_time_init); + alpaka::exec(queue, + workDivTimeCompInit1D, + Kernel_time_computation_init{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + conditionsDev.const_view(), + scratch.sample_valuesDevBuf.value().data(), + scratch.sample_value_errorsDevBuf.value().data(), + scratch.ampMaxErrorDevBuf.value().data(), + scratch.useless_sample_valuesDevBuf.value().data(), + scratch.pedestal_numsDevBuf.value().data()); + + // + // TODO: small kernel only for EB. It needs to be checked if + /// fusing such small kernels is beneficial in here + // + // we are running only over EB digis + // therefore we need to create threads/blocks only for that + auto const threadsFixMGPA = threads_1d; + auto const blocksFixMGPA = cms::alpakatools::divide_up_by(kMaxSamples * ebSize, threadsFixMGPA); + auto workDivTimeFixMGPAslew1D = cms::alpakatools::make_workdiv(blocksFixMGPA, threadsFixMGPA); + alpaka::exec(queue, + workDivTimeFixMGPAslew1D, + Kernel_time_compute_fixMGPAslew{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + conditionsDev.const_view(), + scratch.sample_valuesDevBuf.value().data(), + scratch.sample_value_errorsDevBuf.value().data(), + scratch.useless_sample_valuesDevBuf.value().data()); + + auto const threads_nullhypot = threads_1d; + auto const blocks_nullhypot = blocks_1d; + auto workDivTimeNullhypot1D = cms::alpakatools::make_workdiv(blocks_nullhypot, threads_nullhypot); + alpaka::exec(queue, + workDivTimeNullhypot1D, + Kernel_time_compute_nullhypot{}, + scratch.sample_valuesDevBuf.value().data(), + scratch.sample_value_errorsDevBuf.value().data(), + scratch.useless_sample_valuesDevBuf.value().data(), + scratch.chi2sNullHypotDevBuf.value().data(), + scratch.sum0sNullHypotDevBuf.value().data(), + scratch.sumAAsNullHypotDevBuf.value().data(), + totalChannels); + + constexpr uint32_t nchannels_per_block_makeratio = kMaxSamples; + constexpr auto nthreads_per_channel = + nchannels_per_block_makeratio * (nchannels_per_block_makeratio - 1) / 2; // n(n-1)/2 + constexpr auto threads_makeratio = nthreads_per_channel * nchannels_per_block_makeratio; + auto const blocks_makeratio = + cms::alpakatools::divide_up_by(nthreads_per_channel * totalChannels, threads_makeratio); + auto workDivTimeMakeRatio1D = cms::alpakatools::make_workdiv(blocks_makeratio, threads_makeratio); + alpaka::exec(queue, + workDivTimeMakeRatio1D, + Kernel_time_compute_makeratio{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + scratch.sample_valuesDevBuf.value().data(), + scratch.sample_value_errorsDevBuf.value().data(), + scratch.useless_sample_valuesDevBuf.value().data(), + scratch.pedestal_numsDevBuf.value().data(), + scratch.sumAAsNullHypotDevBuf.value().data(), + scratch.sum0sNullHypotDevBuf.value().data(), + scratch.tMaxAlphaBetasDevBuf.value().data(), + scratch.tMaxErrorAlphaBetasDevBuf.value().data(), + scratch.accTimeMaxDevBuf.value().data(), + scratch.accTimeWgtDevBuf.value().data(), + scratch.tcStateDevBuf.value().data(), + paramsDev.const_view(), + configParams.timeFitLimitsFirstEB, + configParams.timeFitLimitsFirstEE, + configParams.timeFitLimitsSecondEB, + configParams.timeFitLimitsSecondEE); + + auto const threads_findamplchi2 = threads_1d; + auto const blocks_findamplchi2 = blocks_1d; + auto workDivTimeFindAmplChi21D = cms::alpakatools::make_workdiv(blocks_findamplchi2, threads_findamplchi2); + alpaka::exec(queue, + workDivTimeFindAmplChi21D, + Kernel_time_compute_findamplchi2_and_finish{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + scratch.sample_valuesDevBuf.value().data(), + scratch.sample_value_errorsDevBuf.value().data(), + scratch.useless_sample_valuesDevBuf.value().data(), + scratch.tMaxAlphaBetasDevBuf.value().data(), + scratch.tMaxErrorAlphaBetasDevBuf.value().data(), + scratch.accTimeMaxDevBuf.value().data(), + scratch.accTimeWgtDevBuf.value().data(), + scratch.sumAAsNullHypotDevBuf.value().data(), + scratch.sum0sNullHypotDevBuf.value().data(), + scratch.chi2sNullHypotDevBuf.value().data(), + scratch.tcStateDevBuf.value().data(), + scratch.ampMaxAlphaBetaDevBuf.value().data(), + scratch.ampMaxErrorDevBuf.value().data(), + scratch.timeMaxDevBuf.value().data(), + scratch.timeErrorDevBuf.value().data(), + paramsDev.const_view()); + + auto const threads_timecorr = 32; + auto const blocks_timecorr = cms::alpakatools::divide_up_by(totalChannels, threads_timecorr); + auto workDivCorrFinal1D = cms::alpakatools::make_workdiv(blocks_timecorr, threads_timecorr); + alpaka::exec(queue, + workDivCorrFinal1D, + Kernel_time_correction_and_finalize{}, + digisDevEB.const_view(), + digisDevEE.const_view(), + uncalibRecHitsDevEB.view(), + uncalibRecHitsDevEE.view(), + conditionsDev.const_view(), + scratch.timeMaxDevBuf.value().data(), + scratch.timeErrorDevBuf.value().data(), + configParams.timeConstantTermEB, + configParams.timeConstantTermEE, + configParams.timeNconstEB, + configParams.timeNconstEE, + configParams.amplitudeThreshEB, + configParams.amplitudeThreshEE, + configParams.outOfTimeThreshG12pEB, + configParams.outOfTimeThreshG12pEE, + configParams.outOfTimeThreshG12mEB, + configParams.outOfTimeThreshG12mEE, + configParams.outOfTimeThreshG61pEB, + configParams.outOfTimeThreshG61pEE, + configParams.outOfTimeThreshG61mEB, + configParams.outOfTimeThreshG61mEE); + } + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.h new file mode 100644 index 0000000000000..c63b3f8181315 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitMultiFitAlgoPortable.h @@ -0,0 +1,30 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_EcalUncalibRecHitMultiFitAlgoPortable_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_EcalUncalibRecHitMultiFitAlgoPortable_h + +#include + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "DeclsForKernels.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + using InputProduct = EcalDigiDeviceCollection; + using OutputProduct = EcalUncalibratedRecHitDeviceCollection; + + void launchKernels(Queue& queue, + InputProduct const& digisDevEB, + InputProduct const& digisDevEE, + OutputProduct& uncalibRecHitsDevEB, + OutputProduct& uncalibRecHitsDevEE, + EcalMultifitConditionsDevice const& conditionsDev, + EcalMultifitParametersDevice const& paramsDev, + ConfigurationParameters const& configParams); + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +#endif // RecoLocalCalo_EcalRecProducers_plugins_alpaka_EcalUncalibRecHitMultiFitAlgoPortable_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitProducerPortable.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitProducerPortable.cc new file mode 100644 index 0000000000000..d0f06f3caf186 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/EcalUncalibRecHitProducerPortable.cc @@ -0,0 +1,222 @@ +#include "CondFormats/DataRecord/interface/EcalMultifitConditionsRcd.h" +#include "CondFormats/DataRecord/interface/EcalMultifitParametersRcd.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h" +#include "DataFormats/EcalDigi/interface/alpaka/EcalDigiDeviceCollection.h" +#include "DataFormats/EcalRecHit/interface/alpaka/EcalUncalibratedRecHitDeviceCollection.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/stream/SynchronizingEDProducer.h" + +#include "DeclsForKernels.h" +#include "EcalUncalibRecHitMultiFitAlgoPortable.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + class EcalUncalibRecHitProducerPortable : public stream::SynchronizingEDProducer<> { + public: + explicit EcalUncalibRecHitProducerPortable(edm::ParameterSet const& ps); + ~EcalUncalibRecHitProducerPortable() override = default; + static void fillDescriptions(edm::ConfigurationDescriptions&); + + void acquire(device::Event const&, device::EventSetup const&) override; + void produce(device::Event&, device::EventSetup const&) override; + + private: + using InputProduct = EcalDigiDeviceCollection; + const device::EDGetToken digisTokenEB_; + const device::EDGetToken digisTokenEE_; + using OutputProduct = EcalUncalibratedRecHitDeviceCollection; + const device::EDPutToken uncalibRecHitsTokenEB_; + const device::EDPutToken uncalibRecHitsTokenEE_; + + // conditions tokens + const device::ESGetToken multifitConditionsToken_; + const device::ESGetToken multifitParametersToken_; + + // configuration parameters + ecal::multifit::ConfigurationParameters configParameters_; + + cms::alpakatools::host_buffer ebDigisSizeHostBuf_; + cms::alpakatools::host_buffer eeDigisSizeHostBuf_; + }; + + void EcalUncalibRecHitProducerPortable::fillDescriptions(edm::ConfigurationDescriptions& confDesc) { + edm::ParameterSetDescription desc; + + desc.add("digisLabelEB", edm::InputTag("ecalRawToDigiPortable", "ebDigis")); + desc.add("digisLabelEE", edm::InputTag("ecalRawToDigiPortable", "eeDigis")); + + desc.add("recHitsLabelEB", "EcalUncalibRecHitsEB"); + desc.add("recHitsLabelEE", "EcalUncalibRecHitsEE"); + + desc.add("EBtimeFitLimits_Lower", 0.2); + desc.add("EBtimeFitLimits_Upper", 1.4); + desc.add("EEtimeFitLimits_Lower", 0.2); + desc.add("EEtimeFitLimits_Upper", 1.4); + desc.add("EBtimeConstantTerm", .6); + desc.add("EEtimeConstantTerm", 1.0); + desc.add("EBtimeNconst", 28.5); + desc.add("EEtimeNconst", 31.8); + desc.add("outOfTimeThresholdGain12pEB", 5); + desc.add("outOfTimeThresholdGain12mEB", 5); + desc.add("outOfTimeThresholdGain61pEB", 5); + desc.add("outOfTimeThresholdGain61mEB", 5); + desc.add("outOfTimeThresholdGain12pEE", 1000); + desc.add("outOfTimeThresholdGain12mEE", 1000); + desc.add("outOfTimeThresholdGain61pEE", 1000); + desc.add("outOfTimeThresholdGain61mEE", 1000); + desc.add("amplitudeThresholdEB", 10); + desc.add("amplitudeThresholdEE", 10); + desc.addUntracked>("kernelMinimizeThreads", {32, 1, 1}); + desc.add("shouldRunTimingComputation", true); + confDesc.addWithDefaultLabel(desc); + } + + EcalUncalibRecHitProducerPortable::EcalUncalibRecHitProducerPortable(const edm::ParameterSet& ps) + : digisTokenEB_{consumes(ps.getParameter("digisLabelEB"))}, + digisTokenEE_{consumes(ps.getParameter("digisLabelEE"))}, + uncalibRecHitsTokenEB_{produces(ps.getParameter("recHitsLabelEB"))}, + uncalibRecHitsTokenEE_{produces(ps.getParameter("recHitsLabelEE"))}, + multifitConditionsToken_{esConsumes()}, + multifitParametersToken_{esConsumes()}, + ebDigisSizeHostBuf_{cms::alpakatools::make_host_buffer()}, + eeDigisSizeHostBuf_{cms::alpakatools::make_host_buffer()} { + std::pair EBtimeFitLimits, EEtimeFitLimits; + EBtimeFitLimits.first = ps.getParameter("EBtimeFitLimits_Lower"); + EBtimeFitLimits.second = ps.getParameter("EBtimeFitLimits_Upper"); + EEtimeFitLimits.first = ps.getParameter("EEtimeFitLimits_Lower"); + EEtimeFitLimits.second = ps.getParameter("EEtimeFitLimits_Upper"); + + auto EBtimeConstantTerm = ps.getParameter("EBtimeConstantTerm"); + auto EEtimeConstantTerm = ps.getParameter("EEtimeConstantTerm"); + auto EBtimeNconst = ps.getParameter("EBtimeNconst"); + auto EEtimeNconst = ps.getParameter("EEtimeNconst"); + + auto outOfTimeThreshG12pEB = ps.getParameter("outOfTimeThresholdGain12pEB"); + auto outOfTimeThreshG12mEB = ps.getParameter("outOfTimeThresholdGain12mEB"); + auto outOfTimeThreshG61pEB = ps.getParameter("outOfTimeThresholdGain61pEB"); + auto outOfTimeThreshG61mEB = ps.getParameter("outOfTimeThresholdGain61mEB"); + auto outOfTimeThreshG12pEE = ps.getParameter("outOfTimeThresholdGain12pEE"); + auto outOfTimeThreshG12mEE = ps.getParameter("outOfTimeThresholdGain12mEE"); + auto outOfTimeThreshG61pEE = ps.getParameter("outOfTimeThresholdGain61pEE"); + auto outOfTimeThreshG61mEE = ps.getParameter("outOfTimeThresholdGain61mEE"); + auto amplitudeThreshEB = ps.getParameter("amplitudeThresholdEB"); + auto amplitudeThreshEE = ps.getParameter("amplitudeThresholdEE"); + + // switch to run timing computation kernels + configParameters_.shouldRunTimingComputation = ps.getParameter("shouldRunTimingComputation"); + + // minimize kernel launch conf + auto threadsMinimize = ps.getUntrackedParameter>("kernelMinimizeThreads"); + configParameters_.kernelMinimizeThreads[0] = threadsMinimize[0]; + configParameters_.kernelMinimizeThreads[1] = threadsMinimize[1]; + configParameters_.kernelMinimizeThreads[2] = threadsMinimize[2]; + + // + // configuration and physics parameters: done once + // assume there is a single device + // use sync copying + // + + // time fit parameters and limits + configParameters_.timeFitLimitsFirstEB = EBtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEB = EBtimeFitLimits.second; + configParameters_.timeFitLimitsFirstEE = EEtimeFitLimits.first; + configParameters_.timeFitLimitsSecondEE = EEtimeFitLimits.second; + + // time constant terms + configParameters_.timeConstantTermEB = EBtimeConstantTerm; + configParameters_.timeConstantTermEE = EEtimeConstantTerm; + + // time N const + configParameters_.timeNconstEB = EBtimeNconst; + configParameters_.timeNconstEE = EEtimeNconst; + + // amplitude threshold for time flags + configParameters_.amplitudeThreshEB = amplitudeThreshEB; + configParameters_.amplitudeThreshEE = amplitudeThreshEE; + + // out of time thresholds gain-dependent + configParameters_.outOfTimeThreshG12pEB = outOfTimeThreshG12pEB; + configParameters_.outOfTimeThreshG12pEE = outOfTimeThreshG12pEE; + configParameters_.outOfTimeThreshG61pEB = outOfTimeThreshG61pEB; + configParameters_.outOfTimeThreshG61pEE = outOfTimeThreshG61pEE; + configParameters_.outOfTimeThreshG12mEB = outOfTimeThreshG12mEB; + configParameters_.outOfTimeThreshG12mEE = outOfTimeThreshG12mEE; + configParameters_.outOfTimeThreshG61mEB = outOfTimeThreshG61mEB; + configParameters_.outOfTimeThreshG61mEE = outOfTimeThreshG61mEE; + } + + void EcalUncalibRecHitProducerPortable::acquire(device::Event const& event, device::EventSetup const& setup) { + auto& queue = event.queue(); + + // get device collections from event + auto const& ebDigisDev = event.get(digisTokenEB_); + auto const& eeDigisDev = event.get(digisTokenEE_); + + // copy the actual numbers of digis in the collections to host + auto ebDigisSizeDevConstView = + cms::alpakatools::make_device_view(alpaka::getDev(queue), ebDigisDev.const_view().size()); + auto eeDigisSizeDevConstView = + cms::alpakatools::make_device_view(alpaka::getDev(queue), eeDigisDev.const_view().size()); + alpaka::memcpy(queue, ebDigisSizeHostBuf_, ebDigisSizeDevConstView); + alpaka::memcpy(queue, eeDigisSizeHostBuf_, eeDigisSizeDevConstView); + } + + void EcalUncalibRecHitProducerPortable::produce(device::Event& event, device::EventSetup const& setup) { + auto& queue = event.queue(); + + // get device collections from event + auto const& ebDigisDev = event.get(digisTokenEB_); + auto const& eeDigisDev = event.get(digisTokenEE_); + + // get the actual numbers of digis in the collections + auto const ebDigisSize = static_cast(*ebDigisSizeHostBuf_.data()); + auto const eeDigisSize = static_cast(*eeDigisSizeHostBuf_.data()); + + // output device collections + OutputProduct uncalibRecHitsDevEB{ebDigisSize, queue}; + OutputProduct uncalibRecHitsDevEE{eeDigisSize, queue}; + // reset the size scalar of the SoA + // memset takes an alpaka view that is created from the scalar in a view to the portable device collection + auto uncalibRecHitSizeViewEB = + cms::alpakatools::make_device_view(alpaka::getDev(queue), uncalibRecHitsDevEB.view().size()); + auto uncalibRecHitSizeViewEE = + cms::alpakatools::make_device_view(alpaka::getDev(queue), uncalibRecHitsDevEE.view().size()); + alpaka::memset(queue, uncalibRecHitSizeViewEB, 0); + alpaka::memset(queue, uncalibRecHitSizeViewEE, 0); + + // stop here if there are no digis + if (ebDigisSize + eeDigisSize > 0) { + // conditions + auto const& multifitConditionsDev = setup.getData(multifitConditionsToken_); + auto const& multifitParametersDev = setup.getData(multifitParametersToken_); + + // + // schedule algorithms + // + ecal::multifit::launchKernels(queue, + ebDigisDev, + eeDigisDev, + uncalibRecHitsDevEB, + uncalibRecHitsDevEE, + multifitConditionsDev, + multifitParametersDev, + configParameters_); + } + + // put into the event + event.emplace(uncalibRecHitsTokenEB_, std::move(uncalibRecHitsDevEB)); + event.emplace(uncalibRecHitsTokenEE_, std::move(uncalibRecHitsDevEE)); + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" +DEFINE_FWK_ALPAKA_MODULE(EcalUncalibRecHitProducerPortable); diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.dev.cc b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.dev.cc new file mode 100644 index 0000000000000..906b96fa2b6b6 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.dev.cc @@ -0,0 +1,275 @@ +#include "DataFormats/EcalDetId/interface/EBDetId.h" +#include "DataFormats/EcalDetId/interface/EEDetId.h" + +#include "KernelHelpers.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::reconstruction { + + namespace internal::barrel { + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool positiveZ(uint32_t id) { return id & 0x10000; } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t ietaAbs(uint32_t id) { return (id >> 9) & 0x7F; } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t iphi(uint32_t id) { return id & 0x1FF; } + + ALPAKA_FN_ACC int dccFromSm(int ism) { + int idcc = 9 + ism; + if (ism > 18) + idcc -= 18; + else + idcc += 18; + return idcc; + } + + ALPAKA_FN_ACC int sm(int ieta, int iphi) { + if (iphi > 360) + iphi -= 360; + int ism = (iphi - 1) / 20 + 1; + if (ieta < 0) + ism += 18; + return ism; + } + + ALPAKA_FN_ACC int dcc(int ieta, int iphi) { + int const ism = sm(ieta, iphi); + return dccFromSm(ism); + } + + ALPAKA_FN_ACC int lm_channel(int iX, int iY) { + static const int idx_[] = { + // clang-format off + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 3 + 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, // 2 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9, // 1 + 1, 3, 3, 3, 3, 5, 5, 5, 5, 7, 7, 7, 7, 9, 9, 9, 9 // 0 + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + // clang-format on + }; + + constexpr int iym = 4; + constexpr int ixm = 17; + int const il = iym - iY + 1; + int const ic = iX; + int const ii = il * ixm + ic; + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int))) { + return -1; + }; + return idx_[ii]; + } + + ALPAKA_FN_ACC int localCoord_x(int ieta) { + int iz = 1; + if (ieta < 0) { + iz = -1; + } + ieta *= iz; + int ix = ieta - 1; + + return ix; + } + + ALPAKA_FN_ACC int localCoord_y(int ieta, int iphi) { + if (iphi > 360) { + iphi -= 360; + } + int iy = (iphi - 1) % 20; + if (ieta < 0) { + iy = 19 - iy; + } + + return iy; + } + + ALPAKA_FN_ACC int lmmod(int ieta, int iphi) { + int const ix = localCoord_x(ieta); + int const iy = localCoord_y(ieta, iphi); + + return lm_channel(ix / 5, iy / 5); + } + + ALPAKA_FN_ACC int side(int ieta, int iphi) { + int const ilmmod = lmmod(ieta, iphi); + return (ilmmod % 2 == 0) ? 1 : 0; + } + + } // namespace internal::barrel + + ALPAKA_FN_ACC uint32_t hashedIndexEB(uint32_t id) { + using namespace internal::barrel; + return (EBDetId::MAX_IETA + (positiveZ(id) ? ietaAbs(id) - 1 : -ietaAbs(id))) * EBDetId::MAX_IPHI + iphi(id) - 1; + } + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEBGeom.cc + // function: "lmr" + + ALPAKA_FN_ACC int32_t laserMonitoringRegionEB(uint32_t id) { + using namespace internal::barrel; + + int ieta; + if (positiveZ(id)) { + ieta = ietaAbs(id); + } else { + ieta = -ietaAbs(id); + } + + int const idcc = dcc(ieta, (int)(iphi(id))); + int const ism = idcc - 9; + + int const iside = side(ieta, (int)(iphi(id))); + + return (1 + 2 * (ism - 1) + iside); + } + + namespace internal::endcap { + + ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t ix(uint32_t id) { return (id >> 7) & 0x7F; } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE uint32_t iy(uint32_t id) { return id & 0x7F; } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool positiveZ(uint32_t id) { return id & 0x4000; } + + // these constants come from EE Det Id + ALPAKA_STATIC_ACC_MEM_CONSTANT const unsigned short kxf[] = { + 41, 51, 41, 51, 41, 51, 36, 51, 36, 51, 26, 51, 26, 51, 26, 51, 21, 51, 21, 51, 21, 51, 21, 51, 21, + 51, 16, 51, 16, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, + 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 4, 51, 4, 51, 4, + 51, 4, 51, 4, 56, 1, 58, 1, 59, 1, 60, 1, 61, 1, 61, 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, + 1, 62, 1, 62, 1, 62, 1, 62, 1, 62, 1, 61, 1, 61, 1, 60, 1, 59, 1, 58, 4, 56, 4, 51, 4, + 51, 4, 51, 4, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, 6, 51, + 9, 51, 9, 51, 9, 51, 9, 51, 9, 51, 14, 51, 14, 51, 14, 51, 14, 51, 14, 51, 16, 51, 16, 51, 21, + 51, 21, 51, 21, 51, 21, 51, 21, 51, 26, 51, 26, 51, 26, 51, 36, 51, 36, 51, 41, 51, 41, 51, 41, 51}; + + ALPAKA_STATIC_ACC_MEM_CONSTANT const unsigned short kdi[] = { + 0, 10, 20, 30, 40, 50, 60, 75, 90, 105, 120, 145, 170, 195, 220, 245, 270, + 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 605, 640, 675, 710, 747, 784, 821, + 858, 895, 932, 969, 1006, 1043, 1080, 1122, 1164, 1206, 1248, 1290, 1332, 1374, 1416, 1458, 1500, + 1545, 1590, 1635, 1680, 1725, 1770, 1815, 1860, 1905, 1950, 1995, 2040, 2085, 2130, 2175, 2220, 2265, + 2310, 2355, 2400, 2447, 2494, 2541, 2588, 2635, 2682, 2729, 2776, 2818, 2860, 2903, 2946, 2988, 3030, + 3071, 3112, 3152, 3192, 3232, 3272, 3311, 3350, 3389, 3428, 3467, 3506, 3545, 3584, 3623, 3662, 3701, + 3740, 3779, 3818, 3857, 3896, 3935, 3974, 4013, 4052, 4092, 4132, 4172, 4212, 4253, 4294, 4336, 4378, + 4421, 4464, 4506, 4548, 4595, 4642, 4689, 4736, 4783, 4830, 4877, 4924, 4969, 5014, 5059, 5104, 5149, + 5194, 5239, 5284, 5329, 5374, 5419, 5464, 5509, 5554, 5599, 5644, 5689, 5734, 5779, 5824, 5866, 5908, + 5950, 5992, 6034, 6076, 6118, 6160, 6202, 6244, 6281, 6318, 6355, 6392, 6429, 6466, 6503, 6540, 6577, + 6614, 6649, 6684, 6719, 6754, 6784, 6814, 6844, 6874, 6904, 6934, 6964, 6994, 7024, 7054, 7079, 7104, + 7129, 7154, 7179, 7204, 7219, 7234, 7249, 7264, 7274, 7284, 7294, 7304, 7314}; + + ALPAKA_FN_ACC int quadrant(int iX, int iY) { + bool const near = iX >= 11; + bool const far = !near; + bool const top = iY >= 11; + bool const bot = !top; + + int iquad = 0; + if (near && top) + iquad = 1; + else if (far && top) + iquad = 2; + else if (far && bot) + iquad = 3; + else + iquad = 4; + + return iquad; + } + + ALPAKA_FN_ACC int sector(int iX, int iY) { + // Y (towards the surface) + // T + // | + // | + // | + // o---------| X (towards center of LHC) + // + static const int idx_[] = { + // clang-format off + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 9, 9, 9, 0, 0, 0, 0, 0, 0, 0, // 20 + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, // 19 + 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 8, 0, 0, 0, // 18 + 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 0, 0, // 17 + 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 8, 8, 8, 8, 0, // 16 + 0, 2, 2, 2, 2, 2, 1, 1, 1, 1, 9, 9, 9, 9, 8, 8, 8, 8, 8, 0, // 15 + 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 9, 9, 9, 8, 8, 8, 8, 8, 8, 0, // 14 + 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, // 13 + 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 0, 8, 8, 8, 8, 8, 8, 8, 7, 7, // 12 + 3, 3, 3, 3, 3, 3, 3, 2, 0, 0, 0, 0, 8, 7, 7, 7, 7, 7, 7, 7, // 11 + 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, // 10 + 3, 3, 3, 3, 3, 3, 3, 4, 4, 0, 0, 6, 6, 7, 7, 7, 7, 7, 7, 7, // 9 + 3, 3, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 7, 7, 7, // 8 + 0, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 7, 7, 7, 0, // 7 + 0, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 0, // 6 + 0, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 0, // 5 + 0, 0, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 0, 0, // 4 + 0, 0, 0, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 0, 0, 0, // 3 + 0, 0, 0, 0, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 0, 0, 0, 0, // 2 + 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0 // 1 + // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + // clang-format on + }; + + constexpr int iym = 20; + constexpr int ixm = 20; + int const il = iym - iY; + int const ic = iX - 1; + int const ii = il * ixm + ic; + + if (ii < 0 || ii > (int)(sizeof(idx_) / sizeof(int)) || idx_[ii] == 0) { + return -1; + }; + return idx_[ii]; + } + + } // namespace internal::endcap + + ALPAKA_FN_ACC uint32_t hashedIndexEE(uint32_t id) { + using namespace internal::endcap; + + const uint32_t jx(ix(id)); + const uint32_t jd(2 * (iy(id) - 1) + (jx - 1) / 50); + return ((positiveZ(id) ? EEDetId::kEEhalf : 0) + kdi[jd] + jx - kxf[jd]); + } + + // + // https://cmssdt.cern.ch/lxr/source/CalibCalorimetry/EcalLaserAnalyzer/src/MEEEGeom.cc + // https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc + // + + ALPAKA_FN_ACC int32_t laserMonitoringRegionEE(uint32_t id) { + using namespace internal::endcap; + + // SuperCrysCoord + uint32_t const iX = (ix(id) - 1) / 5 + 1; + uint32_t const iY = (iy(id) - 1) / 5 + 1; + + // Correct convention + // * @param iz iz/zside index: -1 for EE-, +1 for EE+ + // https://github.com/cms-sw/cmssw/blob/master/DataFormats/EcalDetId/interface/EEDetId.h#L68-L71 + // zside in https://github.com/cms-sw/cmssw/blob/master/CalibCalorimetry/EcalLaserCorrection/src/EcalLaserDbService.cc#L63 + // + int const iz = positiveZ(id) ? 1 : -1; + + int const iquad = quadrant(iX, iY); + int const isect = sector(iX, iY); + if (isect < 0) + return -1; + + int ilmr = 0; + ilmr = isect - 6; + if (ilmr <= 0) + ilmr += 9; + if (ilmr == 9) + ilmr++; + else if (ilmr == 8 && iquad == 4) + ilmr++; + if (iz == +1) + ilmr += 72; + else + ilmr += 82; + + return ilmr; + } + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::reconstruction diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.h new file mode 100644 index 0000000000000..3b1772ecf2981 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/KernelHelpers.h @@ -0,0 +1,19 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_KernelHelpers_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_KernelHelpers_h + +#include +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::reconstruction { + + ALPAKA_FN_ACC uint32_t hashedIndexEB(uint32_t id); + + ALPAKA_FN_ACC uint32_t hashedIndexEE(uint32_t id); + + ALPAKA_FN_ACC int32_t laserMonitoringRegionEB(uint32_t id); + + ALPAKA_FN_ACC int32_t laserMonitoringRegionEE(uint32_t id); + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::reconstruction + +#endif // RecoLocalCalo_EcalRecProducers_plugins_alpaka_KernelHelpers_h diff --git a/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h new file mode 100644 index 0000000000000..667e4d4687e51 --- /dev/null +++ b/RecoLocalCalo/EcalRecProducers/plugins/alpaka/TimeComputationKernels.h @@ -0,0 +1,1162 @@ +#ifndef RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h +#define RecoLocalCalo_EcalRecProducers_plugins_alpaka_TimeComputationKernels_h + +#include +#include +#include + +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitConditionsDevice.h" +#include "CondFormats/EcalObjects/interface/alpaka/EcalMultifitParametersDevice.h" +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" +#include "DataFormats/EcalDigi/interface/EcalMGPASample.h" +#include "DataFormats/EcalRecHit/interface/EcalUncalibratedRecHit.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_log.h" +#include "FWCore/Utilities/interface/CMSUnrollLoop.h" +#include "RecoLocalCalo/EcalRecProducers/interface/EigenMatrixTypes_gpu.h" + +#include "DeclsForKernels.h" +#include "KernelHelpers.h" + +//#define ECAL_RECO_ALPAKA_DEBUG + +namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit { + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool use_sample(unsigned int sample_mask, unsigned int sample) { + return sample_mask & (0x1 << (EcalDataFrame::MAXSAMPLES - (sample + 1))); + } + + ALPAKA_FN_ACC constexpr float fast_expf(float x) { return unsafe_expf<6>(x); } + ALPAKA_FN_ACC constexpr float fast_logf(float x) { return unsafe_logf<7>(x); } + + class Kernel_time_compute_nullhypot { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + ScalarType* const sample_values, + ScalarType* const sample_value_errors, + bool* const useless_sample_values, + ScalarType* chi2s, + ScalarType* sum0s, + ScalarType* sumAAs, + uint32_t const nchannels) const { + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + + // shared mem inits + auto* s_sum0 = alpaka::getDynSharedMem(acc); + auto* s_sum1 = reinterpret_cast(s_sum0 + elemsPerBlock); + auto* s_sumA = s_sum1 + elemsPerBlock; + auto* s_sumAA = s_sumA + elemsPerBlock; + + for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution + auto tx = nchannels * nsamples - 1 - txforward; + auto const ch = tx / nsamples; + + auto const sample = tx % nsamples; + auto const ltx = tx % elemsPerBlock; + + // TODO make sure no div by 0 + auto const inv_error = + useless_sample_values[tx] ? 0. : 1. / (sample_value_errors[tx] * sample_value_errors[tx]); + auto const sample_value = sample_values[tx]; + s_sum0[ltx] = useless_sample_values[tx] ? 0 : 1; + s_sum1[ltx] = inv_error; + s_sumA[ltx] = sample_value * inv_error; + s_sumAA[ltx] = sample_value * sample_value * inv_error; + alpaka::syncBlockThreads(acc); + + // 5 threads for [0, 4] samples + if (sample < 5) { + s_sum0[ltx] += s_sum0[ltx + 5]; + s_sum1[ltx] += s_sum1[ltx + 5]; + s_sumA[ltx] += s_sumA[ltx + 5]; + s_sumAA[ltx] += s_sumAA[ltx + 5]; + } + alpaka::syncBlockThreads(acc); + + if (sample < 2) { + // note double counting of sample 3 + s_sum0[ltx] += s_sum0[ltx + 2] + s_sum0[ltx + 3]; + s_sum1[ltx] += s_sum1[ltx + 2] + s_sum1[ltx + 3]; + s_sumA[ltx] += s_sumA[ltx + 2] + s_sumA[ltx + 3]; + s_sumAA[ltx] += s_sumAA[ltx + 2] + s_sumAA[ltx + 3]; + } + alpaka::syncBlockThreads(acc); + + if (sample == 0) { + // note, subtract to remove the double counting of sample == 3 + auto const sum0 = s_sum0[ltx] + s_sum0[ltx + 1] - s_sum0[ltx + 3]; + auto const sum1 = s_sum1[ltx] + s_sum1[ltx + 1] - s_sum1[ltx + 3]; + auto const sumA = s_sumA[ltx] + s_sumA[ltx + 1] - s_sumA[ltx + 3]; + auto const sumAA = s_sumAA[ltx] + s_sumAA[ltx + 1] - s_sumAA[ltx + 3]; + auto const chi2 = sum0 > 0 ? (sumAA - sumA * sumA / sum1) / sum0 : static_cast(0); + chi2s[ch] = chi2; + sum0s[ch] = sum0; + sumAAs[ch] = sumAA; + +#ifdef DEBUG_TC_NULLHYPOT + if (ch == 0) { + printf("chi2 = %f sum0 = %d sumAA = %f\n", chi2, static_cast(sum0), sumAA); + } +#endif + } + } + } + }; + + // + // launch ctx parameters are + // 45 threads per channel, X channels per block, Y blocks + // 45 comes from: 10 samples for i <- 0 to 9 and for j <- i+1 to 9 + // TODO: it might be much beter to use 32 threads per channel instead of 45 + // to simplify the synchronization + class Kernel_time_compute_makeratio { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + ScalarType* const sample_values, + ScalarType* const sample_value_errors, + bool* const useless_sample_values, + char* const pedestal_nums, + ScalarType* const sumAAsNullHypot, + ScalarType* const sum0sNullHypot, + ScalarType* tMaxAlphaBetas, + ScalarType* tMaxErrorAlphaBetas, + ScalarType* g_accTimeMax, + ScalarType* g_accTimeWgt, + TimeComputationState* g_state, + EcalMultifitParametersDevice::ConstView paramsDev, + ConfigurationParameters::type const timeFitLimits_firstEB, + ConfigurationParameters::type const timeFitLimits_firstEE, + ConfigurationParameters::type const timeFitLimits_secondEB, + ConfigurationParameters::type const timeFitLimits_secondEE) const { + // constants + constexpr uint32_t nchannels_per_block = 10; + constexpr auto nthreads_per_channel = nchannels_per_block * (nchannels_per_block - 1) / 2; + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + auto const nchannels = digisDevEB.size() + digisDevEE.size(); + auto const offsetForInputs = digisDevEB.size(); + auto const totalElements = nthreads_per_channel * nchannels; + + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + assert(nthreads_per_channel * nchannels_per_block == elemsPerBlock); + + auto* shr_chi2s = alpaka::getDynSharedMem(acc); + auto* shr_time_wgt = shr_chi2s + elemsPerBlock; + auto* shr_time_max = shr_time_wgt + elemsPerBlock; + auto* shrTimeMax = shr_time_max + elemsPerBlock; + auto* shrTimeWgt = shrTimeMax + elemsPerBlock; + auto* shr_chi2 = shrTimeWgt + elemsPerBlock; + auto* shr_tmax = shr_chi2 + elemsPerBlock; + auto* shr_tmaxerr = shr_tmax + elemsPerBlock; + auto* shr_condForUselessSamples = reinterpret_cast(shr_tmaxerr + elemsPerBlock); + auto* shr_internalCondForSkipping1 = shr_condForUselessSamples + elemsPerBlock; + auto* shr_internalCondForSkipping2 = shr_internalCondForSkipping1 + elemsPerBlock; + + for (auto block : cms::alpakatools::blocks_with_stride(acc, totalElements)) { + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ch = idx.global / nthreads_per_channel; + auto const ltx = idx.global % nthreads_per_channel; + + auto const ch_start = ch * nsamples; + auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id(); + + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto* const amplitudeFitParameters = + isBarrel ? paramsDev.amplitudeFitParamsEB().data() : paramsDev.amplitudeFitParamsEE().data(); + auto* const timeFitParameters = + isBarrel ? paramsDev.timeFitParamsEB().data() : paramsDev.timeFitParamsEE().data(); + auto const timeFitParameters_size = + isBarrel ? paramsDev.timeFitParamsEB().size() : paramsDev.timeFitParamsEE().size(); + auto const timeFitLimits_first = isBarrel ? timeFitLimits_firstEB : timeFitLimits_firstEE; + auto const timeFitLimits_second = isBarrel ? timeFitLimits_secondEB : timeFitLimits_secondEE; + + // map tx -> (sample_i, sample_j) + int sample_i = 0; + int sample_j = 0; + if (ltx <= 8) { + sample_i = 0; + sample_j = 1 + ltx; + } else if (ltx <= 16) { + sample_i = 1; + sample_j = 2 + ltx - 9; + } else if (ltx <= 23) { + sample_i = 2; + sample_j = 3 + ltx - 17; + } else if (ltx <= 29) { + sample_i = 3; + sample_j = 4 + ltx - 24; + } else if (ltx <= 34) { + sample_i = 4; + sample_j = 5 + ltx - 30; + } else if (ltx <= 38) { + sample_i = 5; + sample_j = 6 + ltx - 35; + } else if (ltx <= 41) { + sample_i = 6; + sample_j = 7 + ltx - 39; + } else if (ltx <= 43) { + sample_i = 7; + sample_j = 8 + ltx - 42; + } else if (ltx <= 44) { + sample_i = 8; + sample_j = 9; + } else + assert(false); + + auto const tx_i = ch_start + sample_i; + auto const tx_j = ch_start + sample_j; + + // + // note, given the way we partition the block, with 45 threads per channel + // we will end up with inactive threads which need to be dragged along + // through the synching point + // + bool const condForUselessSamples = useless_sample_values[tx_i] || useless_sample_values[tx_j] || + sample_values[tx_i] <= 1 || sample_values[tx_j] <= 1; + + // + // see cpu implementation for explanation + // + ScalarType chi2 = std::numeric_limits::max(); + ScalarType tmax = 0; + ScalarType tmaxerr = 0; + shrTimeMax[idx.local] = 0; + shrTimeWgt[idx.local] = 0; + + bool internalCondForSkipping1 = true; + bool internalCondForSkipping2 = true; + if (!condForUselessSamples) { + auto const rtmp = sample_values[tx_i] / sample_values[tx_j]; + auto const invampl_i = 1. / sample_values[tx_i]; + auto const relErr2_i = sample_value_errors[tx_i] * sample_value_errors[tx_i] * invampl_i * invampl_i; + auto const invampl_j = 1. / sample_values[tx_j]; + auto const relErr2_j = sample_value_errors[tx_j] * sample_value_errors[tx_j] * invampl_j * invampl_j; + auto const err1 = rtmp * rtmp * (relErr2_i + relErr2_j); + auto err2 = + sample_value_errors[tx_j] * (sample_values[tx_i] - sample_values[tx_j]) * (invampl_j * invampl_j); + // TODO non-divergent branch for a block if each block has 1 channel + // otherwise non-divergent for groups of 45 threads + // at this point, pedestal_nums[ch] can be either 0, 1 or 2 + if (pedestal_nums[ch] == 2) + err2 *= err2 * 0.5; + auto const err3 = (0.289 * 0.289) * (invampl_j * invampl_j); + auto const total_error = std::sqrt(err1 + err2 + err3); + + auto const alpha = amplitudeFitParameters[0]; + auto const beta = amplitudeFitParameters[1]; + auto const alphabeta = alpha * beta; + auto const invalphabeta = 1. / alphabeta; + + // variables instead of a struct + auto const ratio_index = sample_i; + auto const ratio_step = sample_j - sample_i; + auto const ratio_value = rtmp; + auto const ratio_error = total_error; + + auto const rlim_i_j = fast_expf(static_cast(sample_j - sample_i) / beta) - 0.001; + internalCondForSkipping1 = !(total_error < 1. && rtmp > 0.001 && rtmp < rlim_i_j); + if (!internalCondForSkipping1) { + // + // precompute. + // in cpu version this was done conditionally + // however easier to do it here (precompute) and then just filter out + // if not needed + // + auto const l_timeFitLimits_first = timeFitLimits_first; + auto const l_timeFitLimits_second = timeFitLimits_second; + if (ratio_step == 1 && ratio_value >= l_timeFitLimits_first && ratio_value <= l_timeFitLimits_second) { + auto const time_max_i = static_cast(ratio_index); + auto u = timeFitParameters[timeFitParameters_size - 1]; + CMS_UNROLL_LOOP + for (int k = timeFitParameters_size - 2; k >= 0; --k) + u = u * ratio_value + timeFitParameters[k]; + + auto du = (timeFitParameters_size - 1) * (timeFitParameters[timeFitParameters_size - 1]); + for (int k = timeFitParameters_size - 2; k >= 1; --k) + du = du * ratio_value + k * timeFitParameters[k]; + + auto const error2 = ratio_error * ratio_error * du * du; + auto const time_max = error2 > 0 ? (time_max_i - u) / error2 : static_cast(0); + auto const time_wgt = error2 > 0 ? 1. / error2 : static_cast(0); + + // store into shared mem + // note, this name is essentially identical to the one used + // below. + shrTimeMax[idx.local] = error2 > 0 ? time_max : 0; + shrTimeWgt[idx.local] = error2 > 0 ? time_wgt : 0; + } else { + shrTimeMax[idx.local] = 0; + shrTimeWgt[idx.local] = 0; + } + + // continue with ratios + auto const stepOverBeta = static_cast(ratio_step) / beta; + auto const offset = static_cast(ratio_index) + alphabeta; + auto const rmin = std::max(ratio_value - ratio_error, 0.001); + auto const rmax = + std::min(ratio_value + ratio_error, fast_expf(static_cast(ratio_step) / beta) - 0.001); + auto const time1 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmin)) / alpha) - 1.); + auto const time2 = offset - ratio_step / (fast_expf((stepOverBeta - fast_logf(rmax)) / alpha) - 1.); + + // set these guys + tmax = 0.5 * (time1 + time2); + tmaxerr = 0.5 * std::sqrt((time1 - time2) * (time1 - time2)); +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) + printf( + "ch = %d ltx = %d tmax = %f tmaxerr = %f time1 = %f time2 = %f offset = %f rmin = %f rmax = " + "%f\n", + ch, + ltx, + tmax, + tmaxerr, + time1, + time2, + offset, + rmin, + rmax); +#endif + + ScalarType sumAf = 0; + ScalarType sumff = 0; + const int itmin = std::max(-1, static_cast(std::floor(tmax - alphabeta))); + auto loffset = (static_cast(itmin) - tmax) * invalphabeta; + // TODO: data dependence + for (int it = itmin + 1; it < nsamples; ++it) { + loffset += invalphabeta; + if (useless_sample_values[ch_start + it]) + continue; + auto const inverr2 = 1. / (sample_value_errors[ch_start + it] * sample_value_errors[ch_start + it]); + auto const term1 = 1. + loffset; + auto const f = (term1 > 1e-6) ? fast_expf(alpha * (fast_logf(term1) - loffset)) : 0; + sumAf += sample_values[ch_start + it] * (f * inverr2); + sumff += f * (f * inverr2); + } + + auto const sumAA = sumAAsNullHypot[ch]; + auto const sum0 = sum0sNullHypot[ch]; + chi2 = sumAA; + // TODO: sum0 can not be 0 below, need to introduce the check upfront + if (sumff > 0) { + chi2 = sumAA - sumAf * (sumAf / sumff); + } + chi2 /= sum0; + +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) + printf( + "ch = %d ltx = %d sumAf = %f sumff = %f sumAA = %f sum0 = %d tmax = %f tmaxerr = %f chi2 = " + "%f\n", + ch, + ltx, + sumAf, + sumff, + sumAA, + static_cast(sum0), + tmax, + tmaxerr, + chi2); +#endif + + if (chi2 > 0 && tmax > 0 && tmaxerr > 0) + internalCondForSkipping2 = false; + else + chi2 = std::numeric_limits::max(); + } + } + + // store into smem + shr_chi2s[idx.local] = chi2; + shr_chi2[idx.local] = chi2; + shr_tmax[idx.local] = tmax; + shr_tmaxerr[idx.local] = tmaxerr; + shr_condForUselessSamples[idx.local] = condForUselessSamples; + shr_internalCondForSkipping1[idx.local] = internalCondForSkipping1; + shr_internalCondForSkipping2[idx.local] = internalCondForSkipping2; + } + + alpaka::syncBlockThreads(acc); + + // find min chi2 - quite crude for now + // TODO validate/check + auto iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + bool oddElements = nthreads_per_channel % 2; + CMS_UNROLL_LOOP + while (iter >= 1) { + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ltx = idx.global % nthreads_per_channel; + + if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) { + // for odd ns, the last guy will just store itself + // exception is for ltx == 0 and iter==1 + shr_chi2s[idx.local] = std::min(shr_chi2s[idx.local], shr_chi2s[idx.local + iter]); + } + } + alpaka::syncBlockThreads(acc); + + oddElements = iter % 2; + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ltx = idx.global % nthreads_per_channel; + + // get precomputedflags for this element from shared memory + auto const condForUselessSamples = shr_condForUselessSamples[idx.local]; + auto const internalCondForSkipping1 = shr_internalCondForSkipping1[idx.local]; + auto const internalCondForSkipping2 = shr_internalCondForSkipping2[idx.local]; + // filter out inactive or useless samples threads + if (!condForUselessSamples && !internalCondForSkipping1 && !internalCondForSkipping2) { + // min chi2, now compute weighted average of tmax measurements + // see cpu version for more explanation + auto const chi2 = shr_chi2[idx.local]; + auto const chi2min = shr_chi2s[idx.local - ltx]; + auto const chi2Limit = chi2min + 1.; + auto const tmaxerr = shr_tmaxerr[idx.local]; + auto const inverseSigmaSquared = chi2 < chi2Limit ? 1. / (tmaxerr * tmaxerr) : 0.; + +#ifdef DEBUG_TC_MAKERATIO + if (ch == 1 || ch == 0) { + auto const ch = idx.global / nthreads_per_channel; + printf("ch = %d ltx = %d chi2min = %f chi2Limit = %f inverseSigmaSquared = %f\n", + ch, + ltx, + chi2min, + chi2Limit, + inverseSigmaSquared); + } +#endif + + // store into shared mem and run reduction + // TODO: check if cooperative groups would be better + // TODO: check if shuffling intrinsics are better + auto const tmax = shr_tmax[idx.local]; + shr_time_wgt[idx.local] = inverseSigmaSquared; + shr_time_max[idx.local] = tmax * inverseSigmaSquared; + } else { + shr_time_wgt[idx.local] = 0; + shr_time_max[idx.local] = 0; + } + } + + alpaka::syncBlockThreads(acc); + + // reduce to compute time_max and time_wgt + iter = nthreads_per_channel / 2 + nthreads_per_channel % 2; + oddElements = nthreads_per_channel % 2; + CMS_UNROLL_LOOP + while (iter >= 1) { + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ltx = idx.global % nthreads_per_channel; + + if (ltx < iter && !(oddElements && (ltx == iter - 1 && ltx > 0))) { + shr_time_wgt[idx.local] += shr_time_wgt[idx.local + iter]; + shr_time_max[idx.local] += shr_time_max[idx.local + iter]; + shrTimeMax[idx.local] += shrTimeMax[idx.local + iter]; + shrTimeWgt[idx.local] += shrTimeWgt[idx.local + iter]; + } + } + + alpaka::syncBlockThreads(acc); + oddElements = iter % 2; + iter = iter == 1 ? iter / 2 : iter / 2 + iter % 2; + } + + for (auto idx : cms::alpakatools::elements_in_block(acc, block, totalElements)) { + auto const ltx = idx.global % nthreads_per_channel; + + // load from shared memory the 0th guy (will contain accumulated values) + // compute + // store into global mem + if (ltx == 0) { + auto const ch = idx.global / nthreads_per_channel; + auto const tmp_time_max = shr_time_max[idx.local]; + auto const tmp_time_wgt = shr_time_wgt[idx.local]; + + // we are done if there number of time ratios is 0 + if (tmp_time_wgt == 0 && tmp_time_max == 0) { + g_state[ch] = TimeComputationState::Finished; + continue; + } + + // no div by 0 + auto const tMaxAlphaBeta = tmp_time_max / tmp_time_wgt; + auto const tMaxErrorAlphaBeta = 1. / std::sqrt(tmp_time_wgt); + + tMaxAlphaBetas[ch] = tMaxAlphaBeta; + tMaxErrorAlphaBetas[ch] = tMaxErrorAlphaBeta; + g_accTimeMax[ch] = shrTimeMax[idx.local]; + g_accTimeWgt[ch] = shrTimeWgt[idx.local]; + g_state[ch] = TimeComputationState::NotFinished; + +#ifdef DEBUG_TC_MAKERATIO + printf("ch = %d time_max = %f time_wgt = %f\n", ch, tmp_time_max, tmp_time_wgt); + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f timeMax = %f timeWgt = %f\n", + ch, + tMaxAlphaBeta, + tMaxErrorAlphaBeta, + shrTimeMax[idx.local], + shrTimeWgt[idx.local]); +#endif + } + } + } + } + }; + + class Kernel_time_compute_findamplchi2_and_finish { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + ScalarType* const sample_values, + ScalarType* const sample_value_errors, + bool* const useless_samples, + ScalarType* const g_tMaxAlphaBeta, + ScalarType* const g_tMaxErrorAlphaBeta, + ScalarType* const g_accTimeMax, + ScalarType* const g_accTimeWgt, + ScalarType* const sumAAsNullHypot, + ScalarType* const sum0sNullHypot, + ScalarType* const chi2sNullHypot, + TimeComputationState* g_state, + ScalarType* g_ampMaxAlphaBeta, + ScalarType* g_ampMaxError, + ScalarType* g_timeMax, + ScalarType* g_timeError, + EcalMultifitParametersDevice::ConstView paramsDev) const { + /// launch ctx parameters are + /// 10 threads per channel, N channels per block, Y blocks + /// TODO: do we need to keep the state around or can be removed?! + //#define DEBUG_FINDAMPLCHI2_AND_FINISH + + // constants + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + auto const nchannels = digisDevEB.size() + digisDevEE.size(); + auto const offsetForInputs = digisDevEB.size(); + + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + + // configure shared mem + // per block, we need #threads per block * 2 * sizeof(ScalarType) + // we run with N channels per block + auto* shr_sumAf = alpaka::getDynSharedMem(acc); + auto* shr_sumff = shr_sumAf + elemsPerBlock; + + for (auto gtxforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution + auto gtx = nchannels * nsamples - 1 - gtxforward; + auto const ch = gtx / nsamples; + auto const elemIdx = gtx % elemsPerBlock; + auto const sample = elemIdx % nsamples; + + auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id(); + auto const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + + auto state = g_state[ch]; + auto const did = DetId{dids[inputCh]}; + auto* const amplitudeFitParameters = did.subdetId() == EcalBarrel ? paramsDev.amplitudeFitParamsEB().data() + : paramsDev.amplitudeFitParamsEE().data(); + + // TODO is that better than storing into global and launching another kernel + // for the first 10 threads + if (state == TimeComputationState::NotFinished) { + auto const alpha = amplitudeFitParameters[0]; + auto const beta = amplitudeFitParameters[1]; + auto const alphabeta = alpha * beta; + auto const invalphabeta = 1. / alphabeta; + auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + auto const sample_value = sample_values[gtx]; + auto const sample_value_error = sample_value_errors[gtx]; + auto const inverr2 = + useless_samples[gtx] ? static_cast(0) : 1. / (sample_value_error * sample_value_error); + auto const offset = (static_cast(sample) - tMaxAlphaBeta) * invalphabeta; + auto const term1 = 1. + offset; + auto const f = term1 > 1e-6 ? fast_expf(alpha * (fast_logf(term1) - offset)) : static_cast(0.); + auto const sumAf = sample_value * (f * inverr2); + auto const sumff = f * (f * inverr2); + + // store into shared mem + shr_sumAf[elemIdx] = sumAf; + shr_sumff[elemIdx] = sumff; + } else { + shr_sumAf[elemIdx] = 0; + shr_sumff[elemIdx] = 0; + } + + alpaka::syncBlockThreads(acc); + + // reduce + // unroll completely here (but hardcoded) + if (sample < 5) { + shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 5]; + shr_sumff[elemIdx] += shr_sumff[elemIdx + 5]; + } + + alpaka::syncBlockThreads(acc); + + if (sample < 2) { + // will need to subtract for ltx = 3, we double count here + shr_sumAf[elemIdx] += shr_sumAf[elemIdx + 2] + shr_sumAf[elemIdx + 3]; + shr_sumff[elemIdx] += shr_sumff[elemIdx + 2] + shr_sumff[elemIdx + 3]; + } + + alpaka::syncBlockThreads(acc); + + if (sample == 0) { + // exit if the state is done + // note, we do not exit before all __synchtreads are finished + if (state == TimeComputationState::Finished) { + g_timeMax[ch] = 5; + g_timeError[ch] = -999; + continue; + } + + // subtract to avoid double counting + auto const sumff = shr_sumff[elemIdx] + shr_sumff[elemIdx + 1] - shr_sumff[elemIdx + 3]; + auto const sumAf = shr_sumAf[elemIdx] + shr_sumAf[elemIdx + 1] - shr_sumAf[elemIdx + 3]; + + auto const ampMaxAlphaBeta = sumff > 0 ? sumAf / sumff : 0; + auto const sumAA = sumAAsNullHypot[ch]; + auto const sum0 = sum0sNullHypot[ch]; + auto const nullChi2 = chi2sNullHypot[ch]; + if (sumff > 0) { + auto const chi2AlphaBeta = (sumAA - sumAf * sumAf / sumff) / sum0; + if (chi2AlphaBeta > nullChi2) { + // null hypothesis is better + state = TimeComputationState::Finished; +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d chi2AlphaBeta = %f nullChi2 = %f sumAA = %f sumAf = %f sumff = %f sum0 = %f\n", + ch, + chi2AlphaBeta, + nullChi2, + sumAA, + sumAf, + sumff, + sum0); +#endif + } + + // store to global + g_ampMaxAlphaBeta[ch] = ampMaxAlphaBeta; + } else { +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d sum0 = %f sumAA = %f sumff = %f sumAf = %f\n", ch, sum0, sumAA, sumff, sumAf); +#endif + state = TimeComputationState::Finished; + } + + // store the state to global and finish calcs + g_state[ch] = state; + if (state == TimeComputationState::Finished) { + // store default values into global + g_timeMax[ch] = 5; + g_timeError[ch] = -999; +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d finished state\n", ch); +#endif + continue; + } + + auto const ampMaxError = g_ampMaxError[ch]; + auto const test_ratio = ampMaxAlphaBeta / ampMaxError; + auto const accTimeMax = g_accTimeMax[ch]; + auto const accTimeWgt = g_accTimeWgt[ch]; + auto const tMaxAlphaBeta = g_tMaxAlphaBeta[ch]; + auto const tMaxErrorAlphaBeta = g_tMaxErrorAlphaBeta[ch]; + // branch to separate large vs small pulses + // see cpu version for more info + if (test_ratio > 5. && accTimeWgt > 0) { + auto const tMaxRatio = accTimeWgt > 0 ? accTimeMax / accTimeWgt : static_cast(0); + auto const tMaxErrorRatio = accTimeWgt > 0 ? 1. / std::sqrt(accTimeWgt) : static_cast(0); + + if (test_ratio > 10.) { + g_timeMax[ch] = tMaxRatio; + g_timeError[ch] = tMaxErrorRatio; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d tMaxRatio = %f tMaxErrorRatio = %f\n", ch, tMaxRatio, tMaxErrorRatio); +#endif + } else { + auto const timeMax = (tMaxAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) + + tMaxRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) / + 5.; + auto const timeError = (tMaxErrorAlphaBeta * (10. - ampMaxAlphaBeta / ampMaxError) + + tMaxErrorRatio * (ampMaxAlphaBeta / ampMaxError - 5.)) / + 5.; + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = timeMax; + g_timeError[ch] = timeError; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d timeMax = %f timeError = %f\n", ch, timeMax, timeError); +#endif + } + } else { + state = TimeComputationState::Finished; + g_state[ch] = state; + g_timeMax[ch] = tMaxAlphaBeta; + g_timeError[ch] = tMaxErrorAlphaBeta; + +#ifdef DEBUG_FINDAMPLCHI2_AND_FINISH + printf("ch = %d tMaxAlphaBeta = %f tMaxErrorAlphaBeta = %f\n", ch, tMaxAlphaBeta, tMaxErrorAlphaBeta); +#endif + } + } + } + } + }; + + class Kernel_time_compute_fixMGPAslew { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ScalarType* sample_values, + ScalarType* sample_value_errors, + bool* useless_sample_values) const { + // constants + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + + auto const nchannelsEB = digisDevEB.size(); + auto const offsetForInputs = nchannelsEB; + + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + + for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannelsEB * nsamples)) { + auto const elemIdx = gtx % elemsPerBlock; + auto const sample = elemIdx % nsamples; + auto const ch = gtx / nsamples; + + // remove thread for sample 0, oversubscribing is easier than .... + if (sample == 0) + continue; + + if (!use_sample(conditionsDev.sampleMask_EB(), sample)) + continue; + + int const inputGtx = ch >= offsetForInputs ? gtx - offsetForInputs * nsamples : gtx; + auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data(); + + auto const gainIdPrev = ecalMGPA::gainId(digis[inputGtx - 1]); + auto const gainIdNext = ecalMGPA::gainId(digis[inputGtx]); + if (gainIdPrev >= 1 && gainIdPrev <= 3 && gainIdNext >= 1 && gainIdNext <= 3 && gainIdPrev < gainIdNext) { + sample_values[gtx - 1] = 0; + sample_value_errors[gtx - 1] = 1e+9; + useless_sample_values[gtx - 1] = true; + } + } + } + }; + + //#define ECAL_RECO_ALPAKA_TC_INIT_DEBUG + class Kernel_time_computation_init { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ScalarType* sample_values, + ScalarType* sample_value_errors, + ScalarType* ampMaxError, + bool* useless_sample_values, + char* pedestal_nums) const { + // constants + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + + // indices + auto const nchannelsEB = digisDevEB.size(); + auto const nchannels = nchannelsEB + digisDevEE.size(); + auto const offsetForInputs = nchannelsEB; + auto const offsetForHashes = conditionsDev.offsetEE(); + + auto const elemsPerBlock = alpaka::getWorkDiv(acc)[0u]; + + // configure shared mem + auto* shrSampleValues = alpaka::getDynSharedMem(acc); + auto* shrSampleValueErrors = shrSampleValues + elemsPerBlock; + + for (auto txforward : cms::alpakatools::elements_with_stride(acc, nchannels * nsamples)) { + // go backwards through the loop to have valid values for shared variables when reading from higher element indices in serial execution + auto tx = nchannels * nsamples - 1 - txforward; + auto const ch = tx / nsamples; + auto const elemIdx = tx % elemsPerBlock; + + int const inputTx = ch >= offsetForInputs ? tx - offsetForInputs * nsamples : tx; + int const inputCh = ch >= offsetForInputs ? ch - offsetForInputs : ch; + auto const* digis = ch >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data(); + auto const* dids = ch >= offsetForInputs ? digisDevEE.id() : digisDevEB.id(); + + // indices/inits + auto const sample = tx % nsamples; + auto const input_ch_start = inputCh * nsamples; + ScalarType pedestal = 0.; + int num = 0; + + // 0 and 1 sample values + auto const adc0 = ecalMGPA::adc(digis[input_ch_start]); + auto const gainId0 = ecalMGPA::gainId(digis[input_ch_start]); + auto const adc1 = ecalMGPA::adc(digis[input_ch_start + 1]); + auto const gainId1 = ecalMGPA::gainId(digis[input_ch_start + 1]); + auto const did = DetId{dids[inputCh]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const sample_mask = isBarrel ? conditionsDev.sampleMask_EB() : conditionsDev.sampleMask_EE(); + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + + // set pedestal + // TODO this branch is non-divergent for a group of 10 threads + if (gainId0 == 1 && use_sample(sample_mask, 0)) { + pedestal = static_cast(adc0); + num = 1; + + auto const diff = adc1 - adc0; + if (gainId1 == 1 && use_sample(sample_mask, 1) && + std::abs(diff) < 3 * conditionsDev.pedestals_rms_x12()[hashedId]) { + pedestal = (pedestal + static_cast(adc1)) / 2.; + num = 2; + } + } else { + pedestal = conditionsDev.pedestals_mean_x12()[ch]; + } + + // ped subtracted and gain-renormalized samples. + auto const gainId = ecalMGPA::gainId(digis[inputTx]); + auto const adc = ecalMGPA::adc(digis[inputTx]); + + bool bad = false; + ScalarType sample_value, sample_value_error; + // TODO divergent branch + // TODO: piece below is general both for amplitudes and timing + // potentially there is a way to reduce the amount of code... + if (!use_sample(sample_mask, sample)) { + bad = true; + sample_value = 0; + sample_value_error = 0; + } else if (gainId == 1) { + sample_value = static_cast(adc) - pedestal; + sample_value_error = conditionsDev.pedestals_rms_x12()[hashedId]; + } else if (gainId == 2) { + auto const mean_x6 = conditionsDev.pedestals_mean_x6()[hashedId]; + auto const rms_x6 = conditionsDev.pedestals_rms_x6()[hashedId]; + auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId]; + sample_value = (static_cast(adc) - mean_x6) * gain12Over6; + sample_value_error = rms_x6 * gain12Over6; + } else if (gainId == 3) { + auto const mean_x1 = conditionsDev.pedestals_mean_x1()[hashedId]; + auto const rms_x1 = conditionsDev.pedestals_rms_x1()[hashedId]; + auto const gain12Over6 = conditionsDev.gain12Over6()[hashedId]; + auto const gain6Over1 = conditionsDev.gain6Over1()[hashedId]; + sample_value = (static_cast(adc) - mean_x1) * gain6Over1 * gain12Over6; + sample_value_error = rms_x1 * gain6Over1 * gain12Over6; + } else { + sample_value = 0; + sample_value_error = 0; + bad = true; + } + + // TODO: make sure we save things correctly when sample is useless + auto const useless_sample = (sample_value_error <= 0) | bad; + useless_sample_values[tx] = useless_sample; + sample_values[tx] = sample_value; + sample_value_errors[tx] = useless_sample ? 1e+9 : sample_value_error; + + // DEBUG +#ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG + if (ch == 0) { + printf("sample = %d sample_value = %f sample_value_error = %f useless = %c\n", + sample, + sample_value, + sample_value_error, + useless_sample ? '1' : '0'); + } +#endif + + // store into the shared mem + shrSampleValues[elemIdx] = sample_value_error > 0 ? sample_value : std::numeric_limits::min(); + shrSampleValueErrors[elemIdx] = sample_value_error; + alpaka::syncBlockThreads(acc); + + // perform the reduction with min + if (sample < 5) { + // note, if equal -> we keep the value with lower sample as for cpu + shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 5] + ? shrSampleValueErrors[elemIdx + 5] + : shrSampleValueErrors[elemIdx]; + shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 5]); + } + alpaka::syncBlockThreads(acc); + + // a bit of an overkill, but easier than to compare across 3 values + if (sample < 3) { + shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 3] + ? shrSampleValueErrors[elemIdx + 3] + : shrSampleValueErrors[elemIdx]; + shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 3]); + } + alpaka::syncBlockThreads(acc); + + if (sample < 2) { + shrSampleValueErrors[elemIdx] = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 2] + ? shrSampleValueErrors[elemIdx + 2] + : shrSampleValueErrors[elemIdx]; + shrSampleValues[elemIdx] = std::max(shrSampleValues[elemIdx], shrSampleValues[elemIdx + 2]); + } + alpaka::syncBlockThreads(acc); + + if (sample == 0) { + // we only need the max error + auto const maxSampleValueError = shrSampleValues[elemIdx] < shrSampleValues[elemIdx + 1] + ? shrSampleValueErrors[elemIdx + 1] + : shrSampleValueErrors[elemIdx]; + + // # pedestal samples used + pedestal_nums[ch] = num; + // this is used downstream + ampMaxError[ch] = maxSampleValueError; + + // DEBUG +#ifdef ECAL_RECO_ALPAKA_TC_INIT_DEBUG + if (ch == 0) { + printf("pedestal_nums = %d ampMaxError = %f\n", num, maxSampleValueError); + } +#endif + } + } + } + }; + + /// + /// launch context parameters: 1 thread per channel + /// + //#define DEBUG_TIME_CORRECTION + class Kernel_time_correction_and_finalize { + using ScalarType = ::ecal::multifit::SampleVector::Scalar; + + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + EcalDigiDeviceCollection::ConstView digisDevEB, + EcalDigiDeviceCollection::ConstView digisDevEE, + EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEB, + EcalUncalibratedRecHitDeviceCollection::View uncalibRecHitsEE, + EcalMultifitConditionsDevice::ConstView conditionsDev, + ScalarType* const g_timeMax, + ScalarType* const g_timeError, + ConfigurationParameters::type const timeConstantTermEB, + ConfigurationParameters::type const timeConstantTermEE, + ConfigurationParameters::type const timeNconstEB, + ConfigurationParameters::type const timeNconstEE, + ConfigurationParameters::type const amplitudeThresholdEB, + ConfigurationParameters::type const amplitudeThresholdEE, + ConfigurationParameters::type const outOfTimeThreshG12pEB, + ConfigurationParameters::type const outOfTimeThreshG12pEE, + ConfigurationParameters::type const outOfTimeThreshG12mEB, + ConfigurationParameters::type const outOfTimeThreshG12mEE, + ConfigurationParameters::type const outOfTimeThreshG61pEB, + ConfigurationParameters::type const outOfTimeThreshG61pEE, + ConfigurationParameters::type const outOfTimeThreshG61mEB, + ConfigurationParameters::type const outOfTimeThreshG61mEE) const { + // constants + constexpr auto nsamples = EcalDataFrame::MAXSAMPLES; + auto const nchannelsEB = digisDevEB.size(); + auto const nchannels = nchannelsEB + digisDevEE.size(); + auto const offsetForInputs = nchannelsEB; + auto const offsetForHashes = conditionsDev.offsetEE(); + + for (auto gtx : cms::alpakatools::elements_with_stride(acc, nchannels)) { + const int inputGtx = gtx >= offsetForInputs ? gtx - offsetForInputs : gtx; + auto const* dids = gtx >= offsetForInputs ? digisDevEE.id() : digisDevEB.id(); + auto const* digis = gtx >= offsetForInputs ? digisDevEE.data()->data() : digisDevEB.data()->data(); + + auto* g_amplitude = gtx >= nchannelsEB ? uncalibRecHitsEE.amplitude() : uncalibRecHitsEB.amplitude(); + auto* g_jitter = gtx >= nchannelsEB ? uncalibRecHitsEE.jitter() : uncalibRecHitsEB.jitter(); + auto* g_jitterError = gtx >= nchannelsEB ? uncalibRecHitsEE.jitterError() : uncalibRecHitsEB.jitterError(); + auto* flags = gtx >= nchannelsEB ? uncalibRecHitsEE.flags() : uncalibRecHitsEB.flags(); + + auto const did = DetId{dids[inputGtx]}; + auto const isBarrel = did.subdetId() == EcalBarrel; + auto const hashedId = isBarrel ? ecal::reconstruction::hashedIndexEB(did.rawId()) + : offsetForHashes + ecal::reconstruction::hashedIndexEE(did.rawId()); + // need to access the underlying data directly here because the std::arrays have different size for EB and EE, which is not compatible with the ? operator + auto* const amplitudeBins = isBarrel ? conditionsDev.timeBiasCorrections_amplitude_EB().data() + : conditionsDev.timeBiasCorrections_amplitude_EE().data(); + auto* const shiftBins = isBarrel ? conditionsDev.timeBiasCorrections_shift_EB().data() + : conditionsDev.timeBiasCorrections_shift_EE().data(); + auto const amplitudeBinsSize = + isBarrel ? conditionsDev.timeBiasCorrectionSizeEB() : conditionsDev.timeBiasCorrectionSizeEE(); + auto const timeConstantTerm = isBarrel ? timeConstantTermEB : timeConstantTermEE; + auto const timeNconst = isBarrel ? timeNconstEB : timeNconstEE; + auto const offsetTimeValue = isBarrel ? conditionsDev.timeOffset_EB() : conditionsDev.timeOffset_EE(); + auto const amplitudeThreshold = isBarrel ? amplitudeThresholdEB : amplitudeThresholdEE; + auto const outOfTimeThreshG12p = isBarrel ? outOfTimeThreshG12pEB : outOfTimeThreshG12pEE; + auto const outOfTimeThreshG12m = isBarrel ? outOfTimeThreshG12mEB : outOfTimeThreshG12mEE; + auto const outOfTimeThreshG61p = isBarrel ? outOfTimeThreshG61pEB : outOfTimeThreshG61pEE; + auto const outOfTimeThreshG61m = isBarrel ? outOfTimeThreshG61mEB : outOfTimeThreshG61mEE; + + // load some + auto const amplitude = g_amplitude[inputGtx]; + auto const rms_x12 = conditionsDev.pedestals_rms_x12()[hashedId]; + auto const timeCalibConst = conditionsDev.timeCalibConstants()[hashedId]; + + int myBin = -1; + for (size_t bin = 0; bin < amplitudeBinsSize; ++bin) { + if (amplitude > amplitudeBins[bin]) + myBin = bin; + else + break; + } + + ScalarType correction = 0; + if (myBin == -1) { + correction = shiftBins[0]; + } else if (myBin == static_cast(amplitudeBinsSize) - 1) { + correction = shiftBins[myBin]; + } else { + correction = shiftBins[myBin + 1] - shiftBins[myBin]; + correction *= (amplitude - amplitudeBins[myBin]) / (amplitudeBins[myBin + 1] - amplitudeBins[myBin]); + correction += shiftBins[myBin]; + } + + // correction * 1./25. + correction = correction * 0.04; + auto const timeMax = g_timeMax[gtx]; + auto const timeError = g_timeError[gtx]; + auto const jitter = timeMax - 5 + correction; + auto const jitterError = + std::sqrt(timeError * timeError + timeConstantTerm * timeConstantTerm * 0.04 * 0.04); // 0.04 = 1./25. + +#ifdef DEBUG_TIME_CORRECTION + printf("ch = %d timeMax = %f timeError = %f jitter = %f correction = %f\n", + gtx, + timeMax, + timeError, + jitter, + correction); +#endif + + // store back to global + g_jitter[inputGtx] = jitter; + g_jitterError[inputGtx] = jitterError; + + // set the flag + // TODO: replace with something more efficient (if required), + // for now just to make it work + if (amplitude > amplitudeThreshold * rms_x12) { + auto threshP = outOfTimeThreshG12p; + auto threshM = outOfTimeThreshG12m; + if (amplitude > 3000.) { + for (int isample = 0; isample < nsamples; isample++) { + auto const gainid = ecalMGPA::gainId(digis[nsamples * inputGtx + isample]); + if (gainid != 1) { + threshP = outOfTimeThreshG61p; + threshM = outOfTimeThreshG61m; + break; + } + } + } + + auto const correctedTime = (timeMax - 5) * 25 + timeCalibConst + offsetTimeValue; + auto const nterm = timeNconst * rms_x12 / amplitude; + auto const sigmat = std::sqrt(nterm * nterm + timeConstantTerm * timeConstantTerm); + if (correctedTime > sigmat * threshP || correctedTime < -sigmat * threshM) + flags[inputGtx] |= 0x1 << EcalUncalibratedRecHit::kOutOfTime; + } + } + } + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit + +namespace alpaka::trait { + using namespace ALPAKA_ACCELERATOR_NAMESPACE::ecal::multifit; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_nullhypot. + template + struct BlockSharedMemDynSizeBytes { + //! \return The size of the shared memory allocated for a block. + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_nullhypot const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + using ScalarType = ecal::multifit::SampleVector::Scalar; + + // return the amount of dynamic shared memory needed + std::size_t bytes = threadsPerBlock[0u] * elemsPerThread[0u] * 4 * sizeof(ScalarType); + return bytes; + } + }; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_makeratio. + template + struct BlockSharedMemDynSizeBytes { + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_makeratio const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + using ScalarType = ecal::multifit::SampleVector::Scalar; + + std::size_t bytes = (8 * sizeof(ScalarType) + 3 * sizeof(bool)) * threadsPerBlock[0u] * elemsPerThread[0u]; + return bytes; + } + }; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_time_compute_findamplchi2_and_finish. + template + struct BlockSharedMemDynSizeBytes { + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_compute_findamplchi2_and_finish const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + using ScalarType = ecal::multifit::SampleVector::Scalar; + + std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType); + return bytes; + } + }; + + //! The trait for getting the size of the block shared dynamic memory for Kernel_time_computation_init. + template + struct BlockSharedMemDynSizeBytes { + template + ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(Kernel_time_computation_init const&, + TVec const& threadsPerBlock, + TVec const& elemsPerThread, + TArgs const&...) -> std::size_t { + using ScalarType = ecal::multifit::SampleVector::Scalar; + + std::size_t bytes = 2 * threadsPerBlock[0u] * elemsPerThread[0u] * sizeof(ScalarType); + return bytes; + } + }; + +} // namespace alpaka::trait + +#endif // RecoLocalCalo_EcalRecProducers_plugins_TimeComputationKernels_h diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalLocalCustom.py b/RecoLocalCalo/EcalRecProducers/python/ecalLocalCustom.py index 137c97ac7765a..12528d990a331 100644 --- a/RecoLocalCalo/EcalRecProducers/python/ecalLocalCustom.py +++ b/RecoLocalCalo/EcalRecProducers/python/ecalLocalCustom.py @@ -1,16 +1,16 @@ import FWCore.ParameterSet.Config as cms def configureEcalLocal25ns(process): - process.ecalMultiFitUncalibRecHit.cpu.activeBXs = [-5,-4,-3,-2,-1,0,1,2,3,4], - process.ecalMultiFitUncalibRecHit.cpu.useLumiInfoRunHeader = False + process.ecalMultiFitUncalibRecHitCPU.activeBXs = [-5,-4,-3,-2,-1,0,1,2,3,4], + process.ecalMultiFitUncalibRecHitCPU.useLumiInfoRunHeader = False return process def configureEcalLocal50ns(process): - process.ecalMultiFitUncalibRecHit.cpu.activeBXs = [-4,-2,0,2,4] - process.ecalMultiFitUncalibRecHit.cpu.useLumiInfoRunHeader = False + process.ecalMultiFitUncalibRecHitCPU.activeBXs = [-4,-2,0,2,4] + process.ecalMultiFitUncalibRecHitCPU.useLumiInfoRunHeader = False return process def configureEcalLocalNoOOTPU(process): - process.ecalMultiFitUncalibRecHit.cpu.activeBXs = [0] - process.ecalMultiFitUncalibRecHit.cpu.useLumiInfoRunHeader = False + process.ecalMultiFitUncalibRecHitCPU.activeBXs = [0] + process.ecalMultiFitUncalibRecHitCPU.useLumiInfoRunHeader = False return process diff --git a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py index 4d8f415e40170..c6104c21b62db 100644 --- a/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py +++ b/RecoLocalCalo/EcalRecProducers/python/ecalMultiFitUncalibRecHit_cff.py @@ -4,8 +4,9 @@ # ECAL multifit running on CPU from RecoLocalCalo.EcalRecProducers.ecalMultiFitUncalibRecHit_cfi import ecalMultiFitUncalibRecHit as _ecalMultiFitUncalibRecHit +ecalMultiFitUncalibRecHitCPU = _ecalMultiFitUncalibRecHit.clone() ecalMultiFitUncalibRecHit = SwitchProducerCUDA( - cpu = _ecalMultiFitUncalibRecHit.clone() + cpu = ecalMultiFitUncalibRecHitCPU ) ecalMultiFitUncalibRecHitTask = cms.Task( @@ -13,6 +14,8 @@ ecalMultiFitUncalibRecHit ) +from Configuration.StandardSequences.Accelerators_cff import * + # ECAL conditions used by the multifit running on GPU from RecoLocalCalo.EcalRecProducers.ecalPedestalsGPUESProducer_cfi import ecalPedestalsGPUESProducer from RecoLocalCalo.EcalRecProducers.ecalGainRatiosGPUESProducer_cfi import ecalGainRatiosGPUESProducer @@ -64,3 +67,39 @@ # ECAL multifit running on CPU, or convert the uncalibrated rechits from SoA to legacy format ecalMultiFitUncalibRecHit, )) + +# modifications for alpaka +from Configuration.ProcessModifiers.alpaka_cff import alpaka + +# ECAL conditions used by the multifit running on the accelerator +from RecoLocalCalo.EcalRecProducers.ecalMultifitConditionsHostESProducer_cfi import ecalMultifitConditionsHostESProducer +from RecoLocalCalo.EcalRecProducers.ecalMultifitParametersHostESProducer_cfi import ecalMultifitParametersHostESProducer + +ecalMultifitParametersSource = cms.ESSource("EmptyESSource", + recordName = cms.string('EcalMultifitParametersRcd'), + iovIsRunNotTime = cms.bool(True), + firstValid = cms.vuint32(1) +) + +# ECAL multifit running on the accelerator +from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitProducerPortable_cfi import ecalUncalibRecHitProducerPortable as _ecalUncalibRecHitProducerPortable +ecalMultiFitUncalibRecHitPortable = _ecalUncalibRecHitProducerPortable.clone( + digisLabelEB = 'ecalDigisPortable:ebDigis', + digisLabelEE = 'ecalDigisPortable:eeDigis' +) + +# replace the SwitchProducerCUDA branches with the module to convert the uncalibrated rechits from SoA to legacy format +from RecoLocalCalo.EcalRecProducers.ecalUncalibRecHitSoAToLegacy_cfi import ecalUncalibRecHitSoAToLegacy as _ecalUncalibRecHitSoAToLegacy +alpaka.toModify(ecalMultiFitUncalibRecHit, + cpu = _ecalUncalibRecHitSoAToLegacy.clone() +) + +alpaka.toReplaceWith(ecalMultiFitUncalibRecHitTask, cms.Task( + # ECAL conditions used by the multifit running on the accelerator + ecalMultifitConditionsHostESProducer, + ecalMultifitParametersHostESProducer, + # ECAL multifit running on device + ecalMultiFitUncalibRecHitPortable, + # ECAL multifit running on CPU, or convert the uncalibrated rechits from SoA to legacy format + ecalMultiFitUncalibRecHit, +)) diff --git a/Validation/Configuration/python/ECALHCAL.py b/Validation/Configuration/python/ECALHCAL.py index 4f7d073fb02c0..053787aaf9d6d 100644 --- a/Validation/Configuration/python/ECALHCAL.py +++ b/Validation/Configuration/python/ECALHCAL.py @@ -37,8 +37,8 @@ def customise(process): process.schedule.append(process.generation_step) process.schedule.append(process.simulation_step) - process.ecalMultiFitUncalibRecHit.cpu.EBdigiCollection = cms.InputTag("simEcalDigis","ebDigis") - process.ecalMultiFitUncalibRecHit.cpu.EEdigiCollection = cms.InputTag("simEcalDigis","eeDigis") + process.ecalMultiFitUncalibRecHitCPU.EBdigiCollection = "simEcalDigis:ebDigis" + process.ecalMultiFitUncalibRecHitCPU.EEdigiCollection = "simEcalDigis:eeDigis" process.ecalPreshowerRecHit.ESdigiCollection = cms.InputTag("simEcalPreshowerDigis") delattr(process,"hbhereco")