diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml index 5406d1355533f..1bf72a85ddc0a 100644 --- a/CUDADataFormats/SiPixelCluster/BuildFile.xml +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -2,6 +2,7 @@ + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index 4ecdf14d8d33c..7f461bef6d2f9 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -5,16 +5,34 @@ #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + #include -class SiPixelClustersCUDA { +GENERATE_SOA_LAYOUT(SiPixelClustersCUDALayout, + SOA_COLUMN(uint32_t, moduleStart), + SOA_COLUMN(uint32_t, clusInModule), + SOA_COLUMN(uint32_t, moduleId), + SOA_COLUMN(uint32_t, clusModuleStart)) + +using SiPixelClustersCUDASoA = SiPixelClustersCUDALayout<>; +using SiPixelClustersCUDASOAView = SiPixelClustersCUDALayout<>::View; +using SiPixelClustersCUDASOAConstView = SiPixelClustersCUDALayout<>::ConstView; + +// TODO: The class is created via inheritance of the PortableDeviceCollection. +// This is generally discouraged, and should be done via composition, i.e., +// by adding a public class attribute like: +// cms::cuda::Portabledevicecollection> collection; +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection> { public: SiPixelClustersCUDA() = default; - explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream); ~SiPixelClustersCUDA() = default; - SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete; - SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete; + explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) + : PortableDeviceCollection>(maxModules + 1, stream) {} + SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default; SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default; @@ -26,41 +44,7 @@ class SiPixelClustersCUDA { uint32_t nClusters() const { return nClusters_h; } int32_t offsetBPIX2() const { return offsetBPIX2_h; } - uint32_t *moduleStart() { return moduleStart_d.get(); } - uint32_t *clusInModule() { return clusInModule_d.get(); } - uint32_t *moduleId() { return moduleId_d.get(); } - uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } - - uint32_t const *moduleStart() const { return moduleStart_d.get(); } - uint32_t const *clusInModule() const { return clusInModule_d.get(); } - uint32_t const *moduleId() const { return moduleId_d.get(); } - uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } - - class SiPixelClustersCUDASOAView { - public: - __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); } - __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); } - __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); } - __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); } - - uint32_t const *moduleStart_; - uint32_t const *clusInModule_; - uint32_t const *moduleId_; - uint32_t const *clusModuleStart_; - }; - - SiPixelClustersCUDASOAView const *view() const { return view_d.get(); } - private: - cms::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module - cms::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module - cms::cuda::device::unique_ptr moduleId_d; // module id of each module - - // originally from rechits - cms::cuda::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module - - cms::cuda::device::unique_ptr view_d; // "me" pointer - uint32_t nClusters_h = 0; int32_t offsetBPIX2_h = 0; }; diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc deleted file mode 100644 index c8a340d2162f9..0000000000000 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ /dev/null @@ -1,19 +0,0 @@ -#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) - : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)), - clusInModule_d(cms::cuda::make_device_unique(maxModules, stream)), - moduleId_d(cms::cuda::make_device_unique(maxModules, stream)), - clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)) { - auto view = cms::cuda::make_host_unique(stream); - view->moduleStart_ = moduleStart_d.get(); - view->clusInModule_ = clusInModule_d.get(); - view->moduleId_ = moduleId_d.get(); - view->clusModuleStart_ = clusModuleStart_d.get(); - - view_d = cms::cuda::make_device_unique(stream); - cms::cuda::copyAsync(view_d, view, stream); -} diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml index 0806768a9b657..784f42c4441a4 100644 --- a/CUDADataFormats/SiPixelDigi/BuildFile.xml +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -3,6 +3,7 @@ + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index cf6b51687982f..5888cd04a6128 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -6,17 +6,32 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" - -class SiPixelDigisCUDA { +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +GENERATE_SOA_LAYOUT(SiPixelDigisSoALayout, + SOA_COLUMN(int32_t, clus), + SOA_COLUMN(uint32_t, pdigi), + SOA_COLUMN(uint32_t, rawIdArr), + SOA_COLUMN(uint16_t, adc), + SOA_COLUMN(uint16_t, xx), + SOA_COLUMN(uint16_t, yy), + SOA_COLUMN(uint16_t, moduleId)) + +using SiPixelDigisCUDASOA = SiPixelDigisSoALayout<>; +using SiPixelDigisCUDASOAView = SiPixelDigisCUDASOA::View; +using SiPixelDigisCUDASOAConstView = SiPixelDigisCUDASOA::ConstView; + +// TODO: The class is created via inheritance of the PortableDeviceCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +class SiPixelDigisCUDA : public cms::cuda::PortableDeviceCollection> { public: - using StoreType = uint16_t; SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) + : PortableDeviceCollection>(maxFedWords + 1, stream) {} ~SiPixelDigisCUDA() = default; - SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; - SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete; SiPixelDigisCUDA(SiPixelDigisCUDA &&) = default; SiPixelDigisCUDA &operator=(SiPixelDigisCUDA &&) = default; @@ -28,17 +43,7 @@ class SiPixelDigisCUDA { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const; - - SiPixelDigisCUDASOAView view() { return m_view; } - SiPixelDigisCUDASOAView const view() const { return m_view; } - private: - // These are consumed by downstream device code - cms::cuda::device::unique_ptr m_store; - - SiPixelDigisCUDASOAView m_view; - uint32_t nModules_h = 0; uint32_t nDigis_h = 0; }; diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h deleted file mode 100644 index 78406cd241473..0000000000000 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h -#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -#include - -class SiPixelDigisCUDASOAView { -public: - friend class SiPixelDigisCUDA; - - template - friend class SiPixelRecHitSoAFromLegacyT; - - enum class StorageLocation { - kCLUS = 0, - kPDIGI = 2, - kRAWIDARR = 4, - kADC = 6, - kXX = 7, - kYY = 8, - kMODULEIND = 9, - kMAX = 10 - }; - /* - ============================================================================================================================ - | CLUS | PDIGI | RAWIDARR | ADC | XX | YY | MODULEIND | - ============================================================================================================================ - | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | 7: N*16 | 8: N*16 | 9: N*16 | - ============================================================================================================================ - */ - // These are for CPU output - // we don't copy local x and y coordinates and module index - enum class StorageLocationHost { kCLUS = 0, kPDIGI = 2, kRAWIDARR = 4, kADC = 6, kMAX = 7 }; - /* - ======================================================================================== - | CLUS | PDIGI | RAWIDARR | ADC | - ======================================================================================== - | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | - ======================================================================================== - */ - - SiPixelDigisCUDASOAView() = default; - - template - SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocation s) { - xx_ = getColumnAddress(StorageLocation::kXX, store, maxFedWords); - yy_ = getColumnAddress(StorageLocation::kYY, store, maxFedWords); - adc_ = getColumnAddress(StorageLocation::kADC, store, maxFedWords); - moduleInd_ = getColumnAddress(StorageLocation::kMODULEIND, store, maxFedWords); - clus_ = getColumnAddress(StorageLocation::kCLUS, store, maxFedWords); - pdigi_ = getColumnAddress(StorageLocation::kPDIGI, store, maxFedWords); - rawIdArr_ = getColumnAddress(StorageLocation::kRAWIDARR, store, maxFedWords); - } - - template - SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocationHost s) { - adc_ = getColumnAddress(StorageLocationHost::kADC, store, maxFedWords); - clus_ = getColumnAddress(StorageLocationHost::kCLUS, store, maxFedWords); - pdigi_ = getColumnAddress(StorageLocationHost::kPDIGI, store, maxFedWords); - rawIdArr_ = getColumnAddress(StorageLocationHost::kRAWIDARR, store, maxFedWords); - } - - __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); } - __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); } - __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); } - __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); } - __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); } - __device__ __forceinline__ uint32_t pdigi(int i) const { return __ldg(pdigi_ + i); } - __device__ __forceinline__ uint32_t rawIdArr(int i) const { return __ldg(rawIdArr_ + i); } - - const uint16_t* xx() const { return xx_; } - const uint16_t* yy() const { return yy_; } - const uint16_t* adc() const { return adc_; } - const uint16_t* moduleInd() const { return moduleInd_; } - const int32_t* clus() const { return clus_; } - const uint32_t* pdigi() const { return pdigi_; } - const uint32_t* rawIdArr() const { return rawIdArr_; } - - uint16_t* xx() { return xx_; } - uint16_t* yy() { return yy_; } - uint16_t* adc() { return adc_; } - uint16_t* moduleInd() { return moduleInd_; } - int32_t* clus() { return clus_; } - uint32_t* pdigi() { return pdigi_; } - uint32_t* rawIdArr() { return rawIdArr_; } - -private: - uint16_t* xx_; // local coordinates of each pixel - uint16_t* yy_; - uint16_t* adc_; // ADC of each pixel - uint16_t* moduleInd_; // module id of each pixel - int32_t* clus_; // cluster id of each pixel - uint32_t* pdigi_; - uint32_t* rawIdArr_; - - template - ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { - return reinterpret_cast(store.get() + static_cast(column) * roundFor128ByteAlignment(size)); - } - - static int roundFor128ByteAlignment(int size) { - constexpr int mul = 128 / sizeof(uint16_t); - return ((size + mul - 1) / mul) * mul; - }; -}; - -#endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc deleted file mode 100644 index 9a7f8ae8bdad5..0000000000000 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include - -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : m_store(cms::cuda::make_device_unique( - SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) * - static_cast(SiPixelDigisCUDASOAView::StorageLocation::kMAX), - stream)), - m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) { - assert(maxFedWords != 0); -} - -cms::cuda::host::unique_ptr SiPixelDigisCUDA::copyAllToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique( - m_view.roundFor128ByteAlignment(nDigis()) * static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - stream); - cudaCheck(cudaMemcpyAsync(ret.get(), - m_view.clus(), - m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) * - static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - cudaMemcpyDeviceToHost, - stream)); - return ret; -} diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml index e3f9a0910bbd8..cf07e3b540f24 100644 --- a/CUDADataFormats/Track/BuildFile.xml +++ b/CUDADataFormats/Track/BuildFile.xml @@ -2,6 +2,7 @@ + diff --git a/CUDADataFormats/Track/README.md b/CUDADataFormats/Track/README.md new file mode 100644 index 0000000000000..8f66d9e4c4467 --- /dev/null +++ b/CUDADataFormats/Track/README.md @@ -0,0 +1,50 @@ +# Track CUDA Data Formats + +`CUDADataFormat`s meant to be used on Host (CPU) or Device (CUDA GPU) for +storing information about `Track`s created during the Pixel-local Reconstruction +chain. It stores data in an SoA manner. It combines the data contained in the +deprecated `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT` classes. + +The host format is inheriting from `CUDADataFormats/Common/interface/PortableHostCollection.h`, +while the device format is inheriting from `CUDADataFormats/Common/interface/PortableDeviceCollection.h` + +Both formats use the same SoA Layout (`TrackSoAHeterogeneousLayout`) which is generated +via the `GENERATE_SOA_LAYOUT` macro in the `PixelTrackUtilities.h` file. + +## Notes + +-`hitIndices` and `detIndices`, instances of `HitContainer`, have been added into the +layout as `SOA_SCALAR`s, meaning that they manage their own data independently from the SoA +`Layout`. This could be improved in the future, if `HitContainer` (aka a `OneToManyAssoc` of fixed size) +is replaced, but there don't seem to be any conflicts in including it in the `Layout` like this. +- Host and Device classes should **not** be created via inheritance, as they're done here, +but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309). + +## TrackSoAHeterogeneousHost + +The version of the data format to be used for storing `Track` data on the CPU. +Instances of this class are to be used for: + +- Having a place to copy data to host from device, via `cudaMemcpy`, or +- Running host-side algorithms using data stored in an SoA manner. + +## TrackSoAHeterogeneousDevice + +The version of the data format to be used for storing `Track` data on the GPU. + +Instances of `TrackSoAHeterogeneousDevice` are to be created on host and be +used on device only. To do so, the instance's `view()` method is to be called +to pass a `View` to any kernel launched. Accessing data from the `view()` is not +possible on the host side. + +## Utilities + +`PixelTrackUtilities.h` contains a collection of methods which were originally +defined as class methods inside either `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT` +which have been adapted to operate on `View` instances, so that they are callable +from within `__global__` kernels, on both CPU and CPU. + +## Use case + +See `test/TrackSoAHeterogeneous_test.cpp` for a simple example of instantiation, +processing and copying from device to host. diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h deleted file mode 100644 index f9e9b3a37c63f..0000000000000 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h -#define CUDADataFormats_Track_PixelTrackHeterogeneous_h - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" - -template -using PixelTrackHeterogeneousT = HeterogeneousSoA>; - -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h new file mode 100644 index 0000000000000..6d7ea258be8d2 --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -0,0 +1,243 @@ +#ifndef CUDADataFormats_Track_PixelTrackUtilities_h +#define CUDADataFormats_Track_PixelTrackUtilities_h + +#include +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +namespace pixelTrack { + + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; + constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; + const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; + inline Quality qualityByName(std::string const &name) { + auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; + return static_cast(qp); + } + +} // namespace pixelTrack + +template +struct TrackSoA { + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; + static constexpr int32_t H = TrackerTraits::avgHitsPerTrack; + // Aliases in order to not confuse the GENERATE_SOA_LAYOUT + // macro with weird colons and angled brackets. + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + using Quality = pixelTrack::Quality; + + using hindex_type = uint32_t; + + using HitContainer = cms::cuda::OneToManyAssoc; + + GENERATE_SOA_LAYOUT(TrackSoALayout, + SOA_COLUMN(Quality, quality), + SOA_COLUMN(float, chi2), + SOA_COLUMN(int8_t, nLayers), + SOA_COLUMN(float, eta), + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks), + SOA_SCALAR(HitContainer, hitIndices), + SOA_SCALAR(HitContainer, detIndices)) +}; + +// Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods. + +template +struct TracksUtilities { + using TrackSoAView = typename TrackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename TrackSoA::template TrackSoALayout<>::ConstView; + using hindex_type = typename TrackSoA::hindex_type; + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + static __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } + + static constexpr __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(0); + } + + static constexpr __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(1); + } + + static constexpr __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(4); + } + + static constexpr __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) { + return tracks[i].nLayers() == 3; + } + + template + static constexpr __host__ __device__ inline void copyFromCircle( + TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + static constexpr __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, + V5 const &v, + M5 const &cov, + int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + static constexpr __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks, + V5 &v, + M5 &cov, + int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } + + static constexpr __host__ __device__ inline int computeNumberOfLayers(const TrackSoAConstView &tracks, int32_t i) { + auto pdet = tracks.detIndices().begin(i); + int nl = 1; + auto ol = pixelTopology::getLayer(*pdet); + for (; pdet < tracks.detIndices().end(i); ++pdet) { + auto il = pixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + + static constexpr __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) { + return tracks.detIndices().size(i); + } +}; + +namespace pixelTrack { + + template + struct QualityCutsT {}; + + template + struct QualityCutsT> { + using TrackSoAView = typename TrackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename TrackSoA::template TrackSoALayout<>::ConstView; + using tracksHelper = TracksUtilities; + // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct Region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + Region triplet; + Region quadruplet; + + __device__ __forceinline__ bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const { + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nHits > 3) ? quadruplet : triplet; + return (std::abs(tracksHelper::tip(tracks, it)) < region.maxTip) and (tracks.pt(it) > region.minPt) and + (std::abs(tracksHelper::zip(tracks, it)) < region.maxZip); + } + + __device__ __forceinline__ bool strictCut(const TrackSoAConstView &tracks, int it) const { + auto roughLog = [](float x) { + // max diff [0.5,12] at 1.25 0.16143 + // average diff 0.0662998 + union IF { + uint32_t i; + float f; + }; + IF z; + z.f = x; + uint32_t lsb = 1 < 21; + z.i += lsb; + z.i >>= 21; + auto f = z.i & 3; + int ex = int(z.i >> 2) - 127; + + // log2(1+0.25*f) + // averaged over bins + const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; + return float(ex) + frac[f]; + }; + + float pt = std::min(tracks.pt(it), chi2MaxPt); + float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); + if (tracks.chi2(it) >= chi2Cut) { +#ifdef NTUPLE_FIT_DEBUG + printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks.pt(it), tracks.eta(it), tracks.chi2(it)); +#endif + return true; + } + return false; + } + }; + + template + struct QualityCutsT> { + using TrackSoAView = typename TrackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename TrackSoA::template TrackSoALayout<>::ConstView; + using tracksHelper = TracksUtilities; + + float maxChi2; + float minPt; + float maxTip; + float maxZip; + + __device__ __forceinline__ bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const { + return (std::abs(tracksHelper::tip(tracks, it)) < maxTip) and (tracks.pt(it) > minPt) and + (std::abs(tracksHelper::zip(tracks, it)) < maxZip); + } + __device__ __forceinline__ bool strictCut(const TrackSoAConstView &tracks, int it) const { + return tracks.chi2(it) >= maxChi2; + } + }; + +} // namespace pixelTrack + +template +using TrackLayout = typename TrackSoA::template TrackSoALayout<>; +template +using TrackSoAView = typename TrackSoA::template TrackSoALayout<>::View; +template +using TrackSoAConstView = typename TrackSoA::template TrackSoALayout<>::ConstView; + +template struct TracksUtilities; +template struct TracksUtilities; + +#endif diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..1938991e071e1 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -0,0 +1,36 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +// TODO: The class is created via inheritance of the PortableDeviceCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + using cms::cuda::PortableDeviceCollection>::view; + using cms::cuda::PortableDeviceCollection>::const_view; + using cms::cuda::PortableDeviceCollection>::buffer; + using cms::cuda::PortableDeviceCollection>::bufferSize; + + TrackSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousDevice(cudaStream_t stream) + : cms::cuda::PortableDeviceCollection>(TrackerTraits::maxNumberOfTuples, stream) {} +}; + +namespace pixelTrack { + + using TrackSoADevicePhase1 = TrackSoAHeterogeneousDevice; + using TrackSoADevicePhase2 = TrackSoAHeterogeneousDevice; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..af8af2a40a52e --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +// TODO: The class is created via inheritance of the PortableHostCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; //TODO: this could be made configurable at runtime + explicit TrackSoAHeterogeneousHost() : cms::cuda::PortableHostCollection>(S) {} + + using cms::cuda::PortableHostCollection>::view; + using cms::cuda::PortableHostCollection>::const_view; + using cms::cuda::PortableHostCollection>::buffer; + using cms::cuda::PortableHostCollection>::bufferSize; + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousHost(cudaStream_t stream) + : cms::cuda::PortableHostCollection>(S, stream) {} +}; + +namespace pixelTrack { + + using TrackSoAHostPhase1 = TrackSoAHeterogeneousHost; + using TrackSoAHostPhase2 = TrackSoAHeterogeneousHost; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h deleted file mode 100644 index b5b1df0d5118a..0000000000000 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ /dev/null @@ -1,195 +0,0 @@ -#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H -#define CUDADataFormats_Track_TrackHeterogeneousT_H - -#include -#include - -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" - -namespace pixelTrack { - - enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; - constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; - const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; - inline Quality qualityByName(std::string const &name) { - auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; - return static_cast(qp); - } - -} // namespace pixelTrack - -template -class TrackSoAHeterogeneousT { -public: - static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; - static constexpr int32_t H = TrackerTraits::maxHitsOnTrack; // Average hits rather than max? - static constexpr int32_t stride() { return S; } - - using hindex_type = uint32_t; //TrackerTraits::hindex_type ? - - using Quality = pixelTrack::Quality; - using HitContainer = cms::cuda::OneToManyAssoc; - - // Always check quality is at least loose! - // CUDA does not support enums in __lgc ... -protected: - eigenSoA::ScalarSoA quality_; - -public: - constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } - constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } - constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } - constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } - - // this is chi2/ndof as not necessarely all hits are used in the fit - eigenSoA::ScalarSoA chi2; - - eigenSoA::ScalarSoA nLayers; - - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } - - constexpr int nHits(int i) const { return detIndices.size(i); } - - constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } - - constexpr int computeNumberOfLayers(int32_t i) const { - // layers are in order and we assume tracks are either forward or backward - auto pdet = detIndices.begin(i); - int nl = 1; - auto ol = pixelTopology::getLayer(*pdet); - for (; pdet < detIndices.end(i); ++pdet) { - auto il = pixelTopology::getLayer(*pdet); - if (il != ol) - ++nl; - ol = il; - } - return nl; - } - - // State at the Beam spot - // phi,tip,1/pt,cotan(theta),zip - TrajectoryStateSoAT stateAtBS; - eigenSoA::ScalarSoA eta; - eigenSoA::ScalarSoA pt; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - - HitContainer hitIndices; - HitContainer detIndices; - -private: - int nTracks_; -}; - -namespace pixelTrack { - - template - using TrackSoAT = TrackSoAHeterogeneousT; - - template - using HitContainerT = typename TrackSoAHeterogeneousT::HitContainer; - - //Used only to ease classes definitions - using TrackSoAPhase1 = TrackSoAHeterogeneousT; - using TrackSoAPhase2 = TrackSoAHeterogeneousT; - - template - struct QualityCutsT {}; - - template - struct QualityCutsT> { - // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) - float chi2Coeff[4]; - float chi2MaxPt; // GeV - float chi2Scale; - - struct Region { - float maxTip; // cm - float minPt; // GeV - float maxZip; // cm - }; - - Region triplet; - Region quadruplet; - - __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, - int nHits, - int it) const { - // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) - // default cuts: - // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm - // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm - // (see CAHitNtupletGeneratorGPU.cc) - auto const ®ion = (nHits > 3) ? quadruplet : triplet; - return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); - } - - __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, - int it) const { - auto roughLog = [](float x) { - // max diff [0.5,12] at 1.25 0.16143 - // average diff 0.0662998 - union IF { - uint32_t i; - float f; - }; - IF z; - z.f = x; - uint32_t lsb = 1 < 21; - z.i += lsb; - z.i >>= 21; - auto f = z.i & 3; - int ex = int(z.i >> 2) - 127; - - // log2(1+0.25*f) - // averaged over bins - const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; - return float(ex) + frac[f]; - }; - - float pt = std::min(tracks->pt(it), chi2MaxPt); - float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); - if (tracks->chi2(it) >= chi2Cut) { -#ifdef NTUPLE_FIT_DEBUG - printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it)); -#endif - return true; - } - return false; - } - }; - - template - struct QualityCutsT> { - float maxChi2; - float minPt; - float maxTip; - float maxZip; - - __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, - int nHits, - int it) const { - return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip); - } - __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, - int it) const { - return tracks->chi2(it) >= maxChi2; - } - }; - -} // namespace pixelTrack - -#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h deleted file mode 100644 index 64fcd573a6991..0000000000000 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H -#define CUDADataFormats_Track_TrajectoryStateSOAT_H - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" - -template -struct TrajectoryStateSoAT { - using Vector5f = Eigen::Matrix; - using Vector15f = Eigen::Matrix; - - using Vector5d = Eigen::Matrix; - using Matrix5d = Eigen::Matrix; - - static constexpr int32_t stride() { return S; } - - eigenSoA::MatrixSoA state; - eigenSoA::MatrixSoA covariance; - - template - __host__ __device__ inline void copyFromCircle( - V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - state(i) << cp.template cast(), lp.template cast(); - state(i)(2) *= b; - auto cov = covariance(i); - cov(0) = ccov(0, 0); - cov(1) = ccov(0, 1); - cov(2) = b * float(ccov(0, 2)); - cov(4) = cov(3) = 0; - cov(5) = ccov(1, 1); - cov(6) = b * float(ccov(1, 2)); - cov(8) = cov(7) = 0; - cov(9) = b * b * float(ccov(2, 2)); - cov(11) = cov(10) = 0; - cov(12) = lcov(0, 0); - cov(13) = lcov(0, 1); - cov(14) = lcov(1, 1); - } - - template - __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - state(i) = v.template cast(); - for (int j = 0, ind = 0; j < 5; ++j) - for (auto k = j; k < 5; ++k) - covariance(i)(ind++) = cov(j, k); - } - - template - __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = state(i).template cast(); - for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = covariance(i)(ind++); - for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = covariance(i)(ind++); - } - } -}; - -#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 97c116f6c88d3..2e07adddcddd0 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -3,7 +3,10 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 5216c19dded65..5e3116609330a 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,15 +1,15 @@ - - - - - + + + + + - - - - - + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index fc78783db473b..32256c87ed577 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,19 +1,22 @@ - - - - + - - - - - - + + + + + + + + + + + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp deleted file mode 100644 index 9708b689dd05b..0000000000000 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" - -#include -#include - -int main() { - // test quality - - auto q = pixelTrack::qualityByName("tight"); - assert(pixelTrack::Quality::tight == q); - q = pixelTrack::qualityByName("toght"); - assert(pixelTrack::Quality::notQuality == q); - - for (uint32_t i = 0; i < pixelTrack::qualitySize; ++i) { - auto const qt = static_cast(i); - auto q = pixelTrack::qualityByName(pixelTrack::qualityName[i]); - assert(qt == q); - } - - return 0; -} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp new file mode 100644 index 0000000000000..dafa75e2e18d7 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -0,0 +1,73 @@ +/** + Simple test for the pixelTrack::TrackSoA data structure + which inherits from PortableDeviceCollection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the CUDA kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. + */ + +#include +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackSoA { + + template + void runKernels(TrackSoAView &tracks_view, cudaStream_t stream); +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // Inner scope to deallocate memory before destroying the stream + { + // Instantiate tracks on device. PortableDeviceCollection allocates + // SoA on device automatically. + TrackSoAHeterogeneousDevice tracks_d(stream); + testTrackSoA::runKernels(tracks_d.view(), stream); + + // Instantate tracks on host. This is where the data will be + // copied to from device. + TrackSoAHeterogeneousHost tracks_h(stream); + + cudaCheck(cudaMemcpyAsync( + tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaStreamSynchronize(stream)); + + // Print results + std::cout << "pt" + << "\t" + << "eta" + << "\t" + << "chi2" + << "\t" + << "quality" + << "\t" + << "nLayers" + << "\t" + << "hitIndices off" << std::endl; + + for (int i = 0; i < 10; ++i) { + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() + << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + << tracks_h.view().hitIndices().off[i] << std::endl; + } + } + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu new file mode 100644 index 0000000000000..8e8595eb43e94 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -0,0 +1,63 @@ +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace testTrackSoA { + + // Kernel which fills the TrackSoAView with data + // to test writing to it + template + __global__ void fill(TrackSoAView tracks_view) { + int i = threadIdx.x; + if (i == 0) { + tracks_view.nTracks() = 420; + } + + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + tracks_view[j].pt() = (float)j; + tracks_view[j].eta() = (float)j; + tracks_view[j].chi2() = (float)j; + tracks_view[j].quality() = (pixelTrack::Quality)(j % 256); + tracks_view[j].nLayers() = j % 128; + tracks_view.hitIndices().off[j] = j; + } + } + + // Kernel which reads from the TrackSoAView to verify + // that it was written correctly from the fill kernel + template + __global__ void verify(TrackSoAConstView tracks_view) { + int i = threadIdx.x; + + if (i == 0) { + printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x); + assert(tracks_view.nTracks() == 420); + } + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + assert(abs(tracks_view[j].pt() - (float)j) < .0001); + assert(abs(tracks_view[j].eta() - (float)j) < .0001); + assert(abs(tracks_view[j].chi2() - (float)j) < .0001); + assert(tracks_view[j].quality() == (pixelTrack::Quality)(j % 256)); + assert(tracks_view[j].nLayers() == j % 128); + assert(tracks_view.hitIndices().off[j] == j); + } + } + + // Host function which invokes the two kernels above + template + void runKernels(TrackSoAView& tracks_view, cudaStream_t stream) { + fill<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + verify<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + } + + template void runKernels(TrackSoAView& tracks_view, + cudaStream_t stream); + template void runKernels(TrackSoAView& tracks_view, + cudaStream_t stream); + +} // namespace testTrackSoA diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h index 97b88873c2613..6ba0eaa5c986e 100644 --- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h @@ -1,7 +1,11 @@ -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" using Vector5d = Eigen::Matrix; using Matrix5d = Eigen::Matrix; +using helper = TracksUtilities; __host__ __device__ Matrix5d loadCov(Vector5d const& e) { Matrix5d cov; @@ -17,26 +21,21 @@ __host__ __device__ Matrix5d loadCov(Vector5d const& e) { return cov; } -using TS = TrajectoryStateSoAT<128>; - -__global__ void testTSSoA(TS* pts, int n) { - assert(n <= 128); - +template +__global__ void testTSSoA(TrackSoAView ts) { Vector5d par0; par0 << 0.2, 0.1, 3.5, 0.8, 0.1; Vector5d e0; e0 << 0.01, 0.01, 0.035, -0.03, -0.01; auto cov0 = loadCov(e0); - TS& ts = *pts; - int first = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = first; i < n; i += blockDim.x * gridDim.x) { - ts.copyFromDense(par0, cov0, i); + for (int i = first; i < ts.metadata().size(); i += blockDim.x * gridDim.x) { + helper::copyFromDense(ts, par0, cov0, i); Vector5d par1; Matrix5d cov1; - ts.copyToDense(par1, cov1, i); + helper::copyToDense(ts, par1, cov1, i); Vector5d delV = par1 - par0; Matrix5d delM = cov1 - cov0; for (int j = 0; j < 5; ++j) { @@ -58,18 +57,29 @@ __global__ void testTSSoA(TS* pts, int n) { int main() { #ifdef __CUDACC__ cms::cudatest::requireDevices(); + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); #endif - TS ts; +#ifdef __CUDACC__ + // Since we are going to copy data from ts_d to ts_h, we + // need to initialize the Host collection with a stream. + TrackSoAHeterogeneousHost ts_h(stream); + TrackSoAHeterogeneousDevice ts_d(stream); +#else + // If CUDA is not available, Host collection must not be initialized + // with a stream. + TrackSoAHeterogeneousHost ts_h; +#endif #ifdef __CUDACC__ - TS* ts_d; - cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); - testTSSoA<<<1, 64>>>(ts_d, 128); + testTSSoA<<<1, 64, 0, stream>>>(ts_d.view()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpyAsync( + ts_h.buffer().get(), ts_d.const_buffer().get(), ts_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); - cudaCheck(cudaDeviceSynchronize()); + cudaCheck(cudaStreamSynchronize(stream)); #else - testTSSoA(&ts, 128); + testTSSoA(ts_h.view()); #endif } diff --git a/CUDADataFormats/TrackingRecHit/BuildFile.xml b/CUDADataFormats/TrackingRecHit/BuildFile.xml index 4cda8ebd306b0..e67c2227feef9 100644 --- a/CUDADataFormats/TrackingRecHit/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/BuildFile.xml @@ -3,6 +3,7 @@ + diff --git a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h index b3bdade5ec97c..13322ce3952b7 100644 --- a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h +++ b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h @@ -12,4 +12,9 @@ struct SiPixelHitStatus { uint8_t qBin : 3; // ∈[0,1,...,7] }; +struct SiPixelHitStatusAndCharge { + SiPixelHitStatus status; + uint32_t charge : 24; +}; + #endif diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h deleted file mode 100644 index ad78daa8354e2..0000000000000 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ /dev/null @@ -1,384 +0,0 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h - -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" - -namespace { - enum class Storage32 { - kXLocal = 0, - kYLocal = 1, - kXerror = 2, - kYerror = 3, - kCharge = 4, - kXGlobal = 5, - kYGlobal = 6, - kZGlobal = 7, - kRGlobal = 8, - kPhiStorage = 9, - kLayers = 10 - }; - - enum class Storage16 { - kDetId = 0, - kPhi = 1, - kXSize = 2, - kYSize = 3, - }; -} // namespace - -template -class TrackingRecHit2DHeterogeneousT { -public: - template - friend class TrackingRecHit2DHostT; - - template - using unique_ptr = typename Traits::template unique_ptr; - - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner; - using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry; - - TrackingRecHit2DHeterogeneousT() = default; - - explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream = nullptr); - - explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream = nullptr); - ~TrackingRecHit2DHeterogeneousT() = default; - - TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete; - TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete; - TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default; - TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default; - - TrackingRecHit2DSOAView* view() { return m_view.get(); } - TrackingRecHit2DSOAView const* view() const { return m_view.get(); } - - auto nHits() const { return m_nHits; } - auto offsetBPIX2() const { return m_offsetBPIX2; } - - auto hitsModuleStart() const { return m_hitsModuleStart; } - auto hitsLayerStart() { return m_hitsLayerStart; } - auto phiBinner() { return m_phiBinner; } - auto phiBinnerStorage() { return m_phiBinnerStorage; } - auto iphi() { return m_iphi; } - - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; - -protected: - static constexpr uint32_t n16 = 4; // number of elements in m_store16 - static constexpr uint32_t n32 = 10; // number of elements in m_store32 - static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious - static_assert(n32 == static_cast(Storage32::kLayers)); - unique_ptr m_store16; //! - unique_ptr m_store32; //! - - unique_ptr m_PhiBinnerStore; //! - unique_ptr m_AverageGeometryStore; //! - - unique_ptr m_view; //! - - uint32_t m_nHits; - int32_t m_offsetBPIX2; - - uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! - - // needed as kernel params... - PhiBinner* m_phiBinner; - typename PhiBinner::index_type* m_phiBinnerStorage; - uint32_t* m_hitsLayerStart; - int16_t* m_iphi; -}; - -//Inherit and overload only what we need to overload, remember to use this-> -//GPU -template -class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT { -public: - using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; - - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; -}; - -//CPU -template -class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT { -public: - using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; - - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; -}; - -//HOST -template -class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT { -public: - ~TrackingRecHit2DHostT() = default; - TrackingRecHit2DHostT() = default; - - explicit TrackingRecHit2DHostT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream = nullptr) - : TrackingRecHit2DHeterogeneousT( - nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {} - - explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream = nullptr) - : TrackingRecHit2DHeterogeneousT( - store32, store16, modules, nHits, stream) {} - - explicit TrackingRecHit2DHostT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneousT const* input); -}; - -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -template -TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( - uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream) - : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - - auto view = Traits::template make_host_unique(stream); - - view->m_nHits = nHits; - m_view = Traits::template make_unique(stream); // leave it on host and pass it by value? - m_AverageGeometryStore = Traits::template make_unique(stream); - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } - return; - } - - // the single arrays are not 128 bit alligned... - // the hits are actually accessed in order only in building - // if ordering is relevant they may have to be stored phi-ordered by layer or so - // this will break 1to1 correspondence with cluster and module locality - // so unless proven VERY inefficient we keep it ordered as generated - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == - sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - view->m_detInd = get16(Storage16::kDetId); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } -} - -template -TrackingRecHit2DHostT::TrackingRecHit2DHostT( - uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneousT const* input) { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - - this->m_nHits = nHits; - this->m_offsetBPIX2 = offsetBPIX2; - this->m_hitsModuleStart = hitsModuleStart; - - auto view = cms::cuda::make_host_unique(stream); - - view->m_nHits = nHits; - this->m_view = - cms::cuda::make_host_unique(stream); // leave it on host and pass it by value? - this->m_AverageGeometryStore = cms::cuda::make_host_unique(stream); - view->m_averageGeometry = this->m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - this->m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - return; - } - - this->m_store32 = cms::cuda::make_host_unique(5 * input->nHits(), stream); - cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream); - - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == - sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get(); - this->m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - this->m_view = std::move(view); -} - -//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases -template -TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( - cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream) - : m_nHits(nHits), m_hitsModuleStart(modules) { - auto view = Traits::template make_host_unique(stream); - - m_view = Traits::template make_unique(stream); - - view->m_nHits = nHits; - - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } - return; - } - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - m_AverageGeometryStore = Traits::template make_unique(stream); - - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_hitsModuleStart = m_hitsModuleStart; - - //store transfer - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_store16, store16, static_cast(n16 * nHits), stream); - cms::cuda::copyAsync(m_store32, store32, static_cast(n32 * nHits), stream); - - } else { - std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get()); // want to copy it - std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get()); - } - - //getters - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - - //Store 32 - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - //Store 16 - view->m_detInd = get16(Storage16::kDetId); - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } -} - -//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. -using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT; -using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT; -using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT; - -using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT; -using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT; -using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT; - -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h deleted file mode 100644 index 8fd2bc54cfad7..0000000000000 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h - -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" - -// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction -template -class TrackingRecHit2DReducedT { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - -public: - using HLPstorage = HostProduct; - using HIDstorage = HostProduct; - - template - TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits) - : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) { - auto get32 = [&](int i) { return const_cast(m_store32.get()) + i * nhits; }; - - // copy all the pointers (better be in sync with the producer store) - - m_view.m_xl = get32(0); - m_view.m_yl = get32(1); - m_view.m_xerr = get32(2); - m_view.m_yerr = get32(3); - m_view.m_chargeAndStatus = reinterpret_cast(get32(4)); - m_view.m_detInd = const_cast(m_store16.get()); - } - - // view only! - TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {} - - TrackingRecHit2DReducedT() = default; - ~TrackingRecHit2DReducedT() = default; - - TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete; - TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete; - TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default; - TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default; - - TrackingRecHit2DSOAView& view() { return m_view; } - TrackingRecHit2DSOAView const& view() const { return m_view; } - - auto nHits() const { return m_nHits; } - -private: - TrackingRecHit2DSOAView m_view; - - HLPstorage m_store32; - HIDstorage m_store16; - - int m_nHits; -}; - -using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT; -using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT; - -#endif diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h deleted file mode 100644 index 59b7cb1337fdf..0000000000000 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ /dev/null @@ -1,131 +0,0 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h - -#include - -#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h" - -namespace pixelCPEforGPU { - template - struct ParamsOnGPUT; -} - -template -class TrackingRecHit2DSOAViewT { -public: - using Status = SiPixelHitStatus; - static_assert(sizeof(Status) == sizeof(uint8_t)); - - using hindex_type = typename TrackerTraits::hindex_type; - using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry - using AverageGeometry = pixelTopology::AverageGeometryT; - using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; - - template - friend class TrackingRecHit2DHeterogeneousT; - template - friend class TrackingRecHit2DHostT; - // template - // friend class TrackingRecHit2DReducedT; - - __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } - - __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } - __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } - __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; } - __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); } - - __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; } - __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); } - __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; } - __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); } - - __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; } - __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); } - __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; } - __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); } - __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; } - __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); } - __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; } - __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); } - - __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; } - __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); } - - __device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) { - ich = std::min(ich, chargeMask()); - uint32_t w = *reinterpret_cast(&is); - ich |= (w << 24); - m_chargeAndStatus[i] = ich; - } - - __device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & 0xFFFFFF; } - - __device__ __forceinline__ Status status(int i) const { - uint8_t w = __ldg(m_chargeAndStatus + i) >> 24; - return *reinterpret_cast(&w); - } - - __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; } - __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); } - __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; } - __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); } - __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } - __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } - - __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } - - __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); } - - __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; } - __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; } - - __device__ __forceinline__ PhiBinner& phiBinner() { return *m_phiBinner; } - __device__ __forceinline__ PhiBinner const& phiBinner() const { return *m_phiBinner; } - - __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; } - __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } - - __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; } - __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; } - -private: - // local coord - float *m_xl, *m_yl; - float *m_xerr, *m_yerr; - - // global coord - float *m_xg, *m_yg, *m_zg, *m_rg; - int16_t* m_iphi; - - // cluster properties - static constexpr uint32_t chargeMask() { return (1 << 24) - 1; } - uint32_t* m_chargeAndStatus; - int16_t* m_xsize; - int16_t* m_ysize; - uint16_t* m_detInd; - - // supporting objects - // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise - AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous - ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned - uint32_t const* m_hitsModuleStart; // forwarded from clusters - - uint32_t* m_hitsLayerStart; - - PhiBinner* m_phiBinner; - typename PhiBinner::index_type* m_phiBinnerStorage; - - uint32_t m_nHits; -}; - -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h new file mode 100644 index 0000000000000..a64f017876439 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -0,0 +1,94 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsDevice_h +#define CUDADataFormats_RecHits_TrackingRecHitsDevice_h + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection> { +public: + using hitSoA = TrackingRecHitSoA; + //Need to decorate the class with the inherited portable accessors being now a template + using cms::cuda::PortableDeviceCollection>::view; + using cms::cuda::PortableDeviceCollection>::const_view; + using cms::cuda::PortableDeviceCollection>::buffer; + using cms::cuda::PortableDeviceCollection>::bufferSize; + + TrackingRecHitSoADevice() = default; // cms::cuda::Product needs this + + using AverageGeometry = typename hitSoA::AverageGeometry; + using ParamsOnGPU = typename hitSoA::ParamsOnGPU; + using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType; + using PhiBinner = typename hitSoA::PhiBinner; + // Constructor which specifies the SoA size + explicit TrackingRecHitSoADevice(uint32_t nHits, + int32_t offsetBPIX2, + ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : cms::cuda::PortableDeviceCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + hitsModuleStart_(hitsModuleStart), + offsetBPIX2_(offsetBPIX2) { + phiBinner_ = &(view().phiBinner()); + cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t), cudaMemcpyDefault, stream)); + // hitsModuleStart is on Device + cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(), + hitsModuleStart, + sizeof(uint32_t) * int(TrackerTraits::numberOfModules + 1), + cudaMemcpyDefault, + stream)); + cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t), cudaMemcpyDefault, stream)); + + // cpeParams argument is a pointer to device memory, copy + // its contents into the Layout. + cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(ParamsOnGPU)), cudaMemcpyDefault, stream)); + } + + uint32_t nHits() const { return nHits_; } //go to size of view + + cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); + size_t rowSize = sizeof(float) * nHits(); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDefault, stream)); + + return ret; + } //move to utilities + + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), + view().hitsModuleStart().data(), + sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), + cudaMemcpyDefault, + stream)); + return ret; + } + + auto phiBinnerStorage() { return phiBinnerStorage_; } + auto hitsModuleStart() const { return hitsModuleStart_; } + uint32_t offsetBPIX2() const { return offsetBPIX2_; } + auto phiBinner() { return phiBinner_; } + +private: + uint32_t nHits_; //Needed for the host SoA size + + //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. + ParamsOnGPU const* cpeParams_; + uint32_t const* hitsModuleStart_; + uint32_t offsetBPIX2_; + + PhiBinnerStorageType* phiBinnerStorage_; + PhiBinner* phiBinner_; +}; + +//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. +using TrackingRecHitSoADevicePhase1 = TrackingRecHitSoADevice; +using TrackingRecHitSoADevicePhase2 = TrackingRecHitSoADevice; + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h new file mode 100644 index 0000000000000..f8bbe61f4a781 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h @@ -0,0 +1,80 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsHost_h +#define CUDADataFormats_RecHits_TrackingRecHitsHost_h + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection> { +public: + using hitSoA = TrackingRecHitSoA; + //Need to decorate the class with the inherited portable accessors being now a template + using cms::cuda::PortableHostCollection>::view; + using cms::cuda::PortableHostCollection>::const_view; + using cms::cuda::PortableHostCollection>::buffer; + using cms::cuda::PortableHostCollection>::bufferSize; + + TrackingRecHitSoAHost() = default; + + using AverageGeometry = typename hitSoA::AverageGeometry; + using ParamsOnGPU = typename hitSoA::ParamsOnGPU; + using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType; + using PhiBinner = typename hitSoA::PhiBinner; + + // This SoA Host is used basically only for DQM + // so we just need a slim constructor + explicit TrackingRecHitSoAHost(uint32_t nHits) + : cms::cuda::PortableHostCollection>(nHits) {} + + explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream) + : cms::cuda::PortableHostCollection>(nHits, stream) {} + + explicit TrackingRecHitSoAHost(uint32_t nHits, + int32_t offsetBPIX2, + ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart) + : cms::cuda::PortableHostCollection>(nHits), + nHits_(nHits), + cpeParams_(cpeParams), + offsetBPIX2_(offsetBPIX2) { + view().nHits() = nHits; + std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().begin()); + memcpy(&(view().cpeParams()), cpeParams, sizeof(ParamsOnGPU)); + view().offsetBPIX2() = offsetBPIX2; + } + + explicit TrackingRecHitSoAHost(uint32_t nHits, + int32_t offsetBPIX2, + ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : cms::cuda::PortableHostCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + offsetBPIX2_(offsetBPIX2) { + view().nHits() = nHits; + std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().begin()); + memcpy(&(view().cpeParams()), cpeParams, sizeof(ParamsOnGPU)); + view().offsetBPIX2() = offsetBPIX2; + } + + uint32_t nHits() const { return nHits_; } + uint32_t offsetBPIX2() const { return offsetBPIX2_; } + auto phiBinnerStorage() { return phiBinnerStorage_; } + +private: + uint32_t nHits_; //Needed for the host SoA size + ParamsOnGPU const* cpeParams_; + uint32_t offsetBPIX2_; + + PhiBinnerStorageType* phiBinnerStorage_; +}; + +using TrackingRecHitSoAHostPhase1 = TrackingRecHitSoAHost; +using TrackingRecHitSoAHostPhase2 = TrackingRecHitSoAHost; + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h new file mode 100644 index 0000000000000..7e28cb97becc8 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h @@ -0,0 +1,66 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h +#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h + +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "SiPixelHitStatus.h" + +template +struct TrackingRecHitSoA { + using hindex_type = typename TrackerTraits::hindex_type; + using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry + + using PhiBinnerStorageType = typename PhiBinner::index_type; + using AverageGeometry = pixelTopology::AverageGeometryT; + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + + using HitLayerStartArray = std::array; + using HitModuleStartArray = std::array; + + //Is it better to have two split? + GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout, + SOA_COLUMN(float, xLocal), + SOA_COLUMN(float, yLocal), + SOA_COLUMN(float, xerrLocal), + SOA_COLUMN(float, yerrLocal), + SOA_COLUMN(float, xGlobal), + SOA_COLUMN(float, yGlobal), + SOA_COLUMN(float, zGlobal), + SOA_COLUMN(float, rGlobal), + SOA_COLUMN(int16_t, iphi), + SOA_COLUMN(SiPixelHitStatusAndCharge, chargeAndStatus), + SOA_COLUMN(int16_t, clusterSizeX), + SOA_COLUMN(int16_t, clusterSizeY), + SOA_COLUMN(uint16_t, detectorIndex), + + SOA_SCALAR(uint32_t, nHits), + SOA_SCALAR(int32_t, offsetBPIX2), + //These above could be separated in a specific + //layout since they don't depends on the template + //for the moment I'm keeping them here + SOA_COLUMN(PhiBinnerStorageType, phiBinnerStorage), + SOA_SCALAR(HitModuleStartArray, hitsModuleStart), + SOA_SCALAR(HitLayerStartArray, hitsLayerStart), + SOA_SCALAR(ParamsOnGPU, cpeParams), + SOA_SCALAR(AverageGeometry, averageGeometry), + SOA_SCALAR(PhiBinner, phiBinner)); +}; + +template +using TrackingRecHitLayout = typename TrackingRecHitSoA::template TrackingRecHitSoALayout<>; +template +using TrackingRecHitSoAView = typename TrackingRecHitSoA::template TrackingRecHitSoALayout<>::View; +template +using TrackingRecHitSoAConstView = + typename TrackingRecHitSoA::template TrackingRecHitSoALayout<>::ConstView; + +#endif diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc deleted file mode 100644 index 05c3eba3d8bde..0000000000000 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc +++ /dev/null @@ -1,49 +0,0 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::localCoordToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(5 * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store32ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(this->n32) * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store32, static_cast(this->n32) * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store16ToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(this->n16) * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store16, static_cast(this->n16) * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::hitsModuleStartToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); - cudaCheck(cudaMemcpyAsync(ret.get(), - this->m_hitsModuleStart, - sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), - cudaMemcpyDefault, - stream)); - return ret; -} - -template class TrackingRecHit2DGPUT; -template class TrackingRecHit2DGPUT; - -template class TrackingRecHit2DCPUT; -template class TrackingRecHit2DCPUT; - -template class TrackingRecHit2DHostT; -template class TrackingRecHit2DHostT; diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h index b9a20695712e3..1f494d0517450 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes.h +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -2,8 +2,8 @@ #define CUDADataFormats_TrackingRecHit_src_classes_h #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_TrackingRecHit_src_classes_h diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index 4287860ee8495..6c2389e829549 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -1,22 +1,16 @@ - - - - - - - - - - - - + + - - + + - - + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml index f064563aa7051..7baacbac416a1 100644 --- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -1,6 +1,7 @@ + - + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h deleted file mode 100644 index b2da57c2471ae..0000000000000 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h +++ /dev/null @@ -1,26 +0,0 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" - -namespace testTrackingRecHit2D { - - template - __global__ void fill(TrackingRecHit2DSOAViewT* phits) { - assert(phits); - auto& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } - - template - __global__ void verify(TrackingRecHit2DSOAViewT const* phits) { - assert(phits); - auto const& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } -} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp deleted file mode 100644 index 0d910273933dc..0000000000000 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" - -namespace testTrackingRecHit2D { - - template - void runKernels(TrackingRecHit2DSOAViewT* hits); -} // namespace testTrackingRecHit2D - -int main() { - cms::cudatest::requireDevices(); - - cudaStream_t stream; - cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - auto nHits = 200; - // inner scope to deallocate memory before destroying the stream - { - TrackingRecHit2DGPUT tkhit(nHits, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhit.view()); - - TrackingRecHit2DGPUT tkhitPhase2(nHits, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhitPhase2.view()); - - TrackingRecHit2DHostT tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit); - cudaStreamSynchronize(stream); - assert(tkhitH.view()); - assert(tkhitH.view()->nHits() == unsigned(nHits)); - - TrackingRecHit2DHostT tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2); - cudaStreamSynchronize(stream); - assert(tkhitHPhase2.view()); - assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); - } - - cudaCheck(cudaStreamDestroy(stream)); - - return 0; -} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu deleted file mode 100644 index e902ea971edf3..0000000000000 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "TrackingRecHit2DCUDAImpl_t.h" - -namespace testTrackingRecHit2D { - - template - void runKernels(TrackingRecHit2DSOAViewT* hits) { - assert(hits); - fill<<<1, 1024>>>(hits); - verify<<<1, 1024>>>(hits); - } - - template void runKernels(TrackingRecHit2DSOAViewT* hits); - template void runKernels(TrackingRecHit2DSOAViewT* hits); -} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp new file mode 100644 index 0000000000000..146bb9133d9d8 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp @@ -0,0 +1,50 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackingRecHitSoA { + + template + void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream); + +} + +int main() { + using ParamsOnGPU = TrackingRecHitSoADevice::ParamsOnGPU; + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault)); + + // inner scope to deallocate memory before destroying the stream + { + uint32_t nHits = 2000; + int32_t offset = 100; + uint32_t moduleStart[1856]; + + for (size_t i = 0; i < 1856; i++) { + moduleStart[i] = i * 2; + } + ParamsOnGPU* cpeParams_d; + cudaCheck(cudaMalloc(&cpeParams_d, sizeof(ParamsOnGPU))); + TrackingRecHitSoADevice tkhit(nHits, offset, cpeParams_d, &moduleStart[0], stream); + + testTrackingRecHitSoA::runKernels(tkhit, stream); + printf("tkhit hits %d \n", tkhit.nHits()); + auto test = tkhit.localCoordToHostAsync(stream); + printf("test[9] %.2f\n", test[9]); + + auto ret = tkhit.hitsModuleStartToHostAsync(stream); + printf("mods[9] %d\n", ret[9]); + cudaCheck(cudaFree(cpeParams_d)); + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu new file mode 100644 index 0000000000000..490f30fa6b7bd --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu @@ -0,0 +1,64 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +namespace testTrackingRecHitSoA { + + template + __global__ void fill(TrackingRecHitSoAView soa) { + int i = threadIdx.x; + int j = blockIdx.x; + if (i == 0 and j == 0) { + soa.offsetBPIX2() = 22; + soa[10].xLocal() = 1.11; + } + + soa[i].iphi() = i % 10; + soa.hitsLayerStart()[j] = j; + __syncthreads(); + } + + template + __global__ void show(TrackingRecHitSoAView soa) { + int i = threadIdx.x; + int j = blockIdx.x; + + if (i == 0 and j == 0) { + printf("nbins = %d \n", soa.phiBinner().nbins()); + printf("offsetBPIX %d ->%d \n", i, soa.offsetBPIX2()); + printf("nHits %d ->%d \n", i, soa.nHits()); + printf("hitsModuleStart %d ->%d \n", i, soa.hitsModuleStart().at(28)); + } + + if (i < 10) // can be increased to soa.nHits() for debugging + printf("iPhi %d ->%d \n", i, soa[i].iphi()); + + if (j * blockDim.x + i < 10) // can be increased to soa.phiBinner().nbins() for debugging + printf(">bin size %d ->%d \n", j * blockDim.x + i, soa.phiBinner().size(j * blockDim.x + i)); + __syncthreads(); + } + + template + void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream) { + printf("> RUN!\n"); + fill<<<10, 100, 0, stream>>>(hits.view()); + + cudaCheck(cudaDeviceSynchronize()); + cms::cuda::fillManyFromVector(hits.phiBinner(), + 10, + hits.view().iphi(), + hits.view().hitsLayerStart().data(), + 2000, + 256, + hits.view().phiBinnerStorage(), + stream); + cudaCheck(cudaDeviceSynchronize()); + show<<<10, 1000, 0, stream>>>(hits.view()); + cudaCheck(cudaDeviceSynchronize()); + } + + template void runKernels(TrackingRecHitSoADevice& hits, + cudaStream_t stream); + template void runKernels(TrackingRecHitSoADevice& hits, + cudaStream_t stream); + +} // namespace testTrackingRecHitSoA diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml index f61e4aff7403f..c6b918ec4b12b 100644 --- a/CUDADataFormats/Vertex/BuildFile.xml +++ b/CUDADataFormats/Vertex/BuildFile.xml @@ -3,6 +3,7 @@ + diff --git a/CUDADataFormats/Vertex/README.md b/CUDADataFormats/Vertex/README.md new file mode 100644 index 0000000000000..3e495d15f776e --- /dev/null +++ b/CUDADataFormats/Vertex/README.md @@ -0,0 +1,45 @@ +# Vertex CUDA Data Formats + +`CUDADataFormat`s meant to be used on Host (CPU) or Device (CUDA GPU) for +storing information about vertices created during the Pixel-local Reconstruction +chain. It stores data in an SoA manner. It contains the data that was previously +contained in the deprecated `ZVertexSoA` class. + +The host format is inheriting from `CUDADataFormats/Common/interface/PortableHostCollection.h`, +while the device format is inheriting from `CUDADataFormats/Common/interface/PortableDeviceCollection.h` + +Both formats use the same SoA Layout (`ZVertexSoAHeterogeneousLayout`) which is generated +via the `GENERATE_SOA_LAYOUT` macro in the `ZVertexUtilities.h` file. + +## Notes + +- Initially, `ZVertexSoA` had distinct array sizes for each attribute (e.g. `zv` was `MAXVTX` elements +long, `ndof` was `MAXTRACKS` elements long). All columns are now of uniform `MAXTRACKS` size, +meaning that there will be some wasted space (appx. 190kB). +- Host and Device classes should **not** be created via inheritance, as they're done here, +but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309). + +## ZVertexHeterogeneousHost + +The version of the data format to be used for storing vertex data on the CPU. +Instances of this class are to be used for: + +- Having a place to copy data to host from device, via `cudaMemcpy`, or +- Running host-side algorithms using data stored in an SoA manner. + +## ZVertexHeterogeneousDevice + +The version of the data format to be used for storing vertex data on the GPU. + +Instances of `ZVertexHeterogeneousDevice` are to be created on host and be +used on device only. To do so, the instance's `view()` method is to be called +to pass a `View` to any kernel launched. Accessing data from the `view()` is not +possible on the host side. + +## Utilities + +Apart from `ZVertexSoAHeterogeneousLayout`, `ZVertexUtilities.h` also contains +a collection of methods which were originally +defined as class methods inside the `ZVertexSoA` class +which have been adapted to operate on `View` instances, so that they are callable +from within `__global__` kernels, on both CPU and CPU. diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..ae662d7fd5f9a --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -0,0 +1,22 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +// TODO: The class is created via inheritance of the PortableDeviceCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + ZVertexSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice; + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..6b62d615e1d11 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -0,0 +1,24 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +// TODO: The class is created via inheritance of the PortableHostCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + explicit ZVertexSoAHeterogeneousHost() : cms::cuda::PortableHostCollection>(S) {} + + // Constructor which specifies the SoA size and CUDA stream + explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +using ZVertexSoAHost = ZVertexSoAHeterogeneousHost; + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h new file mode 100644 index 0000000000000..2403652377971 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h +#define CUDADataFormats_Vertex_ZVertexUtilities_h + +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, + SOA_COLUMN(int16_t, idv), + SOA_COLUMN(float, zv), + SOA_COLUMN(float, wv), + SOA_COLUMN(float, chi2), + SOA_COLUMN(float, ptv2), + SOA_COLUMN(int32_t, ndof), + SOA_COLUMN(uint16_t, sortInd), + SOA_SCALAR(uint32_t, nvFinal)) + +// Previous ZVertexSoA class methods. +// They operate on View and ConstView of the ZVertexSoA. +namespace zVertex { + // Common types for both Host and Device code + using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + + static constexpr uint32_t MAXTRACKS = 128 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } + + } // namespace utilities +} // namespace zVertex + +#endif diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h index 7931beaa8f4bd..0340affffa06c 100644 --- a/CUDADataFormats/Vertex/src/classes.h +++ b/CUDADataFormats/Vertex/src/classes.h @@ -1,7 +1,8 @@ #ifndef CUDADataFormats_Vertex_src_classes_h #define CUDADataFormats_Vertex_src_classes_h -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Common/interface/Product.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml index ea633080af9af..63bd5a1cc94a7 100644 --- a/CUDADataFormats/Vertex/src/classes_def.xml +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -1,6 +1,7 @@ - - - - + + + + + diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc index 9e054ecd17898..c13aa5eb47b42 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc @@ -18,7 +18,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" // Geometry #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -30,8 +31,8 @@ template class SiPixelCompareRecHitsSoA : public DQMEDAnalyzer { public: - using HitSoA = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DCPUT; + using HitSoA = TrackingRecHitSoAView; + using HitsOnHost = TrackingRecHitSoAHost; explicit SiPixelCompareRecHitsSoA(const edm::ParameterSet&); ~SiPixelCompareRecHitsSoA() override = default; @@ -43,8 +44,8 @@ class SiPixelCompareRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; - const edm::EDGetTokenT tokenSoAHitsGPU_; + const edm::EDGetTokenT tokenSoAHitsHost_; //these two are both on Host but originally they have been + const edm::EDGetTokenT tokenSoAHitsDevice_; //produced on Host or on Device const std::string topFolderName_; const float mind2cut_; static constexpr uint32_t invalidHit_ = std::numeric_limits::max(); @@ -82,8 +83,8 @@ template SiPixelCompareRecHitsSoA::SiPixelCompareRecHitsSoA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), topoToken_(esConsumes()), - tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), - tokenSoAHitsGPU_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), + tokenSoAHitsHost_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), + tokenSoAHitsDevice_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), mind2cut_(iConfig.getParameter("minD2cut")) {} // @@ -100,39 +101,41 @@ void SiPixelCompareRecHitsSoA::dqmBeginRun(const edm::Run& iRun, const edm::E // template void SiPixelCompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - const auto& rhsoaHandleCPU = iEvent.getHandle(tokenSoAHitsCPU_); - const auto& rhsoaHandleGPU = iEvent.getHandle(tokenSoAHitsGPU_); - if (not rhsoaHandleCPU or not rhsoaHandleGPU) { + const auto& rhsoaHandleHost = iEvent.getHandle(tokenSoAHitsHost_); + const auto& rhsoaHandleDevice = iEvent.getHandle(tokenSoAHitsDevice_); + if (not rhsoaHandleHost or not rhsoaHandleDevice) { edm::LogWarning out("SiPixelCompareRecHitSoA"); - if (not rhsoaHandleCPU) { - out << "reference (cpu) rechits not found; "; + if (not rhsoaHandleHost) { + out << "reference (Host) rechits not found; "; } - if (not rhsoaHandleGPU) { - out << "target (gpu) rechits not found; "; + if (not rhsoaHandleDevice) { + out << "target (Device) rechits not found; "; } out << "the comparison will not run."; return; } - auto const& rhsoaCPU = *rhsoaHandleCPU; - const HitSoA* soa2dCPU = rhsoaCPU.view(); - auto const& rhsoaGPU = *rhsoaHandleGPU; - const HitSoA* soa2dGPU = rhsoaGPU.view(); + auto const& rhsoaHost = *rhsoaHandleHost; + auto const& rhsoaDevice = *rhsoaHandleDevice; - uint32_t nHitsCPU = soa2dCPU->nHits(); - uint32_t nHitsGPU = soa2dGPU->nHits(); - hnHits_->Fill(nHitsCPU, nHitsGPU); + auto const& soa2dHost = rhsoaHost.const_view(); + auto const& soa2dDevice = rhsoaDevice.const_view(); + + uint32_t nHitsHost = soa2dHost.nHits(); + uint32_t nHitsDevice = soa2dDevice.nHits(); + + hnHits_->Fill(nHitsHost, nHitsDevice); auto detIds = tkGeom_->detUnitIds(); - for (uint32_t i = 0; i < nHitsCPU; i++) { + for (uint32_t i = 0; i < nHitsHost; i++) { float minD = mind2cut_; uint32_t matchedHit = invalidHit_; - uint16_t indCPU = soa2dCPU->detectorIndex(i); - float xLocalCPU = soa2dCPU->xLocal(i); - float yLocalCPU = soa2dCPU->yLocal(i); - for (uint32_t j = 0; j < nHitsGPU; j++) { - if (soa2dGPU->detectorIndex(j) == indCPU) { - float dx = xLocalCPU - soa2dGPU->xLocal(j); - float dy = yLocalCPU - soa2dGPU->yLocal(j); + uint16_t indHost = soa2dHost[i].detectorIndex(); + float xLocalHost = soa2dHost[i].xLocal(); + float yLocalHost = soa2dHost[i].yLocal(); + for (uint32_t j = 0; j < nHitsDevice; j++) { + if (soa2dDevice.detectorIndex(j) == indHost) { + float dx = xLocalHost - soa2dDevice[j].xLocal(); + float dy = yLocalHost - soa2dDevice[j].yLocal(); float distance = dx * dx + dy * dy; if (distance < minD) { minD = distance; @@ -140,46 +143,46 @@ void SiPixelCompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm::E } } } - DetId id = detIds[indCPU]; - uint32_t chargeCPU = soa2dCPU->charge(i); - int16_t sizeXCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeX(i)) / 8.)); - int16_t sizeYCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeY(i)) / 8.)); - uint32_t chargeGPU = 0; - int16_t sizeXGPU = -99; - int16_t sizeYGPU = -99; - float xLocalGPU = -999.; - float yLocalGPU = -999.; + DetId id = detIds[indHost]; + uint32_t chargeHost = soa2dHost[i].chargeAndStatus().charge; + int16_t sizeXHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeX()) / 8.)); + int16_t sizeYHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeY()) / 8.)); + uint32_t chargeDevice = 0; + int16_t sizeXDevice = -99; + int16_t sizeYDevice = -99; + float xLocalDevice = -999.; + float yLocalDevice = -999.; if (matchedHit != invalidHit_) { - chargeGPU = soa2dGPU->charge(matchedHit); - sizeXGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeX(matchedHit)) / 8.)); - sizeYGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeY(matchedHit)) / 8.)); - xLocalGPU = soa2dGPU->xLocal(matchedHit); - yLocalGPU = soa2dGPU->yLocal(matchedHit); + chargeDevice = soa2dDevice[matchedHit].chargeAndStatus().charge; + sizeXDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeX()) / 8.)); + sizeYDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeY()) / 8.)); + xLocalDevice = soa2dDevice[matchedHit].xLocal(); + yLocalDevice = soa2dDevice[matchedHit].yLocal(); } switch (id.subdetId()) { case PixelSubdetector::PixelBarrel: - hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeCPU, chargeGPU); - hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXCPU, sizeXGPU); - hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYCPU, sizeYGPU); - hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalCPU, xLocalGPU); - hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalCPU, yLocalGPU); - hBchargeDiff_->Fill(chargeCPU - chargeGPU); - hBsizeXDiff_->Fill(sizeXCPU - sizeXGPU); - hBsizeYDiff_->Fill(sizeYCPU - sizeYGPU); - hBposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU)); - hBposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU)); + hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeHost, chargeDevice); + hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXHost, sizeXDevice); + hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYHost, sizeYDevice); + hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalHost, xLocalDevice); + hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalHost, yLocalDevice); + hBchargeDiff_->Fill(chargeHost - chargeDevice); + hBsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hBsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hBposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hBposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); break; case PixelSubdetector::PixelEndcap: - hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeCPU, chargeGPU); - hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXCPU, sizeXGPU); - hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYCPU, sizeYGPU); - hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalCPU, xLocalGPU); - hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalCPU, yLocalGPU); - hFchargeDiff_->Fill(chargeCPU - chargeGPU); - hFsizeXDiff_->Fill(sizeXCPU - sizeXGPU); - hFsizeYDiff_->Fill(sizeYCPU - sizeYGPU); - hFposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU)); - hFposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU)); + hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeHost, chargeDevice); + hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXHost, sizeXDevice); + hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYHost, sizeYDevice); + hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalHost, xLocalDevice); + hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalHost, yLocalDevice); + hFchargeDiff_->Fill(chargeHost - chargeDevice); + hFsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hFsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hFposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hFposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); break; } } @@ -197,47 +200,47 @@ void SiPixelCompareRecHitsSoA::bookHistograms(DQMStore::IBooker& iBook, // clang-format off //Global - hnHits_ = iBook.book2I("nHits", "CPUvsGPU RecHits per event;#CPU RecHits;#GPU RecHits", 200, 0, 5000,200, 0, 5000); + hnHits_ = iBook.book2I("nHits", "HostvsDevice RecHits per event;#Host RecHits;#Device RecHits", 200, 0, 5000,200, 0, 5000); //Barrel Layer for(unsigned int il=0;ilnumberOfLayers(PixelSubdetector::PixelBarrel);il++){ - hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("CPUvsGPU RecHits Charge Barrel Layer%d;CPU Charge;GPU Charge",il+1), 250, 0, 100000, 250, 0, 100000); - hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("CPUvsGPU RecHits SizeX Barrel Layer%d;CPU SizeX;GPU SizeX",il+1), 30, 0, 30, 30, 0, 30); - hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("CPUvsGPU RecHits SizeY Barrel Layer%d;CPU SizeY;GPU SizeY",il+1), 30, 0, 30, 30, 0, 30); - hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("CPUvsGPU RecHits x-pos in Barrel Layer%d;CPU pos x;GPU pos x",il+1), 200, -5, 5, 200,-5,5); - hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("CPUvsGPU RecHits y-pos in Barrel Layer%d;CPU pos y;GPU pos y",il+1), 200, -5, 5, 200,-5,5); + hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("HostvsDevice RecHits Charge Barrel Layer%d;Host Charge;Device Charge",il+1), 250, 0, 100000, 250, 0, 100000); + hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("HostvsDevice RecHits SizeX Barrel Layer%d;Host SizeX;Device SizeX",il+1), 30, 0, 30, 30, 0, 30); + hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("HostvsDevice RecHits SizeY Barrel Layer%d;Host SizeY;Device SizeY",il+1), 30, 0, 30, 30, 0, 30); + hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("HostvsDevice RecHits x-pos in Barrel Layer%d;Host pos x;Device pos x",il+1), 200, -5, 5, 200,-5,5); + hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("HostvsDevice RecHits y-pos in Barrel Layer%d;Host pos y;Device pos y",il+1), 200, -5, 5, 200,-5,5); } //Endcaps //Endcaps Disk for(int is=0;is<2;is++){ int sign=is==0? -1:1; for(unsigned int id=0;idnumberOfLayers(PixelSubdetector::PixelEndcap);id++){ - hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("CPUvsGPU RecHits Charge Endcaps Disk%+d;CPU Charge;GPU Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000); - hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("CPUvsGPU RecHits SizeX Endcaps Disk%+d;CPU SizeX;GPU SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30); - hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("CPUvsGPU RecHits SizeY Endcaps Disk%+d;CPU SizeY;GPU SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30); - hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("CPUvsGPU RecHits x-pos Endcaps Disk%+d;CPU pos x;GPU pos x",id*sign+sign), 200, -5, 5, 200, -5, 5); - hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("CPUvsGPU RecHits y-pos Endcaps Disk%+d;CPU pos y;GPU pos y",id*sign+sign), 200, -5, 5, 200, -5, 5); + hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("HostvsDevice RecHits Charge Endcaps Disk%+d;Host Charge;Device Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000); + hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("HostvsDevice RecHits SizeX Endcaps Disk%+d;Host SizeX;Device SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("HostvsDevice RecHits SizeY Endcaps Disk%+d;Host SizeY;Device SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("HostvsDevice RecHits x-pos Endcaps Disk%+d;Host pos x;Device pos x",id*sign+sign), 200, -5, 5, 200, -5, 5); + hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("HostvsDevice RecHits y-pos Endcaps Disk%+d;Host pos y;Device pos y",id*sign+sign), 200, -5, 5, 200, -5, 5); } } //1D differences - hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5); - hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5); - hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5); - hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5); - hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5); - hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5); - hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10); - hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10); - hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10); - hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10); + hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); + hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); } template void SiPixelCompareRecHitsSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { // monitorpixelRecHitsSoA edm::ParameterSetDescription desc; - desc.add("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cpu")); + desc.add("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@Host")); desc.add("pixelHitsSrcGPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cuda")); - desc.add("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareGPUvsCPU"); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareDevicevsHost"); desc.add("minD2cut", 0.0001); descriptions.addWithDefaultLabel(desc); } diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc index fde8e892c560c..ecac8989df441 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc @@ -20,7 +20,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" // for string manipulations #include @@ -65,7 +66,7 @@ namespace { template class SiPixelCompareTrackSoA : public DQMEDAnalyzer { public: - using PixelTrackSoA = PixelTrackHeterogeneousT; + using PixelTrackSoA = TrackSoAHeterogeneousHost; explicit SiPixelCompareTrackSoA(const edm::ParameterSet&); ~SiPixelCompareTrackSoA() override = default; @@ -133,6 +134,7 @@ SiPixelCompareTrackSoA::SiPixelCompareTrackSoA(const edm::ParameterSet& iConf // template void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + using helper = TracksUtilities; const auto& tsoaHandleCPU = iEvent.getHandle(tokenSoATrackCPU_); const auto& tsoaHandleGPU = iEvent.getHandle(tokenSoATrackGPU_); if (not tsoaHandleCPU or not tsoaHandleGPU) { @@ -147,12 +149,12 @@ void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve return; } - auto const& tsoaCPU = *tsoaHandleCPU->get(); - auto const& tsoaGPU = *tsoaHandleGPU->get(); - auto maxTracksCPU = tsoaCPU.stride(); //this should be same for both? - auto maxTracksGPU = tsoaGPU.stride(); //this should be same for both? - auto const* qualityCPU = tsoaCPU.qualityData(); - auto const* qualityGPU = tsoaGPU.qualityData(); + auto const& tsoaCPU = *tsoaHandleCPU; + auto const& tsoaGPU = *tsoaHandleGPU; + auto maxTracksCPU = tsoaCPU.view().metadata().size(); //this should be same for both? + auto maxTracksGPU = tsoaGPU.view().metadata().size(); //this should be same for both? + auto const* qualityCPU = tsoaCPU.view().quality(); + auto const* qualityGPU = tsoaGPU.view().quality(); int32_t nTracksCPU = 0; int32_t nTracksGPU = 0; int32_t nLooseAndAboveTracksCPU = 0; @@ -162,9 +164,9 @@ void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false? std::vector looseTrkidxGPU; for (int32_t jt = 0; jt < maxTracksGPU; ++jt) { - if (tsoaGPU.nHits(jt) == 0) + if (helper::nHits(tsoaGPU.view(), jt) == 0) break; // this is a guard - if (!(tsoaGPU.pt(jt) > 0.)) + if (!(tsoaGPU.view()[jt].pt() > 0.)) continue; nTracksGPU++; if (useQualityCut_ && qualityGPU[jt] < minQuality_) @@ -175,9 +177,18 @@ void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve //Now loop over CPU tracks//nested loop for loose gPU tracks for (int32_t it = 0; it < maxTracksCPU; ++it) { - if (tsoaCPU.nHits(it) == 0) + int nHitsCPU = helper::nHits(tsoaCPU.view(), it); + + if (nHitsCPU == 0) break; // this is a guard - if (!(tsoaCPU.pt(it) > 0.)) + + float ptCPU = tsoaCPU.view()[it].pt(); + float etaCPU = tsoaCPU.view()[it].eta(); + float phiCPU = helper::phi(tsoaCPU.view(), it); + float zipCPU = helper::zip(tsoaCPU.view(), it); + float tipCPU = helper::tip(tsoaCPU.view(), it); + + if (!(ptCPU > 0.)) continue; nTracksCPU++; if (useQualityCut_ && qualityCPU[it] < minQuality_) @@ -187,12 +198,11 @@ void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve const int32_t notFound = -1; int32_t closestTkidx = notFound; float mindr2 = dr2cut_; - float etacpu = tsoaCPU.eta(it); - float phicpu = tsoaCPU.phi(it); + for (auto gid : looseTrkidxGPU) { - float etagpu = tsoaGPU.eta(gid); - float phigpu = tsoaGPU.phi(gid); - float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu); + float etaGPU = tsoaGPU.view()[gid].eta(); + float phiGPU = helper::phi(tsoaGPU.view(), gid); + float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU); if (dr2 > dr2cut_) continue; // this is arbitrary if (mindr2 > dr2) { @@ -201,31 +211,31 @@ void SiPixelCompareTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve } } - hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it)); //all CPU tk - hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it)); + hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU); //all CPU tk + hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU); if (closestTkidx == notFound) continue; nLooseAndAboveTracksCPU_matchedGPU++; - hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx)); - hCharge_->Fill(tsoaCPU.charge(it), tsoaGPU.charge(closestTkidx)); - hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx)); - hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx)); - hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx)); - hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx)); - hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx)); - htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx)); - hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx)); - hCurvdiffMatched_->Fill((tsoaCPU.charge(it) / tsoaCPU.pt(it)) - - (tsoaGPU.charge(closestTkidx) / tsoaGPU.pt(closestTkidx))); - hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx)); - hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx))); - hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx)); - htipdiffMatched_->Fill(tsoaCPU.tip(it) - tsoaGPU.tip(closestTkidx)); - hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it)); //matched to gpu - hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it)); + hchi2_->Fill(tsoaCPU.view()[it].chi2(), tsoaGPU.view()[closestTkidx].chi2()); + hCharge_->Fill(helper::charge(tsoaCPU.view(), it), helper::charge(tsoaGPU.view(), closestTkidx)); + hnHits_->Fill(helper::nHits(tsoaCPU.view(), it), helper::nHits(tsoaGPU.view(), closestTkidx)); + hnLayers_->Fill(tsoaCPU.view()[it].nLayers(), tsoaGPU.view()[closestTkidx].nLayers()); + hpt_->Fill(tsoaCPU.view()[it].pt(), tsoaGPU.view()[closestTkidx].pt()); + hptLogLog_->Fill(tsoaCPU.view()[it].pt(), tsoaGPU.view()[closestTkidx].pt()); + heta_->Fill(etaCPU, tsoaGPU.view()[closestTkidx].eta()); + hphi_->Fill(etaCPU, helper::phi(tsoaGPU.view(), closestTkidx)); + hz_->Fill(zipCPU, helper::zip(tsoaGPU.view(), closestTkidx)); + htip_->Fill(tipCPU, helper::tip(tsoaGPU.view(), closestTkidx)); + hptdiffMatched_->Fill(tsoaCPU.view()[it].pt() - tsoaGPU.view()[closestTkidx].pt()); + hCurvdiffMatched_->Fill((helper::charge(tsoaCPU.view(), it) / tsoaCPU.view()[it].pt()) - + (helper::charge(tsoaGPU.view(), closestTkidx) / tsoaGPU.view()[closestTkidx].pt())); + hetadiffMatched_->Fill(etaCPU - tsoaGPU.view()[closestTkidx].eta()); + hphidiffMatched_->Fill(reco::deltaPhi(etaCPU, helper::phi(tsoaGPU.view(), closestTkidx))); + hzdiffMatched_->Fill(zipCPU - helper::zip(tsoaGPU.view(), closestTkidx)); + htipdiffMatched_->Fill(tipCPU - helper::tip(tsoaGPU.view(), closestTkidx)); + hpt_eta_tkAllCPUMatched_->Fill(etaCPU, tsoaCPU.view()[it].pt()); //matched to gpu + hphi_z_tkAllCPUMatched_->Fill(etaCPU, zipCPU); } hnTracks_->Fill(nTracksCPU, nTracksGPU); hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc index 52e8396a49022..555542eb56995 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelCompareVertexSoA // Class: SiPixelCompareVertexSoA // -/**\class SiPixelCompareVertexSoA SiPixelCompareVertexSoA.cc +/**\class SiPixelCompareVertexSoA SiPixelCompareVertexSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -18,7 +18,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelCompareVertexSoA : public DQMEDAnalyzer { @@ -31,8 +31,8 @@ class SiPixelCompareVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoAVertexCPU_; - const edm::EDGetTokenT tokenSoAVertexGPU_; + const edm::EDGetTokenT tokenSoAVertexCPU_; + const edm::EDGetTokenT tokenSoAVertexGPU_; const edm::EDGetTokenT tokenBeamSpot_; const std::string topFolderName_; const float dzCut_; @@ -53,9 +53,10 @@ class SiPixelCompareVertexSoA : public DQMEDAnalyzer { // constructors // +// Note tokenSoAVertexGPU_ contains data copied from device to host, hence is a HostCollection SiPixelCompareVertexSoA::SiPixelCompareVertexSoA(const edm::ParameterSet& iConfig) - : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), - tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), + : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), + tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), topFolderName_(iConfig.getParameter("topFolderName")), dzCut_(iConfig.getParameter("dzCut")) {} @@ -78,10 +79,10 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event return; } - auto const& vsoaCPU = *vsoaHandleCPU->get(); - int nVerticesCPU = vsoaCPU.nvFinal; - auto const& vsoaGPU = *vsoaHandleGPU->get(); - int nVerticesGPU = vsoaGPU.nvFinal; + auto const& vsoaCPU = *vsoaHandleCPU; + int nVerticesCPU = vsoaCPU.view().nvFinal(); + auto const& vsoaGPU = *vsoaHandleGPU; + int nVerticesGPU = vsoaGPU.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; @@ -97,22 +98,22 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event } for (int ivc = 0; ivc < nVerticesCPU; ivc++) { - auto sic = vsoaCPU.sortInd[ivc]; - auto zc = vsoaCPU.zv[sic]; + auto sic = vsoaCPU.view()[ivc].sortInd(); + auto zc = vsoaCPU.view()[sic].zv(); auto xc = x0 + dxdz * zc; auto yc = y0 + dydz * zc; zc += z0; - auto ndofCPU = vsoaCPU.ndof[sic]; - auto chi2CPU = vsoaCPU.chi2[sic]; + auto ndofCPU = vsoaCPU.view()[sic].ndof(); + auto chi2CPU = vsoaCPU.view()[sic].chi2(); const int32_t notFound = -1; int32_t closestVtxidx = notFound; float mindz = dzCut_; for (int ivg = 0; ivg < nVerticesGPU; ivg++) { - auto sig = vsoaGPU.sortInd[ivg]; - auto zgc = vsoaGPU.zv[sig] + z0; + auto sig = vsoaGPU.view()[ivg].sortInd(); + auto zgc = vsoaGPU.view()[sig].zv() + z0; auto zDist = std::abs(zc - zgc); //insert some matching condition if (zDist > dzCut_) @@ -125,12 +126,12 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event if (closestVtxidx == notFound) continue; - auto zg = vsoaGPU.zv[closestVtxidx]; + auto zg = vsoaGPU.view()[closestVtxidx].zv(); auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofGPU = vsoaGPU.ndof[closestVtxidx]; - auto chi2GPU = vsoaGPU.chi2[closestVtxidx]; + auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof(); + auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); hy_->Fill(yc - y0, yg - y0); @@ -140,7 +141,7 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event hzdiff_->Fill(zc - zg); hchi2_->Fill(chi2CPU, chi2GPU); hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU); - hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]); + hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2()); hntrks_->Fill(ndofCPU + 1, ndofGPU + 1); } hnVertex_->Fill(nVerticesCPU, nVerticesGPU); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc index ba68a8182e261..0844bd865ca1f 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc @@ -19,7 +19,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" // Geometry #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -31,8 +32,8 @@ template class SiPixelMonitorRecHitsSoA : public DQMEDAnalyzer { public: - using HitSoA = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DCPUT; + using HitSoA = TrackingRecHitSoAView; + using HitsOnHost = TrackingRecHitSoAHost; explicit SiPixelMonitorRecHitsSoA(const edm::ParameterSet&); ~SiPixelMonitorRecHitsSoA() override = default; @@ -44,7 +45,7 @@ class SiPixelMonitorRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; + const edm::EDGetTokenT tokenSoAHitsCPU_; const std::string topFolderName_; const TrackerGeometry* tkGeom_ = nullptr; const TrackerTopology* tTopo_ = nullptr; @@ -101,21 +102,21 @@ void SiPixelMonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm::E return; } auto const& rhsoa = *rhsoaHandle; - const HitSoA* soa2d = rhsoa.view(); + auto const& soa2d = rhsoa.const_view(); - uint32_t nHits_ = soa2d->nHits(); + uint32_t nHits_ = soa2d.nHits(); hnHits->Fill(nHits_); auto detIds = tkGeom_->detUnitIds(); for (uint32_t i = 0; i < nHits_; i++) { - DetId id = detIds[soa2d->detectorIndex(i)]; - float xG = soa2d->xGlobal(i); - float yG = soa2d->yGlobal(i); - float zG = soa2d->zGlobal(i); - float rG = soa2d->rGlobal(i); - float fphi = short2phi(soa2d->iphi(i)); - uint32_t charge = soa2d->charge(i); - int16_t sizeX = std::ceil(float(std::abs(soa2d->clusterSizeX(i)) / 8.)); - int16_t sizeY = std::ceil(float(std::abs(soa2d->clusterSizeY(i)) / 8.)); + DetId id = detIds[soa2d[i].detectorIndex()]; + float xG = soa2d[i].xGlobal(); + float yG = soa2d[i].yGlobal(); + float zG = soa2d[i].zGlobal(); + float rG = soa2d[i].rGlobal(); + float fphi = short2phi(soa2d[i].iphi()); + uint32_t charge = soa2d[i].chargeAndStatus().charge; + int16_t sizeX = std::ceil(float(std::abs(soa2d[i].clusterSizeX()) / 8.)); + int16_t sizeY = std::ceil(float(std::abs(soa2d[i].clusterSizeY()) / 8.)); hBFposZP->Fill(zG, fphi); int16_t ysign = yG >= 0 ? 1 : -1; hBFposZR->Fill(zG, rG * ysign); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc index 491c8f1be238a..3deb289888477 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc @@ -20,14 +20,15 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" // for string manipulations #include template class SiPixelMonitorTrackSoA : public DQMEDAnalyzer { public: - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using PixelTrackHeterogeneous = TrackSoAHeterogeneousHost; explicit SiPixelMonitorTrackSoA(const edm::ParameterSet&); ~SiPixelMonitorTrackSoA() override = default; void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; @@ -81,23 +82,24 @@ void SiPixelMonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve return; } - auto const& tsoa = *((tsoaHandle.product())->get()); - auto maxTracks = tsoa.stride(); - auto const* quality = tsoa.qualityData(); + using helper = TracksUtilities; + auto const& tsoa = *tsoaHandle.product(); + auto maxTracks = tsoa.view().metadata().size(); + auto const* quality = tsoa.view().quality(); int32_t nTracks = 0; int32_t nLooseAndAboveTracks = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - auto nLayers = tsoa.nLayers(it); + auto nHits = helper::nHits(tsoa.const_view(), it); + auto nLayers = tsoa.view()[it].nLayers(); if (nHits == 0) break; // this is a guard - float pt = tsoa.pt(it); + float pt = tsoa.view()[it].pt(); if (!(pt > 0.)) continue; // fill the quality for all tracks - pixelTrack::Quality qual = tsoa.quality(it); + pixelTrack::Quality qual = quality[it]; hquality->Fill(int(qual)); nTracks++; @@ -105,11 +107,11 @@ void SiPixelMonitorTrackSoA::analyze(const edm::Event& iEvent, const edm::Eve continue; // fill parameters only for quality >= loose - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); - float zip = tsoa.zip(it); - float eta = tsoa.eta(it); - float tip = tsoa.tip(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = helper::phi(tsoa.const_view(), it); + float zip = helper::zip(tsoa.const_view(), it); + float eta = tsoa.view()[it].eta(); + float tip = helper::tip(tsoa.const_view(), it); hchi2->Fill(chi2); hChi2VsPhi->Fill(phi, chi2); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc index 13cf991b54c82..4287babcf4964 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc @@ -21,7 +21,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelMonitorVertexSoA : public DQMEDAnalyzer { @@ -34,7 +34,7 @@ class SiPixelMonitorVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; edm::EDGetTokenT tokenBeamSpot_; std::string topFolderName_; MonitorElement* hnVertex; @@ -52,7 +52,7 @@ class SiPixelMonitorVertexSoA : public DQMEDAnalyzer { // SiPixelMonitorVertexSoA::SiPixelMonitorVertexSoA(const edm::ParameterSet& iConfig) { - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); tokenBeamSpot_ = consumes(iConfig.getParameter("beamSpotSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); } @@ -67,8 +67,8 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event return; } - auto const& vsoa = *((vsoaHandle.product())->get()); - int nVertices = vsoa.nvFinal; + auto const& vsoa = *vsoaHandle; + int nVertices = vsoa.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -83,8 +83,8 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.sortInd[iv]; - auto z = vsoa.zv[si]; + auto si = vsoa.view()[iv].sortInd(); + auto z = vsoa.view()[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; @@ -92,10 +92,10 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.ndof[si]; - hchi2->Fill(vsoa.chi2[si]); - hchi2oNdof->Fill(vsoa.chi2[si] / ndof); - hptv2->Fill(vsoa.ptv2[si]); + auto ndof = vsoa.view()[si].ndof(); + hchi2->Fill(vsoa.view()[si].chi2()); + hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); + hptv2->Fill(vsoa.view()[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc index 0702bc4830c7c..5b23f2dbda104 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -10,6 +10,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +28,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr store_; + cms::cuda::PortableHostCollection> digis_h_; int nDigis_; }; @@ -48,29 +49,25 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, // Do the transfer in a CUDA stream parallel to the computation CUDA stream cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + const auto& digis_d = ctx.get(iEvent, digiGetToken_); - nDigis_ = gpuDigis.nDigis(); - store_ = gpuDigis.copyAllToHostAsync(ctx.stream()); + nDigis_ = digis_d.nDigis(); + nDigis_ = digis_d.nDigis(); + digis_h_ = cms::cuda::PortableHostCollection>(digis_d.view().metadata().size(), ctx.stream()); + cudaCheck(cudaMemcpyAsync(digis_h_.buffer().get(), + digis_d.const_buffer().get(), + digis_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); } void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - // The following line copies the data from the pinned host memory to - // regular host memory. In principle that feels unnecessary (why not - // just use the pinned host memory?). There are a few arguments for - // doing it though - // - Now can release the pinned host memory back to the (caching) allocator - // * if we'd like to keep the pinned memory, we'd need to also - // keep the CUDA stream around as long as that, or allow pinned - // host memory to be allocated without a CUDA stream - // - What if a CPU algorithm would produce the same SoA? We can't - // use cudaMallocHost without a GPU... - - auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX); - - iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus()); - - store_.reset(); + iEvent.emplace(digiPutToken_, + nDigis_, + digis_h_.view().pdigi(), + digis_h_.view().rawIdArr(), + digis_h_.view().adc(), + digis_h_.view().clus()); } // define as framework plugin diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h index c991d09666297..304e8a1255cce 100644 --- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h +++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h @@ -28,6 +28,8 @@ namespace pixelTopology { constexpr int16_t phi0p07 = 730; // round(730.12648...) = phi2short(0.07); constexpr int16_t phi0p09 = 900; + constexpr uint16_t last_barrel_layer = 3; // this is common between all the topologies + template constexpr auto map_to_array_helper(Function f, std::index_sequence) -> std::array, sizeof...(Indices)> { @@ -292,10 +294,11 @@ namespace pixelTopology { static constexpr uint32_t maxCellTracks = 302; static constexpr uint32_t maxHitsOnTrack = 15; static constexpr uint32_t maxHitsOnTrackForFullFit = 6; - static constexpr uint32_t avgHitsPerTrack = 9; + static constexpr uint32_t avgHitsPerTrack = 7; static constexpr uint32_t maxCellsPerHit = 256; static constexpr uint32_t avgTracksPerHit = 10; static constexpr uint32_t maxNumberOfTuples = 256 * 1024; + //this is well above thanks to maxNumberOfTuples static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples; static constexpr uint32_t maxNumberOfDoublets = 5 * 512 * 1024; static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8; @@ -308,9 +311,9 @@ namespace pixelTopology { static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; - static constexpr uint32_t last_bpix1_detIndex = 108; - static constexpr uint32_t last_bpix2_detIndex = 324; - static constexpr uint32_t last_barrel_detIndex = 504; + static constexpr uint16_t last_bpix1_detIndex = 108; + static constexpr uint16_t last_bpix2_detIndex = 324; + static constexpr uint16_t last_barrel_detIndex = 504; static constexpr uint32_t maxPixInModule = 6000; @@ -383,7 +386,7 @@ namespace pixelTopology { static constexpr uint32_t maxCellTracks = 48; static constexpr uint32_t maxHitsOnTrack = 10; static constexpr uint32_t maxHitsOnTrackForFullFit = 6; - static constexpr uint32_t avgHitsPerTrack = 4; + static constexpr uint32_t avgHitsPerTrack = 5; static constexpr uint32_t maxCellsPerHit = 256; static constexpr uint32_t avgTracksPerHit = 6; static constexpr uint32_t maxNumberOfTuples = 32 * 1024; @@ -399,9 +402,9 @@ namespace pixelTopology { static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; - static constexpr uint32_t last_bpix1_detIndex = 96; - static constexpr uint32_t last_bpix2_detIndex = 320; - static constexpr uint32_t last_barrel_detIndex = 1184; + static constexpr uint16_t last_bpix1_detIndex = 96; + static constexpr uint16_t last_bpix2_detIndex = 320; + static constexpr uint16_t last_barrel_detIndex = 1184; static constexpr uint32_t maxPixInModule = 6000; diff --git a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py index 2f0e84337cb70..5aca943b85192 100644 --- a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py +++ b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py @@ -258,6 +258,12 @@ def customizeHLTfor40334(process): return process +def customizeHLTfor40465(process): + try: + process.hltSiPixelRecHitsSoA.cpu.hltSiPixelRecHitsFromLegacy[0].type = 'pixelTopologyPhase1TrackingRecHitSoAHost' + except: + pass + return process # CMSSW version specific customizations def customizeHLTforCMSSW(process, menuType="GRun"): @@ -266,9 +272,10 @@ def customizeHLTforCMSSW(process, menuType="GRun"): # add call to action function in proper order: newest last! # process = customiseFor12718(process) - + process = customizeHLTfor38761(process) process = customizeHLTfor40264(process) process = customizeHLTfor40334(process) + process = customizeHLTfor40465(process) return process diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc index df168da110301..76cc641d365c5 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc @@ -32,6 +32,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ServiceRegistry/interface/Service.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "RecoTracker/Record/interface/CkfComponentsRecord.h" @@ -272,9 +273,15 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& cms::cuda::ScopedContextProduce ctx{ctxState_}; if (nDigis_ == 0) { - // default construct collections and place them in event - ctx.emplace(iEvent, digiPutToken_, SiPixelDigisCUDA{}); - ctx.emplace(iEvent, clusterPutToken_, SiPixelClustersCUDA{}); + // Cannot use the default constructor here, as it would not allocate memory. + // In the case of no digis, clusters_d are not being instantiated, but are + // still used downstream to initialize TrackingRecHitSoADevice. If there + // are no valid pointers to clusters' Collection columns, instantiation + // of TrackingRecHits fail. Example: workflow 11604.0 + SiPixelDigisCUDA digis_d = SiPixelDigisCUDA(nDigis_, ctx.stream()); + SiPixelClustersCUDA clusters_d = SiPixelClustersCUDA(pixelTopology::Phase1::numberOfModules, ctx.stream()); + ctx.emplace(iEvent, digiPutToken_, std::move(digis_d)); + ctx.emplace(iEvent, clusterPutToken_, std::move(clusters_d)); if (includeErrors_) { ctx.emplace(iEvent, digiErrorPutToken_, SiPixelDigiErrorsCUDA{}); } diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index bc9be260deb20..293d4422e8458 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -318,12 +318,7 @@ namespace pixelgpudetails { const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds, - uint16_t *xx, - uint16_t *yy, - uint16_t *adc, - uint32_t *pdigi, - uint32_t *rawIdArr, - uint16_t *moduleId, + SiPixelDigisCUDASOAView digisView, cms::cuda::SimpleVector *err, bool useQualityInfo, bool includeErrors) { @@ -332,17 +327,18 @@ namespace pixelgpudetails { int32_t first = threadIdx.x + blockIdx.x * blockDim.x; for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) { auto gIndex = iloop; - xx[gIndex] = 0; - yy[gIndex] = 0; - adc[gIndex] = 0; + auto dvgi = digisView[gIndex]; + dvgi.xx() = 0; + dvgi.yy() = 0; + dvgi.adc() = 0; bool skipROC = false; uint8_t fedId = fedIds[gIndex / 2]; // +1200; // initialize (too many coninue below) - pdigi[gIndex] = 0; - rawIdArr[gIndex] = 0; - moduleId[gIndex] = gpuClustering::invalidModuleId; + dvgi.pdigi() = 0; + dvgi.rawIdArr() = 0; + dvgi.moduleId() = gpuClustering::invalidModuleId; uint32_t ww = word[gIndex]; // Array containing 32 bit raw data if (ww == 0) { @@ -433,12 +429,12 @@ namespace pixelgpudetails { } pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix); - xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 - yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 - adc[gIndex] = sipixelconstants::getADC(ww); - pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); - moduleId[gIndex] = detId.moduleId; - rawIdArr[gIndex] = rawId; + dvgi.xx() = globalPix.row; // origin shifting by 1 0-159 + dvgi.yy() = globalPix.col; // origin shifting by 1 0-415 + dvgi.adc() = sipixelconstants::getADC(ww); + dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc()); + dvgi.moduleId() = detId.moduleId; + dvgi.rawIdArr() = rawId; } // end of loop (gIndex < end) } // end of Raw to Digi kernel @@ -451,7 +447,6 @@ namespace pixelgpudetails { constexpr int nMaxModules = TrackerTraits::numberOfModules; constexpr int startBPIX2 = TrackerTraits::layerStart[1]; - assert(nMaxModules < TrackerTraits::numberOfModules); assert(startBPIX2 < nMaxModules); assert(nMaxModules < 4096); // easy to extend at least till 32*1024 assert(nMaxModules > 1024); @@ -549,7 +544,8 @@ namespace pixelgpudetails { #endif // since wordCounter != 0 we're not allocating 0 bytes, - digis_d = SiPixelDigisCUDA(wordCounter, stream); + // digis_d = SiPixelDigisCUDA(wordCounter, stream); + digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream); if (includeErrors) { digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream); } @@ -578,12 +574,7 @@ namespace pixelgpudetails { wordCounter, word_d.get(), fedId_d.get(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), - digis_d.view().pdigi(), - digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), + digis_d.view(), digiErrors_d.error(), // returns nullptr if default-constructed useQualityInfo, includeErrors); @@ -594,12 +585,7 @@ namespace pixelgpudetails { wordCounter, word_d.get(), fedId_d.get(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), - digis_d.view().pdigi(), - digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), + digis_d.view(), digiErrors_d.error(), // returns nullptr if default-constructed useQualityInfo, includeErrors); @@ -621,25 +607,25 @@ namespace pixelgpudetails { int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock; if (isRun2) - gpuCalibPixel::calibDigis<<>>(digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), + gpuCalibPixel::calibDigis<<>>(digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + digis_d->adc(), gains, wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); else - gpuCalibPixel::calibDigis<<>>(digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), + gpuCalibPixel::calibDigis<<>>(digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + digis_d->adc(), gains, wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -652,7 +638,7 @@ namespace pixelgpudetails { #endif countModules<<>>( - digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), wordCounter); + digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel) @@ -661,14 +647,14 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + findClus<<>>(digis_d->rawIdArr(), + digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -678,12 +664,12 @@ namespace pixelgpudetails { // apply charge cut clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + digis_d->moduleId(), + digis_d->adc(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -694,8 +680,10 @@ namespace pixelgpudetails { // synchronization/ExternalWork auto nModules_Clusters_d = cms::cuda::make_device_unique(3, stream); // MUST be ONE block - fillHitsModuleStart<<<1, 1024, 0, stream>>>( - clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); + fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d->clusInModule(), + clusters_d->clusModuleStart(), + clusters_d->moduleStart(), + nModules_Clusters_d.get()); // copy to host nModules_Clusters_h = cms::cuda::make_host_unique(3, stream); @@ -723,15 +711,12 @@ namespace pixelgpudetails { nDigis = numDigis; digis_d = SiPixelDigisCUDA(numDigis, stream); - cudaCheck( - cudaMemcpyAsync(digis_d.view().moduleInd(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck( - cudaMemcpyAsync(digis_d.view().pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck( - cudaMemcpyAsync(digis_d.view().rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream); @@ -740,12 +725,12 @@ namespace pixelgpudetails { int threadsPerBlock = 512; int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigisPhase2<<>>(digis_d.view().moduleInd(), - digis_d.view().adc(), + gpuCalibPixel::calibDigisPhase2<<>>(digis_d->moduleId(), + digis_d->adc(), numDigis, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); cudaCheck(cudaGetLastError()); @@ -755,12 +740,12 @@ namespace pixelgpudetails { #endif countModules<<>>( - digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), numDigis); + digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) cudaCheck(cudaMemcpyAsync( - &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); + &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); threadsPerBlock = 256; blocks = Phase2::numberOfModules; @@ -769,14 +754,14 @@ namespace pixelgpudetails { cudaCheck(cudaStreamSynchronize(stream)); std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + findClus<<>>(digis_d->rawIdArr(), + digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); @@ -788,12 +773,12 @@ namespace pixelgpudetails { // apply charge cut clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + digis_d->moduleId(), + digis_d->adc(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); @@ -805,8 +790,10 @@ namespace pixelgpudetails { std::cout << "CUDA fillHitsModuleStart kernel launch \n"; #endif - fillHitsModuleStart<<<1, 1024, 0, stream>>>( - clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); + fillHitsModuleStart<<<1, 1024, 0, stream>>>(clusters_d->clusInModule(), + clusters_d->clusModuleStart(), + clusters_d->moduleStart(), + nModules_Clusters_d.get()); nModules_Clusters_h = cms::cuda::make_host_unique(3, stream); cudaCheck(cudaMemcpyAsync( diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index cb5b4b2f2c387..38d9ed1ad77e3 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -12,7 +12,7 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" -// #define GPU_DEBUG 1 +// #define GPU_DEBUG namespace { template @@ -42,7 +42,7 @@ namespace { namespace pixelgpudetails { template - TrackingRecHit2DGPUT PixelRecHitGPUKernel::makeHitsAsync( + TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync( SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, @@ -51,8 +51,8 @@ namespace pixelgpudetails { using namespace gpuPixelRecHits; auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPUT hits_d( - nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + TrackingRecHitSoADevice hits_d( + nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events @@ -61,11 +61,10 @@ namespace pixelgpudetails { int blocks = activeModulesWithDigis; #ifdef GPU_DEBUG - std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif getHits<<>>( - cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); + cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.const_view(), hits_d.view()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -74,16 +73,16 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { setHitsLayerStart - <<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + <<<1, 32, 0, stream>>>(clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); constexpr auto nLayers = TrackerTraits::numberOfLayers; cms::cuda::fillManyFromVector(hits_d.phiBinner(), nLayers, - hits_d.iphi(), - hits_d.hitsLayerStart(), + hits_d.view().iphi(), + hits_d.view().hitsLayerStart().data(), nHits, 256, - hits_d.phiBinnerStorage(), + hits_d.view().phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); @@ -93,6 +92,11 @@ namespace pixelgpudetails { } } +#ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); + std::cout << "PixelRecHitGPUKernel -> DONE!" << std::endl; +#endif + return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 0a3c2b647f22e..25cc724cd4c4a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -8,9 +8,9 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -//#define GPU_DEBUG 1 +//#define GPU_DEBUG namespace pixelgpudetails { template @@ -26,11 +26,11 @@ namespace pixelgpudetails { using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; - TrackingRecHit2DGPUT makeHitsAsync(SiPixelDigisCUDA const& digis_d, - SiPixelClustersCUDA const& clusters_d, - BeamSpotCUDA const& bs_d, - ParamsOnGPU const* cpeParams, - cudaStream_t stream) const; + TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d, + SiPixelClustersCUDA const& clusters_d, + BeamSpotCUDA const& bs_d, + ParamsOnGPU const* cpeParams, + cudaStream_t stream) const; }; } // namespace pixelgpudetails diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index b23fa7dcc11ed..3bf0cf670a577 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -4,7 +4,7 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -39,7 +39,7 @@ class SiPixelRecHitCUDAT : public edm::global::EDProducer<> { const edm::EDGetTokenT> tBeamSpot; const edm::EDGetTokenT> token_; const edm::EDGetTokenT> tokenDigi_; - const edm::EDPutTokenT>> tokenHit_; + const edm::EDPutTokenT>> tokenHit_; const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; @@ -50,7 +50,7 @@ SiPixelRecHitCUDAT::SiPixelRecHitCUDAT(const edm::ParameterSet& i tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("src"))), tokenDigi_(consumes>(iConfig.getParameter("src"))), - tokenHit_(produces>>()) {} + tokenHit_(produces>>()) {} template void SiPixelRecHitCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 1428efe06a1d1..c639d5cc4fefa 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -4,7 +4,7 @@ #include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" @@ -33,7 +33,7 @@ class SiPixelRecHitFromCUDAT : public edm::stream::EDProducer static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); using HMSstorage = HostProduct; - using HitsOnGPU = TrackingRecHit2DGPUT; + using HitsOnGPU = TrackingRecHitSoADevice; private: void acquire(edm::Event const& iEvent, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index 8bcb218255548..15bc0c8df70b5 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -4,7 +4,8 @@ #include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" @@ -32,7 +33,8 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer; - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; + using HitsOnHost = TrackingRecHitSoAHost; + using HitsOnDevice = TrackingRecHitSoADevice; private: void acquire(edm::Event const& iEvent, @@ -40,21 +42,18 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer>> hitsTokenGPU_; // CUDA hits - const edm::EDPutTokenT> hitsPutTokenCPU_; + const edm::EDGetTokenT> hitsTokenGPU_; // CUDA hits + const edm::EDPutTokenT hitsPutTokenCPU_; const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; - - cms::cuda::host::unique_ptr store32_; - cms::cuda::host::unique_ptr store16_; - cms::cuda::host::unique_ptr hitsModuleStart_; + HitsOnHost hits_h_; }; template SiPixelRecHitSoAFromCUDAT::SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig) : hitsTokenGPU_(consumes(iConfig.getParameter("pixelRecHitSrc"))), - hitsPutTokenCPU_(produces>()), + hitsPutTokenCPU_(produces()), hostPutToken_(produces()) {} template @@ -69,18 +68,18 @@ template void SiPixelRecHitSoAFromCUDAT::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product> const& inputDataWrapped = iEvent.get(hitsTokenGPU_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); nHits_ = inputData.nHits(); + hits_h_ = HitsOnHost(nHits_, ctx.stream()); + cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(), + inputData.const_buffer().get(), + inputData.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; - - if (0 == nHits_) - return; - store32_ = inputData.store32ToHostAsync(ctx.stream()); - store16_ = inputData.store16ToHostAsync(ctx.stream()); - hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } template @@ -88,10 +87,10 @@ void SiPixelRecHitSoAFromCUDAT::produce(edm::Event& iEvent, edm:: auto hmsp = std::make_unique(TrackerTraits::numberOfModules + 1); if (nHits_ > 0) - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get()); + std::copy(hits_h_.view().hitsModuleStart().begin(), hits_h_.view().hitsModuleStart().end(), hmsp.get()); iEvent.emplace(hostPutToken_, std::move(hmsp)); - iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_); + iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_)); } using SiPixelRecHitSoAFromCUDA = SiPixelRecHitSoAFromCUDAT; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index 1edc7870f4800..dfc18d31154f2 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -3,7 +3,8 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" #include "CUDADataFormats/Common/interface/HostProduct.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" @@ -35,8 +36,9 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - using HitModuleStart = std::array; + using HitModuleStart = std::array; using HMSstorage = HostProduct; + using HitsOnHost = TrackingRecHitSoAHost; private: void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; @@ -45,7 +47,7 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> { const edm::ESGetToken cpeToken_; const edm::EDGetTokenT bsGetToken_; const edm::EDGetTokenT clusterToken_; // Legacy Clusters - const edm::EDPutTokenT> tokenHit_; + const edm::EDPutTokenT tokenHit_; const edm::EDPutTokenT tokenModuleStart_; const bool convert2Legacy_; }; @@ -56,7 +58,7 @@ SiPixelRecHitSoAFromLegacyT::SiPixelRecHitSoAFromLegacyT(const ed cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), bsGetToken_{consumes(iConfig.getParameter("beamSpot"))}, clusterToken_{consumes(iConfig.getParameter("src"))}, - tokenHit_{produces>()}, + tokenHit_{produces()}, tokenModuleStart_{produces()}, convert2Legacy_(iConfig.getParameter("convertToLegacy")) { if (convert2Legacy_) @@ -99,12 +101,11 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, iEvent.getByToken(clusterToken_, hclusters); auto const& input = *hclusters; - constexpr int maxModules = TrackerTraits::numberOfModules; + constexpr int nModules = TrackerTraits::numberOfModules; constexpr int startBPIX2 = pixelTopology::layerStart(1); // allocate a buffer for the indices of the clusters - auto hmsp = std::make_unique(maxModules + 1); - // hitsModuleStart is a non-owning pointer to the buffer + auto hmsp = std::make_unique(nModules + 1); auto hitsModuleStart = hmsp.get(); // wrap the buffer in a HostProduct auto hms = std::make_unique(std::move(hmsp)); @@ -114,28 +115,19 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, // legacy output auto legacyOutput = std::make_unique(); - // storage - std::vector xx; - std::vector yy; - std::vector adc; - std::vector moduleInd; - std::vector clus; - std::vector, SiPixelCluster>> clusterRef; constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule(); - HitModuleStart moduleStart_; // index of the first pixel of each module - HitModuleStart clusInModule_; - memset(&clusInModule_, 0, sizeof(HitModuleStart)); // needed?? - memset(&moduleStart_, 0, sizeof(HitModuleStart)); - assert(gpuClustering::maxNumModules + 1 == clusInModule_.size()); - assert(0 == clusInModule_[gpuClustering::maxNumModules]); - uint32_t moduleId_; - moduleStart_[1] = 0; // we run sequentially.... + cms::cuda::PortableHostCollection> clusters_h(nModules + 1); + + memset(clusters_h.view().clusInModule(), 0, (nModules + 1) * sizeof(uint32_t)); // needed?? + memset(clusters_h.view().moduleStart(), 0, (nModules + 1) * sizeof(uint32_t)); + memset(clusters_h.view().moduleId(), 0, (nModules + 1) * sizeof(uint32_t)); + memset(clusters_h.view().clusModuleStart(), 0, (nModules + 1) * sizeof(uint32_t)); - SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{ - moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart}; + assert(0 == clusters_h.view()[nModules].clusInModule()); + clusters_h.view()[1].moduleStart() = 0; // fill cluster arrays int numberOfClusters = 0; @@ -144,33 +136,33 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto gind = genericDet->index(); - assert(gind < maxModules); + assert(gind < nModules); auto const nclus = dsv.size(); - clusInModule_[gind] = nclus; + clusters_h.view()[gind].clusInModule() = nclus; numberOfClusters += nclus; } - hitsModuleStart[0] = 0; - - for (int i = 1, n = maxModules + 1; i < n; ++i) - hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1]; + clusters_h.view()[0].clusModuleStart() = 0; - assert(numberOfClusters == int(hitsModuleStart[maxModules])); + for (int i = 1; i < nModules + 1; ++i) { + clusters_h.view()[i].clusModuleStart() = + clusters_h.view()[i - 1].clusModuleStart() + clusters_h.view()[i - 1].clusInModule(); + } + assert((uint32_t)numberOfClusters == clusters_h.view()[nModules].clusModuleStart()); // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) - - auto output = std::make_unique>( - numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); + HitsOnHost output( + numberOfClusters, clusters_h.view()[startBPIX2].clusModuleStart(), &cpeView, clusters_h.view().clusModuleStart()); if (0 == numberOfClusters) { - iEvent.put(std::move(output)); + iEvent.emplace(tokenHit_, std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); return; } if (convert2Legacy_) - legacyOutput->reserve(maxModules, numberOfClusters); + legacyOutput->reserve(nModules, numberOfClusters); int numberOfDetUnits = 0; int numberOfHits = 0; @@ -180,16 +172,17 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto const gind = genericDet->index(); - assert(gind < maxModules); + assert(gind < nModules); const PixelGeomDetUnit* pixDet = dynamic_cast(genericDet); assert(pixDet); auto const nclus = dsv.size(); - assert(clusInModule_[gind] == nclus); + + assert(clusters_h.view()[gind].clusInModule() == nclus); if (0 == nclus) continue; // is this really possible? - auto const fc = hitsModuleStart[gind]; - auto const lc = hitsModuleStart[gind + 1]; + auto const fc = clusters_h.view()[gind].clusModuleStart(); + auto const lc = clusters_h.view()[gind + 1].clusModuleStart(); assert(lc > fc); LogDebug("SiPixelRecHitSoAFromLegacy") << "in det " << gind << ": conv " << nclus << " hits from " << dsv.size() << " legacy clusters" << ' ' << fc << ',' << lc; @@ -198,25 +191,30 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, printf( "WARNING: too many clusters %d in Module %d. Only first %d Hits converted\n", nclus, gind, maxHitsInModule); - // fill digis - xx.clear(); - yy.clear(); - adc.clear(); - moduleInd.clear(); - clus.clear(); + // count digis + uint32_t ndigi = 0; + for (auto const& clust : dsv) { + assert(clust.size() > 0); + ndigi += clust.size(); + } + + cms::cuda::PortableHostCollection> digis_h(ndigi); + clusterRef.clear(); - moduleId_ = gind; + clusters_h.view()[0].moduleId() = gind; + uint32_t ic = 0; - uint32_t ndigi = 0; + ndigi = 0; + //filling digis for (auto const& clust : dsv) { assert(clust.size() > 0); for (int i = 0, nd = clust.size(); i < nd; ++i) { auto px = clust.pixel(i); - xx.push_back(px.x); - yy.push_back(px.y); - adc.push_back(px.adc); - moduleInd.push_back(gind); - clus.push_back(ic); + digis_h.view()[ndigi].xx() = px.x; + digis_h.view()[ndigi].yy() = px.y; + digis_h.view()[ndigi].adc() = px.adc; + digis_h.view()[ndigi].moduleId() = gind; + digis_h.view()[ndigi].clus() = ic; ++ndigi; } @@ -225,25 +223,19 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, ic++; } assert(nclus == ic); - assert(clus.size() == ndigi); + numberOfHits += nclus; // filled creates view - SiPixelDigisCUDASOAView digiView; - digiView.xx_ = xx.data(); - digiView.yy_ = yy.data(); - digiView.adc_ = adc.data(); - digiView.moduleInd_ = moduleInd.data(); - digiView.clus_ = clus.data(); - digiView.pdigi_ = nullptr; - digiView.rawIdArr_ = nullptr; - assert(digiView.adc(0) != 0); + assert(digis_h.view()[0].adc() != 0); // we run on blockId.x==0 - gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); + + gpuPixelRecHits::getHits(&cpeView, &bsHost, digis_h.view(), ndigi, clusters_h.view(), output.view()); for (auto h = fc; h < lc; ++h) if (h - fc < maxHitsInModule) - assert(gind == output->view()->detectorIndex(h)); + assert(gind == output.view()[h].detectorIndex()); else - assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h)); + assert(gpuClustering::invalidModuleId == output.view()[h].detectorIndex()); + if (convert2Legacy_) { SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid); for (auto h = fc; h < lc; ++h) { @@ -253,8 +245,9 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, break; assert(ih < clusterRef.size()); - LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); - LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); + LocalPoint lp(output.view()[h].xLocal(), output.view()[h].yLocal()); + LocalError le(output.view()[h].xerrLocal(), 0, output.view()[h].yerrLocal()); + SiPixelRecHitQuality::QualWordType rqw = 0; SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]); recHitsOnDetUnit.push_back(hit); @@ -267,24 +260,28 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, // fill data structure to support CA constexpr auto nLayers = TrackerTraits::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { - output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + output.view().hitsLayerStart()[i] = clusters_h.view()[cpeView.layerGeometry().layerStart[i]].clusModuleStart(); LogDebug("SiPixelRecHitSoAFromLegacy") << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i] << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n"; } - cms::cuda::fillManyFromVector(output->phiBinner(), + cms::cuda::fillManyFromVector(&(output.view().phiBinner()), nLayers, - output->iphi(), - output->hitsLayerStart(), - numberOfHits, + output.view().iphi(), + output.view().hitsLayerStart().data(), + output.view().nHits(), 256, - output->phiBinnerStorage()); + output.view().phiBinnerStorage()); LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets" << "\n"; - iEvent.put(std::move(output)); + + // copy pointer to data (SoA view) to allocated buffer + memcpy(hitsModuleStart, clusters_h.view().clusModuleStart(), nModules * sizeof(uint32_t)); + + iEvent.emplace(tokenHit_, std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index f0798cc74a975..09d0b55030d9c 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -7,11 +7,11 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" //#define GPU_DEBUG 1 namespace gpuPixelRecHits { @@ -19,20 +19,17 @@ namespace gpuPixelRecHits { template __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT const* __restrict__ cpeParams, BeamSpotPOD const* __restrict__ bs, - SiPixelDigisCUDASOAView const digis, + SiPixelDigisCUDASOAConstView digis, int numElements, - SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - TrackingRecHit2DSOAViewT* phits) { + SiPixelClustersCUDASOAConstView clusters, + TrackingRecHitSoAView hits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain) - assert(phits); assert(cpeParams); - auto& hits = *phits; - auto const& clusters = *pclusters; // copy average geometry corrected by beamspot . FIXME (move it somewhere else???) if (0 == blockIdx.x) { auto& agc = hits.averageGeometry(); @@ -51,7 +48,6 @@ namespace gpuPixelRecHits { if (0 == threadIdx.x) { agc.endCapZ[0] = ag.endCapZ[0] - bs->z; agc.endCapZ[1] = ag.endCapZ[1] - bs->z; - // printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]); } } @@ -64,23 +60,22 @@ namespace gpuPixelRecHits { // as usual one block per module __shared__ ClusParams clusParams; - auto me = clusters.moduleId(blockIdx.x); - int nclus = clusters.clusInModule(me); + auto me = clusters[blockIdx.x].moduleId(); + int nclus = clusters[me].clusInModule(); if (0 == nclus) return; -// #ifdef GPU_DEBUG -// if (threadIdx.x == 0) { -// auto k = clusters.moduleStart(1 + blockIdx.x); -// while (digis.moduleInd(k) == invalidModuleId) -// ++k; -// assert(digis.moduleInd(k) == me); -// } -// #endif #ifdef GPU_DEBUG + if (threadIdx.x == 0) { + auto k = clusters[1 + blockIdx.x].moduleStart(); + while (digis[k].moduleId() == invalidModuleId) + ++k; + assert(digis[k].moduleId() == me); + } + if (me % 100 == 1) if (threadIdx.x == 0) - printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me)); + printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart()); #endif for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) { @@ -108,21 +103,21 @@ namespace gpuPixelRecHits { __syncthreads(); // one thread per "digi" - auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x; + auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x; for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleId(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); + auto x = digis[i].xx(); + auto y = digis[i].yy(); atomicMin(&clusParams.minRow[cl], x); atomicMax(&clusParams.maxRow[cl], x); atomicMin(&clusParams.minCol[cl], y); @@ -133,20 +128,20 @@ namespace gpuPixelRecHits { auto pixmx = cpeParams->detParams(me).pixmx; for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleId(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); - auto ch = digis.adc(i); + auto x = digis[i].xx(); + auto y = digis[i].yy(); + auto ch = digis[i].adc(); atomicAdd(&clusParams.charge[cl], ch); ch = std::min(ch, pixmx); if (clusParams.minRow[cl] == x) @@ -163,30 +158,31 @@ namespace gpuPixelRecHits { // next one cluster per thread... - first = clusters.clusModuleStart(me) + startClus; + first = clusters[me].clusModuleStart() + startClus; for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) { auto h = first + ic; // output index in global memory assert(h < hits.nHits()); - assert(h < clusters.clusModuleStart(me + 1)); + assert(h < clusters[me + 1].clusModuleStart()); pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it - hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]); - hits.detectorIndex(h) = me; + hits[h].chargeAndStatus().charge = clusParams.charge[ic]; + hits[h].chargeAndStatus().status = clusParams.status[ic]; + hits[h].detectorIndex() = me; float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits[h].xLocal() = xl = clusParams.xpos[ic]; + hits[h].yLocal() = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits[h].clusterSizeX() = clusParams.xsize[ic]; + hits[h].clusterSizeY() = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; + hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; + hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; // keep it local for computations float xg, yg, zg; @@ -197,12 +193,12 @@ namespace gpuPixelRecHits { yg -= bs->y; zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits[h].xGlobal() = xg; + hits[h].yGlobal() = yg; + hits[h].zGlobal() = zg; - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg); + hits[h].iphi() = unsafe_atan2s<7>(yg, xg); } __syncthreads(); } // end loop on batches diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index ec3e068bca422..7284dab68f05e 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -56,7 +56,7 @@ siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHit2DCPUT")), + cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) @@ -64,7 +64,7 @@ phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA, cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHit2DCPUT")), + cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) ))) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index ef73c625ebfa8..ac58a494cdf58 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,9 +1,7 @@ #include +#include // needed here by soa layout #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/Event.h" @@ -20,10 +18,21 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + template class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> { public: - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; + + using VertexSoAHost = ZVertexSoAHost; + using VertexSoADevice = ZVertexSoADevice; + explicit PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig); ~PixelTrackDumpCUDAT() override = default; @@ -32,23 +41,21 @@ class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> { private: void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; const bool m_onGPU; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDGetTokenT> tokenGPUVertex_; - edm::EDGetTokenT tokenSoATrack_; - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; }; template PixelTrackDumpCUDAT::PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig) : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { - tokenGPUTrack_ = - consumes>(iConfig.getParameter("pixelTrackSrc")); - tokenGPUVertex_ = - consumes>(iConfig.getParameter("pixelVertexSrc")); + tokenGPUTrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } else { tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -71,19 +78,19 @@ void PixelTrackDumpCUDAT::analyze(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{hTracks}; auto const& tracks = ctx.get(hTracks); - auto const* tsoa = tracks.get(); - assert(tsoa); + auto const* tsoa = &tracks; + assert(tsoa->buffer()); auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); - auto const* vsoa = vertices.get(); - assert(vsoa); + auto const* vsoa = &vertices; + assert(vsoa->buffer()); } else { - auto const* tsoa = iEvent.get(tokenSoATrack_).get(); - assert(tsoa); + auto const& tsoa = iEvent.get(tokenSoATrack_); + assert(tsoa.buffer()); - auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); - assert(vsoa); + auto const& vsoa = iEvent.get(tokenSoAVertex_); + assert(vsoa.buffer()); } } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 6a0f918b0d979..358d0b7b63e0c 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,20 +27,24 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "storeTracks.h" #include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" + /** * This class creates "leagcy" reco::Track * objects from the output of SoA CA. */ template class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using tracksHelpers = TracksUtilities; public: using IndToEdm = std::vector; @@ -50,7 +54,6 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); - // using HitModuleStart = std::array; using HMSstorage = HostProduct; private: @@ -58,7 +61,7 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens @@ -139,6 +142,7 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, std::vector hitmap; auto const &rcs = rechits.data(); auto nhits = rcs.size(); + hitmap.resize(nhits, nullptr); auto const *hitsModuleStart = iEvent.get(hmsToken_).get(); @@ -152,6 +156,7 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, auto i = fc[detI] + clus.pixelCluster().originalId(); if (i >= hitmap.size()) hitmap.resize(i + 256, nullptr); // only in case of hit overflow in one module + assert(nullptr == hitmap[i]); hitmap[i] = &h; } @@ -159,12 +164,10 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); - - auto const *quality = tsoa.qualityData(); - auto const &fit = tsoa.stateAtBS; - auto const &hitIndices = tsoa.hitIndices; - auto nTracks = tsoa.nTracks(); + auto const &tsoa = iEvent.get(tokenTrack_); + auto const *quality = tsoa.view().quality(); + auto const &hitIndices = tsoa.view().hitIndices(); + auto nTracks = tsoa.view().nTracks(); tracks.reserve(nTracks); @@ -173,19 +176,20 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, //sort index by pt std::vector sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); - std::sort( - sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); }); + std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - auto nHits = tsoa.nHits(it); + auto nHits = tracksHelpers::nHits(tsoa.view(), it); assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) continue; - if (tsoa.nLayers(it) < minNumberOfHits_) + if (nHits < minNumberOfHits_) //move to nLayers? continue; indToEdm[it] = nt; ++nt; @@ -197,12 +201,12 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = tracksHelpers::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + tracksHelpers::template copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 0675effd091e8..82c21da184ab9 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -1,8 +1,11 @@ #include +#include // needed here by soa layout #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -21,8 +24,8 @@ template class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - using TrackSoA = pixelTrack::TrackSoAT; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; public: explicit PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig); @@ -36,16 +39,15 @@ class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + TrackSoAHost tracks_h_; }; template PixelTrackSoAFromCUDAT::PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes(iConfig.getParameter("src"))), tokenSOA_(produces()) {} template void PixelTrackSoAFromCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -59,19 +61,22 @@ template void PixelTrackSoAFromCUDAT::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - soa_ = inputData.toHostAsync(ctx.stream()); + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device + tracks_h_ = TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(tracks_h_.buffer().get(), + tracks_d.const_buffer().get(), + tracks_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host } template void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { - auto const& tsoa = *soa_; - auto maxTracks = tsoa.stride(); + auto maxTracks = tracks_h_.view().metadata().size(); + auto nTracks = tracks_h_.view().nTracks(); - auto nTracks = tsoa.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 @@ -84,8 +89,8 @@ void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::Eve int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = TracksUtilities::nHits(tracks_h_.view(), it); + assert(nHits == int(tracks_h_.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -94,9 +99,8 @@ void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::Eve #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_)); - - assert(!soa_); + iEvent.emplace(tokenSOA_, std::move(tracks_h_)); + assert(!tracks_h_.buffer()); } using PixelTrackSoAFromCUDA = PixelTrackSoAFromCUDAT; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index d6a9db4953be1..be92f2d5d0fa2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,7 +1,7 @@ #include "BrokenLineFitOnGPU.h" template -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, +void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(const TrackingRecHitSoAConstView &hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { assert(tuples_); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index b1ee028b8863e..c5c9ac7fc6345 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -2,7 +2,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" template -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, +void HelixFitOnGPU::launchBrokenLineKernels(const TrackingRecHitSoAConstView& hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t stream) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 4d1d57c4e27a8..e347b0c000dc3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -8,7 +8,7 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -17,23 +17,18 @@ #include "HelixFitOnGPU.h" template -using HitsOnGPU = TrackingRecHit2DSOAViewT; +using Tuples = typename TrackSoA::HitContainer; template -using Tuples = pixelTrack::HitContainerT; -template -using OutputSoA = pixelTrack::TrackSoAT; +using OutputSoAView = TrackSoAView; template using TupleMultiplicity = caStructures::TupleMultiplicityT; -// using tindex_type = typename TrackerTraits::tindex_type; -// constexpr auto invalidTkId = std::numeric_limits::max(); - // #define BL_DUMP_HITS template __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, TupleMultiplicity const *__restrict__ tupleMultiplicity, - HitsOnGPU const *__restrict__ hhp, + TrackingRecHitSoAConstView hh, typename TrackerTraits::tindex_type *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, @@ -46,7 +41,6 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found assert(hitsInFit <= nHitsL); assert(nHitsL <= nHitsH); - assert(hhp); assert(phits); assert(pfast_fit); assert(foundNtuplets); @@ -100,9 +94,9 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found // #define YERR_FROM_DC #ifdef YERR_FROM_DC // try to compute more precise error in y - auto dx = hhp->xGlobal(hitId[hitsInFit - 1]) - hhp->xGlobal(hitId[0]); - auto dy = hhp->yGlobal(hitId[hitsInFit - 1]) - hhp->yGlobal(hitId[0]); - auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]); + auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal(); + auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal(); + auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal(); float ux, uy, uz; #endif @@ -118,8 +112,8 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found float ge[6]; #ifdef YERR_FROM_DC - auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit)); - auto status = hhp->status(hit); + auto const &dp = hh.cpeParams().detParams(hh.detectorIndex(hit)); + auto status = hh[hit].chargeAndStatus().status; int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin; assert(qbin >= 0 && qbin < 5); bool nok = (status.isBigY | status.isOneY); @@ -136,12 +130,10 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found yerr *= dp.yfact[qbin]; // inflate yerr *= yerr; yerr += dp.apeYY; - yerr = nok ? hhp->yerrLocal(hit) : yerr; - dp.frame.toGlobal(hhp->xerrLocal(hit), 0, yerr, ge); + yerr = nok ? hh[hit].yerrLocal() : yerr; + dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge); #else - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); #endif #ifdef BL_DUMP_HITS @@ -151,16 +143,16 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found local_idx, tkid, hit, - hhp->detectorIndex(hit), + hh[hit].detectorIndex(), i, - hhp->xGlobal(hit), - hhp->yGlobal(hit), - hhp->zGlobal(hit)); + hh[hit].xGlobal(), + hh[hit].yGlobal(), + hh[hit].zGlobal()); printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]); } #endif - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } brokenline::fastFit(hits, fast_fit); @@ -176,12 +168,14 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found template __global__ void kernel_BLFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, - OutputSoA *results, + OutputSoAView results_view, typename TrackerTraits::tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { - assert(results); + assert(results_view.pt()); + assert(results_view.eta()); + assert(results_view.chi2()); assert(pfast_fit); constexpr auto invalidTkId = std::numeric_limits::max(); @@ -209,10 +203,11 @@ __global__ void kernel_BLFit(TupleMultiplicity const *__restrict_ brokenline::lineFit(hits_ge, fast_fit, bField, data, line); brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); - results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); - results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); - results->eta(tkid) = asinhf(line.par(0)); - results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + TracksUtilities::copyFromCircle( + results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2))); + results_view[tkid].eta() = asinhf(line.par(0)); + results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5); #ifdef BROKENLINE_DEBUG if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index 95c443c3b51e7..de2a40fc8b0f0 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -1,5 +1,6 @@ + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index fade739410e2f..122f4af710966 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -17,19 +17,23 @@ #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" #include "CAHitNtupletGeneratorOnGPU.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" template class CAHitNtupletCUDAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using HitsConstView = TrackingRecHitSoAConstView; + using HitsOnGPU = TrackingRecHitSoADevice; //TODO move to OnDevice + using HitsOnCPU = TrackingRecHitSoAHost; //TODO move to OnHost + + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnGPU = TrackingRecHit2DGPUT; - using HitsOnCPU = TrackingRecHit2DCPUT; using GPUAlgo = CAHitNtupletGeneratorOnGPU; public: @@ -48,9 +52,9 @@ class CAHitNtupletCUDAT : public edm::global::EDProducer<> { edm::ESGetToken tokenField_; edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; edm::EDGetTokenT tokenHitCPU_; - edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; GPUAlgo gpuAlgo_; }; @@ -60,10 +64,10 @@ CAHitNtupletCUDAT::CAHitNtupletCUDAT(const edm::ParameterSet& iCo : onGPU_(iConfig.getParameter("onGPU")), tokenField_(esConsumes()), gpuAlgo_(iConfig, consumesCollector()) { if (onGPU_) { tokenHitGPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } @@ -95,13 +99,14 @@ void CAHitNtupletCUDAT::produce(edm::StreamID streamID, auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV(); if (onGPU_) { - auto hHits = iEvent.getHandle(tokenHitGPU_); - cms::cuda::ScopedContextProduce ctx{*hHits}; - auto const& hits = ctx.get(*hHits); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + auto const& hits = iEvent.get(tokenHitGPU_); + + cms::cuda::ScopedContextProduce ctx{hits}; + auto& hits_d = ctx.get(hits); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits_d, bf, ctx.stream())); } else { - auto const& hits = iEvent.get(tokenHitCPU_); - iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + auto& hits_h = iEvent.get(tokenHitCPU_); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits_h, bf)); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 75fbbffb49190..f826b1b5c89da 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -14,7 +14,9 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters cons } template -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(const HitsConstView &hh, + int32_t offsetBPIX2, + cudaStream_t stream) { using namespace gpuPixelDoublets; using GPUCACell = GPUCACellT; @@ -26,7 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con auto nhits = hh.nHits(); #ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl; + std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << offsetBPIX2 << std::endl; #endif // use "nhits" to heuristically dimension the workspace @@ -35,7 +37,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con //this->device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); this->device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); assert(this->device_isOuterHitOfCell_.get()); - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; auto cellStorageSize = TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks); @@ -68,28 +70,22 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con this->device_nCells_, this->device_theCellNeighbors_.get(), this->device_theCellTracks_.get(), - hh.view(), + hh, this->isOuterHitOfCell_, nActualPairs, this->params_.cellCuts_); } template -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsCPU::launchKernels(const HitsConstView &hh, + TkSoAView &tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); - - assert(tuples_d && quality_d); - // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); - auto nhits = hh.nHits(); + uint32_t nhits = hh.metadata().size(); #ifdef NTUPLE_DEBUG std::cout << "start tuple building. N hits " << nhits << std::endl; @@ -103,7 +99,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU con kernel_connect(this->device_hitTuple_apc_, this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh, this->device_theCells_.get(), this->device_nCells_, this->device_theCellNeighbors_.get(), @@ -112,91 +108,83 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU con if (nhits > 1 && this->params_.earlyFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); } - kernel_find_ntuplets(hh.view(), + kernel_find_ntuplets(hh, + tracks_view, this->device_theCells_.get(), this->device_nCells_, this->device_theCellTracks_.get(), - tuples_d, this->device_hitTuple_apc_, - quality_d, this->params_.caParams_); if (this->params_.doStats_) kernel_mark_used(this->device_theCells_.get(), this->device_nCells_); - cms::cuda::finalizeBulk(this->device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk(this->device_hitTuple_apc_, &tracks_view.hitIndices()); - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, this->device_hitTuple_apc_); + kernel_fillHitDetIndices(tracks_view, hh); + kernel_fillNLayers(tracks_view, this->device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover( - this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + kernel_countMultiplicity(tracks_view, this->device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tracks_view, this->device_tupleMultiplicity_.get()); if (nhits > 1 && this->params_.lateFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); } } template -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(const HitsConstView &hh, + TkSoAView &tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; - int32_t nhits = hh.nHits(); - - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + int32_t nhits = hh.metadata().size(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); + kernel_classifyTracks(tracks_view, this->params_.qualityCuts_); if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(this->device_theCells_.get(), this->device_nCells_, quality_d); + kernel_fishboneCleaner(this->device_theCells_.get(), this->device_nCells_, tracks_view); } // remove duplicates (tracks that share a doublet) kernel_fastDuplicateRemover( - this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); // fill hit->track "map" if (this->params_.doSharedHitCut_ || this->params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); + kernel_countHitInTracks(tracks_view, this->device_hitToTuple_.get()); cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); + kernel_fillHitInTracks(tracks_view, this->device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) if (this->params_.doSharedHitCut_) { - kernel_rejectDuplicate(tracks_d, - quality_d, + kernel_rejectDuplicate(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); - kernel_sharedHitCleaner(hh.view(), - tracks_d, - quality_d, + kernel_sharedHitCleaner(hh, + tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); if (this->params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner(tracks_d, - quality_d, + kernel_simpleTripletCleaner(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); } else { - kernel_tripletCleaner(tracks_d, - quality_d, + kernel_tripletCleaner(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); @@ -205,7 +193,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co if (this->params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, + kernel_checkOverflows(tracks_view, this->device_tupleMultiplicity_.get(), this->device_hitToTuple_.get(), this->device_hitTuple_apc_, @@ -223,7 +211,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co // counters (add flag???) std::lock_guard guard(lock_stat); kernel_doStatsForHitInTracks(this->device_hitToTuple_.get(), this->counters_); - kernel_doStatsForTracks(tuples_d, quality_d, this->counters_); + kernel_doStatsForTracks(tracks_view, this->counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -232,8 +220,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co { std::lock_guard guard(lock); ++iev; - kernel_print_found_ntuplets( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 0, 1000000, iev); + kernel_print_found_ntuplets(hh, tracks_view, this->device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 59ae2041b44aa..cd15b96bcd5fc 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -5,20 +5,16 @@ // #define GPU_DEBUG template -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsGPU::launchKernels(const HitsConstView &hh, + TkSoAView &tracks_view, cudaStream_t cudaStream) { using namespace gpuPixelDoublets; using namespace caHitNtupletGeneratorKernels; - // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream); //TODO test .data() - int32_t nhits = hh.nHits(); + int32_t nhits = hh.metadata().size(); #ifdef NTUPLE_DEBUG std::cout << "start tuple building. N hits " << nhits << std::endl; @@ -45,7 +41,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con kernel_connect <<>>(this->device_hitTuple_apc_, this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh, this->device_theCells_.get(), this->device_nCells_, this->device_theCellNeighbors_.get(), @@ -63,19 +59,18 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); fishbone<<>>( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); cudaCheck(cudaGetLastError()); } blockSize = 64; numberOfBlocks = (3 * this->params_.cellCuts_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), + kernel_find_ntuplets<<>>(hh, + tracks_view, this->device_theCells_.get(), this->device_nCells_, this->device_theCellTracks_.get(), - tuples_d, this->device_hitTuple_apc_, - quality_d, this->params_.caParams_); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -94,21 +89,23 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; - cms::cuda::finalizeBulk<<>>(this->device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk<<>>(this->device_hitTuple_apc_, + &tracks_view.hitIndices()); //TODO test .data() #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + kernel_fillHitDetIndices<<>>(tracks_view, hh); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - kernel_fillNLayers<<>>(tracks_d, this->device_hitTuple_apc_); + kernel_fillNLayers + <<>>(tracks_view, this->device_hitTuple_apc_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -120,7 +117,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -130,10 +127,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con blockSize = 128; numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize; kernel_countMultiplicity - <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + <<>>(tracks_view, this->device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity - <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + <<>>(tracks_view, this->device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -149,7 +146,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); fishbone<<>>( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); cudaCheck(cudaGetLastError()); } @@ -157,14 +154,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - - // free space asap - // this->device_isOuterHitOfCell_.reset(); } template -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - int32_t nhits = hh.nHits(); +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(const HitsConstView &hh, + int32_t offsetBPIX2, + cudaStream_t stream) { + int32_t nhits = hh.metadata().size(); using namespace gpuPixelDoublets; @@ -174,7 +170,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con using CellTracks = typename GPUCACell::CellTracks; using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer; - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; #ifdef NTUPLE_DEBUG std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; @@ -187,10 +183,10 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con // in principle we can use "nhits" to heuristically dimension the workspace... this->device_isOuterHitOfCell_ = - cms::cuda::make_device_unique(std::max(1, nhits - hh.offsetBPIX2()), stream); + cms::cuda::make_device_unique(std::max(1, nhits - offsetBPIX2), stream); assert(this->device_isOuterHitOfCell_.get()); - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; this->cellStorage_ = cms::cuda::make_device_unique(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + @@ -203,7 +199,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con { int threadsPerBlock = 128; // at least one block! - int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock; + int blocks = (std::max(1, nhits - offsetBPIX2) + threadsPerBlock - 1) / threadsPerBlock; initDoublets<<>>(this->isOuterHitOfCell_, nhits, this->device_theCellNeighbors_.get(), @@ -236,7 +232,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con this->device_nCells_, this->device_theCellNeighbors_.get(), this->device_theCellTracks_.get(), - hh.view(), + hh, this->isOuterHitOfCell_, nActualPairs, this->params_.cellCuts_); @@ -249,36 +245,32 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con } template -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(const HitsConstView &hh, + TkSoAView &tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; - // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); - - int32_t nhits = hh.nHits(); + int32_t nhits = hh.metadata().size(); auto blockSize = 64; // classify tracks based on kinematics auto numberOfBlocks = this->nQuadrupletBlocks(blockSize); kernel_classifyTracks - <<>>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); + <<>>(tracks_view, this->params_.qualityCuts_); if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_fishboneCleaner - <<>>(this->device_theCells_.get(), this->device_nCells_, quality_d); + <<>>(this->device_theCells_.get(), this->device_nCells_, tracks_view); cudaCheck(cudaGetLastError()); } // mark duplicates (tracks that share a doublet) numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -289,7 +281,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co assert(this->hitToTupleView_.offSize > nhits); numberOfBlocks = this->nQuadrupletBlocks(blockSize); kernel_countHitInTracks - <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); + <<>>(tracks_view, this->device_hitToTuple_.get()); //CHECK cudaCheck(cudaGetLastError()); assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) && (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) && @@ -297,7 +289,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); kernel_fillHitInTracks - <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); + <<>>(tracks_view, this->device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -308,32 +300,25 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co // mark duplicates (tracks that share at least one hit) numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize; - kernel_rejectDuplicate - <<>>(tracks_d, - quality_d, - this->params_.minHitsForSharingCut_, - this->params_.dupPassThrough_, - this->device_hitToTuple_.get()); + kernel_rejectDuplicate<<>>( + tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); kernel_sharedHitCleaner - <<>>(hh.view(), - tracks_d, - quality_d, + <<>>(hh, + tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); if (this->params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner - <<>>(tracks_d, - quality_d, + <<>>(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); } else { kernel_tripletCleaner - <<>>(tracks_d, - quality_d, + <<>>(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); @@ -347,7 +332,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co if (this->params_.doStats_) { numberOfBlocks = (std::max(nhits, int(this->params_.cellCuts_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; kernel_checkOverflows - <<>>(tuples_d, + <<>>(tracks_view, this->device_tupleMultiplicity_.get(), this->device_hitToTuple_.get(), this->device_hitTuple_apc_, @@ -370,7 +355,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; kernel_doStatsForTracks - <<>>(tuples_d, quality_d, this->counters_); + <<>>(tracks_view, this->counters_); //why sometimes yes and some no? cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -385,14 +370,13 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co std::lock_guard guard(lock); ++iev; for (int k = 0; k < 20000; k += 500) { - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), k, k + 500, iev); - cudaDeviceSynchronize(); + kernel_print_found_ntuplets + <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), k, k + 500, iev); + cudaCheck(cudaStreamSynchronize(cudaStream)); } - kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 20000, 1000000, iev); - cudaDeviceSynchronize(); - // cudaStreamSynchronize(cudaStream); + kernel_print_found_ntuplets + <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), 20000, 1000000, iev); + cudaCheck(cudaStreamSynchronize(cudaStream)); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index b595106299d71..f019283b90469 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -3,9 +3,15 @@ // #define GPU_DEBUG -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "GPUCACell.h" #include "gpuPixelDoublets.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + // #define DUMP_GPU_TK_TUPLES namespace caHitNtupletGenerator { @@ -201,8 +207,9 @@ class CAHitNtupletGeneratorKernels { template using unique_ptr = typename Traits::template unique_ptr; - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; + using HitsView = TrackingRecHitSoAView; + using HitsConstView = TrackingRecHitSoAConstView; + using TkSoAView = TrackSoAView; using HitToTuple = caStructures::HitToTupleT; using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -216,8 +223,7 @@ class CAHitNtupletGeneratorKernels { using CACell = GPUCACellT; using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoAT; - using HitContainer = pixelTrack::HitContainerT; + using HitContainer = typename TrackSoA::HitContainer; CAHitNtupletGeneratorKernels(Params const& params) : params_(params), paramsMaxDoubletes3Quarters_(3 * params.cellCuts_.maxNumberOfDoublets_ / 4) {} @@ -226,11 +232,11 @@ class CAHitNtupletGeneratorKernels { TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); void cleanup(cudaStream_t cudaStream); @@ -283,20 +289,24 @@ class CAHitNtupletGeneratorKernels { template class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels { using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; - using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; - using HitContainer = pixelTrack::HitContainerT; + using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitContainer = typename TrackSoA::HitContainer; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; using HitToTuple = caStructures::HitToTupleT; using CellTracksVector = caStructures::CellTracksVectorT; using TupleMultiplicity = caStructures::TupleMultiplicityT; - using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitsConstView = TrackingRecHitSoAConstView; + using TkSoAView = TrackSoAView; public: - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); + void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); + void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); static void printCounters(Counters const* counters); }; @@ -304,19 +314,24 @@ class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels class CAHitNtupletGeneratorKernelsCPU : public CAHitNtupletGeneratorKernels { using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; - using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; + using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitContainer = typename TrackSoA::HitContainer; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; using HitToTuple = caStructures::HitToTupleT; using CellTracksVector = caStructures::CellTracksVectorT; using TupleMultiplicity = caStructures::TupleMultiplicityT; - using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitsConstView = TrackingRecHitSoAConstView; + using TkSoAView = TrackSoAView; public: - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); + void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream); + void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); static void printCounters(Counters const* counters); }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 03112e0f3fc48..85386305eca6a 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -15,6 +15,9 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" + #include "CAStructures.h" #include "CAHitNtupletGeneratorKernels.h" #include "GPUCACell.h" @@ -28,8 +31,6 @@ namespace caHitNtupletGeneratorKernels { constexpr float nSigma2 = 25.f; //all of these below are mostly to avoid brining around the relative namespace - template - using HitsView = TrackingRecHit2DSOAViewT; template using HitToTuple = caStructures::HitToTupleT; @@ -49,13 +50,13 @@ namespace caHitNtupletGeneratorKernels { using Quality = pixelTrack::Quality; template - using TkSoA = pixelTrack::TrackSoAT; + using TkSoAView = TrackSoAView; template - using HitContainer = pixelTrack::HitContainerT; + using HitContainer = typename TrackSoA::HitContainer; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template using QualityCuts = pixelTrack::QualityCutsT; @@ -66,7 +67,7 @@ namespace caHitNtupletGeneratorKernels { using Counters = caHitNtupletGenerator::Counters; template - __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, + __global__ void kernel_checkOverflows(TkSoAView tracks_view, TupleMultiplicity const *tupleMultiplicity, HitToTuple const *hitToTuple, cms::cuda::AtomicPairCounter *apc, @@ -99,16 +100,16 @@ namespace caHitNtupletGeneratorKernels { nHits, hitToTuple->totOnes()); if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) { - assert(foundNtuplets->size(apc->get().m) == 0); - assert(foundNtuplets->size() == apc->get().n); + assert(tracks_view.hitIndices().size(apc->get().m) == 0); + assert(tracks_view.hitIndices().size() == apc->get().n); } } - for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) <= TrackerTraits::maxHitsOnTrack); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit + printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx)); + assert(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack); + for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih) assert(int(*ih) < nHits); } #endif @@ -168,7 +169,7 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_fishboneCleaner(GPUCACellT const *cells, uint32_t const *__restrict__ nCells, - Quality *quality) { + TkSoAView tracks_view) { constexpr auto reject = pixelTrack::Quality::dup; auto first = threadIdx.x + blockIdx.x * blockDim.x; @@ -178,7 +179,7 @@ namespace caHitNtupletGeneratorKernels { continue; for (auto it : thisCell.tracks()) - quality[it] = reject; + tracks_view[it].quality() = reject; } } @@ -187,14 +188,11 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_earlyDuplicateRemover(GPUCACellT const *cells, uint32_t const *__restrict__ nCells, - TkSoA const *__restrict__ ptracks, - Quality *quality, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose - auto const &tracks = *ptracks; - assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -207,7 +205,7 @@ namespace caHitNtupletGeneratorKernels { // find maxNl for (auto it : thisCell.tracks()) { - auto nl = tracks.nLayers(it); + auto nl = tracks_view[it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -216,8 +214,8 @@ namespace caHitNtupletGeneratorKernels { // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (tracks.nLayers(it) < maxNl) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].nLayers() < maxNl) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } } @@ -226,7 +224,7 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_fastDuplicateRemover(GPUCACellT const *__restrict__ cells, uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; @@ -243,45 +241,37 @@ namespace caHitNtupletGeneratorKernels { float mc = maxScore; uint16_t im = tkNotFound; - /* chi2 penalize higher-pt tracks (try rescale it?) - auto score = [&](auto it) { - return tracks->nLayers(it) < 4 ? - std::abs(tracks->tip(it)) : // tip for triplets - tracks->chi2(it); //chi2 for quads - }; - */ - - auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; + auto score = [&](auto it) { return std::abs(TracksUtilities::tip(tracks_view, it)); }; // full crazy combinatorics // full crazy combinatorics int ntr = thisCell.tracks().size(); for (int i = 0; i < ntr - 1; ++i) { auto it = thisCell.tracks()[i]; - auto qi = tracks->quality(it); + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks->stateAtBS.state(it)(2); - auto e2opi = tracks->stateAtBS.covariance(it)(9); - auto cti = tracks->stateAtBS.state(it)(3); - auto e2cti = tracks->stateAtBS.covariance(it)(12); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); for (auto j = i + 1; j < ntr; ++j) { auto jt = thisCell.tracks()[j]; - auto qj = tracks->quality(jt); + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks->stateAtBS.state(jt)(2); - auto ctj = tracks->stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; if ((qj < qi) || (qj == qi && score(it) < score(jt))) - tracks->quality(jt) = reject; + tracks_view[jt].quality() = reject; else { - tracks->quality(it) = reject; + tracks_view[it].quality() = reject; break; } } @@ -290,8 +280,8 @@ namespace caHitNtupletGeneratorKernels { // find maxQual auto maxQual = reject; // no duplicate! for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > maxQual) - maxQual = tracks->quality(it); + if (tracks_view[it].quality() > maxQual) + maxQual = tracks_view[it].quality(); } if (maxQual <= loose) @@ -299,7 +289,7 @@ namespace caHitNtupletGeneratorKernels { // find min score for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == maxQual && score(it) < mc) { + if (tracks_view[it].quality() == maxQual && score(it) < mc) { mc = score(it); im = it; } @@ -310,8 +300,8 @@ namespace caHitNtupletGeneratorKernels { // mark all other duplicates (not yet, keep it loose) for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > loose && it != im) - tracks->quality(it) = loose; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > loose && it != im) + tracks_view[it].quality() = loose; //no race: simple assignment of the same constant } } } @@ -319,14 +309,13 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, cms::cuda::AtomicPairCounter *apc2, // just to zero them, - Hits const *__restrict__ hhp, + HitsConstView hh, GPUCACellT *cells, uint32_t const *__restrict__ nCells, CellNeighborsVector *cellNeighbors, OuterHitOfCell const isOuterHitOfCell, CAParams params) { using Cell = GPUCACellT; - auto const &hh = *hhp; auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y; auto first = threadIdx.x; @@ -383,16 +372,14 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_find_ntuplets(Hits const *__restrict__ hhp, + __global__ void kernel_find_ntuplets(HitsConstView hh, + TkSoAView tracks_view, GPUCACellT *__restrict__ cells, uint32_t const *nCells, CellTracksVector *cellTracks, - HitContainer *foundNtuplets, cms::cuda::AtomicPairCounter *apc, - Quality *__restrict__ quality, CAParams params) { // recursive: not obvious to widen - auto const &hh = *hhp; using Cell = GPUCACellT; @@ -422,8 +409,15 @@ namespace caHitNtupletGeneratorKernels { bool bpix1Start = params.startAt0(pid); - thisCell.template find_ntuplets( - hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, params.minHitsPerNtuplet_, bpix1Start); + thisCell.template find_ntuplets(hh, + cells, + *cellTracks, + tracks_view.hitIndices(), + *apc, + tracks_view.quality(), + stack, + params.minHitsPerNtuplet_, + bpix1Start); assert(stack.empty()); } @@ -441,17 +435,16 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + __global__ void kernel_countMultiplicity(TkSoAView tracks_view, TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); if (nhits > TrackerTraits::maxHitsOnTrack) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= TrackerTraits::maxHitsOnTrack); @@ -460,17 +453,16 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + __global__ void kernel_fillMultiplicity(TkSoAView tracks_view, TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); if (nhits > TrackerTraits::maxHitsOnTrack) printf("wrong mult %d %d\n", it, nhits); assert(nhits <= TrackerTraits::maxHitsOnTrack); @@ -478,22 +470,21 @@ namespace caHitNtupletGeneratorKernels { } } + ///TODO : why there was quality here? template - __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - QualityCuts cuts, - Quality *__restrict__ quality) { + __global__ void kernel_classifyTracks(TkSoAView tracks_view, QualityCuts cuts) { + // Quality *__restrict__ quality) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = tuples->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits == 0) break; // guard // if duplicate: not even fit - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrack::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrack::Quality::bad); // mark doublets as bad if (nhits < 3) @@ -502,101 +493,91 @@ namespace caHitNtupletGeneratorKernels { // if the fit has any invalid parameters, mark it as bad bool isNaN = false; for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + isNaN |= std::isnan(tracks_view[it].state()(i)); } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2()); #endif continue; } - quality[it] = pixelTrack::Quality::strict; + tracks_view[it].quality() = pixelTrack::Quality::strict; - if (cuts.strictCut(tracks, it)) + if (cuts.strictCut(tracks_view, it)) continue; - quality[it] = pixelTrack::Quality::tight; + tracks_view[it].quality() = pixelTrack::Quality::tight; - if (cuts.isHP(tracks, nhits, it)) - quality[it] = pixelTrack::Quality::highPurity; + if (cuts.isHP(tracks_view, nhits, it)) + tracks_view[it].quality() = pixelTrack::Quality::highPurity; } } template - __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - Counters *counters) { + __global__ void kernel_doStatsForTracks(TkSoAView tracks_view, Counters *counters) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; //guard - if (quality[idx] < pixelTrack::Quality::loose) + if (tracks_view[idx].quality() < pixelTrack::Quality::loose) continue; atomicAdd(&(counters->nLooseTracks), 1); - if (quality[idx] < pixelTrack::Quality::strict) + if (tracks_view[idx].quality() < pixelTrack::Quality::strict) continue; atomicAdd(&(counters->nGoodTracks), 1); } } template - __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - HitToTuple *hitToTuple) { + __global__ void kernel_countHitInTracks(TkSoAView tracks_view, HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->count(*h); } } template - __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, - HitToTuple *hitToTuple) { + __global__ void kernel_fillHitInTracks(TkSoAView tracks_view, HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->fill(*h, idx); } } template - __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - HitsView const *__restrict__ hhp, - HitContainer *__restrict__ hitDetIndices) { + __global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, HitsConstView hh) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets - for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - hitDetIndices->off[idx] = tuples->off[idx]; + for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; } // fill hit indices - auto const &hh = *hhp; auto nhits = hh.nHits(); - for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tracks_view.hitIndices().content[idx] < nhits); + tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex(); } } template - __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { - auto &tracks = *ptracks; + __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) { auto first = blockIdx.x * blockDim.x + threadIdx.x; // clamp the number of tracks to the capacity of the SoA - auto ntracks = std::min(apc->get().m, tracks.stride() - 1); + auto ntracks = std::min(apc->get().m, tracks_view.metadata().size() - 1); if (0 == first) - tracks.setNTracks(ntracks); + tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + auto nHits = TracksUtilities::nHits(tracks_view, idx); assert(nHits >= 3); - tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + tracks_view[idx].nLayers() = TracksUtilities::computeNumberOfLayers(tracks_view, idx); } } @@ -677,8 +658,7 @@ namespace caHitNtupletGeneratorKernels { // mostly for very forward triplets..... template - __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { @@ -686,50 +666,43 @@ namespace caHitNtupletGeneratorKernels { auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (hitToTuple.size(idx) < 2) continue; - /* chi2 is bad for large pt - auto score = [&](auto it, auto nl) { - return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets - tracks.chi2(it); //chi2 - }; - */ - auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities::tip(tracks_view, it)); }; // full combinatorics for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { auto const it = *ip; - auto qi = quality[it]; + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks.stateAtBS.state(it)(2); - auto e2opi = tracks.stateAtBS.covariance(it)(9); - auto cti = tracks.stateAtBS.state(it)(3); - auto e2cti = tracks.stateAtBS.covariance(it)(12); - auto nli = tracks.nLayers(it); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); + auto nli = tracks_view[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; - auto qj = quality[jt]; + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks.stateAtBS.state(jt)(2); - auto ctj = tracks.stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nlj = tracks.nLayers(jt); + auto nlj = tracks_view[jt].nLayers(); if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) - quality[jt] = reject; + tracks_view[jt].quality() = reject; else { - quality[it] = reject; + tracks_view[it].quality() = reject; break; } } @@ -738,9 +711,8 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_sharedHitCleaner(HitsView const *__restrict__ hhp, - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_sharedHitCleaner(HitsConstView hh, + TkSoAView tracks_view, int nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { @@ -750,9 +722,7 @@ namespace caHitNtupletGeneratorKernels { auto const longTqual = pixelTrack::Quality::highPurity; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; - auto const &hh = *hhp; int l1end = hh.hitsLayerStart()[1]; int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -764,10 +734,10 @@ namespace caHitNtupletGeneratorKernels { // find maxNl for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < longTqual) + if (tracks_view[*it].quality() < longTqual) continue; - // if (tracks.nHits(*it)==3) continue; - auto nl = tracks.nLayers(*it); + // if (tracks_view[*it].nHits()==3) continue; + auto nl = tracks_view[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -779,21 +749,20 @@ namespace caHitNtupletGeneratorKernels { // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks.nLayers(*it); + auto nl = tracks_view[*it].nLayers(); //checking if shared hit is on bpix1 and if the tuple is short enough if (idx < l1end and nl > nmin) continue; - if (nl < maxNl && quality[*it] > reject) - quality[*it] = reject; + if (nl < maxNl && tracks_view[*it].quality() > reject) + tracks_view[*it].quality() = reject; } } } template - __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_tripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { @@ -803,7 +772,6 @@ namespace caHitNtupletGeneratorKernels { auto const good = pixelTrack::Quality::strict; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -816,9 +784,9 @@ namespace caHitNtupletGeneratorKernels { // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] <= good) + if (tracks_view[*it].quality() <= good) continue; - onlyTriplets &= tracks.isTriplet(*it); + onlyTriplets &= TracksUtilities::isTriplet(tracks_view, *it); if (!onlyTriplets) break; } @@ -830,8 +798,8 @@ namespace caHitNtupletGeneratorKernels { // for triplets choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(TracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(TracksUtilities::tip(tracks_view, it)); im = it; } } @@ -842,16 +810,15 @@ namespace caHitNtupletGeneratorKernels { // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } template - __global__ void kernel_simpleTripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_simpleTripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { @@ -861,7 +828,6 @@ namespace caHitNtupletGeneratorKernels { auto const good = pixelTrack::Quality::loose; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -874,8 +840,8 @@ namespace caHitNtupletGeneratorKernels { // choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(TracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(TracksUtilities::tip(tracks_view, it)); im = it; } } @@ -886,53 +852,50 @@ namespace caHitNtupletGeneratorKernels { // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && tracks.isTriplet(it) && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && TracksUtilities::isTriplet(tracks_view, it) && + it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } template - __global__ void kernel_print_found_ntuplets(HitsView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, - Quality const *__restrict__ quality, + __global__ void kernel_print_found_ntuplets(HitsConstView hh, + TkSoAView tracks_view, HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, int32_t lastPrint, int iev) { constexpr auto loose = pixelTrack::Quality::loose; - auto const &hh = *hhp; - auto const &foundNtuplets = *ptuples; - auto const &tracks = *ptracks; + int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { - auto nh = foundNtuplets.size(i); + for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; + i += blockDim.x * gridDim.x) { + auto nh = tracks_view.hitIndices().size(i); if (nh < 3) continue; - if (quality[i] < loose) + if (tracks_view[i].quality() < loose) continue; printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", 10000 * iev + i, - int(quality[i]), + int(tracks_view[i].quality()), nh, - tracks.nLayers(i), - tracks.charge(i), - tracks.pt(i), - tracks.eta(i), - tracks.phi(i), - tracks.tip(i), - tracks.zip(i), - // asinhf(fit_results[i].par(3)), - tracks.chi2(i), - hh.zGlobal(*foundNtuplets.begin(i)), - hh.zGlobal(*(foundNtuplets.begin(i) + 1)), - hh.zGlobal(*(foundNtuplets.begin(i) + 2)), - nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, - nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, - nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, - nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); + tracks_view[i].nLayers(), + TracksUtilities::charge(tracks_view, i), + tracks_view[i].pt(), + tracks_view[i].eta(), + TracksUtilities::phi(tracks_view, i), + TracksUtilities::tip(tracks_view, i), + TracksUtilities::zip(tracks_view, i), + tracks_view[i].chi2(), + hh[*tracks_view.hitIndices().begin(i)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(), + nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0, + nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0, + nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0, + nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 6d9ac785155d2..f499a6c90d384 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -21,6 +21,12 @@ #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + #include "CAHitNtupletGeneratorOnGPU.h" namespace { @@ -66,25 +72,25 @@ namespace { (float)cfg.getParameter("dcaCutOuterTriplet")}}; }; - static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + static constexpr pixelTrack::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { auto coeff = pset.getParameter>("chi2Coeff"); auto ptMax = pset.getParameter("chi2MaxPt"); coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax); - return QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut - {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, - // max pT used to determine the chi2 cut - (float)ptMax, - // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit - (float)pset.getParameter("chi2Scale"), - // regional cuts for triplets - {(float)pset.getParameter("tripletMaxTip"), - (float)pset.getParameter("tripletMinPt"), - (float)pset.getParameter("tripletMaxZip")}, - // regional cuts for quadruplets - {(float)pset.getParameter("quadrupletMaxTip"), - (float)pset.getParameter("quadrupletMinPt"), - (float)pset.getParameter("quadrupletMaxZip")}}; + return pixelTrack::QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut + {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, + // max pT used to determine the chi2 cut + (float)ptMax, + // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit + (float)pset.getParameter("chi2Scale"), + // regional cuts for triplets + {(float)pset.getParameter("tripletMaxTip"), + (float)pset.getParameter("tripletMinPt"), + (float)pset.getParameter("tripletMaxZip")}, + // regional cuts for quadruplets + {(float)pset.getParameter("quadrupletMaxTip"), + (float)pset.getParameter("quadrupletMinPt"), + (float)pset.getParameter("quadrupletMaxZip")}}; } }; @@ -101,8 +107,8 @@ namespace { {(bool)cfg.getParameter("includeFarForwards")}}; } - static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { - return QualityCutsT{ + static constexpr pixelTrack::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + return pixelTrack::QualityCutsT{ (float)pset.getParameter("maxChi2"), (float)pset.getParameter("minPt"), (float)pset.getParameter("maxTip"), @@ -274,37 +280,30 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } template -PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuplesAsync( +TrackSoAHeterogeneousDevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync( HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const { using HelixFitOnGPU = HelixFitOnGPU; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoA = TrackSoAHeterogeneousDevice; using GPUKernels = CAHitNtupletGeneratorKernelsGPU; - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); - - auto* soa = tracks.get(); - assert(soa); - cudaCheck(cudaGetLastError()); + TrackSoA tracks(stream); GPUKernels kernels(m_params); kernels.setCounters(m_counters); kernels.allocateOnGPU(hits_d.nHits(), stream); - cudaCheck(cudaGetLastError()); - kernels.buildDoublets(hits_d, stream); - cudaCheck(cudaGetLastError()); + kernels.buildDoublets(hits_d.view(), hits_d.offsetBPIX2(), stream); - kernels.launchKernels(hits_d, soa, stream); - cudaCheck(cudaGetLastError()); + kernels.launchKernels(hits_d.view(), tracks.view(), stream); HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } else { fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } - kernels.classifyTuples(hits_d, soa, stream); + kernels.classifyTuples(hits_d.view(), tracks.view(), stream); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); @@ -315,47 +314,43 @@ PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU -PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_d, - float bfield) const { +TrackSoAHeterogeneousHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_h, + float bfield) const { using HelixFitOnGPU = HelixFitOnGPU; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoA = TrackSoAHeterogeneousHost; using CPUKernels = CAHitNtupletGeneratorKernelsCPU; - PixelTrackHeterogeneous tracks(std::make_unique()); - - auto* soa = tracks.get(); - assert(soa); + TrackSoA tracks; CPUKernels kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(hits_d.nHits(), nullptr); + kernels.allocateOnGPU(hits_h.nHits(), nullptr); - kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); + kernels.buildDoublets(hits_h.view(), hits_h.offsetBPIX2(), nullptr); + kernels.launchKernels(hits_h.view(), tracks.view(), nullptr); - if (0 == hits_d.nHits()) + if (0 == hits_h.nHits()) return tracks; // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); + fitter.launchRiemannKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets); } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); + fitter.launchBrokenLineKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d, soa, nullptr); + kernels.classifyTuples(hits_h.view(), tracks.view(), nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; #endif // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + auto maxTracks = tracks.view().metadata().size(); + auto nTracks = tracks.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 745579b960b76..8ee65736541f3 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -2,8 +2,14 @@ #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -24,20 +30,20 @@ namespace edm { template class CAHitNtupletGeneratorOnGPU { public: - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnGPU = TrackingRecHit2DGPUT; - using HitsOnCPU = TrackingRecHit2DCPUT; - using hindex_type = typename HitsView::hindex_type; + using HitsView = TrackingRecHitSoAView; + using HitsConstView = TrackingRecHitSoAConstView; + using HitsOnGPU = TrackingRecHitSoADevice; //TODO move to OnDevice + using HitsOnCPU = TrackingRecHitSoAHost; //TODO move to OnHost + using hindex_type = typename TrackingRecHitSoA::hindex_type; using HitToTuple = caStructures::HitToTupleT; using TupleMultiplicity = caStructures::TupleMultiplicityT; using OuterHitOfCell = caStructures::OuterHitOfCellT; using GPUCACell = GPUCACellT; - using OutputSoA = pixelTrack::TrackSoAT; - using HitContainer = typename OutputSoA::HitContainer; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; + using HitContainer = typename TrackSoA::HitContainer; using Tuple = HitContainer; using CellNeighborsVector = caStructures::CellNeighborsVectorT; @@ -56,21 +62,20 @@ class CAHitNtupletGeneratorOnGPU { static void fillDescriptions(edm::ParameterSetDescription& desc); static void fillDescriptionsCommon(edm::ParameterSetDescription& desc); - //static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; } void beginJob(); void endJob(); - PixelTrackHeterogeneous makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const; + TrackSoADevice makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const; - PixelTrackHeterogeneous makeTuples(HitsOnCPU const& hits_d, float bfield) const; + TrackSoAHost makeTuples(HitsOnCPU const& hits_d, float bfield) const; private: - void buildDoublets(HitsOnGPU const& hh, cudaStream_t stream) const; + void buildDoublets(const HitsConstView& hh, cudaStream_t stream) const; - void hitNtuplets(HitsOnGPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); + void hitNtuplets(const HitsConstView& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); - void launchKernels(HitsOnGPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; + void launchKernels(const HitsConstView& hh, bool useRiemannFit, cudaStream_t cudaStream) const; Params m_params; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 965889abcb268..2f8ae9105ac55 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -9,12 +9,12 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "CAStructures.h" @@ -31,14 +31,14 @@ class GPUCACellT { using CellNeighborsVector = caStructures::CellNeighborsVectorT; using CellTracksVector = caStructures::CellTracksVectorT; - using Hits = TrackingRecHit2DSOAViewT; + using HitsConstView = TrackingRecHitSoAConstView; using hindex_type = typename TrackerTraits::hindex_type; using tindex_type = typename TrackerTraits::tindex_type; static constexpr auto invalidHitId = std::numeric_limits::max(); using TmpTuple = cms::cuda::VecArray; - using HitContainer = pixelTrack::HitContainerT; + using HitContainer = typename TrackSoA::HitContainer; using Quality = pixelTrack::Quality; static constexpr auto bad = pixelTrack::Quality::bad; @@ -48,7 +48,7 @@ class GPUCACellT { __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, CellTracksVector& cellTracks, - Hits const& hh, + const HitsConstView& hh, int layerPairId, hindex_type innerHitId, hindex_type outerHitId) { @@ -59,8 +59,8 @@ class GPUCACellT { theFishboneId = invalidHitId; // optimization that depends on access pattern - theInnerZ = hh.zGlobal(innerHitId); - theInnerR = hh.rGlobal(innerHitId); + theInnerZ = hh[innerHitId].zGlobal(); + theInnerR = hh[innerHitId].rGlobal(); // link to default empty theOuterNeighbors = &cellNeighbors[0]; @@ -115,22 +115,26 @@ class GPUCACellT { __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } - __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; } + __device__ __forceinline__ float inner_x(const HitsConstView& hh) const { return hh[theInnerHitId].xGlobal(); } + __device__ __forceinline__ float outer_x(const HitsConstView& hh) const { return hh[theOuterHitId].xGlobal(); } + __device__ __forceinline__ float inner_y(const HitsConstView& hh) const { return hh[theInnerHitId].yGlobal(); } + __device__ __forceinline__ float outer_y(const HitsConstView& hh) const { return hh[theOuterHitId].yGlobal(); } + __device__ __forceinline__ float inner_z(const HitsConstView& hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } - __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; } + __device__ __forceinline__ float outer_z(const HitsConstView& hh) const { return hh[theOuterHitId].zGlobal(); } + __device__ __forceinline__ float inner_r(const HitsConstView& hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } - __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); } + __device__ __forceinline__ float outer_r(const HitsConstView& hh) const { return hh[theOuterHitId].rGlobal(); } - __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); } - __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); } + __device__ __forceinline__ auto inner_iphi(const HitsConstView& hh) const { return hh[theInnerHitId].iphi(); } + __device__ __forceinline__ auto outer_iphi(const HitsConstView& hh) const { return hh[theOuterHitId].iphi(); } - __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); } - __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); } + __device__ __forceinline__ float inner_detIndex(const HitsConstView& hh) const { + return hh[theInnerHitId].detectorIndex(); + } + __device__ __forceinline__ float outer_detIndex(const HitsConstView& hh) const { + return hh[theOuterHitId].detectorIndex(); + } constexpr unsigned int inner_hit_id() const { return theInnerHitId; } constexpr unsigned int outer_hit_id() const { return theOuterHitId; } @@ -142,7 +146,7 @@ class GPUCACellT { theOuterHitId); } - __device__ bool check_alignment(Hits const& hh, + __device__ bool check_alignment(const HitsConstView& hh, GPUCACellT const& otherCell, const float ptmin, const float hardCurvCut, @@ -189,7 +193,7 @@ class GPUCACellT { return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; } - __device__ inline bool dcaCut(Hits const& hh, + __device__ inline bool dcaCut(const HitsConstView& hh, GPUCACellT const& otherCell, const float region_origin_radius_plus_tolerance, const float maxCurv) const { @@ -226,7 +230,7 @@ class GPUCACellT { return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); } - __device__ inline bool hole0(Hits const& hh, GPUCACellT const& innerCell) const { + __device__ inline bool hole0(const HitsConstView& hh, GPUCACellT const& innerCell) const { using namespace phase1PixelTopology; int p = innerCell.inner_iphi(hh); @@ -247,7 +251,7 @@ class GPUCACellT { return gap; } - __device__ inline bool hole4(Hits const& hh, GPUCACellT const& innerCell) const { + __device__ inline bool hole4(const HitsConstView& hh, GPUCACellT const& innerCell) const { using namespace phase1PixelTopology; int p = outer_iphi(hh); @@ -274,7 +278,7 @@ class GPUCACellT { // the visit of the graph based on the neighborhood connections between cells. template - __device__ inline void find_ntuplets(Hits const& hh, + __device__ inline void find_ntuplets(const HitsConstView& hh, GPUCACellT* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, @@ -356,14 +360,14 @@ class GPUCACellT { __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); } __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); } - __device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) { + __device__ __forceinline__ void setFishbone(hindex_type id, float z, const HitsConstView& hh) { // make it deterministic: use the farther apart (in z) auto old = theFishboneId; - while ( - old != - atomicCAS(&theFishboneId, - old, - (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old)) + while (old != + atomicCAS( + &theFishboneId, + old, + (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old)) old = theFishboneId; } __device__ __forceinline__ auto fishboneId() const { return theFishboneId; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index c300329a82208..befd30ffab7b2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -2,17 +2,16 @@ #include "HelixFitOnGPU.h" template -void HelixFitOnGPU::allocateOnGPU( - Tuples const *tuples, - caStructures::TupleMultiplicityT const *tupleMultiplicity, - pixelTrack::TrackSoAT *helix_fit_results) { - tuples_ = tuples; +void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, + OutputSoAView &helix_fit_results) { + tuples_ = &helix_fit_results.hitIndices(); tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; assert(tuples_); assert(tupleMultiplicity_); - assert(outputSoa_); + assert(outputSoa_.chi2()); + assert(outputSoa_.pt()); } template diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 78bec6f5e2a87..88dc882ce5de9 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -1,8 +1,8 @@ #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" @@ -43,10 +43,13 @@ namespace riemannFit { template class HelixFitOnGPU { public: - using HitsView = TrackingRecHit2DSOAViewT; + using TrackingRecHitSoAs = TrackingRecHitSoA; - using Tuples = pixelTrack::HitContainerT; - using OutputSoA = pixelTrack::TrackSoAT; + using HitView = TrackingRecHitSoAView; + using HitConstView = TrackingRecHitSoAConstView; + + using Tuples = typename TrackSoA::HitContainer; + using OutputSoAView = TrackSoAView; using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -54,13 +57,16 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } - void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchRiemannKernels(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchBrokenLineKernels(const HitConstView &hv, + uint32_t nhits, + uint32_t maxNumberOfTuples, + cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchRiemannKernelsOnCPU(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchBrokenLineKernelsOnCPU(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results); void deallocateOnGPU(); private: @@ -69,7 +75,7 @@ class HelixFitOnGPU { // fowarded Tuples const *tuples_ = nullptr; TupleMultiplicity const *tupleMultiplicity_ = nullptr; - OutputSoA *outputSoa_; + OutputSoAView outputSoa_; float bField_; const bool fitNas4_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index e4a7de6adaf4c..2678f60f75b3f 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,7 +1,7 @@ #include "RiemannFitOnGPU.h" template -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, +void HelixFitOnGPU::launchRiemannKernelsOnCPU(const TrackingRecHitSoAConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples) { assert(tuples_); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index 3d6b2d570077e..99c55992bbf71 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -2,7 +2,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" template -void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, +void HelixFitOnGPU::launchRiemannKernels(const TrackingRecHitSoAConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t stream) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 18dd205cd13c3..96cccf0d0cc0b 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -6,7 +6,8 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -15,11 +16,9 @@ #include "HelixFitOnGPU.h" template -using HitsOnGPU = TrackingRecHit2DSOAViewT; +using Tuples = typename TrackSoA::HitContainer; template -using Tuples = pixelTrack::HitContainerT; -template -using OutputSoA = pixelTrack::TrackSoAT; +using OutputSoAView = TrackSoAView; template using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -27,7 +26,7 @@ template __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, - HitsOnGPU const *__restrict__ hhp, + TrackingRecHitSoAConstView hh, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -68,14 +67,10 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNt auto const *hitId = foundNtuplets->begin(tkid); for (unsigned int i = 0; i < hitsInFit; ++i) { auto hit = hitId[i]; - // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); float ge[6]; - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); - // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); + hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } riemannFit::fastFit(hits, fast_fit); @@ -133,13 +128,12 @@ template __global__ void kernel_LineFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, - OutputSoA *results, + OutputSoAView results_view, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, riemannFit::CircleFit *__restrict__ circle_fit, uint32_t offset) { - assert(results); assert(circle_fit); assert(N <= nHits); @@ -154,7 +148,7 @@ __global__ void kernel_LineFit(TupleMultiplicity const *__restric break; // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); @@ -164,11 +158,16 @@ __global__ void kernel_LineFit(TupleMultiplicity const *__restric riemannFit::fromCircleToPerigee(circle_fit[local_idx]); - results->stateAtBS.copyFromCircle( - circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid); - results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2)); - results->eta(tkid) = asinhf(line_fit.par(0)); - results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + TracksUtilities::copyFromCircle(results_view, + circle_fit[local_idx].par, + circle_fit[local_idx].cov, + line_fit.par, + line_fit.cov, + 1.f / float(bField), + tkid); + results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2)); + results_view[tkid].eta() = asinhf(line_fit.par(0)); + results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); #ifdef RIEMANN_DEBUG printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index d4b3282574ec3..f32adf9f6e770 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -27,10 +27,10 @@ namespace gpuPixelDoublets { template using OuterHitOfCell = caStructures::OuterHitOfCellT; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template - __global__ void fishbone(Hits const* __restrict__ hhp, + __global__ void fishbone(HitsConstView hh, GPUCACellT* cells, uint32_t const* __restrict__ nCells, OuterHitOfCell const isOuterHitOfCellWrap, @@ -38,8 +38,6 @@ namespace gpuPixelDoublets { bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACellT::maxCellsPerHit; - auto const& hh = *hhp; - auto const isOuterHitOfCell = isOuterHitOfCellWrap.container; int32_t offset = isOuterHitOfCellWrap.offset; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index deed54ca02b5b..740b63ac774a5 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -7,22 +7,6 @@ namespace gpuPixelDoublets { - template - using CellNeighbors = caStructures::CellNeighborsT; - template - using CellTracks = caStructures::CellTracksT; - template - using CellNeighborsVector = caStructures::CellNeighborsVectorT; - template - using CellTracksVector = caStructures::CellTracksVectorT; - template - using OuterHitOfCell = caStructures::OuterHitOfCellT; - template - using Hits = typename GPUCACellT::Hits; - - // end constants - // clang-format on - template __global__ void initDoublets(OuterHitOfCell isOuterHitOfCell, int nHits, @@ -59,11 +43,10 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAViewT const* __restrict__ hhp, + HitsConstView hh, OuterHitOfCell isOuterHitOfCell, int nActualPairs, CellCutsT cuts) { - auto const& __restrict__ hh = *hhp; doubletsFromHisto( nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, cuts); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h index 0f3d786a8e476..eaaefb42b74ae 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -7,7 +7,7 @@ #include #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" @@ -32,11 +32,11 @@ namespace gpuPixelDoublets { template using OuterHitOfCell = caStructures::OuterHitOfCellT; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template struct CellCutsT { - using H = Hits; + using H = HitsConstView; using T = TrackerTraits; const uint32_t maxNumberOfDoublets_; @@ -45,21 +45,21 @@ namespace gpuPixelDoublets { const bool doPtCut_; const bool idealConditions_; //this is actually not used by phase2 - __device__ __forceinline__ bool zSizeCut(H const& hh, int i, int o) const { - auto mi = hh.detectorIndex(i); + __device__ __forceinline__ bool zSizeCut(H hh, int i, int o) const { + const uint32_t mi = hh[i].detectorIndex(); bool innerB1 = mi < T::last_bpix1_detIndex; bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; - auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; if (mes < 0) return false; - auto mo = hh.detectorIndex(o); - auto so = hh.clusterSizeY(o); + const uint32_t mo = hh[o].detectorIndex(); + auto so = hh[o].clusterSizeY(); - auto dz = hh.zGlobal(i) - hh.zGlobal(o); - auto dr = hh.rGlobal(i) - hh.rGlobal(o); + auto dz = hh[i].zGlobal() - hh[o].zGlobal(); + auto dr = hh[i].rGlobal() - hh[o].rGlobal(); auto innerBarrel = mi < T::last_barrel_detIndex; auto onlyBarrel = mo < T::last_barrel_detIndex; @@ -72,14 +72,8 @@ namespace gpuPixelDoublets { : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred; } - __device__ __forceinline__ bool clusterCut(H const& hh, int i, int o) const { - auto mo = hh.detectorIndex(o); - bool outerFwd = (mo >= T::last_barrel_detIndex); - - if (!outerFwd) - return false; - - auto mi = hh.detectorIndex(i); + __device__ __forceinline__ bool clusterCut(H hh, int i) const { + const uint32_t mi = hh[i].detectorIndex(); bool innerB1orB2 = mi < T::last_bpix2_detIndex; if (!innerB1orB2) @@ -87,13 +81,13 @@ namespace gpuPixelDoublets { bool innerB1 = mi < T::last_bpix1_detIndex; bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; - auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; - if (innerB1 && outerFwd) // B1 and F1 + if (innerB1) // B1 if (mes > 0 && mes < T::minYsizeB1) return true; // only long cluster (5*8) bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex); //FIXME number - if (innerB2 && outerFwd) // B2 and F1 + if (innerB2) // B2 and F1 if (mes > 0 && mes < T::minYsizeB2) return true; @@ -101,19 +95,13 @@ namespace gpuPixelDoublets { } }; - // template - // struct CellCutsT : public CellCutsCommon {}; - // - // template <> - // struct CellCutsT : public CellCutsCommon {}; - template __device__ __forceinline__ void doubletsFromHisto(uint32_t nPairs, GPUCACellT* cells, uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAViewT const& __restrict__ hh, + HitsConstView hh, OuterHitOfCell isOuterHitOfCell, CellCutsT const& cuts) { // ysize cuts (z in the barrel) times 8 @@ -124,10 +112,10 @@ namespace gpuPixelDoublets { const bool doPtCut = cuts.doPtCut_; const uint32_t maxNumOfDoublets = cuts.maxNumberOfDoublets_; - using PhiBinner = typename TrackingRecHit2DSOAViewT::PhiBinner; + using PhiBinner = typename TrackingRecHitSoA::PhiBinner; auto const& __restrict__ phiBinner = hh.phiBinner(); - uint32_t const* __restrict__ offsets = hh.hitsLayerStart(); + uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data(); assert(offsets); auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; }; @@ -168,18 +156,15 @@ namespace gpuPixelDoublets { assert(outer > inner); auto hoff = PhiBinner::histOff(outer); - auto fo = __ldg(phiBinner.begin(hoff)); //first hit on outer for the cluster cut auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1]; i += offsets[inner]; - // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j); - assert(i >= offsets[inner]); assert(i < offsets[inner + 1]); // found hit corresponding to our cuda thread, now do the job - if (hh.detectorIndex(i) > gpuClustering::maxNumModules) + if (hh[i].detectorIndex() > gpuClustering::maxNumModules) continue; // invalid /* maybe clever, not effective when zoCut is on @@ -188,16 +173,16 @@ namespace gpuPixelDoublets { if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue; */ - auto mez = hh.zGlobal(i); + auto mez = hh[i].zGlobal(); if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId]) continue; - if (doClusterCut && cuts.clusterCut(hh, i, fo)) + if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(hh, i)) continue; - auto mep = hh.iphi(i); - auto mer = hh.rGlobal(i); + auto mep = hh[i].iphi(); + auto mer = hh[i].rGlobal(); // all cuts: true if fails constexpr float z0cut = TrackerTraits::z0Cut; // cm @@ -208,13 +193,13 @@ namespace gpuPixelDoublets { auto ptcut = [&](int j, int16_t idphi) { auto r2t4 = minRadius2T4; auto ri = mer; - auto ro = hh.rGlobal(j); + auto ro = hh[j].rGlobal(); auto dphi = short2phi(idphi); return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri); }; auto z0cutoff = [&](int j) { - auto zo = hh.zGlobal(j); - auto ro = hh.rGlobal(j); + auto zo = hh[j].zGlobal(); + auto ro = hh[j].rGlobal(); auto dr = ro - mer; return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; }; @@ -245,14 +230,14 @@ namespace gpuPixelDoublets { auto oi = __ldg(p); assert(oi >= offsets[outer]); assert(oi < offsets[outer + 1]); - auto mo = hh.detectorIndex(oi); + auto mo = hh[oi].detectorIndex(); if (mo > gpuClustering::maxNumModules) continue; // invalid if (doZ0Cut && z0cutoff(oi)) continue; - auto mop = hh.iphi(oi); + auto mop = hh[oi].iphi(); uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); if (idphi > iphicut) continue; diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index d480d7408b9e2..522b186f3351b 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -26,4 +26,5 @@ + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 024c95398b988..b51bd73350940 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -18,13 +18,19 @@ #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" + #include "gpuVertexFinder.h" #undef PIXVERTEX_DEBUG_PRODUCE template class PixelVertexProducerCUDAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TracksSoADevice = TrackSoAHeterogeneousDevice; + using TracksSoAHost = TrackSoAHeterogeneousHost; using GPUAlgo = gpuVertexFinder::Producer; public: @@ -40,10 +46,10 @@ class PixelVertexProducerCUDAT : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; - edm::EDPutTokenT tokenCPUVertex_; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDPutTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenCPUTrack_; + edm::EDPutTokenT tokenCPUVertex_; const GPUAlgo gpuAlgo_; @@ -67,12 +73,11 @@ PixelVertexProducerCUDAT::PixelVertexProducerCUDAT(const edm::Par ptMax_(conf.getParameter("PtMax")) // 75. GeV { if (onGPU_) { - tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); - tokenGPUVertex_ = produces(); + tokenGPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = produces(); } else { tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); - tokenCPUVertex_ = produces(); + tokenCPUVertex_ = produces(); } } @@ -104,23 +109,20 @@ template void PixelVertexProducerCUDAT::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; - iEvent.getByToken(tokenGPUTrack_, hTracks); + using TracksSoA = TrackSoAHeterogeneousDevice; + auto hTracks = iEvent.getHandle(tokenGPUTrack_); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto const* tracks = ctx.get(*hTracks).get(); - - assert(tracks); + auto& tracks = ctx.get(*hTracks); - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_)); } template void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto const* tracks = iEvent.get(tokenCPUTrack_).get(); - assert(tracks); + auto& tracks = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE auto const& tsoa = *tracks; @@ -129,8 +131,8 @@ void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamI int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = TracksUtilities::nHits(tracks.view(), it); + assert(nHits == int(tracks.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -138,7 +140,7 @@ void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamI std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_)); + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_)); } template diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc index 8cceeaa42cc10..91de2bdb6992b 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/OrphanHandle.h" #include "DataFormats/TrackReco/interface/Track.h" @@ -35,17 +36,17 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { private: void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; - edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenVertex_; edm::EDGetTokenT tokenBeamSpot_; edm::EDGetTokenT tokenTracks_; edm::EDGetTokenT tokenIndToEdm_; }; PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) - : tokenVertex_(consumes(conf.getParameter("src"))), - tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), - tokenTracks_(consumes(conf.getParameter("TrackCollection"))), - tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { + : tokenVertex_(consumes(conf.getParameter("src"))), + tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), + tokenTracks_(consumes(conf.getParameter("TrackCollection"))), + tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { produces(); } @@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv dydz = bs.dydz(); } - auto const &soa = *(iEvent.get(tokenVertex_).get()); + auto const &soa = iEvent.get(tokenVertex_); - int nv = soa.nvFinal; + int nv = soa.view().nvFinal(); #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "converting " << nv << " vertices " @@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv std::set uind; // for verifing index consistency for (int j = nv - 1; j >= 0; --j) { - auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + auto i = soa.view()[j].sortInd(); // on gpu sorted in ascending order.... assert(i < nv); uind.insert(i); assert(itrk.empty()); - auto z = soa.zv[i]; + auto z = soa.view()[i].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; reco::Vertex::Error err; - err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) = 1.f / soa.view()[i].wv(); err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.idv[k] == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index dc125878b1058..b13b6c96f0bd3 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -2,7 +2,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" @@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr m_soa; + ZVertexSoAHost zvertex_h; }; PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -50,16 +51,20 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - m_soa = inputData.toHostAsync(ctx.stream()); + auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device + zvertex_h = ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), + zvertex_d.const_buffer().get(), + zvertex_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // No copies.... - iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); + iEvent.emplace(tokenSOA_, std::move(zvertex_h)); } DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h new file mode 100644 index 0000000000000..223c3d7e94785 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h @@ -0,0 +1,23 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoADevice_h +#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoADevice_h + +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h" + +template +class PixelVertexWorkSpaceSoADevice : public cms::cuda::PortableDeviceCollection> { +public: + PixelVertexWorkSpaceSoADevice() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit PixelVertexWorkSpaceSoADevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using PixelVertexWorkSpaceSoADevice = PixelVertexWorkSpaceSoADevice; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h new file mode 100644 index 0000000000000..6c424fcec8a30 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h @@ -0,0 +1,22 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoAHost_h +#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoAHost_h + +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h" + +template +class PixelVertexWorkSpaceSoAHost : public cms::cuda::PortableHostCollection> { +public: + explicit PixelVertexWorkSpaceSoAHost() : PortableHostCollection>(S) {} + // Constructor which specifies the SoA size and CUDA stream + explicit PixelVertexWorkSpaceSoAHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using PixelVertexWorkSpaceSoAHost = PixelVertexWorkSpaceSoAHost; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h new file mode 100644 index 0000000000000..f5859319c0b6b --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h @@ -0,0 +1,35 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpace_h +#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpace_h + +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +// Intermediate data used in the vertex reco algos +// For internal use only +GENERATE_SOA_LAYOUT(PixelVertexWSSoALayout, + SOA_COLUMN(uint16_t, itrk), // index of original track + SOA_COLUMN(float, zt), // input track z at bs + SOA_COLUMN(float, ezt2), // input error^2 on the above + SOA_COLUMN(float, ptt2), // input pt^2 on the above + SOA_COLUMN(uint8_t, izt), // interized z-position of input tracks + SOA_COLUMN(int32_t, iv), // vertex index for each associated track + SOA_SCALAR(uint32_t, ntrks), // number of "selected tracks" + SOA_SCALAR(uint32_t, nvIntermediate)) // the number of vertices after splitting pruning etc. + +// Methods that operate on View and ConstView of the WorkSpaceSoALayout. +namespace gpuVertexFinder { + namespace workSpace { + using PixelVertexWorkSpaceSoALayout = PixelVertexWSSoALayout<>; + using PixelVertexWorkSpaceSoAView = PixelVertexWSSoALayout<>::View; + using PixelVertexWorkSpaceSoAConstView = PixelVertexWSSoALayout<>::ConstView; + + namespace utilities { + __host__ __device__ inline void init(PixelVertexWorkSpaceSoAView &workspace_view) { + workspace_view.ntrks() = 0; + workspace_view.nvIntermediate() = 0; + } + } // namespace utilities + } // namespace workSpace +} // namespace gpuVertexFinder + +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..915e48e867d95 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -17,8 +17,8 @@ namespace gpuVertexFinder { // // based on Rodrighez&Laio algo // - __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata, + WsSoAView& pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -32,21 +32,24 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(ezt2); + assert(izt); + assert(nn); + assert(iv); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -63,7 +66,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); @@ -197,7 +200,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { @@ -219,8 +222,8 @@ namespace gpuVertexFinder { printf("found %d proto vertices\n", foundClusters); } - __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void clusterTracksByDensityKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..f92d9a1d0113d 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksDBSCAN(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksDBSCAN(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,21 +28,23 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(iv); + assert(nn); + assert(ezt2); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -59,7 +61,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -214,7 +216,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..21182690ec7e8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksIterative(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksIterative(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,21 +28,23 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(nn); + assert(iv); + assert(ezt2); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -59,7 +61,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -185,7 +187,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..a89064b7f2ac0 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -12,28 +12,25 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void fitVertices(ZVertices* pdata, - WorkSpace* pws, + __device__ __forceinline__ void fitVertices(VtxSoAView& pdata, + WsSoAView& pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; - - assert(pdata); - assert(zt); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); assert(nvFinal <= nvIntermediate); nvFinal = nvIntermediate; @@ -101,8 +98,8 @@ namespace gpuVertexFinder { printf("and %d noise\n", noise); } - __global__ void fitVerticesKernel(ZVertices* pdata, - WorkSpace* pws, + __global__ void fitVerticesKernel(VtxSoAView pdata, + WsSoAView pws, float chi2Max // for outlier rejection ) { fitVertices(pdata, pws, chi2Max); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..2e2e2353f6b30 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -15,29 +15,29 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ ptt2 = ws.ptt2; - uint32_t const& nvFinal = data.nvFinal; + __device__ __forceinline__ void sortByPt2(VtxSoAView& pdata, WsSoAView& pws) { + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ ptt2 = ws.ptt2(); + uint32_t const& nvFinal = data.nvFinal(); - int32_t const* __restrict__ iv = ws.iv; - float* __restrict__ ptv2 = data.ptv2; - uint16_t* __restrict__ sortInd = data.sortInd; + int32_t const* __restrict__ iv = ws.iv(); + float* __restrict__ ptv2 = data.ptv2(); + uint16_t* __restrict__ sortInd = data.sortInd(); - // if (threadIdx.x == 0) - // printf("sorting %d vertices\n",nvFinal); + assert(ptv2); + assert(sortInd); if (nvFinal < 1) return; // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data.idv[ws.itrk[i]] = iv[i]; + data[ws[i].itrk()].idv() = iv[i]; } - // can be done asynchronoisly at the end of previous event + // can be done asynchronously at the end of previous event for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) { ptv2[i] = 0; } @@ -66,7 +66,7 @@ namespace gpuVertexFinder { #endif } - __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..7f18d58d11454 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -12,24 +12,26 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __device__ __forceinline__ void splitVertices(VtxSoAView& pdata, WsSoAView& pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float const* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float const* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); - int32_t const* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + int32_t const* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(wv); + assert(chi2); + assert(nn); // one vertex per block for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) { @@ -120,7 +122,7 @@ namespace gpuVertexFinder { // get a new global vertex __shared__ uint32_t igv; if (0 == threadIdx.x) - igv = atomicAdd(&ws.nvIntermediate, 1); + igv = atomicAdd(&ws.nvIntermediate(), 1); __syncthreads(); for (auto k = threadIdx.x; k < nq; k += blockDim.x) { if (1 == newV[k]) @@ -130,7 +132,7 @@ namespace gpuVertexFinder { } // loop on vertices } - __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) { splitVertices(pdata, pws, maxChi2); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 74bcd26f8a79c..950a31f8ac48a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,5 +1,12 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" + +#include "PixelVertexWorkSpaceUtilities.h" +#include "PixelVertexWorkSpaceSoAHost.h" +#include "PixelVertexWorkSpaceSoADevice.h" + #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" #include "gpuClusterTracksIterative.h" @@ -20,28 +27,23 @@ namespace gpuVertexFinder { template __global__ void loadTracks( - pixelTrack::TrackSoAT const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { - assert(ptracks); - assert(soa); - auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; - auto const* quality = tracks.qualityData(); - + TrackSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) { + auto const* quality = tracks_view.quality(); + using helper = TracksUtilities; auto first = blockIdx.x * blockDim.x + threadIdx.x; - - for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = helper::nHits(tracks_view, idx); assert(nHits >= 3); // initialize soa... - soa->idv[idx] = -1; + soa[idx].idv() = -1; - if (tracks.isTriplet(idx)) + if (helper::isTriplet(tracks_view, idx)) continue; // no triplets if (quality[idx] < pixelTrack::Quality::highPurity) continue; - auto pt = tracks.pt(idx); + auto pt = tracks_view[idx].pt(); if (pt < ptMin) continue; @@ -49,19 +51,19 @@ namespace gpuVertexFinder { // clamp pt pt = std::min(pt, ptMax); - auto& data = *pws; - auto it = atomicAdd(&data.ntrks, 1); - data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); - data.ptt2[it] = pt * pt; + auto& data = pws; + auto it = atomicAdd(&data.ntrks(), 1); + data[it].itrk() = idx; + data[it].zt() = helper::zip(tracks_view, idx); + data[it].ezt2() = tracks_view[idx].covariance()(14); + data[it].ptt2() = pt * pt; } } // #define THREE_KERNELS #ifndef THREE_KERNELS - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderOneKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -78,8 +80,8 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #else - __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderKernel1(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -90,7 +92,7 @@ namespace gpuVertexFinder { fitVertices(pdata, pws, maxChi2ForFirstFit); } - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) { fitVertices(pdata, pws, maxChi2ForFinalFit); __syncthreads(); sortByPt2(pdata, pws); @@ -99,44 +101,42 @@ namespace gpuVertexFinder { template #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, - pixelTrack::TrackSoAT const* tksoa, - float ptMin, - float ptMax) const { + ZVertexSoADevice Producer::makeAsync(cudaStream_t stream, + const TrackSoAConstView& tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + ZVertexSoADevice vertices(stream); #else - - ZVertexHeterogeneous Producer::make(pixelTrack::TrackSoAT const* tksoa, - float ptMin, - float ptMax) const { - + ZVertexSoAHost Producer::make(const TrackSoAConstView& tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); + ZVertexSoAHost vertices; #endif - assert(tksoa); - auto* soa = vertices.get(); - assert(soa); + auto soa = vertices.view(); + + assert(vertices.buffer()); #ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); + auto ws_d = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoADevice(stream); #else - auto ws_d = std::make_unique(); + auto ws_d = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAHost(); #endif #ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + init<<<1, 1, 0, stream>>>(soa, ws_d.view()); auto blockSize = 128; - auto numberOfBlocks = (pixelTrack::TrackSoAT::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize; + loadTracks<<>>(tracks_view, soa, ws_d.view(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + init(soa, ws_d.view()); + loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax); #endif #ifdef __CUDACC__ @@ -148,50 +148,51 @@ namespace gpuVertexFinder { if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>( + soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max); } #ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; + std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + fitVertices(soa, ws_d.view(), maxChi2ForFirstFit); // one block per vertex! - splitVertices(soa, ws_d.get(), maxChi2ForSplit); - fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); - sortByPt2(soa, ws_d.get()); + splitVertices(soa, ws_d.view(), maxChi2ForSplit); + fitVertices(soa, ws_d.view(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.view()); #endif return vertices; @@ -199,5 +200,4 @@ namespace gpuVertexFinder { template class Producer; template class Producer; - } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 6128939f6eb87..d5157fec14053 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -4,45 +4,29 @@ #include #include -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "PixelVertexWorkSpaceUtilities.h" +#include "PixelVertexWorkSpaceSoAHost.h" +#include "PixelVertexWorkSpaceSoADevice.h" namespace gpuVertexFinder { - using ZVertices = ZVertexSoA; - // workspace used in the vertex reco algos - struct WorkSpace { - static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; - static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; + using VtxSoAView = zVertex::ZVertexSoAView; + using WsSoAView = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView; - uint32_t ntrks; // number of "selected tracks" - uint32_t itrk[MAXTRACKS]; // index of original track - float zt[MAXTRACKS]; // input track z at bs - float ezt2[MAXTRACKS]; // input error^2 on the above - float ptt2[MAXTRACKS]; // input pt^2 on the above - uint8_t izt[MAXTRACKS]; // interized z-position of input tracks - int32_t iv[MAXTRACKS]; // vertex index for each associated track - - uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. - - __host__ __device__ void init() { - ntrks = 0; - nvIntermediate = 0; - } - }; - - __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { - pdata->init(); - pws->init(); + __global__ void init(VtxSoAView pdata, WsSoAView pws) { + zVertex::utilities::init(pdata); + gpuVertexFinder::workSpace::utilities::init(pws); } template class Producer { - public: - using ZVertices = ZVertexSoA; - using WorkSpace = gpuVertexFinder::WorkSpace; - using TkSoA = pixelTrack::TrackSoAT; + using TkSoAConstView = TrackSoAConstView; + public: Producer(bool oneKernel, bool useDensity, bool useDBSCAN, @@ -63,8 +47,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const; + ZVertexSoADevice makeAsync(cudaStream_t stream, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const; + ZVertexSoAHost make(const TkSoAConstView &tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5f8a0646c726a..ff3048c03f6a4 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -7,6 +7,17 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +// PixelTrackUtilities only included in order to compile SoALayout with Eigen columns +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -23,22 +34,22 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ -__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, +__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, + gpuVertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata, pws, 50.); + gpuVertexFinder::fitVertices(pdata, pws, 50.); __syncthreads(); - splitVertices(pdata, pws, 9.f); + gpuVertexFinder::splitVertices(pdata, pws, 9.f); __syncthreads(); - fitVertices(pdata, pws, 5000.); + gpuVertexFinder::fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata, pws); + gpuVertexFinder::sortByPt2(pdata, pws); } #endif #endif @@ -101,25 +112,23 @@ struct ClusterGenerator { std::exponential_distribution ptGen; }; -// a macro SORRY -#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) -#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) - -__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { - auto const& __restrict__ data = *pdata; - auto const& __restrict__ ws = *pws; - printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); +__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { + auto& __restrict__ ws = pws; + printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } int main() { #ifdef __CUDACC__ + cudaStream_t stream; cms::cudatest::requireDevices(); + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + ZVertexSoADevice onGPU_d(stream); + gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoADevice ws_d(stream); #else - auto onGPU_d = std::make_unique(); - auto ws_d = std::make_unique(); + + ZVertexSoAHost onGPU_d; + gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAHost ws_d; #endif Event ev; @@ -135,24 +144,26 @@ int main() { gen(ev); #ifdef __CUDACC__ - init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); #else - onGPU_d->init(); - ws_d->init(); + gpuVertexFinder::init(onGPU_d.view(), ws_d.view()); #endif std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else - ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t)); + ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; @@ -168,30 +179,30 @@ int main() { uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.get(), ws_d.get()); - CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.get(), ws_d.get()); - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; + print(onGPU_d.view(), ws_d.view()); + CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + print(onGPU_d.view(), ws_d.view()); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); #endif if (nv == 0) { @@ -221,18 +232,18 @@ int main() { nn = hnn; ind = hind; #else - zv = onGPU_d->zv; - wv = onGPU_d->wv; - ptv2 = onGPU_d->ptv2; - nn = onGPU_d->ndof; - ind = onGPU_d->sortInd; + zv = onGPU_d.view().zv(); + wv = onGPU_d.view().wv(); + ptv2 = onGPU_d.view().ptv2(); + nn = onGPU_d.view().ndof(); + ind = onGPU_d.view().sortInd(); #endif #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -244,14 +255,14 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -264,26 +275,26 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f); + cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); - nv = ws_d->nvIntermediate; + gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f); + nv = ws_d.view().nvIntermediate(); #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.get(), ws_d.get()); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); + gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view()); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif if (nv == 0) { @@ -292,12 +303,12 @@ int main() { } #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index c11b53538c5b0..4637bac6fa580 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -45,12 +45,13 @@ #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" namespace L2TauTagNNv1 { constexpr int nCellEta = 5; @@ -145,10 +146,9 @@ struct L2TauNNProducerCacheData { }; class L2TauNNProducer : public edm::stream::EDProducer> { - using TrackSoA = pixelTrack::TrackSoAT; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - public: + using TrackSoAHost = pixelTrack::TrackSoAHostPhase1; + struct caloRecHitCollections { const HBHERecHitCollection* hbhe; const HORecHitCollection* ho; @@ -182,16 +182,17 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, - const TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, + const ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); - void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const TrackSoA& patatracks_tsoa, + void selectGoodTracksAndVertices(const ZVertexSoAHost& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); + std::pair impactParameter(int it, - const TrackSoA& patatracks_tsoa, + const TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi); @@ -210,8 +211,8 @@ class L2TauNNProducer : public edm::stream::EDProducer eeToken_; const edm::ESGetToken geometryToken_; const edm::ESGetToken bFieldToken_; - const edm::EDGetTokenT pataVerticesToken_; - const edm::EDGetTokenT pataTracksToken_; + const edm::EDGetTokenT pataVerticesToken_; + const edm::EDGetTokenT pataTracksToken_; const edm::EDGetTokenT beamSpotToken_; const unsigned int maxVtx_; const float fractionSumPt2_; @@ -295,7 +296,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd eeToken_(consumes(cfg.getParameter("eeInput"))), geometryToken_(esConsumes()), bFieldToken_(esConsumes()), - pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), + pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), @@ -572,32 +573,33 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } } -void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const TrackSoA& patatracks_tsoa, +void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoAHost& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { - const auto maxTracks = patatracks_tsoa.stride(); - const int nv = patavtx_soa.nvFinal; + using patatrackHelpers = TracksUtilities; + const auto maxTracks = patatracks_tsoa.view().metadata().size(); + const int nv = patavtx_soa.view().nvFinal(); trkGood.clear(); trkGood.reserve(maxTracks); vtxGood.clear(); vtxGood.reserve(nv); - auto const* quality = patatracks_tsoa.qualityData(); + auto const* quality = patatracks_tsoa.view().quality(); // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum). std::vector pTSquaredSum(nv, 0); std::vector nTrkAssociated(nv, 0); for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) { - auto nHits = patatracks_tsoa.nHits(trk_idx); + auto nHits = patatrackHelpers::nHits(patatracks_tsoa.view(), trk_idx); if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.idv[trk_idx]; + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { - auto patatrackPt = patatracks_tsoa.pt[trk_idx]; + auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; - if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) { + if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) { patatrackPt = std::min(patatrackPt, trackPtMax_); pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt; } @@ -609,7 +611,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, if (nv > 0) { const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_; for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) { - auto vtx_idx = patavtx_soa.sortInd[j]; + auto vtx_idx = patavtx_soa.view()[j].sortInd(); assert(vtx_idx < nv); if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac && pTSquaredSum[vtx_idx] > minSumPt2_) { @@ -620,15 +622,14 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, } std::pair L2TauNNProducer::impactParameter(int it, - const TrackSoA& patatracks_tsoa, + const TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi) { - auto const& fit = patatracks_tsoa.stateAtBS; /* dxy and dz */ riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + TracksUtilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); float sp = std::sin(patatrackPhi); @@ -653,11 +654,12 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, - const TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, + const ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { using NNInputs = L2TauTagNNv1::NNInputs; + using patatrackHelpers = TracksUtilities; float deta, dphi; int eta_idx = 0; int phi_idx = 0; @@ -678,19 +680,19 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const float tauPhi = allTaus[tau_idx]->phi(); for (const auto it : trkGood) { - const float patatrackPt = patatracks_tsoa.pt[it]; + const float patatrackPt = patatracks_tsoa.const_view()[it].pt(); if (patatrackPt <= 0) continue; - const float patatrackPhi = patatracks_tsoa.phi(it); - const float patatrackEta = patatracks_tsoa.eta(it); - const float patatrackCharge = patatracks_tsoa.charge(it); - const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it); - const auto nHits = patatracks_tsoa.nHits(it); + const float patatrackPhi = patatrackHelpers::phi(patatracks_tsoa.const_view(), it); + const float patatrackEta = patatracks_tsoa.const_view()[it].eta(); + const float patatrackCharge = patatrackHelpers::charge(patatracks_tsoa.const_view(), it); + const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2(); + const auto nHits = patatrackHelpers::nHits(patatracks_tsoa.const_view(), it); if (nHits <= 0) continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.idv[it]; + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); @@ -766,8 +768,8 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset const auto eeCal = event.getHandle(eeToken_); const auto hbhe = event.getHandle(hbheToken_); const auto ho = event.getHandle(hoToken_); - const auto& patatracks_SoA = *event.get(pataTracksToken_); - const auto& vertices_SoA = *event.get(pataVerticesToken_); + auto const& patatracks_SoA = event.get(pataTracksToken_); + auto const& vertices_SoA = event.get(pataVerticesToken_); const auto bsHandle = event.getHandle(beamSpotToken_); auto const fieldESH = eventsetup.getHandle(bFieldToken_); diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 9023640f62d5a..8225885068cef 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -1,4 +1,4 @@ -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -46,7 +46,7 @@ class SeedProducerFromSoAT : public edm::global::EDProducer<> { // Event data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT> tokenTrack_; + const edm::EDGetTokenT> tokenTrack_; // Event setup tokens const edm::ESGetToken idealMagneticFieldToken_; const edm::ESGetToken trackerDigiGeometryToken_; @@ -84,6 +84,8 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // std::cout << "Converting gpu helix to trajectory seed" << std::endl; auto result = std::make_unique(); + using trackHelper = TracksUtilities; + auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_); auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_); auto const& dus = tracker->detUnits(); @@ -95,16 +97,15 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); - const auto& tsoa = *(iEvent.get(tokenTrack_)); + auto const& tsoa = iEvent.get(tokenTrack_); - auto const* quality = tsoa.qualityData(); - auto const& fit = tsoa.stateAtBS; - auto const& detIndices = tsoa.detIndices; - auto maxTracks = tsoa.stride(); + auto const* quality = tsoa.view().quality(); + auto const& detIndices = tsoa.view().detIndices(); + auto maxTracks = tsoa.view().metadata().size(); int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); + auto nHits = trackHelper::nHits(tsoa.view(), it); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... @@ -126,11 +127,11 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float phi = tsoa.phi(it); + float phi = trackHelper::nHits(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + trackHelper::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);