diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h index 4ecdf14d8d33c..0059234b1f6ee 100644 --- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -5,16 +5,29 @@ #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + #include -class SiPixelClustersCUDA { +GENERATE_SOA_LAYOUT(SiPixelClustersCUDALayout, + SOA_COLUMN(uint32_t, moduleStart), + SOA_COLUMN(uint32_t, clusInModule), + SOA_COLUMN(uint32_t, moduleId), + SOA_COLUMN(uint32_t, clusModuleStart)) + +using SiPixelClustersCUDASoA = SiPixelClustersCUDALayout<>; +using SiPixelClustersCUDASOAView = SiPixelClustersCUDALayout<>::View; +using SiPixelClustersCUDASOAConstView = SiPixelClustersCUDALayout<>::ConstView; + +class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection> { public: SiPixelClustersCUDA() = default; - explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream); ~SiPixelClustersCUDA() = default; - SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete; - SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete; + explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) + : PortableDeviceCollection>(maxModules + 1, stream) {} + SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default; SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default; @@ -26,41 +39,7 @@ class SiPixelClustersCUDA { uint32_t nClusters() const { return nClusters_h; } int32_t offsetBPIX2() const { return offsetBPIX2_h; } - uint32_t *moduleStart() { return moduleStart_d.get(); } - uint32_t *clusInModule() { return clusInModule_d.get(); } - uint32_t *moduleId() { return moduleId_d.get(); } - uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } - - uint32_t const *moduleStart() const { return moduleStart_d.get(); } - uint32_t const *clusInModule() const { return clusInModule_d.get(); } - uint32_t const *moduleId() const { return moduleId_d.get(); } - uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } - - class SiPixelClustersCUDASOAView { - public: - __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); } - __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); } - __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); } - __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); } - - uint32_t const *moduleStart_; - uint32_t const *clusInModule_; - uint32_t const *moduleId_; - uint32_t const *clusModuleStart_; - }; - - SiPixelClustersCUDASOAView const *view() const { return view_d.get(); } - private: - cms::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module - cms::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module - cms::cuda::device::unique_ptr moduleId_d; // module id of each module - - // originally from rechits - cms::cuda::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module - - cms::cuda::device::unique_ptr view_d; // "me" pointer - uint32_t nClusters_h = 0; int32_t offsetBPIX2_h = 0; }; diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc index c8a340d2162f9..ff2fb8cbe41e0 100644 --- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -1,19 +1,19 @@ -#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) - : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)), - clusInModule_d(cms::cuda::make_device_unique(maxModules, stream)), - moduleId_d(cms::cuda::make_device_unique(maxModules, stream)), - clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)) { - auto view = cms::cuda::make_host_unique(stream); - view->moduleStart_ = moduleStart_d.get(); - view->clusInModule_ = clusInModule_d.get(); - view->moduleId_ = moduleId_d.get(); - view->clusModuleStart_ = clusModuleStart_d.get(); - - view_d = cms::cuda::make_device_unique(stream); - cms::cuda::copyAsync(view_d, view, stream); -} +// #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +// +// SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) +// : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)), +// clusInModule_d(cms::cuda::make_device_unique(maxModules, stream)), +// moduleId_d(cms::cuda::make_device_unique(maxModules, stream)), +// clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)) { +// auto view = cms::cuda::make_host_unique(stream); +// view->moduleStart_ = moduleStart_d.get(); +// view->clusInModule_ = clusInModule_d.get(); +// view->moduleId_ = moduleId_d.get(); +// view->clusModuleStart_ = clusModuleStart_d.get(); +// +// view_d = cms::cuda::make_device_unique(stream); +// cms::cuda::copyAsync(view_d, view, stream); +// } diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index cf6b51687982f..29f2637b569fa 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -6,17 +6,33 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +// #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" -class SiPixelDigisCUDA { +GENERATE_SOA_LAYOUT(SiPixelDigisSoALayout, + SOA_COLUMN(int32_t, clus), + SOA_COLUMN(uint32_t, pdigi), + SOA_COLUMN(uint32_t, rawIdArr), + SOA_COLUMN(uint16_t, adc), + SOA_COLUMN(uint16_t, xx), + SOA_COLUMN(uint16_t, yy), + SOA_COLUMN(uint16_t, moduleId)) + +using SiPixelDigisCUDASOA = SiPixelDigisSoALayout<>; +using SiPixelDigisCUDASOAView = SiPixelDigisCUDASOA::View; +using SiPixelDigisCUDASOAConstView = SiPixelDigisCUDASOA::ConstView; + +class SiPixelDigisCUDA : public cms::cuda::PortableDeviceCollection> { public: - using StoreType = uint16_t; + // using StoreType = uint16_t; SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) + : PortableDeviceCollection>(maxFedWords + 1, stream) {} ~SiPixelDigisCUDA() = default; - SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; - SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete; + // SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; + // SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete; SiPixelDigisCUDA(SiPixelDigisCUDA &&) = default; SiPixelDigisCUDA &operator=(SiPixelDigisCUDA &&) = default; @@ -28,16 +44,23 @@ class SiPixelDigisCUDA { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const; + // cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const; + + cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const { + // Copy to a host buffer the host-device shared part (m_hostDeviceLayout). + auto ret = cms::cuda::make_host_unique(bufferSize(), stream); + cudaCheck(cudaMemcpyAsync(ret.get(), buffer().get(), bufferSize(), cudaMemcpyDeviceToHost, stream)); + return ret; + } - SiPixelDigisCUDASOAView view() { return m_view; } - SiPixelDigisCUDASOAView const view() const { return m_view; } + // SiPixelDigisCUDASOAView view() { return m_view; } + // SiPixelDigisCUDASOAView const view() const { return m_view; } private: // These are consumed by downstream device code - cms::cuda::device::unique_ptr m_store; + // cms::cuda::device::unique_ptr m_store; - SiPixelDigisCUDASOAView m_view; + // SiPixelDigisCUDASOAView m_view; uint32_t nModules_h = 0; uint32_t nDigis_h = 0; diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h index 78406cd241473..74f1fe79a7026 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h @@ -1,112 +1,109 @@ -#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h -#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" - -#include - -class SiPixelDigisCUDASOAView { -public: - friend class SiPixelDigisCUDA; - - template - friend class SiPixelRecHitSoAFromLegacyT; - - enum class StorageLocation { - kCLUS = 0, - kPDIGI = 2, - kRAWIDARR = 4, - kADC = 6, - kXX = 7, - kYY = 8, - kMODULEIND = 9, - kMAX = 10 - }; - /* - ============================================================================================================================ - | CLUS | PDIGI | RAWIDARR | ADC | XX | YY | MODULEIND | - ============================================================================================================================ - | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | 7: N*16 | 8: N*16 | 9: N*16 | - ============================================================================================================================ - */ - // These are for CPU output - // we don't copy local x and y coordinates and module index - enum class StorageLocationHost { kCLUS = 0, kPDIGI = 2, kRAWIDARR = 4, kADC = 6, kMAX = 7 }; - /* - ======================================================================================== - | CLUS | PDIGI | RAWIDARR | ADC | - ======================================================================================== - | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | - ======================================================================================== - */ - - SiPixelDigisCUDASOAView() = default; - - template - SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocation s) { - xx_ = getColumnAddress(StorageLocation::kXX, store, maxFedWords); - yy_ = getColumnAddress(StorageLocation::kYY, store, maxFedWords); - adc_ = getColumnAddress(StorageLocation::kADC, store, maxFedWords); - moduleInd_ = getColumnAddress(StorageLocation::kMODULEIND, store, maxFedWords); - clus_ = getColumnAddress(StorageLocation::kCLUS, store, maxFedWords); - pdigi_ = getColumnAddress(StorageLocation::kPDIGI, store, maxFedWords); - rawIdArr_ = getColumnAddress(StorageLocation::kRAWIDARR, store, maxFedWords); - } - - template - SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocationHost s) { - adc_ = getColumnAddress(StorageLocationHost::kADC, store, maxFedWords); - clus_ = getColumnAddress(StorageLocationHost::kCLUS, store, maxFedWords); - pdigi_ = getColumnAddress(StorageLocationHost::kPDIGI, store, maxFedWords); - rawIdArr_ = getColumnAddress(StorageLocationHost::kRAWIDARR, store, maxFedWords); - } - - __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); } - __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); } - __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); } - __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); } - __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); } - __device__ __forceinline__ uint32_t pdigi(int i) const { return __ldg(pdigi_ + i); } - __device__ __forceinline__ uint32_t rawIdArr(int i) const { return __ldg(rawIdArr_ + i); } - - const uint16_t* xx() const { return xx_; } - const uint16_t* yy() const { return yy_; } - const uint16_t* adc() const { return adc_; } - const uint16_t* moduleInd() const { return moduleInd_; } - const int32_t* clus() const { return clus_; } - const uint32_t* pdigi() const { return pdigi_; } - const uint32_t* rawIdArr() const { return rawIdArr_; } - - uint16_t* xx() { return xx_; } - uint16_t* yy() { return yy_; } - uint16_t* adc() { return adc_; } - uint16_t* moduleInd() { return moduleInd_; } - int32_t* clus() { return clus_; } - uint32_t* pdigi() { return pdigi_; } - uint32_t* rawIdArr() { return rawIdArr_; } - -private: - uint16_t* xx_; // local coordinates of each pixel - uint16_t* yy_; - uint16_t* adc_; // ADC of each pixel - uint16_t* moduleInd_; // module id of each pixel - int32_t* clus_; // cluster id of each pixel - uint32_t* pdigi_; - uint32_t* rawIdArr_; - - template - ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { - return reinterpret_cast(store.get() + static_cast(column) * roundFor128ByteAlignment(size)); - } - - static int roundFor128ByteAlignment(int size) { - constexpr int mul = 128 / sizeof(uint16_t); - return ((size + mul - 1) / mul) * mul; - }; -}; - -#endif +// #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h +// #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h +// +// #include +// +// #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +// +// #include +// +// class SiPixelDigisCUDASOAView { +// public: +// friend class SiPixelDigisCUDA; +// friend class SiPixelRecHitSoAFromLegacy; +// enum class StorageLocation { +// kCLUS = 0, +// kPDIGI = 2, +// kRAWIDARR = 4, +// kADC = 6, +// kXX = 7, +// kYY = 8, +// kMODULEIND = 9, +// kMAX = 10 +// }; +// /* +// ============================================================================================================================ +// | CLUS | PDIGI | RAWIDARR | ADC | XX | YY | MODULEIND | +// ============================================================================================================================ +// | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | 7: N*16 | 8: N*16 | 9: N*16 | +// ============================================================================================================================ +// */ +// // These are for CPU output +// // we don't copy local x and y coordinates and module index +// enum class StorageLocationHost { kCLUS = 0, kPDIGI = 2, kRAWIDARR = 4, kADC = 6, kMAX = 7 }; +// /* +// ======================================================================================== +// | CLUS | PDIGI | RAWIDARR | ADC | +// ======================================================================================== +// | 0: N*32 | 2: N*32 | 4: N*32 | 6: N*16 | +// ======================================================================================== +// */ +// +// SiPixelDigisCUDASOAView() = default; +// +// template +// SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocation s) { +// xx_ = getColumnAddress(StorageLocation::kXX, store, maxFedWords); +// yy_ = getColumnAddress(StorageLocation::kYY, store, maxFedWords); +// adc_ = getColumnAddress(StorageLocation::kADC, store, maxFedWords); +// moduleInd_ = getColumnAddress(StorageLocation::kMODULEIND, store, maxFedWords); +// clus_ = getColumnAddress(StorageLocation::kCLUS, store, maxFedWords); +// pdigi_ = getColumnAddress(StorageLocation::kPDIGI, store, maxFedWords); +// rawIdArr_ = getColumnAddress(StorageLocation::kRAWIDARR, store, maxFedWords); +// } +// +// template +// SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocationHost s) { +// adc_ = getColumnAddress(StorageLocationHost::kADC, store, maxFedWords); +// clus_ = getColumnAddress(StorageLocationHost::kCLUS, store, maxFedWords); +// pdigi_ = getColumnAddress(StorageLocationHost::kPDIGI, store, maxFedWords); +// rawIdArr_ = getColumnAddress(StorageLocationHost::kRAWIDARR, store, maxFedWords); +// } +// +// __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); } +// __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); } +// __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); } +// __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); } +// __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); } +// __device__ __forceinline__ uint32_t pdigi(int i) const { return __ldg(pdigi_ + i); } +// __device__ __forceinline__ uint32_t rawIdArr(int i) const { return __ldg(rawIdArr_ + i); } +// +// const uint16_t* xx() const { return xx_; } +// const uint16_t* yy() const { return yy_; } +// const uint16_t* adc() const { return adc_; } +// const uint16_t* moduleInd() const { return moduleInd_; } +// const int32_t* clus() const { return clus_; } +// const uint32_t* pdigi() const { return pdigi_; } +// const uint32_t* rawIdArr() const { return rawIdArr_; } +// +// uint16_t* xx() { return xx_; } +// uint16_t* yy() { return yy_; } +// uint16_t* adc() { return adc_; } +// uint16_t* moduleInd() { return moduleInd_; } +// int32_t* clus() { return clus_; } +// uint32_t* pdigi() { return pdigi_; } +// uint32_t* rawIdArr() { return rawIdArr_; } +// +// private: +// uint16_t* xx_; // local coordinates of each pixel +// uint16_t* yy_; +// uint16_t* adc_; // ADC of each pixel +// uint16_t* moduleInd_; // module id of each pixel +// int32_t* clus_; // cluster id of each pixel +// uint32_t* pdigi_; +// uint32_t* rawIdArr_; +// +// template +// ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { +// return reinterpret_cast(store.get() + static_cast(column) * roundFor128ByteAlignment(size)); +// } +// +// static int roundFor128ByteAlignment(int size) { +// constexpr int mul = 128 / sizeof(uint16_t); +// return ((size + mul - 1) / mul) * mul; +// }; +// }; +// +// #endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 9a7f8ae8bdad5..a40955d3767ac 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -1,29 +1,29 @@ -#include - -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : m_store(cms::cuda::make_device_unique( - SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) * - static_cast(SiPixelDigisCUDASOAView::StorageLocation::kMAX), - stream)), - m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) { - assert(maxFedWords != 0); -} - -cms::cuda::host::unique_ptr SiPixelDigisCUDA::copyAllToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique( - m_view.roundFor128ByteAlignment(nDigis()) * static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - stream); - cudaCheck(cudaMemcpyAsync(ret.get(), - m_view.clus(), - m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) * - static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - cudaMemcpyDeviceToHost, - stream)); - return ret; -} +// #include +// +// #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +// +// SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) +// : m_store(cms::cuda::make_device_unique( +// SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) * +// static_cast(SiPixelDigisCUDASOAView::StorageLocation::kMAX), +// stream)), +// m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) { +// assert(maxFedWords != 0); +// } +// +// cms::cuda::host::unique_ptr SiPixelDigisCUDA::copyAllToHostAsync( +// cudaStream_t stream) const { +// auto ret = cms::cuda::make_host_unique( +// m_view.roundFor128ByteAlignment(nDigis()) * static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), +// stream); +// cudaCheck(cudaMemcpyAsync(ret.get(), +// m_view.clus(), +// m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) * +// static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), +// cudaMemcpyDeviceToHost, +// stream)); +// return ret; +// } diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h index f9e9b3a37c63f..6ce0285e6eafb 100644 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -1,11 +1,11 @@ -#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h -#define CUDADataFormats_Track_PixelTrackHeterogeneous_h - -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" - -template -using PixelTrackHeterogeneousT = HeterogeneousSoA>; - -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h +// #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h +// #define CUDADataFormats_Track_PixelTrackHeterogeneous_h +// +// #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +// #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +// #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +// +// template +// using PixelTrackHeterogeneousT = HeterogeneousSoA>; +// +// #endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h new file mode 100644 index 0000000000000..29839cfdbafb1 --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h @@ -0,0 +1,262 @@ +#ifndef CUDADataFormats_Track_PixelTrackUtilities_h +#define CUDADataFormats_Track_PixelTrackUtilities_h + +#include +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" + + +namespace pixelTrackSoA { + + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; + constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; + const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; + inline Quality qualityByName(std::string const &name) { + auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; + return static_cast(qp); + } + +} // namespace pixelTrackSoA + +template +struct trackSoA +{ + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; + static constexpr int32_t H = TrackerTraits::maxHitsOnTrack; + // Aliases in order to not confuse the GENERATE_SOA_LAYOUT + // macro with weird colons and angled brackets. + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + using Quality = pixelTrackSoA::Quality; + + using hindex_type = uint32_t; + + using HitContainer = cms::cuda::OneToManyAssoc; + + GENERATE_SOA_LAYOUT(TrackSoALayout, + SOA_COLUMN(Quality, quality), + SOA_COLUMN(float, chi2), + SOA_COLUMN(int8_t, nLayers), + SOA_COLUMN(float, eta), + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks), + SOA_SCALAR(HitContainer, hitIndices), + SOA_SCALAR(HitContainer, detIndices)) + + +}; + + +// Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods. + +template +struct tracksUtilities +{ + + using TrackSoAView = typename trackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename trackSoA::template TrackSoALayout<>::ConstView; + using hindex_type = typename trackSoA::hindex_type; + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + static constexpr __host__ __device__ inline float charge(TrackSoAConstView tracks, int32_t i) { + return std::copysign(1.f, tracks[i].state()(2)); + } + + static constexpr __host__ __device__ inline float phi(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(0); } + + static constexpr __host__ __device__ inline float tip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(1); } + + static constexpr __host__ __device__ inline float zip(TrackSoAConstView tracks, int32_t i) { return tracks[i].state()(4); } + + static constexpr __host__ __device__ inline bool isTriplet(TrackSoAConstView tracks, int i) { + return tracks[i].nLayers() == 3; + } + + template + static constexpr __host__ __device__ inline void copyFromCircle( + TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + static constexpr __host__ __device__ inline void copyFromDense(TrackSoAView &tracks, V5 const &v, M5 const &cov, int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + static constexpr __host__ __device__ inline void copyToDense(TrackSoAConstView tracks, V5 &v, M5 &cov, int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } + + // TODO: Not using TrackSoAConstView due to weird bugs with HitContainer + static constexpr __host__ __device__ inline int computeNumberOfLayers(TrackSoAView &tracks, int32_t i) { + auto pdet = tracks.detIndices().begin(i); + int nl = 1; + auto ol = pixelTopology::getLayer(*pdet); + for (; pdet < tracks.detIndices().end(i); ++pdet) { + auto il = pixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + + static constexpr __host__ __device__ inline int nHits(TrackSoAConstView tracks, int i) { return tracks.detIndices().size(i); } + +}; + +namespace pixelTrackSoA { + + // template + // using TrackLayout = typename trackSoA::template TrackSoALayout<>; + // template + // using TrackSoAView = typename trackSoA::template TrackSoALayout<>::View; + // template + // using TrackSoAConstView = typename trackSoA::template TrackSoALayout<>::ConstView; + + template + struct QualityCutsT {}; + + template + struct QualityCutsT> { + + using TrackSoAView = typename trackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename trackSoA::template TrackSoALayout<>::ConstView; + using tracksHelper = tracksUtilities; + // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct Region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + Region triplet; + Region quadruplet; + + __device__ __forceinline__ bool isHP(TrackSoAConstView tracks, + int nHits, + int it) const { + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nHits > 3) ? quadruplet : triplet; + return (std::abs(tracksHelper::tip(tracks,it)) < region.maxTip) and (tracks.pt(it) > region.minPt) and + (std::abs(tracksHelper::zip(tracks,it)) < region.maxZip); + } + + __device__ __forceinline__ bool strictCut(TrackSoAConstView tracks, + int it) const { + auto roughLog = [](float x) { + // max diff [0.5,12] at 1.25 0.16143 + // average diff 0.0662998 + union IF { + uint32_t i; + float f; + }; + IF z; + z.f = x; + uint32_t lsb = 1 < 21; + z.i += lsb; + z.i >>= 21; + auto f = z.i & 3; + int ex = int(z.i >> 2) - 127; + + // log2(1+0.25*f) + // averaged over bins + const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; + return float(ex) + frac[f]; + }; + + float pt = std::min(tracks.pt(it), chi2MaxPt); + float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); + if (tracks.chi2(it) >= chi2Cut) { +#ifdef NTUPLE_FIT_DEBUG + printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks.pt(it), tracks.eta(it), tracks.chi2(it)); +#endif + return true; + } + return false; + } + }; + + template + struct QualityCutsT> { + + using TrackSoAView = typename trackSoA::template TrackSoALayout<>::View; + using TrackSoAConstView = typename trackSoA::template TrackSoALayout<>::ConstView; + using tracksHelper = tracksUtilities; + + float maxChi2; + float minPt; + float maxTip; + float maxZip; + + __device__ __forceinline__ bool isHP(TrackSoAConstView tracks, + int nHits, + int it) const { + return (std::abs(tracksHelper::tip(tracks,it)) < maxTip) and (tracks.pt(it) > minPt) and (std::abs(tracksHelper::zip(tracks,it)) < maxZip); + } + __device__ __forceinline__ bool strictCut(TrackSoAConstView tracks, + int it) const { + return tracks.chi2(it) >= maxChi2; + } + }; + + +}// pixelTrackSoA + + +template +using TrackLayout = typename trackSoA::template TrackSoALayout<>; +template +using TrackSoAView = typename trackSoA::template TrackSoALayout<>::View; +template +using TrackSoAConstView = typename trackSoA::template TrackSoALayout<>::ConstView; + +template struct tracksUtilities; +template struct tracksUtilities; + +// namespace pixelTrack { +// // Common types for both Host and Device code +// using TrackSoALayout = TrackSoAHeterogeneousLayout<>; +// using TrackSoAView = TrackSoAHeterogeneousLayout<>::View; +// using TrackSoAConstView = TrackSoAHeterogeneousLayout<>::ConstView; +// +// } // namespace pixelTrack + +#endif diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..ccf584b4cf995 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H +#define CUDADataFormats_Track_TrackHeterogeneousDevice_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + + +template +class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + + using cms::cuda::PortableDeviceCollection>::view; + using cms::cuda::PortableDeviceCollection>::const_view; + using cms::cuda::PortableDeviceCollection>::buffer; + using cms::cuda::PortableDeviceCollection>::bufferSize; + + TrackSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousDevice(cudaStream_t stream) + : cms::cuda::PortableDeviceCollection>(TrackerTraits::maxNumberOfTuples, stream) {} +}; + +namespace pixelTrack { + + using TrackSoADevicePhase1 = TrackSoAHeterogeneousDevice; + using TrackSoADevicePhase2 = TrackSoAHeterogeneousDevice; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..dcf47d85f7455 --- /dev/null +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h @@ -0,0 +1,31 @@ +#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H +#define CUDADataFormats_Track_TrackHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +template +class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + TrackSoAHeterogeneousHost() = default; + + using cms::cuda::PortableHostCollection>::view; + using cms::cuda::PortableHostCollection>::const_view; + using cms::cuda::PortableHostCollection>::buffer; + using cms::cuda::PortableHostCollection>::bufferSize; + + // Constructor which specifies the SoA size + explicit TrackSoAHeterogeneousHost(cudaStream_t stream) + : cms::cuda::PortableHostCollection>(TrackerTraits::maxNumberOfTuples, stream) {} +}; + +namespace pixelTrack { + + using TrackSoAHostPhase1 = TrackSoAHeterogeneousHost; + using TrackSoAHostPhase2 = TrackSoAHeterogeneousHost; + +} // namespace pixelTrack + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h index b5b1df0d5118a..dd1aae94f5732 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -1,195 +1,195 @@ -#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H -#define CUDADataFormats_Track_TrackHeterogeneousT_H - -#include -#include - -#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" - -namespace pixelTrack { - - enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; - constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; - const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; - inline Quality qualityByName(std::string const &name) { - auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; - return static_cast(qp); - } - -} // namespace pixelTrack - -template -class TrackSoAHeterogeneousT { -public: - static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; - static constexpr int32_t H = TrackerTraits::maxHitsOnTrack; // Average hits rather than max? - static constexpr int32_t stride() { return S; } - - using hindex_type = uint32_t; //TrackerTraits::hindex_type ? - - using Quality = pixelTrack::Quality; - using HitContainer = cms::cuda::OneToManyAssoc; - - // Always check quality is at least loose! - // CUDA does not support enums in __lgc ... -protected: - eigenSoA::ScalarSoA quality_; - -public: - constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } - constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } - constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } - constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } - - // this is chi2/ndof as not necessarely all hits are used in the fit - eigenSoA::ScalarSoA chi2; - - eigenSoA::ScalarSoA nLayers; - - constexpr int nTracks() const { return nTracks_; } - constexpr void setNTracks(int n) { nTracks_ = n; } - - constexpr int nHits(int i) const { return detIndices.size(i); } - - constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } - - constexpr int computeNumberOfLayers(int32_t i) const { - // layers are in order and we assume tracks are either forward or backward - auto pdet = detIndices.begin(i); - int nl = 1; - auto ol = pixelTopology::getLayer(*pdet); - for (; pdet < detIndices.end(i); ++pdet) { - auto il = pixelTopology::getLayer(*pdet); - if (il != ol) - ++nl; - ol = il; - } - return nl; - } - - // State at the Beam spot - // phi,tip,1/pt,cotan(theta),zip - TrajectoryStateSoAT stateAtBS; - eigenSoA::ScalarSoA eta; - eigenSoA::ScalarSoA pt; - constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } - constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } - constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } - constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } - - // state at the detector of the outermost hit - // representation to be decided... - // not yet filled on GPU - // TrajectoryStateSoA stateAtOuterDet; - - HitContainer hitIndices; - HitContainer detIndices; - -private: - int nTracks_; -}; - -namespace pixelTrack { - - template - using TrackSoAT = TrackSoAHeterogeneousT; - - template - using HitContainerT = typename TrackSoAHeterogeneousT::HitContainer; - - //Used only to ease classes definitions - using TrackSoAPhase1 = TrackSoAHeterogeneousT; - using TrackSoAPhase2 = TrackSoAHeterogeneousT; - - template - struct QualityCutsT {}; - - template - struct QualityCutsT> { - // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) - float chi2Coeff[4]; - float chi2MaxPt; // GeV - float chi2Scale; - - struct Region { - float maxTip; // cm - float minPt; // GeV - float maxZip; // cm - }; - - Region triplet; - Region quadruplet; - - __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, - int nHits, - int it) const { - // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) - // default cuts: - // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm - // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm - // (see CAHitNtupletGeneratorGPU.cc) - auto const ®ion = (nHits > 3) ? quadruplet : triplet; - return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and - (std::abs(tracks->zip(it)) < region.maxZip); - } - - __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, - int it) const { - auto roughLog = [](float x) { - // max diff [0.5,12] at 1.25 0.16143 - // average diff 0.0662998 - union IF { - uint32_t i; - float f; - }; - IF z; - z.f = x; - uint32_t lsb = 1 < 21; - z.i += lsb; - z.i >>= 21; - auto f = z.i & 3; - int ex = int(z.i >> 2) - 127; - - // log2(1+0.25*f) - // averaged over bins - const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; - return float(ex) + frac[f]; - }; - - float pt = std::min(tracks->pt(it), chi2MaxPt); - float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); - if (tracks->chi2(it) >= chi2Cut) { -#ifdef NTUPLE_FIT_DEBUG - printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it)); -#endif - return true; - } - return false; - } - }; - - template - struct QualityCutsT> { - float maxChi2; - float minPt; - float maxTip; - float maxZip; - - __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, - int nHits, - int it) const { - return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip); - } - __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, - int it) const { - return tracks->chi2(it) >= maxChi2; - } - }; - -} // namespace pixelTrack - -#endif // CUDADataFormats_Track_TrackHeterogeneousT_H +// #ifndef CUDADataFormats_Track_TrackHeterogeneousT_H +// #define CUDADataFormats_Track_TrackHeterogeneousT_H +// +// #include +// #include +// +// #include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +// #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +// #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +// #include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" +// +// namespace pixelTrack { +// +// enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; +// constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; +// const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; +// inline Quality qualityByName(std::string const &name) { +// auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; +// return static_cast(qp); +// } +// +// } // namespace pixelTrack +// +// template +// class TrackSoAHeterogeneousT { +// public: +// static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; +// static constexpr int32_t H = TrackerTraits::maxHitsOnTrack; // Average hits rather than max? +// static constexpr int32_t stride() { return S; } +// +// using hindex_type = uint32_t; //TrackerTraits::hindex_type ? +// +// using Quality = pixelTrack::Quality; +// using HitContainer = cms::cuda::OneToManyAssoc; +// +// // Always check quality is at least loose! +// // CUDA does not support enums in __lgc ... +// protected: +// eigenSoA::ScalarSoA quality_; +// +// public: +// constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); } +// constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); } +// constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); } +// constexpr Quality *qualityData() { return (Quality *)(quality_.data()); } +// +// // this is chi2/ndof as not necessarely all hits are used in the fit +// eigenSoA::ScalarSoA chi2; +// +// eigenSoA::ScalarSoA nLayers; +// +// constexpr int nTracks() const { return nTracks_; } +// constexpr void setNTracks(int n) { nTracks_ = n; } +// +// constexpr int nHits(int i) const { return detIndices.size(i); } +// +// constexpr bool isTriplet(int i) const { return nLayers(i) == 3; } +// +// constexpr int computeNumberOfLayers(int32_t i) const { +// // layers are in order and we assume tracks are either forward or backward +// auto pdet = detIndices.begin(i); +// int nl = 1; +// auto ol = pixelTopology::getLayer(*pdet); +// for (; pdet < detIndices.end(i); ++pdet) { +// auto il = pixelTopology::getLayer(*pdet); +// if (il != ol) +// ++nl; +// ol = il; +// } +// return nl; +// } +// +// // State at the Beam spot +// // phi,tip,1/pt,cotan(theta),zip +// TrajectoryStateSoAT stateAtBS; +// eigenSoA::ScalarSoA eta; +// eigenSoA::ScalarSoA pt; +// constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } +// constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } +// constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } +// constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } +// +// // state at the detector of the outermost hit +// // representation to be decided... +// // not yet filled on GPU +// // TrajectoryStateSoA stateAtOuterDet; +// +// HitContainer hitIndices; +// HitContainer detIndices; +// +// private: +// int nTracks_; +// }; +// +// namespace pixelTrack { +// +// template +// using TrackSoAT = TrackSoAHeterogeneousT; +// +// template +// using HitContainerT = typename TrackSoAHeterogeneousT::HitContainer; +// +// //Used only to ease classes definitions +// using TrackSoAPhase1 = TrackSoAHeterogeneousT; +// using TrackSoAPhase2 = TrackSoAHeterogeneousT; +// +// template +// struct QualityCutsT {}; +// +// template +// struct QualityCutsT> { +// // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3]))) +// float chi2Coeff[4]; +// float chi2MaxPt; // GeV +// float chi2Scale; +// +// struct Region { +// float maxTip; // cm +// float minPt; // GeV +// float maxZip; // cm +// }; +// +// Region triplet; +// Region quadruplet; +// +// __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, +// int nHits, +// int it) const { +// // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) +// // default cuts: +// // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm +// // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm +// // (see CAHitNtupletGeneratorGPU.cc) +// auto const ®ion = (nHits > 3) ? quadruplet : triplet; +// return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and +// (std::abs(tracks->zip(it)) < region.maxZip); +// } +// +// __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, +// int it) const { +// auto roughLog = [](float x) { +// // max diff [0.5,12] at 1.25 0.16143 +// // average diff 0.0662998 +// union IF { +// uint32_t i; +// float f; +// }; +// IF z; +// z.f = x; +// uint32_t lsb = 1 < 21; +// z.i += lsb; +// z.i >>= 21; +// auto f = z.i & 3; +// int ex = int(z.i >> 2) - 127; +// +// // log2(1+0.25*f) +// // averaged over bins +// const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; +// return float(ex) + frac[f]; +// }; +// +// float pt = std::min(tracks->pt(it), chi2MaxPt); +// float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); +// if (tracks->chi2(it) >= chi2Cut) { +// #ifdef NTUPLE_FIT_DEBUG +// printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it)); +// #endif +// return true; +// } +// return false; +// } +// }; +// +// template +// struct QualityCutsT> { +// float maxChi2; +// float minPt; +// float maxTip; +// float maxZip; +// +// __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT const *__restrict__ tracks, +// int nHits, +// int it) const { +// return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip); +// } +// __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT const *__restrict__ tracks, +// int it) const { +// return tracks->chi2(it) >= maxChi2; +// } +// }; +// +// } // namespace pixelTrack +// +// #endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h index 64fcd573a6991..73983380c5e19 100644 --- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h @@ -1,59 +1,59 @@ -#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H -#define CUDADataFormats_Track_TrajectoryStateSOAT_H - -#include -#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" - -template -struct TrajectoryStateSoAT { - using Vector5f = Eigen::Matrix; - using Vector15f = Eigen::Matrix; - - using Vector5d = Eigen::Matrix; - using Matrix5d = Eigen::Matrix; - - static constexpr int32_t stride() { return S; } - - eigenSoA::MatrixSoA state; - eigenSoA::MatrixSoA covariance; - - template - __host__ __device__ inline void copyFromCircle( - V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { - state(i) << cp.template cast(), lp.template cast(); - state(i)(2) *= b; - auto cov = covariance(i); - cov(0) = ccov(0, 0); - cov(1) = ccov(0, 1); - cov(2) = b * float(ccov(0, 2)); - cov(4) = cov(3) = 0; - cov(5) = ccov(1, 1); - cov(6) = b * float(ccov(1, 2)); - cov(8) = cov(7) = 0; - cov(9) = b * b * float(ccov(2, 2)); - cov(11) = cov(10) = 0; - cov(12) = lcov(0, 0); - cov(13) = lcov(0, 1); - cov(14) = lcov(1, 1); - } - - template - __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { - state(i) = v.template cast(); - for (int j = 0, ind = 0; j < 5; ++j) - for (auto k = j; k < 5; ++k) - covariance(i)(ind++) = cov(j, k); - } - - template - __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { - v = state(i).template cast(); - for (int j = 0, ind = 0; j < 5; ++j) { - cov(j, j) = covariance(i)(ind++); - for (auto k = j + 1; k < 5; ++k) - cov(k, j) = cov(j, k) = covariance(i)(ind++); - } - } -}; - -#endif // CUDADataFormats_Track_TrajectoryStateSOAT_H +// #ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H +// #define CUDADataFormats_Track_TrajectoryStateSOAT_H +// +// #include +// #include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" +// +// template +// struct TrajectoryStateSoAT { +// using Vector5f = Eigen::Matrix; +// using Vector15f = Eigen::Matrix; +// +// using Vector5d = Eigen::Matrix; +// using Matrix5d = Eigen::Matrix; +// +// static constexpr int32_t stride() { return S; } +// +// eigenSoA::MatrixSoA state; +// eigenSoA::MatrixSoA covariance; +// +// template +// __host__ __device__ inline void copyFromCircle( +// V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { +// state(i) << cp.template cast(), lp.template cast(); +// state(i)(2) *= b; +// auto cov = covariance(i); +// cov(0) = ccov(0, 0); +// cov(1) = ccov(0, 1); +// cov(2) = b * float(ccov(0, 2)); +// cov(4) = cov(3) = 0; +// cov(5) = ccov(1, 1); +// cov(6) = b * float(ccov(1, 2)); +// cov(8) = cov(7) = 0; +// cov(9) = b * b * float(ccov(2, 2)); +// cov(11) = cov(10) = 0; +// cov(12) = lcov(0, 0); +// cov(13) = lcov(0, 1); +// cov(14) = lcov(1, 1); +// } +// +// template +// __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { +// state(i) = v.template cast(); +// for (int j = 0, ind = 0; j < 5; ++j) +// for (auto k = j; k < 5; ++k) +// covariance(i)(ind++) = cov(j, k); +// } +// +// template +// __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { +// v = state(i).template cast(); +// for (int j = 0, ind = 0; j < 5; ++j) { +// cov(j, j) = covariance(i)(ind++); +// for (auto k = j + 1; k < 5; ++k) +// cov(k, j) = cov(j, k) = covariance(i)(ind++); +// } +// } +// }; +// +// #endif // CUDADataFormats_Track_TrajectoryStateSOAT_H diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 97c116f6c88d3..c17f2a43b9e1f 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -3,7 +3,11 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +// #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + #include "DataFormats/Common/interface/Wrapper.h" #endif // CUDADataFormats_Track_src_classes_h diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 5216c19dded65..fed1dac54677c 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,6 +1,6 @@ - + + + + + + + + + + + + + + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml index fc78783db473b..14aa505815935 100644 --- a/CUDADataFormats/Track/test/BuildFile.xml +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -1,19 +1,8 @@ - - - - - - - - - - + + - - - - - + + diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneousImpl_test.h b/CUDADataFormats/Track/test/TrackSoAHeterogeneousImpl_test.h new file mode 100644 index 0000000000000..61bf4283d63be --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneousImpl_test.h @@ -0,0 +1,49 @@ +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace testTrackSoA { + + // Kernel which fills the TrackSoAView with data + // to test writing to it + template< typename TrackerTraits> + __global__ void fill(TrackSoAView tracks_view) { + int i = threadIdx.x; + if (i == 0) { + tracks_view.nTracks() = 420; + } + + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + tracks_view[j].pt() = (float)j; + tracks_view[j].eta() = (float)j; + tracks_view[j].chi2() = (float)j; + tracks_view[j].quality() = (pixelTrackSoA::Quality)(j % 256); + tracks_view[j].nLayers() = j % 128; + tracks_view.hitIndices().off[j] = j; + } + } + + // Kernel which reads from the TrackSoAView to verify + // that it was written correctly from the fill kernel + // TODO: Use TrackSoAConstView when https://github.com/cms-sw/cmssw/pull/39919 is merged + template< typename TrackerTraits> + __global__ void verify(TrackSoAView tracks_view) { + int i = threadIdx.x; + + if (i == 0) { + printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x); + assert(tracks_view.nTracks() == 420); + } + for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) { + assert(abs(tracks_view[j].pt() - (float)j) < .0001); + assert(abs(tracks_view[j].eta() - (float)j) < .0001); + assert(abs(tracks_view[j].chi2() - (float)j) < .0001); + assert(tracks_view[j].quality() == (pixelTrackSoA::Quality)(j % 256)); + assert(tracks_view[j].nLayers() == j % 128); + assert(tracks_view.hitIndices().off[j] == j); + } + } + +} // namespace testTrackSoA diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp index 9708b689dd05b..359f1e1e50cb8 100644 --- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp @@ -1,4 +1,4 @@ -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" +/*#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" #include #include @@ -6,16 +6,16 @@ int main() { // test quality - auto q = pixelTrack::qualityByName("tight"); - assert(pixelTrack::Quality::tight == q); - q = pixelTrack::qualityByName("toght"); - assert(pixelTrack::Quality::notQuality == q); + auto q = pixelTrackSoA::qualityByName("tight"); + assert(pixelTrackSoA::Quality::tight == q); + q = pixelTrackSoA::qualityByName("toght"); + assert(pixelTrackSoA::Quality::notQuality == q); - for (uint32_t i = 0; i < pixelTrack::qualitySize; ++i) { - auto const qt = static_cast(i); - auto q = pixelTrack::qualityByName(pixelTrack::qualityName[i]); + for (uint32_t i = 0; i < pixelTrackSoA::qualitySize; ++i) { + auto const qt = static_cast(i); + auto q = pixelTrackSoA::qualityByName(pixelTrackSoA::qualityName[i]); assert(qt == q); } return 0; -} +}*/ diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp new file mode 100644 index 0000000000000..104cc4a34770f --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp @@ -0,0 +1,74 @@ +/** + Simple test for the pixelTrack::TrackSoA data structure + which inherits from PortableDeviceCollection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the CUDA kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. + */ + +#include +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackSoA { + + template + void runKernels(TrackSoAView tracks_view, cudaStream_t stream); +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // Inner scope to deallocate memory before destroying the stream + { + // Instantiate tracks on device. PortableDeviceCollection allocates + // SoA on device automatically. + TrackSoAHeterogeneousDevice tracks_d(stream); + testTrackSoA::runKernels(tracks_d.view(), stream); + + // Instantate tracks on host. This is where the data will be + // copied to from device. + TrackSoAHeterogeneousHost tracks_h(stream); + + cudaCheck(cudaMemcpyAsync( + tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream)); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + // Print results + std::cout << "pt" + << "\t" + << "eta" + << "\t" + << "chi2" + << "\t" + << "quality" + << "\t" + << "nLayers" + << "\t" + << "hitIndices off" << std::endl; + + for (int i = 0; i < 10; ++i) { + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() + << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + << tracks_h.view().hitIndices().off[i] << std::endl; + } + } + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu new file mode 100644 index 0000000000000..71b2c78960cb6 --- /dev/null +++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu @@ -0,0 +1,23 @@ +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "TrackSoAHeterogeneousImpl_test.h" + +namespace testTrackSoA { + + // Host function which invokes the two kernels above + template< typename TrackerTraits> + void runKernels(TrackSoAView tracks_view, cudaStream_t stream) { + fill<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + verify<<<1, 1024, 0, stream>>>(tracks_view); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + } + + template void runKernels(TrackSoAView tracks_view, cudaStream_t stream); + template void runKernels(TrackSoAView tracks_view, cudaStream_t stream); + +} // namespace testTrackSoAHeterogeneousT diff --git a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h index b3bdade5ec97c..13322ce3952b7 100644 --- a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h +++ b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h @@ -12,4 +12,9 @@ struct SiPixelHitStatus { uint8_t qBin : 3; // ∈[0,1,...,7] }; +struct SiPixelHitStatusAndCharge { + SiPixelHitStatus status; + uint32_t charge : 24; +}; + #endif diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index ad78daa8354e2..5bbc8359a58bc 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -1,384 +1,384 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h - -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" - -namespace { - enum class Storage32 { - kXLocal = 0, - kYLocal = 1, - kXerror = 2, - kYerror = 3, - kCharge = 4, - kXGlobal = 5, - kYGlobal = 6, - kZGlobal = 7, - kRGlobal = 8, - kPhiStorage = 9, - kLayers = 10 - }; - - enum class Storage16 { - kDetId = 0, - kPhi = 1, - kXSize = 2, - kYSize = 3, - }; -} // namespace - -template -class TrackingRecHit2DHeterogeneousT { -public: - template - friend class TrackingRecHit2DHostT; - - template - using unique_ptr = typename Traits::template unique_ptr; - - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner; - using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry; - - TrackingRecHit2DHeterogeneousT() = default; - - explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream = nullptr); - - explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream = nullptr); - ~TrackingRecHit2DHeterogeneousT() = default; - - TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete; - TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete; - TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default; - TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default; - - TrackingRecHit2DSOAView* view() { return m_view.get(); } - TrackingRecHit2DSOAView const* view() const { return m_view.get(); } - - auto nHits() const { return m_nHits; } - auto offsetBPIX2() const { return m_offsetBPIX2; } - - auto hitsModuleStart() const { return m_hitsModuleStart; } - auto hitsLayerStart() { return m_hitsLayerStart; } - auto phiBinner() { return m_phiBinner; } - auto phiBinnerStorage() { return m_phiBinnerStorage; } - auto iphi() { return m_iphi; } - - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; - -protected: - static constexpr uint32_t n16 = 4; // number of elements in m_store16 - static constexpr uint32_t n32 = 10; // number of elements in m_store32 - static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious - static_assert(n32 == static_cast(Storage32::kLayers)); - unique_ptr m_store16; //! - unique_ptr m_store32; //! - - unique_ptr m_PhiBinnerStore; //! - unique_ptr m_AverageGeometryStore; //! - - unique_ptr m_view; //! - - uint32_t m_nHits; - int32_t m_offsetBPIX2; - - uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! - - // needed as kernel params... - PhiBinner* m_phiBinner; - typename PhiBinner::index_type* m_phiBinnerStorage; - uint32_t* m_hitsLayerStart; - int16_t* m_iphi; -}; - -//Inherit and overload only what we need to overload, remember to use this-> -//GPU -template -class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT { -public: - using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; - - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; -}; - -//CPU -template -class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT { -public: - using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; - - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; -}; - -//HOST -template -class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT { -public: - ~TrackingRecHit2DHostT() = default; - TrackingRecHit2DHostT() = default; - - explicit TrackingRecHit2DHostT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream = nullptr) - : TrackingRecHit2DHeterogeneousT( - nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {} - - explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream = nullptr) - : TrackingRecHit2DHeterogeneousT( - store32, store16, modules, nHits, stream) {} - - explicit TrackingRecHit2DHostT(uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneousT const* input); -}; - -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -template -TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( - uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream) - : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - - auto view = Traits::template make_host_unique(stream); - - view->m_nHits = nHits; - m_view = Traits::template make_unique(stream); // leave it on host and pass it by value? - m_AverageGeometryStore = Traits::template make_unique(stream); - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } - return; - } - - // the single arrays are not 128 bit alligned... - // the hits are actually accessed in order only in building - // if ordering is relevant they may have to be stored phi-ordered by layer or so - // this will break 1to1 correspondence with cluster and module locality - // so unless proven VERY inefficient we keep it ordered as generated - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == - sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - view->m_detInd = get16(Storage16::kDetId); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } -} - -template -TrackingRecHit2DHostT::TrackingRecHit2DHostT( - uint32_t nHits, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPUT const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneousT const* input) { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - - this->m_nHits = nHits; - this->m_offsetBPIX2 = offsetBPIX2; - this->m_hitsModuleStart = hitsModuleStart; - - auto view = cms::cuda::make_host_unique(stream); - - view->m_nHits = nHits; - this->m_view = - cms::cuda::make_host_unique(stream); // leave it on host and pass it by value? - this->m_AverageGeometryStore = cms::cuda::make_host_unique(stream); - view->m_averageGeometry = this->m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - this->m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - return; - } - - this->m_store32 = cms::cuda::make_host_unique(5 * input->nHits(), stream); - cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream); - - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == - sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get(); - this->m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - this->m_view = std::move(view); -} - -//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases -template -TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( - cms::cuda::host::unique_ptr& store32, - cms::cuda::host::unique_ptr& store16, - uint32_t* modules, - int nHits, - cudaStream_t stream) - : m_nHits(nHits), m_hitsModuleStart(modules) { - auto view = Traits::template make_host_unique(stream); - - m_view = Traits::template make_unique(stream); - - view->m_nHits = nHits; - - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } - return; - } - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - m_AverageGeometryStore = Traits::template make_unique(stream); - - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_hitsModuleStart = m_hitsModuleStart; - - //store transfer - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_store16, store16, static_cast(n16 * nHits), stream); - cms::cuda::copyAsync(m_store32, store32, static_cast(n32 * nHits), stream); - - } else { - std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get()); // want to copy it - std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get()); - } - - //getters - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - - //Store 32 - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - //Store 16 - view->m_detInd = get16(Storage16::kDetId); - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } -} - -//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. -using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT; -using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT; -using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT; - -using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT; -using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT; -using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT; - -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h +// #ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +// #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +// +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +// #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +// #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +// #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +// #include "DataFormats/Common/interface/CMS_CLASS_VERSION.h" +// +// namespace { +// enum class Storage32 { +// kXLocal = 0, +// kYLocal = 1, +// kXerror = 2, +// kYerror = 3, +// kCharge = 4, +// kXGlobal = 5, +// kYGlobal = 6, +// kZGlobal = 7, +// kRGlobal = 8, +// kPhiStorage = 9, +// kLayers = 10 +// }; +// +// enum class Storage16 { +// kDetId = 0, +// kPhi = 1, +// kXSize = 2, +// kYSize = 3, +// }; +// } // namespace +// +// template +// class TrackingRecHit2DHeterogeneousT { +// public: +// template +// friend class TrackingRecHit2DHostT; +// +// template +// using unique_ptr = typename Traits::template unique_ptr; +// +// using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; +// using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner; +// using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry; +// +// TrackingRecHit2DHeterogeneousT() = default; +// +// explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits, +// int32_t offsetBPIX2, +// pixelCPEforGPU::ParamsOnGPUT const* cpeParams, +// uint32_t const* hitsModuleStart, +// cudaStream_t stream = nullptr); +// +// explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr& store32, +// cms::cuda::host::unique_ptr& store16, +// uint32_t* modules, +// int nHits, +// cudaStream_t stream = nullptr); +// ~TrackingRecHit2DHeterogeneousT() = default; +// +// TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete; +// TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete; +// TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default; +// TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default; +// +// TrackingRecHit2DSOAView* view() { return m_view.get(); } +// TrackingRecHit2DSOAView const* view() const { return m_view.get(); } +// +// auto nHits() const { return m_nHits; } +// auto offsetBPIX2() const { return m_offsetBPIX2; } +// +// auto hitsModuleStart() const { return m_hitsModuleStart; } +// auto hitsLayerStart() { return m_hitsLayerStart; } +// auto phiBinner() { return m_phiBinner; } +// auto phiBinnerStorage() { return m_phiBinnerStorage; } +// auto iphi() { return m_iphi; } +// +// cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; +// +// cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; +// +// cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; +// +// protected: +// static constexpr uint32_t n16 = 4; // number of elements in m_store16 +// static constexpr uint32_t n32 = 10; // number of elements in m_store32 +// static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious +// static_assert(n32 == static_cast(Storage32::kLayers)); +// unique_ptr m_store16; //! +// unique_ptr m_store32; //! +// +// unique_ptr m_PhiBinnerStore; //! +// unique_ptr m_AverageGeometryStore; //! +// +// unique_ptr m_view; //! +// +// uint32_t m_nHits; +// int32_t m_offsetBPIX2; +// +// uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! +// +// // needed as kernel params... +// PhiBinner* m_phiBinner; +// typename PhiBinner::index_type* m_phiBinnerStorage; +// uint32_t* m_hitsLayerStart; +// int16_t* m_iphi; +// }; +// +// //Inherit and overload only what we need to overload, remember to use this-> +// //GPU +// template +// class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT { +// public: +// using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; +// +// cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; +// }; +// +// //CPU +// template +// class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT { +// public: +// using TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT; +// +// cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; +// cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; +// }; +// +// //HOST +// template +// class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT { +// public: +// ~TrackingRecHit2DHostT() = default; +// TrackingRecHit2DHostT() = default; +// +// explicit TrackingRecHit2DHostT(uint32_t nHits, +// int32_t offsetBPIX2, +// pixelCPEforGPU::ParamsOnGPUT const* cpeParams, +// uint32_t const* hitsModuleStart, +// cudaStream_t stream = nullptr) +// : TrackingRecHit2DHeterogeneousT( +// nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {} +// +// explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr& store32, +// cms::cuda::host::unique_ptr& store16, +// uint32_t* modules, +// int nHits, +// cudaStream_t stream = nullptr) +// : TrackingRecHit2DHeterogeneousT( +// store32, store16, modules, nHits, stream) {} +// +// explicit TrackingRecHit2DHostT(uint32_t nHits, +// int32_t offsetBPIX2, +// pixelCPEforGPU::ParamsOnGPUT const* cpeParams, +// uint32_t const* hitsModuleStart, +// cudaStream_t stream, +// TrackingRecHit2DHeterogeneousT const* input); +// }; +// +// #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +// +// template +// TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( +// uint32_t nHits, +// int32_t offsetBPIX2, +// pixelCPEforGPU::ParamsOnGPUT const* cpeParams, +// uint32_t const* hitsModuleStart, +// cudaStream_t stream) +// : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { +// using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; +// +// auto view = Traits::template make_host_unique(stream); +// +// view->m_nHits = nHits; +// m_view = Traits::template make_unique(stream); // leave it on host and pass it by value? +// m_AverageGeometryStore = Traits::template make_unique(stream); +// view->m_averageGeometry = m_AverageGeometryStore.get(); +// view->m_cpeParams = cpeParams; +// view->m_hitsModuleStart = hitsModuleStart; +// +// // if empy do not bother +// if (0 == nHits) { +// if constexpr (std::is_same_v) { +// cms::cuda::copyAsync(m_view, view, stream); +// } else { +// m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version +// } +// return; +// } +// +// // the single arrays are not 128 bit alligned... +// // the hits are actually accessed in order only in building +// // if ordering is relevant they may have to be stored phi-ordered by layer or so +// // this will break 1to1 correspondence with cluster and module locality +// // so unless proven VERY inefficient we keep it ordered as generated +// +// m_store16 = Traits::template make_unique(nHits * n16, stream); +// m_store32 = Traits::template make_unique(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream); +// m_PhiBinnerStore = Traits::template make_unique(stream); +// +// static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); +// static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == +// sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); +// +// auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; +// +// // copy all the pointers +// m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); +// m_phiBinnerStorage = view->m_phiBinnerStorage = +// reinterpret_cast(get32(Storage32::kPhiStorage)); +// +// view->m_xl = get32(Storage32::kXLocal); +// view->m_yl = get32(Storage32::kYLocal); +// view->m_xerr = get32(Storage32::kXerror); +// view->m_yerr = get32(Storage32::kYerror); +// view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); +// +// view->m_xg = get32(Storage32::kXGlobal); +// view->m_yg = get32(Storage32::kYGlobal); +// view->m_zg = get32(Storage32::kZGlobal); +// view->m_rg = get32(Storage32::kRGlobal); +// +// auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; +// m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); +// +// view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); +// view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); +// view->m_detInd = get16(Storage16::kDetId); +// +// m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); +// m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); +// +// // transfer view +// if constexpr (std::is_same_v) { +// cms::cuda::copyAsync(m_view, view, stream); +// } else { +// m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version +// } +// } +// +// template +// TrackingRecHit2DHostT::TrackingRecHit2DHostT( +// uint32_t nHits, +// int32_t offsetBPIX2, +// pixelCPEforGPU::ParamsOnGPUT const* cpeParams, +// uint32_t const* hitsModuleStart, +// cudaStream_t stream, +// TrackingRecHit2DHeterogeneousT const* input) { +// using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; +// +// this->m_nHits = nHits; +// this->m_offsetBPIX2 = offsetBPIX2; +// this->m_hitsModuleStart = hitsModuleStart; +// +// auto view = cms::cuda::make_host_unique(stream); +// +// view->m_nHits = nHits; +// this->m_view = +// cms::cuda::make_host_unique(stream); // leave it on host and pass it by value? +// this->m_AverageGeometryStore = cms::cuda::make_host_unique(stream); +// view->m_averageGeometry = this->m_AverageGeometryStore.get(); +// view->m_cpeParams = cpeParams; +// view->m_hitsModuleStart = hitsModuleStart; +// +// // if empy do not bother +// if (0 == nHits) { +// this->m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version +// return; +// } +// +// this->m_store32 = cms::cuda::make_host_unique(5 * input->nHits(), stream); +// cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream); +// +// static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); +// static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == +// sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type)); +// +// auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast(i) * nHits; }; +// +// // copy all the pointers +// this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get(); +// this->m_phiBinnerStorage = view->m_phiBinnerStorage = +// reinterpret_cast(get32(Storage32::kPhiStorage)); +// +// view->m_xl = get32(Storage32::kXLocal); +// view->m_yl = get32(Storage32::kYLocal); +// view->m_xerr = get32(Storage32::kXerror); +// view->m_yerr = get32(Storage32::kYerror); +// view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); +// +// this->m_view = std::move(view); +// } +// +// //this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases +// template +// TrackingRecHit2DHeterogeneousT::TrackingRecHit2DHeterogeneousT( +// cms::cuda::host::unique_ptr& store32, +// cms::cuda::host::unique_ptr& store16, +// uint32_t* modules, +// int nHits, +// cudaStream_t stream) +// : m_nHits(nHits), m_hitsModuleStart(modules) { +// auto view = Traits::template make_host_unique(stream); +// +// m_view = Traits::template make_unique(stream); +// +// view->m_nHits = nHits; +// +// if (0 == nHits) { +// if constexpr (std::is_same_v) { +// cms::cuda::copyAsync(m_view, view, stream); +// } else { +// m_view = std::move(view); +// } +// return; +// } +// +// m_store16 = Traits::template make_unique(nHits * n16, stream); +// m_store32 = Traits::template make_unique(nHits * n32, stream); +// m_PhiBinnerStore = Traits::template make_unique(stream); +// m_AverageGeometryStore = Traits::template make_unique(stream); +// +// view->m_averageGeometry = m_AverageGeometryStore.get(); +// view->m_hitsModuleStart = m_hitsModuleStart; +// +// //store transfer +// if constexpr (std::is_same_v) { +// cms::cuda::copyAsync(m_store16, store16, static_cast(n16 * nHits), stream); +// cms::cuda::copyAsync(m_store32, store32, static_cast(n32 * nHits), stream); +// +// } else { +// std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get()); // want to copy it +// std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get()); +// } +// +// //getters +// auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; +// auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; +// +// //Store 32 +// view->m_xl = get32(Storage32::kXLocal); +// view->m_yl = get32(Storage32::kYLocal); +// view->m_xerr = get32(Storage32::kXerror); +// view->m_yerr = get32(Storage32::kYerror); +// view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); +// view->m_xg = get32(Storage32::kXGlobal); +// view->m_yg = get32(Storage32::kYGlobal); +// view->m_zg = get32(Storage32::kZGlobal); +// view->m_rg = get32(Storage32::kRGlobal); +// +// m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); +// m_phiBinnerStorage = view->m_phiBinnerStorage = +// reinterpret_cast(get32(Storage32::kPhiStorage)); +// +// //Store 16 +// view->m_detInd = get16(Storage16::kDetId); +// m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); +// view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); +// view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); +// +// // transfer view +// if constexpr (std::is_same_v) { +// cms::cuda::copyAsync(m_view, view, stream); +// } else { +// m_view = std::move(view); +// } +// } +// +// //Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. +// using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT; +// using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT; +// using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT; +// +// using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT; +// using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT; +// using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT; +// +// #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h index 8fd2bc54cfad7..d3c307099c596 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h @@ -1,59 +1,59 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h - -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" - -// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction -template -class TrackingRecHit2DReducedT { - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; - -public: - using HLPstorage = HostProduct; - using HIDstorage = HostProduct; - - template - TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits) - : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) { - auto get32 = [&](int i) { return const_cast(m_store32.get()) + i * nhits; }; - - // copy all the pointers (better be in sync with the producer store) - - m_view.m_xl = get32(0); - m_view.m_yl = get32(1); - m_view.m_xerr = get32(2); - m_view.m_yerr = get32(3); - m_view.m_chargeAndStatus = reinterpret_cast(get32(4)); - m_view.m_detInd = const_cast(m_store16.get()); - } - - // view only! - TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {} - - TrackingRecHit2DReducedT() = default; - ~TrackingRecHit2DReducedT() = default; - - TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete; - TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete; - TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default; - TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default; - - TrackingRecHit2DSOAView& view() { return m_view; } - TrackingRecHit2DSOAView const& view() const { return m_view; } - - auto nHits() const { return m_nHits; } - -private: - TrackingRecHit2DSOAView m_view; - - HLPstorage m_store32; - HIDstorage m_store16; - - int m_nHits; -}; - -using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT; -using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT; - -#endif +// #ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h +// #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h +// +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +// #include "CUDADataFormats/Common/interface/HostProduct.h" +// +// // a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction +// template +// class TrackingRecHit2DReducedT { +// using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; +// +// public: +// using HLPstorage = HostProduct; +// using HIDstorage = HostProduct; +// +// template +// TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits) +// : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) { +// auto get32 = [&](int i) { return const_cast(m_store32.get()) + i * nhits; }; +// +// // copy all the pointers (better be in sync with the producer store) +// +// m_view.m_xl = get32(0); +// m_view.m_yl = get32(1); +// m_view.m_xerr = get32(2); +// m_view.m_yerr = get32(3); +// m_view.m_chargeAndStatus = reinterpret_cast(get32(4)); +// m_view.m_detInd = const_cast(m_store16.get()); +// } +// +// // view only! +// TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {} +// +// TrackingRecHit2DReducedT() = default; +// ~TrackingRecHit2DReducedT() = default; +// +// TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete; +// TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete; +// TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default; +// TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default; +// +// TrackingRecHit2DSOAView& view() { return m_view; } +// TrackingRecHit2DSOAView const& view() const { return m_view; } +// +// auto nHits() const { return m_nHits; } +// +// private: +// TrackingRecHit2DSOAView m_view; +// +// HLPstorage m_store32; +// HIDstorage m_store16; +// +// int m_nHits; +// }; +// +// using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT; +// using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT; +// +// #endif diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h index 59b7cb1337fdf..27d74ffc7d94b 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -1,131 +1,131 @@ -#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h -#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h - -#include - -#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" -#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h" - -namespace pixelCPEforGPU { - template - struct ParamsOnGPUT; -} - -template -class TrackingRecHit2DSOAViewT { -public: - using Status = SiPixelHitStatus; - static_assert(sizeof(Status) == sizeof(uint8_t)); - - using hindex_type = typename TrackerTraits::hindex_type; - using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry - using AverageGeometry = pixelTopology::AverageGeometryT; - using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; - - template - friend class TrackingRecHit2DHeterogeneousT; - template - friend class TrackingRecHit2DHostT; - // template - // friend class TrackingRecHit2DReducedT; - - __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } - - __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } - __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } - __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; } - __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); } - - __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; } - __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); } - __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; } - __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); } - - __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; } - __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); } - __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; } - __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); } - __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; } - __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); } - __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; } - __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); } - - __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; } - __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); } - - __device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) { - ich = std::min(ich, chargeMask()); - uint32_t w = *reinterpret_cast(&is); - ich |= (w << 24); - m_chargeAndStatus[i] = ich; - } - - __device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & 0xFFFFFF; } - - __device__ __forceinline__ Status status(int i) const { - uint8_t w = __ldg(m_chargeAndStatus + i) >> 24; - return *reinterpret_cast(&w); - } - - __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; } - __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); } - __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; } - __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); } - __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } - __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } - - __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } - - __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); } - - __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; } - __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; } - - __device__ __forceinline__ PhiBinner& phiBinner() { return *m_phiBinner; } - __device__ __forceinline__ PhiBinner const& phiBinner() const { return *m_phiBinner; } - - __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; } - __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } - - __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; } - __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; } - -private: - // local coord - float *m_xl, *m_yl; - float *m_xerr, *m_yerr; - - // global coord - float *m_xg, *m_yg, *m_zg, *m_rg; - int16_t* m_iphi; - - // cluster properties - static constexpr uint32_t chargeMask() { return (1 << 24) - 1; } - uint32_t* m_chargeAndStatus; - int16_t* m_xsize; - int16_t* m_ysize; - uint16_t* m_detInd; - - // supporting objects - // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise - AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous - ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned - uint32_t const* m_hitsModuleStart; // forwarded from clusters - - uint32_t* m_hitsLayerStart; - - PhiBinner* m_phiBinner; - typename PhiBinner::index_type* m_phiBinnerStorage; - - uint32_t m_nHits; -}; - -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h +// #ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h +// #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h +// +// #include +// +// #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +// #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +// #include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h" +// +// namespace pixelCPEforGPU { +// template +// struct ParamsOnGPUT; +// } +// +// template +// class TrackingRecHit2DSOAViewT { +// public: +// using Status = SiPixelHitStatus; +// static_assert(sizeof(Status) == sizeof(uint8_t)); +// +// using hindex_type = typename TrackerTraits::hindex_type; +// using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry +// using AverageGeometry = pixelTopology::AverageGeometryT; +// using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; +// +// template +// friend class TrackingRecHit2DHeterogeneousT; +// template +// friend class TrackingRecHit2DHostT; +// // template +// // friend class TrackingRecHit2DReducedT; +// +// __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } +// +// __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } +// __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } +// __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; } +// __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); } +// +// __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; } +// __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); } +// __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; } +// __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); } +// +// __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; } +// __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); } +// __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; } +// __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); } +// __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; } +// __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); } +// __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; } +// __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); } +// +// __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; } +// __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); } +// +// __device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) { +// ich = std::min(ich, chargeMask()); +// uint32_t w = *reinterpret_cast(&is); +// ich |= (w << 24); +// m_chargeAndStatus[i] = ich; +// } +// +// __device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & 0xFFFFFF; } +// +// __device__ __forceinline__ Status status(int i) const { +// uint8_t w = __ldg(m_chargeAndStatus + i) >> 24; +// return *reinterpret_cast(&w); +// } +// +// __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; } +// __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); } +// __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; } +// __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); } +// __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } +// __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } +// +// __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } +// +// __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); } +// +// __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; } +// __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; } +// +// __device__ __forceinline__ PhiBinner& phiBinner() { return *m_phiBinner; } +// __device__ __forceinline__ PhiBinner const& phiBinner() const { return *m_phiBinner; } +// +// __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; } +// __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } +// +// __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; } +// __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; } +// +// private: +// // local coord +// float *m_xl, *m_yl; +// float *m_xerr, *m_yerr; +// +// // global coord +// float *m_xg, *m_yg, *m_zg, *m_rg; +// int16_t* m_iphi; +// +// // cluster properties +// static constexpr uint32_t chargeMask() { return (1 << 24) - 1; } +// uint32_t* m_chargeAndStatus; +// int16_t* m_xsize; +// int16_t* m_ysize; +// uint16_t* m_detInd; +// +// // supporting objects +// // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise +// AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous +// ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned +// uint32_t const* m_hitsModuleStart; // forwarded from clusters +// +// uint32_t* m_hitsLayerStart; +// +// PhiBinner* m_phiBinner; +// typename PhiBinner::index_type* m_phiBinnerStorage; +// +// uint32_t m_nHits; +// }; +// +// #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h new file mode 100644 index 0000000000000..341992182ea9c --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h @@ -0,0 +1,109 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsDevice_h +#define CUDADataFormats_RecHits_TrackingRecHitsDevice_h + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection> { +public: + + using hitSoA = trackingRecHitSoA; + //Need to decorate the class with the inherited portable accessors being now a template + using cms::cuda::PortableDeviceCollection>::view; + using cms::cuda::PortableDeviceCollection>::const_view; + using cms::cuda::PortableDeviceCollection>::buffer; + using cms::cuda::PortableDeviceCollection>::bufferSize; + + TrackingRecHitSoADevice() = default; // cms::cuda::Product needs this + + using AverageGeometry = typename hitSoA::AverageGeometry;; + using ParamsOnGPU = typename hitSoA::ParamsOnGPU;; + using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType; + using PhiBinner = typename hitSoA::PhiBinner; + // Constructor which specifies the SoA size + explicit TrackingRecHitSoADevice(uint32_t nHits, + int32_t offsetBPIX2, + ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : cms::cuda::PortableDeviceCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + hitsModuleStart_(hitsModuleStart), + offsetBPIX2_(offsetBPIX2) { + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + + phiBinner_ = &(view().phiBinner()); + // phiBinner_ = cms::cuda::make_device_unique(stream).get(); + cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t), cudaMemcpyHostToDevice, stream)); + cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(), + hitsModuleStart, + sizeof(uint32_t) * int(TrackerTraits::numberOfModules + 1), + cudaMemcpyHostToDevice, + stream)); + cudaCheck( + cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t), cudaMemcpyHostToDevice, stream)); + + // cpeParams argument is a pointer to device memory, copy + // its contents into the Layout. + + cudaCheck(cudaMemcpyAsync( + &(view().cpeParams()), cpeParams, int(sizeof(ParamsOnGPU)), cudaMemcpyDeviceToDevice, stream)); + } + + uint32_t nHits() const { return nHits_; } //go to size of view + // uint32_t nModules() const { return nModules_; } + + cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); + size_t rowSize = sizeof(float) * nHits(); + // printf("nModules=%d \n", nModules()); + printf("nHits=%d \n", nHits()); + cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + rowSize , view().yLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 2), view().xerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + // cudaCheck(cudaMemcpyAsync(ret.get() + size_t(rowSize * 3) , view().yerrLocal() , rowSize, cudaMemcpyDeviceToHost, stream)); + return ret; + } //move to utilities + + cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const { + // printf("%d \n",nModules()); + auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), + view().hitsModuleStart().data(), + sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), + cudaMemcpyDeviceToHost, + stream)); + return ret; + } + + auto phiBinnerStorage() { return phiBinnerStorage_; } + auto hitsModuleStart() const { return hitsModuleStart_; } + uint32_t offsetBPIX2() const { return offsetBPIX2_; } + auto phiBinner() { return phiBinner_; } + + +private: + uint32_t nHits_; //Needed for the host SoA size + ParamsOnGPU const* cpeParams_; //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class. + uint32_t const* hitsModuleStart_; + uint32_t offsetBPIX2_; + + PhiBinnerStorageType* phiBinnerStorage_; + PhiBinner* phiBinner_; + +}; + + +//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code. +using TrackingRecHitSoADevicePhase1 = TrackingRecHitSoADevice; +using TrackingRecHitSoADevicePhase2 = TrackingRecHitSoADevice; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h new file mode 100644 index 0000000000000..e939dc9cdba97 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h @@ -0,0 +1,69 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsHost_h +#define CUDADataFormats_RecHits_TrackingRecHitsHost_h + +#include + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + + +template +class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection> { +public: + + + using hitSoA = trackingRecHitSoA; + //Need to decorate the class with the inherited portable accessors being now a template + using cms::cuda::PortableHostCollection>::view; + using cms::cuda::PortableHostCollection>::const_view; + using cms::cuda::PortableHostCollection>::buffer; + using cms::cuda::PortableHostCollection>::bufferSize; + + TrackingRecHitSoAHost() = default; + + using AverageGeometry = typename hitSoA::AverageGeometry;; + using ParamsOnGPU = typename hitSoA::ParamsOnGPU;; + using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType; + using PhiBinner = typename hitSoA::PhiBinner; + + // This SoA Host is used basically only for DQM + // so we just need a slim constructor + explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream) + : cms::cuda::PortableHostCollection>(nHits, stream) {} + + explicit TrackingRecHitSoAHost(uint32_t nHits, + int32_t offsetBPIX2, + ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : cms::cuda::PortableHostCollection>(nHits, stream), + nHits_(nHits), + cpeParams_(cpeParams), + offsetBPIX2_(offsetBPIX2) { + std::cout << "PORCA MADONNA!!!!!!!!!!!!!!!!!" << std::endl; + view().nHits() = nHits; + std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().begin()); + memcpy(&(view().cpeParams()), cpeParams, sizeof(ParamsOnGPU)); + view().offsetBPIX2() = offsetBPIX2; + } + + uint32_t nHits() const { return nHits_; } + uint32_t offsetBPIX2() const { return offsetBPIX2_; } + auto phiBinnerStorage() { return phiBinnerStorage_; } + +private: + uint32_t nHits_; //Needed for the host SoA size + ParamsOnGPU const* cpeParams_; + uint32_t offsetBPIX2_; + + PhiBinnerStorageType* phiBinnerStorage_; +}; + + +using TrackingRecHitSoAHostPhase1 = TrackingRecHitSoAHost; +using TrackingRecHitSoAHostPhase2 = TrackingRecHitSoAHost; + + +#endif // CUDADataFormats_Track_TrackHeterogeneousT_H diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h new file mode 100644 index 0000000000000..02c1c44e3cb89 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h +#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h + +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "SiPixelHitStatus.h" + + +// more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field +template +struct trackingRecHitSoA +{ + + using hindex_type = typename TrackerTraits::hindex_type; + using PhiBinner = cms::cuda::HistoContainer; //28 for phase2 geometry + + using PhiBinnerStorageType = typename PhiBinner::index_type; + using AverageGeometry = pixelTopology::AverageGeometryT; + using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; + + using HitLayerStartArray = std::array; + using HitModuleStartArray = std::array; + + //Is it better to have two split? + GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout, + SOA_COLUMN(float, xLocal), + SOA_COLUMN(float, yLocal), + SOA_COLUMN(float, xerrLocal), + SOA_COLUMN(float, yerrLocal), + SOA_COLUMN(float, xGlobal), + SOA_COLUMN(float, yGlobal), + SOA_COLUMN(float, zGlobal), + SOA_COLUMN(float, rGlobal), + SOA_COLUMN(int16_t, iphi), + SOA_COLUMN(SiPixelHitStatusAndCharge, chargeAndStatus), + SOA_COLUMN(int16_t, clusterSizeX), + SOA_COLUMN(int16_t, clusterSizeY), + SOA_COLUMN(int16_t, detectorIndex), + + SOA_SCALAR(uint32_t, nHits), + SOA_SCALAR(int32_t, offsetBPIX2), + //These above could be separated in a specific + //layout since they don't depends on the template + //for the moment I'm keeping them here + SOA_COLUMN(PhiBinnerStorageType, phiBinnerStorage), + SOA_SCALAR(HitModuleStartArray, hitsModuleStart), + SOA_SCALAR(HitLayerStartArray, hitsLayerStart), + SOA_SCALAR(ParamsOnGPU, cpeParams), + SOA_SCALAR(AverageGeometry, averageGeometry), + SOA_SCALAR(PhiBinner, phiBinner)); + + + + +}; + + template + using HitLayout = typename trackingRecHitSoA::template TrackingRecHitSoALayout<>; + template + using HitSoAView = typename trackingRecHitSoA::template TrackingRecHitSoALayout<>::View; + template + using HitSoAConstView = typename trackingRecHitSoA::template TrackingRecHitSoALayout<>::ConstView; + +#endif diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc index 05c3eba3d8bde..e97b762f39d79 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc @@ -1,49 +1,49 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::localCoordToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(5 * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store32ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(this->n32) * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store32, static_cast(this->n32) * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store16ToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(this->n16) * this->nHits(), stream); - cms::cuda::copyAsync(ret, this->m_store16, static_cast(this->n16) * this->nHits(), stream); - return ret; -} - -template -cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::hitsModuleStartToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); - cudaCheck(cudaMemcpyAsync(ret.get(), - this->m_hitsModuleStart, - sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), - cudaMemcpyDefault, - stream)); - return ret; -} - -template class TrackingRecHit2DGPUT; -template class TrackingRecHit2DGPUT; - -template class TrackingRecHit2DCPUT; -template class TrackingRecHit2DCPUT; - -template class TrackingRecHit2DHostT; -template class TrackingRecHit2DHostT; +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +// +// template +// cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::localCoordToHostAsync( +// cudaStream_t stream) const { +// auto ret = cms::cuda::make_host_unique(5 * this->nHits(), stream); +// cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream); +// return ret; +// } +// +// template +// cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store32ToHostAsync(cudaStream_t stream) const { +// auto ret = cms::cuda::make_host_unique(static_cast(this->n32) * this->nHits(), stream); +// cms::cuda::copyAsync(ret, this->m_store32, static_cast(this->n32) * this->nHits(), stream); +// return ret; +// } +// +// template +// cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::store16ToHostAsync( +// cudaStream_t stream) const { +// auto ret = cms::cuda::make_host_unique(static_cast(this->n16) * this->nHits(), stream); +// cms::cuda::copyAsync(ret, this->m_store16, static_cast(this->n16) * this->nHits(), stream); +// return ret; +// } +// +// template +// cms::cuda::host::unique_ptr TrackingRecHit2DGPUT::hitsModuleStartToHostAsync( +// cudaStream_t stream) const { +// auto ret = cms::cuda::make_host_unique(TrackerTraits::numberOfModules + 1, stream); +// cudaCheck(cudaMemcpyAsync(ret.get(), +// this->m_hitsModuleStart, +// sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1), +// cudaMemcpyDefault, +// stream)); +// return ret; +// } +// +// template class TrackingRecHit2DGPUT; +// template class TrackingRecHit2DGPUT; +// +// template class TrackingRecHit2DCPUT; +// template class TrackingRecHit2DCPUT; +// +// template class TrackingRecHit2DHostT; +// template class TrackingRecHit2DHostT; diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h index b9a20695712e3..fb22e78358db3 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes.h +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -3,6 +3,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml index 4287860ee8495..c0ea7bb1af5cb 100644 --- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -1,5 +1,5 @@ - + + + + + + + + + + + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml index f064563aa7051..0d961264a0fd5 100644 --- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -1,6 +1,8 @@ + - + + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h index b2da57c2471ae..c8c930e3e2cb0 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h @@ -1,26 +1,26 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" - -namespace testTrackingRecHit2D { - - template - __global__ void fill(TrackingRecHit2DSOAViewT* phits) { - assert(phits); - auto& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } - - template - __global__ void verify(TrackingRecHit2DSOAViewT const* phits) { - assert(phits); - auto const& hits = *phits; - assert(hits.nHits() == 200); - - int i = threadIdx.x; - if (i > 200) - return; - } -} // namespace testTrackingRecHit2D +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// +// namespace testTrackingRecHit2D { +// +// template +// __global__ void fill(TrackingRecHit2DSOAViewT* phits) { +// assert(phits); +// auto& hits = *phits; +// assert(hits.nHits() == 200); +// +// int i = threadIdx.x; +// if (i > 200) +// return; +// } +// +// template +// __global__ void verify(TrackingRecHit2DSOAViewT const* phits) { +// assert(phits); +// auto const& hits = *phits; +// assert(hits.nHits() == 200); +// +// int i = threadIdx.x; +// if (i > 200) +// return; +// } +// } // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp index 0d910273933dc..8e28dbad50194 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp @@ -1,42 +1,42 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" - -namespace testTrackingRecHit2D { - - template - void runKernels(TrackingRecHit2DSOAViewT* hits); -} // namespace testTrackingRecHit2D - -int main() { - cms::cudatest::requireDevices(); - - cudaStream_t stream; - cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - - auto nHits = 200; - // inner scope to deallocate memory before destroying the stream - { - TrackingRecHit2DGPUT tkhit(nHits, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhit.view()); - - TrackingRecHit2DGPUT tkhitPhase2(nHits, 0, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhitPhase2.view()); - - TrackingRecHit2DHostT tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit); - cudaStreamSynchronize(stream); - assert(tkhitH.view()); - assert(tkhitH.view()->nHits() == unsigned(nHits)); - - TrackingRecHit2DHostT tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2); - cudaStreamSynchronize(stream); - assert(tkhitHPhase2.view()); - assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); - } - - cudaCheck(cudaStreamDestroy(stream)); - - return 0; -} +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +// #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +// #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +// +// namespace testTrackingRecHit2D { +// +// template +// void runKernels(TrackingRecHit2DSOAViewT* hits); +// } // namespace testTrackingRecHit2D +// +// int main() { +// cms::cudatest::requireDevices(); +// +// cudaStream_t stream; +// cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); +// +// auto nHits = 200; +// // inner scope to deallocate memory before destroying the stream +// { +// TrackingRecHit2DGPUT tkhit(nHits, 0, nullptr, nullptr, stream); +// testTrackingRecHit2D::runKernels(tkhit.view()); +// +// TrackingRecHit2DGPUT tkhitPhase2(nHits, 0, nullptr, nullptr, stream); +// testTrackingRecHit2D::runKernels(tkhitPhase2.view()); +// +// TrackingRecHit2DHostT tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit); +// cudaStreamSynchronize(stream); +// assert(tkhitH.view()); +// assert(tkhitH.view()->nHits() == unsigned(nHits)); +// +// TrackingRecHit2DHostT tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2); +// cudaStreamSynchronize(stream); +// assert(tkhitHPhase2.view()); +// assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); +// } +// +// cudaCheck(cudaStreamDestroy(stream)); +// +// return 0; +// } diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu index e902ea971edf3..36ec8009bb864 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu @@ -1,15 +1,15 @@ -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "TrackingRecHit2DCUDAImpl_t.h" - -namespace testTrackingRecHit2D { - - template - void runKernels(TrackingRecHit2DSOAViewT* hits) { - assert(hits); - fill<<<1, 1024>>>(hits); - verify<<<1, 1024>>>(hits); - } - - template void runKernels(TrackingRecHit2DSOAViewT* hits); - template void runKernels(TrackingRecHit2DSOAViewT* hits); -} // namespace testTrackingRecHit2D +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "TrackingRecHit2DCUDAImpl_t.h" +// +// namespace testTrackingRecHit2D { +// +// template +// void runKernels(TrackingRecHit2DSOAViewT* hits) { +// assert(hits); +// fill<<<1, 1024>>>(hits); +// verify<<<1, 1024>>>(hits); +// } +// +// template void runKernels(TrackingRecHit2DSOAViewT* hits); +// template void runKernels(TrackingRecHit2DSOAViewT* hits); +// } // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoAImpl_t.h new file mode 100644 index 0000000000000..1544ea759eff9 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoAImpl_t.h @@ -0,0 +1,42 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" + +namespace testTrackingRecHitSoA { + + + template + __global__ void fill(HitSoAView soa) { + + int i = threadIdx.x; + int j = blockIdx.x; + if (i == 0 and j == 0) { + soa.offsetBPIX2() = 22; + soa[10].xLocal() = 1.11; + } + + soa[i].iphi() = i % 10; + soa.hitsLayerStart()[j] = j; + __syncthreads(); + } + + template + __global__ void show(HitSoAView soa) { + + int i = threadIdx.x; + int j = blockIdx.x; + + if (i == 0 and j == 0) { + printf("nbins = %d \n", soa.phiBinner().nbins()); + printf("offsetBPIX %d ->%d \n", i, soa.offsetBPIX2()); + printf("nHits %d ->%d \n", i, soa.nHits()); + printf("hitsModuleStart %d ->%d \n", i, soa.hitsModuleStart().at(28)); + } + + if (i < soa.nHits()) + printf("iPhi %d ->%d \n", i, soa[i].iphi()); + + if (j * blockDim.x + i < soa.phiBinner().nbins()) + printf(">bin size %d ->%d \n", j * blockDim.x + i, soa.phiBinner().size(j * blockDim.x + i)); + __syncthreads(); + } + +} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp new file mode 100644 index 0000000000000..39a6639db6c89 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp @@ -0,0 +1,47 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +namespace testTrackingRecHitSoA { + + template + void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream); + +} + +int main() { + cms::cudatest::requireDevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault)); + + // inner scope to deallocate memory before destroying the stream + { + uint32_t nHits = 2000; + int32_t offset = 100; + uint32_t moduleStart[1856]; + + for (size_t i = 0; i < 1856; i++) { + moduleStart[i] = i * 2; + } + + TrackingRecHitSoADevice tkhit(nHits, offset, nullptr, &moduleStart[0], stream); + + testTrackingRecHitSoA::runKernels(tkhit, stream); + printf("tkhit hits %d \n", tkhit.nHits()); + auto test = tkhit.localCoordToHostAsync(stream); + printf("test[9] %.2f\n", test[9]); + + auto ret = tkhit.hitsModuleStartToHostAsync(stream); + printf("mods[9] %d\n", ret[9]); + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu new file mode 100644 index 0000000000000..3c994bc0e90ed --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu @@ -0,0 +1,30 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#include "TrackingRecHitSoAImpl_t.h" + +namespace testTrackingRecHitSoA { + + template + void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream) { + // assert(soa); + printf("> RUN!\n"); + fill<<<10, 100, 0, stream>>>(hits.view()); + + cudaCheck(cudaDeviceSynchronize()); + cms::cuda::fillManyFromVector(hits.phiBinner(), + 10, + hits.view().iphi(), + hits.view().hitsLayerStart().data(), + 2000, + 256, + hits.view().phiBinnerStorage(), + stream); + cudaCheck(cudaDeviceSynchronize()); + show<<<10, 1000, 0, stream>>>(hits.view()); + cudaCheck(cudaDeviceSynchronize()); + } + + template void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream); + template void runKernels(TrackingRecHitSoADevice& hits, cudaStream_t stream); + +} // namespace testTrackingRecHit2DNew diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..b57406c70de80 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h @@ -0,0 +1,23 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" + +template +class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + ZVertexSoAHeterogeneousDevice() = default; // cms::cuda::Product needs this + + // Constructor which specifies the SoA size + explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace zVertex { + + using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice; + +} // namespace zVertex + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..24ce798491473 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h @@ -0,0 +1,25 @@ +#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H +#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H + +#include + +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" + +template +class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + ZVertexSoAHeterogeneousHost() = default; // Required for cms::cuda::Product + + // Constructor which specifies the SoA size and CUDA stream + explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace zVertex { + + using ZVertexSoAHost = ZVertexSoAHeterogeneousHost; + +} // namespace zVertex + +#endif // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h new file mode 100644 index 0000000000000..2403652377971 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h @@ -0,0 +1,35 @@ +#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h +#define CUDADataFormats_Vertex_ZVertexUtilities_h + +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout, + SOA_COLUMN(int16_t, idv), + SOA_COLUMN(float, zv), + SOA_COLUMN(float, wv), + SOA_COLUMN(float, chi2), + SOA_COLUMN(float, ptv2), + SOA_COLUMN(int32_t, ndof), + SOA_COLUMN(uint16_t, sortInd), + SOA_SCALAR(uint32_t, nvFinal)) + +// Previous ZVertexSoA class methods. +// They operate on View and ConstView of the ZVertexSoA. +namespace zVertex { + // Common types for both Host and Device code + using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>; + using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View; + using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + + static constexpr uint32_t MAXTRACKS = 128 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } + + } // namespace utilities +} // namespace zVertex + +#endif diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h index 7931beaa8f4bd..0340affffa06c 100644 --- a/CUDADataFormats/Vertex/src/classes.h +++ b/CUDADataFormats/Vertex/src/classes.h @@ -1,7 +1,8 @@ #ifndef CUDADataFormats_Vertex_src_classes_h #define CUDADataFormats_Vertex_src_classes_h -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "CUDADataFormats/Common/interface/Product.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml index ea633080af9af..404cc826fe73d 100644 --- a/CUDADataFormats/Vertex/src/classes_def.xml +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -1,6 +1,8 @@ - - - - + + + + + + diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc index 71abb95dbb4d1..b1bf1c010f096 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareRecHitsSoA.cc @@ -18,7 +18,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" // Geometry #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -29,8 +30,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { public: - using HitSoA = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DCPUT; + using HitSoA = HitSoAView; + using HitsOnHost = TrackingRecHitSoAHost; explicit SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet&); ~SiPixelPhase1CompareRecHitsSoA() override = default; @@ -42,8 +43,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; - const edm::EDGetTokenT tokenSoAHitsGPU_; + const edm::EDGetTokenT tokenSoAHitsHost_; //these two are both on Host but originally they have been + const edm::EDGetTokenT tokenSoAHitsDevice_; //produced on Host or on Device const std::string topFolderName_; const float mind2cut_; static constexpr uint32_t invalidHit_ = std::numeric_limits::max(); @@ -80,8 +81,8 @@ class SiPixelPhase1CompareRecHitsSoA : public DQMEDAnalyzer { SiPixelPhase1CompareRecHitsSoA::SiPixelPhase1CompareRecHitsSoA(const edm::ParameterSet& iConfig) : geomToken_(esConsumes()), topoToken_(esConsumes()), - tokenSoAHitsCPU_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), - tokenSoAHitsGPU_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), + tokenSoAHitsHost_(consumes(iConfig.getParameter("pixelHitsSrcCPU"))), + tokenSoAHitsDevice_(consumes(iConfig.getParameter("pixelHitsSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), mind2cut_(iConfig.getParameter("minD2cut")) {} // @@ -96,39 +97,41 @@ void SiPixelPhase1CompareRecHitsSoA::dqmBeginRun(const edm::Run& iRun, const edm // -- Analyze // void SiPixelPhase1CompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { - const auto& rhsoaHandleCPU = iEvent.getHandle(tokenSoAHitsCPU_); - const auto& rhsoaHandleGPU = iEvent.getHandle(tokenSoAHitsGPU_); - if (not rhsoaHandleCPU or not rhsoaHandleGPU) { + const auto& rhsoaHandleHost = iEvent.getHandle(tokenSoAHitsHost_); + const auto& rhsoaHandleDevice = iEvent.getHandle(tokenSoAHitsDevice_); + if (not rhsoaHandleHost or not rhsoaHandleDevice) { edm::LogWarning out("SiPixelPhase1CompareRecHitSoA"); - if (not rhsoaHandleCPU) { - out << "reference (cpu) rechits not found; "; + if (not rhsoaHandleHost) { + out << "reference (Host) rechits not found; "; } - if (not rhsoaHandleGPU) { - out << "target (gpu) rechits not found; "; + if (not rhsoaHandleDevice) { + out << "target (Device) rechits not found; "; } out << "the comparison will not run."; return; } - auto const& rhsoaCPU = *rhsoaHandleCPU; - const HitSoA* soa2dCPU = rhsoaCPU.view(); - auto const& rhsoaGPU = *rhsoaHandleGPU; - const HitSoA* soa2dGPU = rhsoaGPU.view(); + auto const& rhsoaHost = *rhsoaHandleHost; + auto const& rhsoaDevice = *rhsoaHandleDevice; - uint32_t nHitsCPU = soa2dCPU->nHits(); - uint32_t nHitsGPU = soa2dGPU->nHits(); - hnHits_->Fill(nHitsCPU, nHitsGPU); + auto const& soa2dHost = rhsoaHost.const_view(); + auto const& soa2dDevice = rhsoaDevice.const_view(); + + uint32_t nHitsHost = soa2dHost.nHits(); + uint32_t nHitsDevice = soa2dDevice.nHits(); + + hnHits_->Fill(nHitsHost, nHitsDevice); auto detIds = tkGeom_->detUnitIds(); - for (uint32_t i = 0; i < nHitsCPU; i++) { + for (uint32_t i = 0; i < nHitsHost; i++) { float minD = mind2cut_; uint32_t matchedHit = invalidHit_; - uint16_t indCPU = soa2dCPU->detectorIndex(i); - float xLocalCPU = soa2dCPU->xLocal(i); - float yLocalCPU = soa2dCPU->yLocal(i); - for (uint32_t j = 0; j < nHitsGPU; j++) { - if (soa2dGPU->detectorIndex(j) == indCPU) { - float dx = xLocalCPU - soa2dGPU->xLocal(j); - float dy = yLocalCPU - soa2dGPU->yLocal(j); + uint16_t indHost = soa2dHost[i].detectorIndex(); + float xLocalHost = soa2dHost[i].xLocal(); + float yLocalHost = soa2dHost[i].yLocal(); + for (uint32_t j = 0; j < nHitsDevice; j++) { + if (soa2dDevice.detectorIndex(j) == indHost) { + float dx = xLocalHost - soa2dDevice[j].xLocal(); + float dy = yLocalHost - soa2dDevice[j].yLocal(); float distance = dx * dx + dy * dy; if (distance < minD) { minD = distance; @@ -136,46 +139,46 @@ void SiPixelPhase1CompareRecHitsSoA::analyze(const edm::Event& iEvent, const edm } } } - DetId id = detIds[indCPU]; - uint32_t chargeCPU = soa2dCPU->charge(i); - int16_t sizeXCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeX(i)) / 8.)); - int16_t sizeYCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeY(i)) / 8.)); - uint32_t chargeGPU = 0; - int16_t sizeXGPU = -99; - int16_t sizeYGPU = -99; - float xLocalGPU = -999.; - float yLocalGPU = -999.; + DetId id = detIds[indHost]; + uint32_t chargeHost = soa2dHost[i].chargeAndStatus().charge; + int16_t sizeXHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeX()) / 8.)); + int16_t sizeYHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeY()) / 8.)); + uint32_t chargeDevice = 0; + int16_t sizeXDevice = -99; + int16_t sizeYDevice = -99; + float xLocalDevice = -999.; + float yLocalDevice = -999.; if (matchedHit != invalidHit_) { - chargeGPU = soa2dGPU->charge(matchedHit); - sizeXGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeX(matchedHit)) / 8.)); - sizeYGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeY(matchedHit)) / 8.)); - xLocalGPU = soa2dGPU->xLocal(matchedHit); - yLocalGPU = soa2dGPU->yLocal(matchedHit); + chargeDevice = soa2dDevice[matchedHit].chargeAndStatus().charge; + sizeXDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeX()) / 8.)); + sizeYDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeY()) / 8.)); + xLocalDevice = soa2dDevice[matchedHit].xLocal(); + yLocalDevice = soa2dDevice[matchedHit].yLocal(); } switch (id.subdetId()) { case PixelSubdetector::PixelBarrel: - hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeCPU, chargeGPU); - hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXCPU, sizeXGPU); - hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYCPU, sizeYGPU); - hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalCPU, xLocalGPU); - hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalCPU, yLocalGPU); - hBchargeDiff_->Fill(chargeCPU - chargeGPU); - hBsizeXDiff_->Fill(sizeXCPU - sizeXGPU); - hBsizeYDiff_->Fill(sizeYCPU - sizeYGPU); - hBposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU)); - hBposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU)); + hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeHost, chargeDevice); + hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXHost, sizeXDevice); + hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYHost, sizeYDevice); + hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalHost, xLocalDevice); + hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalHost, yLocalDevice); + hBchargeDiff_->Fill(chargeHost - chargeDevice); + hBsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hBsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hBposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hBposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); break; case PixelSubdetector::PixelEndcap: - hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeCPU, chargeGPU); - hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXCPU, sizeXGPU); - hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYCPU, sizeYGPU); - hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalCPU, xLocalGPU); - hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalCPU, yLocalGPU); - hFchargeDiff_->Fill(chargeCPU - chargeGPU); - hFsizeXDiff_->Fill(sizeXCPU - sizeXGPU); - hFsizeYDiff_->Fill(sizeYCPU - sizeYGPU); - hFposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU)); - hFposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU)); + hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeHost, chargeDevice); + hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXHost, sizeXDevice); + hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYHost, sizeYDevice); + hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalHost, xLocalDevice); + hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalHost, yLocalDevice); + hFchargeDiff_->Fill(chargeHost - chargeDevice); + hFsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hFsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hFposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hFposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); break; } } @@ -192,46 +195,46 @@ void SiPixelPhase1CompareRecHitsSoA::bookHistograms(DQMStore::IBooker& iBook, // clang-format off //Global - hnHits_ = iBook.book2I("nHits", "CPUvsGPU RecHits per event;#CPU RecHits;#GPU RecHits", 200, 0, 5000,200, 0, 5000); + hnHits_ = iBook.book2I("nHits", "HostvsDevice RecHits per event;#Host RecHits;#Device RecHits", 200, 0, 5000,200, 0, 5000); //Barrel Layer for(unsigned int il=0;ilnumberOfLayers(PixelSubdetector::PixelBarrel);il++){ - hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("CPUvsGPU RecHits Charge Barrel Layer%d;CPU Charge;GPU Charge",il+1), 250, 0, 100000, 250, 0, 100000); - hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("CPUvsGPU RecHits SizeX Barrel Layer%d;CPU SizeX;GPU SizeX",il+1), 30, 0, 30, 30, 0, 30); - hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("CPUvsGPU RecHits SizeY Barrel Layer%d;CPU SizeY;GPU SizeY",il+1), 30, 0, 30, 30, 0, 30); - hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("CPUvsGPU RecHits x-pos in Barrel Layer%d;CPU pos x;GPU pos x",il+1), 200, -5, 5, 200,-5,5); - hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("CPUvsGPU RecHits y-pos in Barrel Layer%d;CPU pos y;GPU pos y",il+1), 200, -5, 5, 200,-5,5); + hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("HostvsDevice RecHits Charge Barrel Layer%d;Host Charge;Device Charge",il+1), 250, 0, 100000, 250, 0, 100000); + hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("HostvsDevice RecHits SizeX Barrel Layer%d;Host SizeX;Device SizeX",il+1), 30, 0, 30, 30, 0, 30); + hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("HostvsDevice RecHits SizeY Barrel Layer%d;Host SizeY;Device SizeY",il+1), 30, 0, 30, 30, 0, 30); + hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("HostvsDevice RecHits x-pos in Barrel Layer%d;Host pos x;Device pos x",il+1), 200, -5, 5, 200,-5,5); + hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("HostvsDevice RecHits y-pos in Barrel Layer%d;Host pos y;Device pos y",il+1), 200, -5, 5, 200,-5,5); } //Endcaps //Endcaps Disk for(int is=0;is<2;is++){ int sign=is==0? -1:1; for(unsigned int id=0;idnumberOfLayers(PixelSubdetector::PixelEndcap);id++){ - hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("CPUvsGPU RecHits Charge Endcaps Disk%+d;CPU Charge;GPU Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000); - hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("CPUvsGPU RecHits SizeX Endcaps Disk%+d;CPU SizeX;GPU SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30); - hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("CPUvsGPU RecHits SizeY Endcaps Disk%+d;CPU SizeY;GPU SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30); - hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("CPUvsGPU RecHits x-pos Endcaps Disk%+d;CPU pos x;GPU pos x",id*sign+sign), 200, -5, 5, 200, -5, 5); - hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("CPUvsGPU RecHits y-pos Endcaps Disk%+d;CPU pos y;GPU pos y",id*sign+sign), 200, -5, 5, 200, -5, 5); + hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("HostvsDevice RecHits Charge Endcaps Disk%+d;Host Charge;Device Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000); + hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("HostvsDevice RecHits SizeX Endcaps Disk%+d;Host SizeX;Device SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("HostvsDevice RecHits SizeY Endcaps Disk%+d;Host SizeY;Device SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("HostvsDevice RecHits x-pos Endcaps Disk%+d;Host pos x;Device pos x",id*sign+sign), 200, -5, 5, 200, -5, 5); + hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("HostvsDevice RecHits y-pos Endcaps Disk%+d;Host pos y;Device pos y",id*sign+sign), 200, -5, 5, 200, -5, 5); } } //1D differences - hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5); - hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5); - hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5); - hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5); - hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5); - hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5); - hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10); - hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10); - hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10); - hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10); + hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); + hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); } void SiPixelPhase1CompareRecHitsSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { // monitorpixelRecHitsSoA edm::ParameterSetDescription desc; - desc.add("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cpu")); + desc.add("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@Host")); desc.add("pixelHitsSrcGPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cuda")); - desc.add("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareGPUvsCPU"); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareDevicevsHost"); desc.add("minD2cut", 0.0001); descriptions.addWithDefaultLabel(desc); } diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc index 915c2ac1399f5..966a3b310b3f4 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareTrackSoA.cc @@ -20,7 +20,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" // for string manipulations #include @@ -64,7 +65,7 @@ namespace { class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { public: - using PixelTrackSoAPhase1 = PixelTrackHeterogeneousT; + using TrackSoAPhase1 = TrackSoAHeterogeneousHost; explicit SiPixelPhase1CompareTrackSoA(const edm::ParameterSet&); ~SiPixelPhase1CompareTrackSoA() override = default; @@ -73,11 +74,11 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoATrackCPU_; - const edm::EDGetTokenT tokenSoATrackGPU_; + const edm::EDGetTokenT tokenSoATrackCPU_; + const edm::EDGetTokenT tokenSoATrackGPU_; const std::string topFolderName_; const bool useQualityCut_; - const pixelTrack::Quality minQuality_; + const pixelTrackSoA::Quality minQuality_; const float dr2cut_; MonitorElement* hnTracks_; MonitorElement* hnLooseAndAboveTracks_; @@ -115,17 +116,18 @@ class SiPixelPhase1CompareTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1CompareTrackSoA::SiPixelPhase1CompareTrackSoA(const edm::ParameterSet& iConfig) - : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), - tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), + : tokenSoATrackCPU_(consumes(iConfig.getParameter("pixelTrackSrcCPU"))), + tokenSoATrackGPU_(consumes(iConfig.getParameter("pixelTrackSrcGPU"))), topFolderName_(iConfig.getParameter("topFolderName")), useQualityCut_(iConfig.getParameter("useQualityCut")), - minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), + minQuality_(pixelTrackSoA::qualityByName(iConfig.getParameter("minQuality"))), dr2cut_(iConfig.getParameter("deltaR2cut")) {} // // -- Analyze // void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + using helper = tracksUtilities; const auto& tsoaHandleCPU = iEvent.getHandle(tokenSoATrackCPU_); const auto& tsoaHandleGPU = iEvent.getHandle(tokenSoATrackGPU_); if (not tsoaHandleCPU or not tsoaHandleGPU) { @@ -140,12 +142,12 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoaCPU = *tsoaHandleCPU->get(); - auto const& tsoaGPU = *tsoaHandleGPU->get(); - auto maxTracksCPU = tsoaCPU.stride(); //this should be same for both? - auto maxTracksGPU = tsoaGPU.stride(); //this should be same for both? - auto const* qualityCPU = tsoaCPU.qualityData(); - auto const* qualityGPU = tsoaGPU.qualityData(); + auto& tsoaCPU = *tsoaHandleCPU.product(); + auto& tsoaGPU = *tsoaHandleGPU.product(); + auto maxTracksCPU = tsoaCPU.view().metadata().size(); //this should be same for both? + auto maxTracksGPU = tsoaGPU.view().metadata().size(); //this should be same for both? + auto const* qualityCPU = tsoaCPU.view().quality(); + auto const* qualityGPU = tsoaGPU.view().quality(); int32_t nTracksCPU = 0; int32_t nTracksGPU = 0; int32_t nLooseAndAboveTracksCPU = 0; @@ -155,9 +157,9 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false? std::vector looseTrkidxGPU; for (int32_t jt = 0; jt < maxTracksGPU; ++jt) { - if (tsoaGPU.nHits(jt) == 0) + if (helper::nHits(tsoaGPU.view(), jt) == 0) break; // this is a guard - if (!(tsoaGPU.pt(jt) > 0.)) + if (!(tsoaGPU.view()[jt].pt() > 0.)) continue; nTracksGPU++; if (useQualityCut_ && qualityGPU[jt] < minQuality_) @@ -168,9 +170,21 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: //Now loop over CPU tracks//nested loop for loose gPU tracks for (int32_t it = 0; it < maxTracksCPU; ++it) { - if (tsoaCPU.nHits(it) == 0) + + float chi2CPU = tsoaCPU.view()[it].chi2(); + int nHitsCPU = helper::nHits(tsoaCPU.view(), it); + + if (nHitsCPU == 0) break; // this is a guard - if (!(tsoaCPU.pt(it) > 0.)) + + int8_t nLayersCPU = tsoaCPU.view()[it].nLayers(); + float ptCPU = tsoaCPU.view()[it].pt(); + float etaCPU = tsoaCPU.view()[it].eta(); + float phiCPU = helper::phi(tsoaCPU.view(), it); + float zipCPU = helper::zip(tsoaCPU.view(), it); + float tipCPU = helper::tip(tsoaCPU.view(), it); + + if (!(ptCPU > 0.)) continue; nTracksCPU++; if (useQualityCut_ && qualityCPU[it] < minQuality_) @@ -180,12 +194,11 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: const int32_t notFound = -1; int32_t closestTkidx = notFound; float mindr2 = dr2cut_; - float etacpu = tsoaCPU.eta(it); - float phicpu = tsoaCPU.phi(it); + for (auto gid : looseTrkidxGPU) { - float etagpu = tsoaGPU.eta(gid); - float phigpu = tsoaGPU.phi(gid); - float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu); + float etaGPU = tsoaGPU.view()[gid].eta(); + float phiGPU = helper::phi(tsoaGPU.view(), gid); + float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU); if (dr2 > dr2cut_) continue; // this is arbitrary if (mindr2 > dr2) { @@ -194,27 +207,36 @@ void SiPixelPhase1CompareTrackSoA::analyze(const edm::Event& iEvent, const edm:: } } - hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it)); //all CPU tk - hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it)); + hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU); //all CPU tk + hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU); if (closestTkidx == notFound) continue; nLooseAndAboveTracksCPU_matchedGPU++; - hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx)); - hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx)); - hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx)); - hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx)); - heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx)); - hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx)); - hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx)); - htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx)); - hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx)); - hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx)); - hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx))); - hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx)); - hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it)); //matched to gpu - hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it)); + float chi2GPU = tsoaGPU.view()[closestTkidx].chi2(); + int nHitsGPU = helper::nHits(tsoaGPU.view(), closestTkidx); + int8_t nLayersGPU = tsoaGPU.view()[closestTkidx].nLayers(); + float ptGPU = tsoaGPU.view()[closestTkidx].pt(); + float etaGPU = tsoaGPU.view()[closestTkidx].eta(); + float phiGPU = helper::phi(tsoaGPU.view(), closestTkidx); + float zipGPU = helper::zip(tsoaGPU.view(), closestTkidx); + float tipGPU = helper::tip(tsoaGPU.view(), closestTkidx); + + hchi2_->Fill(chi2CPU, chi2GPU); + hnHits_->Fill(nHitsCPU, nHitsGPU); + hnLayers_->Fill(nLayersCPU, nLayersGPU); + hpt_->Fill(ptCPU, ptCPU); + hptLogLog_->Fill(ptCPU, ptGPU); + heta_->Fill(etaCPU, etaGPU); + hphi_->Fill(phiCPU, phiGPU); + hz_->Fill(zipCPU, zipGPU); + htip_->Fill(tipCPU, tipGPU); + hptdiffMatched_->Fill(ptCPU - ptGPU); + hetadiffMatched_->Fill(etaCPU - etaGPU); + hphidiffMatched_->Fill(reco::deltaPhi(phiCPU, phiGPU)); + hzdiffMatched_->Fill(zipCPU - zipGPU); + hpt_eta_tkAllCPUMatched_->Fill(etaCPU, ptCPU); //matched to gpu + hphi_z_tkAllCPUMatched_->Fill(phiCPU, zipCPU); } hnTracks_->Fill(nTracksCPU, nTracksGPU); hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc index 0113ea50973d8..c91d8dcbb593e 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1CompareVertexSoA.cc @@ -2,7 +2,7 @@ // Package: SiPixelPhase1CompareVertexSoA // Class: SiPixelPhase1CompareVertexSoA // -/**\class SiPixelPhase1CompareVertexSoA SiPixelPhase1CompareVertexSoA.cc +/**\class SiPixelPhase1CompareVertexSoA SiPixelPhase1CompareVertexSoA.cc */ // // Author: Suvankar Roy Chowdhury @@ -18,7 +18,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { @@ -31,8 +31,8 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - const edm::EDGetTokenT tokenSoAVertexCPU_; - const edm::EDGetTokenT tokenSoAVertexGPU_; + const edm::EDGetTokenT tokenSoAVertexCPU_; + const edm::EDGetTokenT tokenSoAVertexGPU_; const edm::EDGetTokenT tokenBeamSpot_; const std::string topFolderName_; const float dzCut_; @@ -53,9 +53,10 @@ class SiPixelPhase1CompareVertexSoA : public DQMEDAnalyzer { // constructors // +// Note tokenSoAVertexGPU_ contains data copied from device to host, hence is a HostCollection SiPixelPhase1CompareVertexSoA::SiPixelPhase1CompareVertexSoA(const edm::ParameterSet& iConfig) - : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), - tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), + : tokenSoAVertexCPU_(consumes(iConfig.getParameter("pixelVertexSrcCPU"))), + tokenSoAVertexGPU_(consumes(iConfig.getParameter("pixelVertexSrcGPU"))), tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), topFolderName_(iConfig.getParameter("topFolderName")), dzCut_(iConfig.getParameter("dzCut")) {} @@ -78,10 +79,10 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto const& vsoaCPU = *vsoaHandleCPU->get(); - int nVerticesCPU = vsoaCPU.nvFinal; - auto const& vsoaGPU = *vsoaHandleGPU->get(); - int nVerticesGPU = vsoaGPU.nvFinal; + auto& vsoaCPU = *vsoaHandleCPU; + int nVerticesCPU = vsoaCPU.view().nvFinal(); + auto& vsoaGPU = *vsoaHandleGPU; + int nVerticesGPU = vsoaGPU.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; @@ -97,22 +98,22 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: } for (int ivc = 0; ivc < nVerticesCPU; ivc++) { - auto sic = vsoaCPU.sortInd[ivc]; - auto zc = vsoaCPU.zv[sic]; + auto sic = vsoaCPU.view()[ivc].sortInd(); + auto zc = vsoaCPU.view()[sic].zv(); auto xc = x0 + dxdz * zc; auto yc = y0 + dydz * zc; zc += z0; - auto ndofCPU = vsoaCPU.ndof[sic]; - auto chi2CPU = vsoaCPU.chi2[sic]; + auto ndofCPU = vsoaCPU.view()[sic].ndof(); + auto chi2CPU = vsoaCPU.view()[sic].chi2(); const int32_t notFound = -1; int32_t closestVtxidx = notFound; float mindz = dzCut_; for (int ivg = 0; ivg < nVerticesGPU; ivg++) { - auto sig = vsoaGPU.sortInd[ivg]; - auto zgc = vsoaGPU.zv[sig] + z0; + auto sig = vsoaGPU.view()[ivg].sortInd(); + auto zgc = vsoaGPU.view()[sig].zv() + z0; auto zDist = std::abs(zc - zgc); //insert some matching condition if (zDist > dzCut_) @@ -125,12 +126,12 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: if (closestVtxidx == notFound) continue; - auto zg = vsoaGPU.zv[closestVtxidx]; + auto zg = vsoaGPU.view()[closestVtxidx].zv(); auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofGPU = vsoaGPU.ndof[closestVtxidx]; - auto chi2GPU = vsoaGPU.chi2[closestVtxidx]; + auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof(); + auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); hy_->Fill(yc - y0, yg - y0); @@ -140,7 +141,7 @@ void SiPixelPhase1CompareVertexSoA::analyze(const edm::Event& iEvent, const edm: hzdiff_->Fill(zc - zg); hchi2_->Fill(chi2CPU, chi2GPU); hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU); - hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]); + hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2()); hntrks_->Fill(ndofCPU + 1, ndofGPU + 1); } hnVertex_->Fill(nVerticesCPU, nVerticesGPU); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc index 231186f88e53f..e5d9a1adfaed5 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorRecHitsSoA.cc @@ -19,7 +19,8 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" // Geometry #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -30,8 +31,8 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer { public: - using HitSoA = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DCPUT; + using HitSoA = HitSoAView; + using HitsOnHost = TrackingRecHitSoAHost; explicit SiPixelPhase1MonitorRecHitsSoA(const edm::ParameterSet&); ~SiPixelPhase1MonitorRecHitsSoA() override = default; @@ -43,7 +44,7 @@ class SiPixelPhase1MonitorRecHitsSoA : public DQMEDAnalyzer { private: const edm::ESGetToken geomToken_; const edm::ESGetToken topoToken_; - const edm::EDGetTokenT tokenSoAHitsCPU_; + const edm::EDGetTokenT tokenSoAHitsCPU_; const std::string topFolderName_; const TrackerGeometry* tkGeom_ = nullptr; const TrackerTopology* tTopo_ = nullptr; @@ -97,21 +98,21 @@ void SiPixelPhase1MonitorRecHitsSoA::analyze(const edm::Event& iEvent, const edm return; } auto const& rhsoa = *rhsoaHandle; - const HitSoA* soa2d = rhsoa.view(); + auto const& soa2d = rhsoa.const_view(); - uint32_t nHits_ = soa2d->nHits(); + uint32_t nHits_ = soa2d.nHits(); hnHits->Fill(nHits_); auto detIds = tkGeom_->detUnitIds(); for (uint32_t i = 0; i < nHits_; i++) { - DetId id = detIds[soa2d->detectorIndex(i)]; - float xG = soa2d->xGlobal(i); - float yG = soa2d->yGlobal(i); - float zG = soa2d->zGlobal(i); - float rG = soa2d->rGlobal(i); - float fphi = short2phi(soa2d->iphi(i)); - uint32_t charge = soa2d->charge(i); - int16_t sizeX = std::ceil(float(std::abs(soa2d->clusterSizeX(i)) / 8.)); - int16_t sizeY = std::ceil(float(std::abs(soa2d->clusterSizeY(i)) / 8.)); + DetId id = detIds[soa2d[i].detectorIndex()]; + float xG = soa2d[i].xGlobal(); + float yG = soa2d[i].yGlobal(); + float zG = soa2d[i].zGlobal(); + float rG = soa2d[i].rGlobal(); + float fphi = short2phi(soa2d[i].iphi()); + uint32_t charge = soa2d[i].chargeAndStatus().charge; + int16_t sizeX = std::ceil(float(std::abs(soa2d[i].clusterSizeX()) / 8.)); + int16_t sizeY = std::ceil(float(std::abs(soa2d[i].clusterSizeY()) / 8.)); hBFposZP->Fill(zG, fphi); int16_t ysign = yG >= 0 ? 1 : -1; hBFposZR->Fill(zG, rG * ysign); diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc index 5d2545b6cdc9f..3522df7e3cdfe 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorTrackSoA.cc @@ -21,13 +21,14 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" // for string manipulations #include class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { public: - using PixelTrackHeterogeneousPhase1 = PixelTrackHeterogeneousT; + using TrackSoAPhase1 = TrackSoAHeterogeneousHost; explicit SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet&); ~SiPixelPhase1MonitorTrackSoA() override = default; void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; @@ -35,10 +36,10 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoATrack_; std::string topFolderName_; bool useQualityCut_; - pixelTrack::Quality minQuality_; + pixelTrackSoA::Quality minQuality_; MonitorElement* hnTracks; MonitorElement* hnLooseAndAboveTracks; MonitorElement* hnHits; @@ -63,10 +64,10 @@ class SiPixelPhase1MonitorTrackSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorTrackSoA::SiPixelPhase1MonitorTrackSoA(const edm::ParameterSet& iConfig) { - tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); //"SiPixelHeterogeneous/PixelTrackSoA"; useQualityCut_ = iConfig.getParameter("useQualityCut"); - minQuality_ = pixelTrack::qualityByName(iConfig.getParameter("minQuality")); + minQuality_ = pixelTrackSoA::qualityByName(iConfig.getParameter("minQuality")); } // @@ -79,23 +80,24 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: return; } - auto const& tsoa = *((tsoaHandle.product())->get()); - auto maxTracks = tsoa.stride(); - auto const* quality = tsoa.qualityData(); + using helper = tracksUtilities; + auto& tsoa = *tsoaHandle.product(); + auto maxTracks = tsoa.view().metadata().size(); + auto const* quality = tsoa.view().quality(); int32_t nTracks = 0; int32_t nLooseAndAboveTracks = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - auto nLayers = tsoa.nLayers(it); + auto nHits = helper::nHits(tsoa.view(), it); + auto nLayers = tsoa.view()[it].nLayers(); if (nHits == 0) break; // this is a guard - float pt = tsoa.pt(it); + float pt = tsoa.view()[it].pt(); if (!(pt > 0.)) continue; // fill the quality for all tracks - pixelTrack::Quality qual = tsoa.quality(it); + pixelTrackSoA::Quality qual = quality[it]; hquality->Fill(int(qual)); nTracks++; @@ -103,11 +105,11 @@ void SiPixelPhase1MonitorTrackSoA::analyze(const edm::Event& iEvent, const edm:: continue; // fill parameters only for quality >= loose - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); - float zip = tsoa.zip(it); - float eta = tsoa.eta(it); - float tip = tsoa.tip(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = helper::phi(tsoa.view(), it); + float zip = helper::zip(tsoa.view(), it); + float eta = tsoa.view()[it].eta(); + float tip = helper::tip(tsoa.view(), it); hchi2->Fill(chi2); hChi2VsPhi->Fill(phi, chi2); @@ -166,7 +168,7 @@ void SiPixelPhase1MonitorTrackSoA::bookHistograms(DQMStore::IBooker& iBook, htip = iBook.book1D("tip", ";Track (quality #geq loose) TIP [cm];#tracks", 100, -0.5, 0.5); hquality = iBook.book1D("quality", ";Track Quality;#tracks", 7, -0.5, 6.5); uint i = 1; - for (const auto& q : pixelTrack::qualityName) { + for (const auto& q : pixelTrackSoA::qualityName) { hquality->setBinLabel(i, q, 1); i++; } diff --git a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc index 6324cee4372d8..859b1d4577b02 100644 --- a/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc +++ b/DQM/SiPixelPhase1Heterogeneous/plugins/SiPixelPhase1MonitorVertexSoA.cc @@ -21,7 +21,7 @@ #include "DQMServices/Core/interface/MonitorElement.h" #include "DQMServices/Core/interface/DQMEDAnalyzer.h" #include "DQMServices/Core/interface/DQMStore.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { @@ -34,7 +34,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); private: - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT tokenSoAVertex_; edm::EDGetTokenT tokenBeamSpot_; std::string topFolderName_; MonitorElement* hnVertex; @@ -52,7 +52,7 @@ class SiPixelPhase1MonitorVertexSoA : public DQMEDAnalyzer { // SiPixelPhase1MonitorVertexSoA::SiPixelPhase1MonitorVertexSoA(const edm::ParameterSet& iConfig) { - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); tokenBeamSpot_ = consumes(iConfig.getParameter("beamSpotSrc")); topFolderName_ = iConfig.getParameter("topFolderName"); } @@ -67,8 +67,8 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: return; } - auto const& vsoa = *((vsoaHandle.product())->get()); - int nVertices = vsoa.nvFinal; + auto& vsoa = *vsoaHandle.product(); + int nVertices = vsoa.view().nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -83,8 +83,8 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.sortInd[iv]; - auto z = vsoa.zv[si]; + auto si = vsoa.view()[iv].sortInd(); + auto z = vsoa.view()[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; @@ -92,10 +92,10 @@ void SiPixelPhase1MonitorVertexSoA::analyze(const edm::Event& iEvent, const edm: hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.ndof[si]; - hchi2->Fill(vsoa.chi2[si]); - hchi2oNdof->Fill(vsoa.chi2[si] / ndof); - hptv2->Fill(vsoa.ptv2[si]); + auto ndof = vsoa.view()[si].ndof(); + hchi2->Fill(vsoa.view()[si].chi2()); + hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); + hptv2->Fill(vsoa.view()[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc index 0702bc4830c7c..db349710aa124 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -10,6 +10,7 @@ #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +28,8 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr store_; + // cms::cuda::host::unique_ptr store_; + cms::cuda::PortableHostCollection> digis_h_; int nDigis_; }; @@ -48,29 +50,29 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, // Do the transfer in a CUDA stream parallel to the computation CUDA stream cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + // const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + const auto& digis_d = ctx.get(iEvent, digiGetToken_); - nDigis_ = gpuDigis.nDigis(); - store_ = gpuDigis.copyAllToHostAsync(ctx.stream()); + nDigis_ = digis_d.nDigis(); + nDigis_ = digis_d.nDigis(); + digis_h_ = cms::cuda::PortableHostCollection>(digis_d.view().metadata().size(), ctx.stream()); + cudaCheck(cudaMemcpyAsync(digis_h_.buffer().get(), digis_d.const_buffer().get(), digis_d.bufferSize(), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaGetLastError()); + // store_ = gpuDigis.copyAllToHostAsync(ctx.stream()); } void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { - // The following line copies the data from the pinned host memory to - // regular host memory. In principle that feels unnecessary (why not - // just use the pinned host memory?). There are a few arguments for - // doing it though - // - Now can release the pinned host memory back to the (caching) allocator - // * if we'd like to keep the pinned memory, we'd need to also - // keep the CUDA stream around as long as that, or allow pinned - // host memory to be allocated without a CUDA stream - // - What if a CPU algorithm would produce the same SoA? We can't - // use cudaMallocHost without a GPU... - - auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX); - - iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus()); - - store_.reset(); + iEvent.emplace(digiPutToken_, + nDigis_, + digis_h_.view().pdigi(), + digis_h_.view().rawIdArr(), + digis_h_.view().adc(), + digis_h_.view().clus()); + // auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX); + // + // iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus()); + // + // store_.reset(); } // define as framework plugin diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h index c991d09666297..2a3ebee9fbd41 100644 --- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h +++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h @@ -308,9 +308,9 @@ namespace pixelTopology { static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; - static constexpr uint32_t last_bpix1_detIndex = 108; - static constexpr uint32_t last_bpix2_detIndex = 324; - static constexpr uint32_t last_barrel_detIndex = 504; + static constexpr int16_t last_bpix1_detIndex = 108; + static constexpr int16_t last_bpix2_detIndex = 324; + static constexpr int16_t last_barrel_detIndex = 504; static constexpr uint32_t maxPixInModule = 6000; @@ -399,9 +399,9 @@ namespace pixelTopology { static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64; // for both x and y static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16; - static constexpr uint32_t last_bpix1_detIndex = 96; - static constexpr uint32_t last_bpix2_detIndex = 320; - static constexpr uint32_t last_barrel_detIndex = 1184; + static constexpr int16_t last_bpix1_detIndex = 96; + static constexpr int16_t last_bpix2_detIndex = 320; + static constexpr int16_t last_barrel_detIndex = 1184; static constexpr uint32_t maxPixInModule = 6000; diff --git a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py index b778daa63677f..5af514529af63 100644 --- a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py +++ b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py @@ -223,7 +223,7 @@ def customizeHLTfor38761(process): if 'hltSiPixelRecHitsSoA' in process.__dict__: process.hltSiPixelRecHitsSoA.cpu = cms.EDAlias(hltSiPixelRecHitsFromLegacy = cms.VPSet( cms.PSet( - type = cms.string('pixelTopologyPhase1TrackingRecHit2DCPUT') + type = cms.string('pixelTopologyPhase1TrackingRecHitSoAHost') ), cms.PSet( type = cms.string('uintAsHostProduct') diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index bc9be260deb20..75bebbfb1a467 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -318,12 +318,13 @@ namespace pixelgpudetails { const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds, - uint16_t *xx, - uint16_t *yy, - uint16_t *adc, - uint32_t *pdigi, - uint32_t *rawIdArr, - uint16_t *moduleId, + SiPixelDigisCUDASOAView digisView, + // uint16_t *xx, + // uint16_t *yy, + // uint16_t *adc, + // uint32_t *pdigi, + // uint32_t *rawIdArr, + // uint16_t *moduleId, cms::cuda::SimpleVector *err, bool useQualityInfo, bool includeErrors) { @@ -332,17 +333,24 @@ namespace pixelgpudetails { int32_t first = threadIdx.x + blockIdx.x * blockDim.x; for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) { auto gIndex = iloop; - xx[gIndex] = 0; - yy[gIndex] = 0; - adc[gIndex] = 0; + auto dvgi = digisView[gIndex]; + dvgi.xx() = 0; + dvgi.yy() = 0; + dvgi.adc() = 0; + // xx[gIndex] = 0; + // yy[gIndex] = 0; + // adc[gIndex] = 0; bool skipROC = false; uint8_t fedId = fedIds[gIndex / 2]; // +1200; // initialize (too many coninue below) - pdigi[gIndex] = 0; - rawIdArr[gIndex] = 0; - moduleId[gIndex] = gpuClustering::invalidModuleId; + dvgi.pdigi() = 0; + dvgi.rawIdArr() = 0; + dvgi.moduleId() = gpuClustering::invalidModuleId; + // pdigi[gIndex] = 0; + // rawIdArr[gIndex] = 0; + // moduleId[gIndex] = gpuClustering::invalidModuleId; uint32_t ww = word[gIndex]; // Array containing 32 bit raw data if (ww == 0) { @@ -433,12 +441,18 @@ namespace pixelgpudetails { } pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix); - xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 - yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 - adc[gIndex] = sipixelconstants::getADC(ww); - pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); - moduleId[gIndex] = detId.moduleId; - rawIdArr[gIndex] = rawId; + // xx[gIndex] = globalPix.row; // origin shifting by 1 0-159 + // yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 + // adc[gIndex] = sipixelconstants::getADC(ww); + // pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); + // moduleId[gIndex] = detId.moduleId; + // rawIdArr[gIndex] = rawId; + dvgi.xx() = globalPix.row; // origin shifting by 1 0-159 + dvgi.yy() = globalPix.col; // origin shifting by 1 0-415 + dvgi.adc() = sipixelconstants::getADC(ww); + dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc()); + dvgi.moduleId() = detId.moduleId; + dvgi.rawIdArr() = rawId; } // end of loop (gIndex < end) } // end of Raw to Digi kernel @@ -549,7 +563,8 @@ namespace pixelgpudetails { #endif // since wordCounter != 0 we're not allocating 0 bytes, - digis_d = SiPixelDigisCUDA(wordCounter, stream); + // digis_d = SiPixelDigisCUDA(wordCounter, stream); + digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream); if (includeErrors) { digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream); } @@ -578,12 +593,13 @@ namespace pixelgpudetails { wordCounter, word_d.get(), fedId_d.get(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), - digis_d.view().pdigi(), - digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), + digis_d.view(), + // digis_d.view().xx(), + // digis_d.view().yy(), + // digis_d.view().adc(), + // digis_d.view().pdigi(), + // digis_d.view().rawIdArr(), + // digis_d.view().moduleInd(), digiErrors_d.error(), // returns nullptr if default-constructed useQualityInfo, includeErrors); @@ -594,12 +610,13 @@ namespace pixelgpudetails { wordCounter, word_d.get(), fedId_d.get(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), - digis_d.view().pdigi(), - digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), + digis_d.view(), + // digis_d.view().xx(), + // digis_d.view().yy(), + // digis_d.view().adc(), + // digis_d.view().pdigi(), + // digis_d.view().rawIdArr(), + // digis_d.view().moduleInd(), digiErrors_d.error(), // returns nullptr if default-constructed useQualityInfo, includeErrors); @@ -621,25 +638,25 @@ namespace pixelgpudetails { int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock; if (isRun2) - gpuCalibPixel::calibDigis<<>>(digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), + gpuCalibPixel::calibDigis<<>>(digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + digis_d->adc(), gains, wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); else - gpuCalibPixel::calibDigis<<>>(digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - digis_d.view().adc(), + gpuCalibPixel::calibDigis<<>>(digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + digis_d->adc(), gains, wordCounter, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -652,7 +669,7 @@ namespace pixelgpudetails { #endif countModules<<>>( - digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), wordCounter); + digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); threadsPerBlock = 256 + 128; /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel) @@ -661,14 +678,14 @@ namespace pixelgpudetails { std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + findClus<<>>(digis_d->rawIdArr(), + digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -678,12 +695,12 @@ namespace pixelgpudetails { // apply charge cut clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + digis_d->moduleId(), + digis_d->adc(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), wordCounter); cudaCheck(cudaGetLastError()); @@ -695,7 +712,7 @@ namespace pixelgpudetails { auto nModules_Clusters_d = cms::cuda::make_device_unique(3, stream); // MUST be ONE block fillHitsModuleStart<<<1, 1024, 0, stream>>>( - clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); + clusters_d->clusInModule(), clusters_d->clusModuleStart(), clusters_d->moduleStart(), nModules_Clusters_d.get()); // copy to host nModules_Clusters_h = cms::cuda::make_host_unique(3, stream); @@ -724,14 +741,14 @@ namespace pixelgpudetails { digis_d = SiPixelDigisCUDA(numDigis, stream); cudaCheck( - cudaMemcpyAsync(digis_d.view().moduleInd(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(digis_d.view().adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream)); cudaCheck( - cudaMemcpyAsync(digis_d.view().pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); + cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); cudaCheck( - cudaMemcpyAsync(digis_d.view().rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); + cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream)); clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream); @@ -740,12 +757,12 @@ namespace pixelgpudetails { int threadsPerBlock = 512; int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock; - gpuCalibPixel::calibDigisPhase2<<>>(digis_d.view().moduleInd(), - digis_d.view().adc(), + gpuCalibPixel::calibDigisPhase2<<>>(digis_d->moduleId(), + digis_d->adc(), numDigis, - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.clusModuleStart()); + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->clusModuleStart()); cudaCheck(cudaGetLastError()); @@ -755,12 +772,12 @@ namespace pixelgpudetails { #endif countModules<<>>( - digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), numDigis); + digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); // read the number of modules into a data member, used by getProduct()) cudaCheck(cudaMemcpyAsync( - &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); + &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream)); threadsPerBlock = 256; blocks = Phase2::numberOfModules; @@ -769,14 +786,14 @@ namespace pixelgpudetails { cudaCheck(cudaStreamSynchronize(stream)); std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n"; #endif - findClus<<>>(digis_d.view().rawIdArr(), - digis_d.view().moduleInd(), - digis_d.view().xx(), - digis_d.view().yy(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + findClus<<>>(digis_d->rawIdArr(), + digis_d->moduleId(), + digis_d->xx(), + digis_d->yy(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); @@ -788,12 +805,12 @@ namespace pixelgpudetails { // apply charge cut clusterChargeCut<<>>(clusterThresholds, - digis_d.view().moduleInd(), - digis_d.view().adc(), - clusters_d.moduleStart(), - clusters_d.clusInModule(), - clusters_d.moduleId(), - digis_d.view().clus(), + digis_d->moduleId(), + digis_d->adc(), + clusters_d->moduleStart(), + clusters_d->clusInModule(), + clusters_d->moduleId(), + digis_d->clus(), numDigis); cudaCheck(cudaGetLastError()); @@ -806,7 +823,7 @@ namespace pixelgpudetails { #endif fillHitsModuleStart<<<1, 1024, 0, stream>>>( - clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get()); + clusters_d->clusInModule(), clusters_d->clusModuleStart(), clusters_d->moduleStart(), nModules_Clusters_d.get()); nModules_Clusters_h = cms::cuda::make_host_unique(3, stream); cudaCheck(cudaMemcpyAsync( diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h index 675eae8938236..d538ad7896f9c 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h @@ -10,7 +10,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -//#define GPU_DEBUG +#define GPU_DEBUG namespace gpuClustering { @@ -396,7 +396,7 @@ namespace gpuClustering { } #endif #ifdef GPU_DEBUG - if (thisModuleId % 100 == 1) + if (true and thisModuleId % 100 == 1) printf("%d clusters in module %d\n", foundClusters, thisModuleId); #endif } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index cb5b4b2f2c387..a1d932070949f 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -12,7 +12,7 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" -// #define GPU_DEBUG 1 +#define GPU_DEBUG namespace { template @@ -42,7 +42,7 @@ namespace { namespace pixelgpudetails { template - TrackingRecHit2DGPUT PixelRecHitGPUKernel::makeHitsAsync( + TrackingRecHitSoADevice PixelRecHitGPUKernel::makeHitsAsync( SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, @@ -51,8 +51,15 @@ namespace pixelgpudetails { using namespace gpuPixelRecHits; auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPUT hits_d( - nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); + // TrackingRecHit2DGPUT hits_d( + // nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + + TrackingRecHitSoADevice hits_d(nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream); + + cudaCheck(cudaGetLastError()); + cudaCheck(cudaDeviceSynchronize()); int activeModulesWithDigis = digis_d.nModules(); // protect from empty events @@ -65,7 +72,7 @@ namespace pixelgpudetails { std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl; #endif getHits<<>>( - cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); + cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.const_view(), hits_d.view()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -74,16 +81,16 @@ namespace pixelgpudetails { // assuming full warp of threads is better than a smaller number... if (nHits) { setHitsLayerStart - <<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart()); + <<<1, 32, 0, stream>>>(clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data()); cudaCheck(cudaGetLastError()); constexpr auto nLayers = TrackerTraits::numberOfLayers; cms::cuda::fillManyFromVector(hits_d.phiBinner(), nLayers, - hits_d.iphi(), - hits_d.hitsLayerStart(), + hits_d.view().iphi(), + hits_d.view().hitsLayerStart().data(), nHits, 256, - hits_d.phiBinnerStorage(), + hits_d.view().phiBinnerStorage(), stream); cudaCheck(cudaGetLastError()); @@ -93,6 +100,11 @@ namespace pixelgpudetails { } } + #ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); + std::cout << "PixelRecHitGPUKernel -> DONE!" << std::endl; + #endif + return hits_d; } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 0a3c2b647f22e..0b9713402a90a 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -8,7 +8,8 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" //#define GPU_DEBUG 1 namespace pixelgpudetails { @@ -26,7 +27,7 @@ namespace pixelgpudetails { using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT; - TrackingRecHit2DGPUT makeHitsAsync(SiPixelDigisCUDA const& digis_d, + TrackingRecHitSoADevice makeHitsAsync(SiPixelDigisCUDA const& digis_d, SiPixelClustersCUDA const& clusters_d, BeamSpotCUDA const& bs_d, ParamsOnGPU const* cpeParams, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc index b23fa7dcc11ed..91038f7f80377 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc @@ -4,7 +4,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -39,7 +40,7 @@ class SiPixelRecHitCUDAT : public edm::global::EDProducer<> { const edm::EDGetTokenT> tBeamSpot; const edm::EDGetTokenT> token_; const edm::EDGetTokenT> tokenDigi_; - const edm::EDPutTokenT>> tokenHit_; + const edm::EDPutTokenT>> tokenHit_; const pixelgpudetails::PixelRecHitGPUKernel gpuAlgo_; }; @@ -50,7 +51,7 @@ SiPixelRecHitCUDAT::SiPixelRecHitCUDAT(const edm::ParameterSet& i tBeamSpot(consumes>(iConfig.getParameter("beamSpot"))), token_(consumes>(iConfig.getParameter("src"))), tokenDigi_(consumes>(iConfig.getParameter("src"))), - tokenHit_(produces>>()) {} + tokenHit_(produces>>()) {} template void SiPixelRecHitCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 1428efe06a1d1..017fb4ef05576 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -4,7 +4,8 @@ #include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" @@ -33,7 +34,7 @@ class SiPixelRecHitFromCUDAT : public edm::stream::EDProducer static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); using HMSstorage = HostProduct; - using HitsOnGPU = TrackingRecHit2DGPUT; + using HitsOnGPU = TrackingRecHitSoADevice; private: void acquire(edm::Event const& iEvent, diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index 8bcb218255548..1b2983b319d87 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -4,7 +4,9 @@ #include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" #include "DataFormats/Common/interface/Handle.h" #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h" @@ -32,7 +34,9 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer; - using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; + using HitsOnHost = TrackingRecHitSoAHost; + using HitsOnDevice = TrackingRecHitSoADevice; + // using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT; private: void acquire(edm::Event const& iEvent, @@ -40,21 +44,21 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer>> hitsTokenGPU_; // CUDA hits - const edm::EDPutTokenT> hitsPutTokenCPU_; + const edm::EDGetTokenT> hitsTokenGPU_; // CUDA hits + const edm::EDPutTokenT hitsPutTokenCPU_; const edm::EDPutTokenT hostPutToken_; uint32_t nHits_; - - cms::cuda::host::unique_ptr store32_; - cms::cuda::host::unique_ptr store16_; - cms::cuda::host::unique_ptr hitsModuleStart_; + HitsOnHost hits_h_; + // cms::cuda::host::unique_ptr store32_; + // cms::cuda::host::unique_ptr store16_; + // cms::cuda::host::unique_ptr hitsModuleStart_; }; template SiPixelRecHitSoAFromCUDAT::SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig) : hitsTokenGPU_(consumes(iConfig.getParameter("pixelRecHitSrc"))), - hitsPutTokenCPU_(produces>()), + hitsPutTokenCPU_(produces()), hostPutToken_(produces()) {} template @@ -69,18 +73,24 @@ template void SiPixelRecHitSoAFromCUDAT::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product> const& inputDataWrapped = iEvent.get(hitsTokenGPU_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(hitsTokenGPU_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); nHits_ = inputData.nHits(); + hits_h_ = HitsOnHost(nHits_, ctx.stream()); + cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(), + inputData.const_buffer().get(), + inputData.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits"; if (0 == nHits_) return; - store32_ = inputData.store32ToHostAsync(ctx.stream()); - store16_ = inputData.store16ToHostAsync(ctx.stream()); - hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); + // store32_ = inputData.store32ToHostAsync(ctx.stream()); + // store16_ = inputData.store16ToHostAsync(ctx.stream()); + // hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream()); } template @@ -88,10 +98,12 @@ void SiPixelRecHitSoAFromCUDAT::produce(edm::Event& iEvent, edm:: auto hmsp = std::make_unique(TrackerTraits::numberOfModules + 1); if (nHits_ > 0) - std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get()); + std::copy(hits_h_.view().hitsModuleStart().begin(), hits_h_.view().hitsModuleStart().end(), hmsp.get()); + // std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get()); iEvent.emplace(hostPutToken_, std::move(hmsp)); - iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_); + iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_)); + // iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_); } using SiPixelRecHitSoAFromCUDA = SiPixelRecHitSoAFromCUDAT; diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index 1edc7870f4800..1f5745bc276e7 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -3,7 +3,9 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" #include "CUDADataFormats/Common/interface/HostProduct.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" @@ -35,8 +37,9 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> { static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); - using HitModuleStart = std::array; + using HitModuleStart = std::array; using HMSstorage = HostProduct; + using HitsOnHost = TrackingRecHitSoAHost; private: void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override; @@ -45,7 +48,7 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> { const edm::ESGetToken cpeToken_; const edm::EDGetTokenT bsGetToken_; const edm::EDGetTokenT clusterToken_; // Legacy Clusters - const edm::EDPutTokenT> tokenHit_; + const edm::EDPutTokenT tokenHit_; const edm::EDPutTokenT tokenModuleStart_; const bool convert2Legacy_; }; @@ -56,7 +59,7 @@ SiPixelRecHitSoAFromLegacyT::SiPixelRecHitSoAFromLegacyT(const ed cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), bsGetToken_{consumes(iConfig.getParameter("beamSpot"))}, clusterToken_{consumes(iConfig.getParameter("src"))}, - tokenHit_{produces>()}, + tokenHit_{produces()}, tokenModuleStart_{produces()}, convert2Legacy_(iConfig.getParameter("convertToLegacy")) { if (convert2Legacy_) @@ -99,43 +102,53 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, iEvent.getByToken(clusterToken_, hclusters); auto const& input = *hclusters; - constexpr int maxModules = TrackerTraits::numberOfModules; + constexpr int nModules = TrackerTraits::numberOfModules; constexpr int startBPIX2 = pixelTopology::layerStart(1); - // allocate a buffer for the indices of the clusters - auto hmsp = std::make_unique(maxModules + 1); - // hitsModuleStart is a non-owning pointer to the buffer - auto hitsModuleStart = hmsp.get(); - // wrap the buffer in a HostProduct - auto hms = std::make_unique(std::move(hmsp)); - // move the HostProduct to the Event, without reallocating the buffer or affecting hitsModuleStart - iEvent.put(tokenModuleStart_, std::move(hms)); + // // allocate a buffer for the indices of the clusters + // auto hmsp = std::make_unique(nModules + 1); + // // hitsModuleStart is a non-owning pointer to the buffer + // // auto hitsModuleStart = hmsp.get(); + // // wrap the buffer in a HostProduct + // auto hms = std::make_unique(std::move(hmsp)); + // // move the HostProduct to the Event, without reallocating the buffer or affecting hitsModuleStart + // iEvent.put(tokenModuleStart_, std::move(hms)); // legacy output auto legacyOutput = std::make_unique(); // storage - std::vector xx; - std::vector yy; - std::vector adc; - std::vector moduleInd; - std::vector clus; + // std::vector xx; + // std::vector yy; + // std::vector adc; + // std::vector moduleInd; + // std::vector clus; std::vector, SiPixelCluster>> clusterRef; constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule(); - HitModuleStart moduleStart_; // index of the first pixel of each module - HitModuleStart clusInModule_; - memset(&clusInModule_, 0, sizeof(HitModuleStart)); // needed?? - memset(&moduleStart_, 0, sizeof(HitModuleStart)); - assert(gpuClustering::maxNumModules + 1 == clusInModule_.size()); - assert(0 == clusInModule_[gpuClustering::maxNumModules]); - uint32_t moduleId_; - moduleStart_[1] = 0; // we run sequentially.... + // HitModuleStart moduleStart_; // index of the first pixel of each module + // HitModuleStart clusInModule_; + // memset(&clusInModule_, 0, sizeof(HitModuleStart)); // needed?? + // memset(&moduleStart_, 0, sizeof(HitModuleStart)); + // assert(gpuClustering::maxNumModules + 1 == clusInModule_.size()); + // assert(0 == clusInModule_[gpuClustering::maxNumModules]); + // uint32_t moduleId_; + // moduleStart_[1] = 0; // we run sequentially.... + + cms::cuda::PortableHostCollection> clusters_h(nModules + 1, nullptr); - SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{ - moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart}; + memset(clusters_h.view().clusInModule(), 0, (nModules + 1) * sizeof(uint32_t)); // needed?? + memset(clusters_h.view().moduleStart(), 0, (nModules + 1) * sizeof(uint32_t)); + memset(clusters_h.view().moduleId(), 0, (nModules + 1) * sizeof(uint32_t)); + memset(clusters_h.view().clusModuleStart(), 0, (nModules + 1) * sizeof(uint32_t)); + + assert(0 == clusters_h.view()[nModules].clusInModule()); + clusters_h.view()[1].moduleStart() = 0; + + // SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{ + // moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart}; // fill cluster arrays int numberOfClusters = 0; @@ -144,33 +157,44 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto gind = genericDet->index(); - assert(gind < maxModules); + assert(gind < nModules); auto const nclus = dsv.size(); - clusInModule_[gind] = nclus; + // clusInModule_[gind] = nclus; + clusters_h.view()[gind].clusInModule() = nclus; numberOfClusters += nclus; } - hitsModuleStart[0] = 0; - - for (int i = 1, n = maxModules + 1; i < n; ++i) - hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1]; + clusters_h.view()[0].clusModuleStart() = 0; + // hitsModuleStart[0] = 0; - assert(numberOfClusters == int(hitsModuleStart[maxModules])); + // for (int i = 1, n = nModules + 1; i < n; ++i) + // hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1]; + for (int i = 1; i < nModules + 1; ++i) { + clusters_h.view()[i].clusModuleStart() = clusters_h.view()[i - 1].clusModuleStart() + clusters_h.view()[i - 1].clusInModule(); + } + // assert(numberOfClusters == int(hitsModuleStart[nModules])); + assert((uint32_t)numberOfClusters == clusters_h.view()[nModules].clusModuleStart()); // output SoA // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) + HitsOnHost output(numberOfClusters, + clusters_h.view()[startBPIX2].clusModuleStart(), + &cpeView, + clusters_h.view().clusModuleStart(), + nullptr); - auto output = std::make_unique>( - numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); + // auto output = std::make_unique>( + // numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); if (0 == numberOfClusters) { - iEvent.put(std::move(output)); + iEvent.emplace(tokenHit_, std::move(output)); + // iEvent.put(std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); return; } if (convert2Legacy_) - legacyOutput->reserve(maxModules, numberOfClusters); + legacyOutput->reserve(nModules, numberOfClusters); int numberOfDetUnits = 0; int numberOfHits = 0; @@ -180,16 +204,19 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, DetId detIdObject(detid); const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject); auto const gind = genericDet->index(); - assert(gind < maxModules); + assert(gind < nModules); const PixelGeomDetUnit* pixDet = dynamic_cast(genericDet); assert(pixDet); auto const nclus = dsv.size(); - assert(clusInModule_[gind] == nclus); + // assert(clusInModule_[gind] == nclus);Non + assert(clusters_h.view()[gind].clusInModule() == nclus); if (0 == nclus) continue; // is this really possible? - auto const fc = hitsModuleStart[gind]; - auto const lc = hitsModuleStart[gind + 1]; + // auto const fc = hitsModuleStart[gind]; + auto const fc = clusters_h.view()[gind].clusModuleStart(); + // auto const lc = hitsModuleStart[gind + 1]; + auto const lc = clusters_h.view()[gind + 1].clusModuleStart(); assert(lc > fc); LogDebug("SiPixelRecHitSoAFromLegacy") << "in det " << gind << ": conv " << nclus << " hits from " << dsv.size() << " legacy clusters" << ' ' << fc << ',' << lc; @@ -198,25 +225,42 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, printf( "WARNING: too many clusters %d in Module %d. Only first %d Hits converted\n", nclus, gind, maxHitsInModule); - // fill digis - xx.clear(); - yy.clear(); - adc.clear(); - moduleInd.clear(); - clus.clear(); + // count digis + uint32_t ndigi = 0; + for (auto const& clust : dsv) { + assert(clust.size() > 0); + for (int i = 0, nd = clust.size(); i < nd; ++i) { + ndigi++; + } + } + std::cout << "ndigi=" << ndigi << std::endl; + cms::cuda::PortableHostCollection> digis_h(ndigi, nullptr); + + // xx.clear(); + // yy.clear(); + // adc.clear(); + // moduleInd.clear(); + // clus.clear(); clusterRef.clear(); - moduleId_ = gind; + clusters_h.view()[0].moduleId() = gind; + // moduleId_ = gind; uint32_t ic = 0; - uint32_t ndigi = 0; + ndigi = 0; + //filling digis for (auto const& clust : dsv) { assert(clust.size() > 0); for (int i = 0, nd = clust.size(); i < nd; ++i) { auto px = clust.pixel(i); - xx.push_back(px.x); - yy.push_back(px.y); - adc.push_back(px.adc); - moduleInd.push_back(gind); - clus.push_back(ic); + digis_h.view()[ndigi].xx() = px.x; + digis_h.view()[ndigi].yy() = px.y; + digis_h.view()[ndigi].adc() = px.adc; + digis_h.view()[ndigi].moduleId() = gind; + digis_h.view()[ndigi].clus() = ic; + // xx.push_back(px.x); + // yy.push_back(px.y); + // adc.push_back(px.adc); + // moduleInd.push_back(gind); + // clus.push_back(ic); ++ndigi; } @@ -225,25 +269,29 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, ic++; } assert(nclus == ic); - assert(clus.size() == ndigi); + // assert(clus.size() == ndigi); numberOfHits += nclus; // filled creates view - SiPixelDigisCUDASOAView digiView; - digiView.xx_ = xx.data(); - digiView.yy_ = yy.data(); - digiView.adc_ = adc.data(); - digiView.moduleInd_ = moduleInd.data(); - digiView.clus_ = clus.data(); - digiView.pdigi_ = nullptr; - digiView.rawIdArr_ = nullptr; - assert(digiView.adc(0) != 0); + // SiPixelDigisCUDASOAView digiView; + // digiView.xx_ = xx.data(); + // digiView.yy_ = yy.data(); + // digiView.adc_ = adc.data(); + // digiView.moduleInd_ = moduleInd.data(); + // digiView.clus_ = clus.data(); + // digiView.pdigi_ = nullptr; + // digiView.rawIdArr_ = nullptr; + // assert(digiView.adc(0) != 0); + assert(digis_h.view()[0].adc() != 0); // we run on blockId.x==0 - gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); + // gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view()); + gpuPixelRecHits::getHits(&cpeView, &bsHost, digis_h.view(), ndigi, clusters_h.view(), output.view()); for (auto h = fc; h < lc; ++h) if (h - fc < maxHitsInModule) - assert(gind == output->view()->detectorIndex(h)); + // assert(gind == output->view()->detectorIndex(h)); + assert(gind == output.view()[h].detectorIndex()); else - assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h)); + assert(gpuClustering::invalidModuleId == output.view()[h].detectorIndex()); + // assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h)); if (convert2Legacy_) { SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid); for (auto h = fc; h < lc; ++h) { @@ -253,8 +301,10 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, break; assert(ih < clusterRef.size()); - LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); - LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); + // LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h)); + // LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h)); + LocalPoint lp(output.view()[h].xLocal(), output.view()[h].yLocal()); + LocalError le(output.view()[h].xerrLocal(), 0, output.view()[h].yerrLocal()); SiPixelRecHitQuality::QualWordType rqw = 0; SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]); recHitsOnDetUnit.push_back(hit); @@ -267,24 +317,43 @@ void SiPixelRecHitSoAFromLegacyT::produce(edm::StreamID streamID, // fill data structure to support CA constexpr auto nLayers = TrackerTraits::numberOfLayers; for (auto i = 0U; i < nLayers + 1; ++i) { - output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + // output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]]; + output.view().hitsLayerStart()[i] = clusters_h.view()[cpeView.layerGeometry().layerStart[i]].clusModuleStart(); LogDebug("SiPixelRecHitSoAFromLegacy") << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i] << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n"; } - cms::cuda::fillManyFromVector(output->phiBinner(), + cms::cuda::fillManyFromVector(&(output.view().phiBinner()), nLayers, - output->iphi(), - output->hitsLayerStart(), - numberOfHits, + output.view().iphi(), + output.view().hitsLayerStart().data(), + output.view().nHits(), 256, - output->phiBinnerStorage()); + output.view().phiBinnerStorage()); + + // cms::cuda::fillManyFromVector(output->phiBinner(), + // nLayers, + // output->iphi(), + // output->hitsLayerStart(), + // numberOfHits, + // 256, + // output->phiBinnerStorage()); LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in " << numberOfDetUnits << " Dets" << "\n"; - iEvent.put(std::move(output)); + // iEvent.put(std::move(output)); + + // allocate a buffer for the indices of the clusters + auto hmsp = std::make_unique(nModules + 1); + // copy pointer to data (SoA view) to allocated buffer + memcpy(hmsp.get(),clusters_h.view().clusModuleStart(),nModules * sizeof(uint32_t)); + // wrap the buffer in a HostProduct + auto hms = std::make_unique(std::move(hmsp)); + // move the HostProduct to the Event, without reallocating the buffer or affecting hitsModuleStart + iEvent.put(tokenModuleStart_, std::move(hms)); + iEvent.emplace(tokenHit_, std::move(output)); if (convert2Legacy_) iEvent.put(std::move(legacyOutput)); } diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index f0798cc74a975..69b5ab62c7539 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -7,11 +7,12 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" //#define GPU_DEBUG 1 namespace gpuPixelRecHits { @@ -19,20 +20,20 @@ namespace gpuPixelRecHits { template __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT const* __restrict__ cpeParams, BeamSpotPOD const* __restrict__ bs, - SiPixelDigisCUDASOAView const digis, + SiPixelDigisCUDASOAConstView digis, int numElements, - SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters, - TrackingRecHit2DSOAViewT* phits) { + SiPixelClustersCUDASOAConstView clusters, + HitSoAView hits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain) - assert(phits); + // assert(phits); assert(cpeParams); - auto& hits = *phits; + // auto& hits = *phits; - auto const& clusters = *pclusters; + // auto const& clusters = *pclusters; // copy average geometry corrected by beamspot . FIXME (move it somewhere else???) if (0 == blockIdx.x) { auto& agc = hits.averageGeometry(); @@ -64,23 +65,23 @@ namespace gpuPixelRecHits { // as usual one block per module __shared__ ClusParams clusParams; - auto me = clusters.moduleId(blockIdx.x); - int nclus = clusters.clusInModule(me); + auto me = clusters[blockIdx.x].moduleId(); + int nclus = clusters[me].clusInModule(); if (0 == nclus) return; -// #ifdef GPU_DEBUG -// if (threadIdx.x == 0) { -// auto k = clusters.moduleStart(1 + blockIdx.x); -// while (digis.moduleInd(k) == invalidModuleId) -// ++k; -// assert(digis.moduleInd(k) == me); -// } -// #endif #ifdef GPU_DEBUG - if (me % 100 == 1) + if (threadIdx.x == 0) { + auto k = clusters[1 + blockIdx.x].moduleStart(); + while (digis[k].moduleId() == invalidModuleId) + ++k; + assert(digis[k].moduleId() == me); + } +#endif +#ifdef GPU_DEBUG + if (true and me % 100 == 1) if (threadIdx.x == 0) - printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me)); + printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart()); #endif for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) { @@ -108,21 +109,21 @@ namespace gpuPixelRecHits { __syncthreads(); // one thread per "digi" - auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x; + auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x; for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleId(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); + auto x = digis[i].xx(); + auto y = digis[i].yy(); atomicMin(&clusParams.minRow[cl], x); atomicMax(&clusParams.maxRow[cl], x); atomicMin(&clusParams.minCol[cl], y); @@ -133,20 +134,20 @@ namespace gpuPixelRecHits { auto pixmx = cpeParams->detParams(me).pixmx; for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleId(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); - auto ch = digis.adc(i); + auto x = digis[i].xx(); + auto y = digis[i].yy(); + auto ch = digis[i].adc(); atomicAdd(&clusParams.charge[cl], ch); ch = std::min(ch, pixmx); if (clusParams.minRow[cl] == x) @@ -163,30 +164,31 @@ namespace gpuPixelRecHits { // next one cluster per thread... - first = clusters.clusModuleStart(me) + startClus; + first = clusters[me].clusModuleStart() + startClus; for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) { auto h = first + ic; // output index in global memory assert(h < hits.nHits()); - assert(h < clusters.clusModuleStart(me + 1)); + assert(h < clusters[me+1].clusModuleStart()); pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it - hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]); - hits.detectorIndex(h) = me; + hits[h].chargeAndStatus().charge = clusParams.charge[ic]; + hits[h].chargeAndStatus().status = clusParams.status[ic]; + hits[h].detectorIndex() = me; float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits[h].xLocal() = xl = clusParams.xpos[ic]; + hits[h].yLocal() = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits[h].clusterSizeX() = clusParams.xsize[ic]; + hits[h].clusterSizeY() = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; + hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX; + hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY; // keep it local for computations float xg, yg, zg; @@ -197,12 +199,14 @@ namespace gpuPixelRecHits { yg -= bs->y; zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits[h].xGlobal() = xg; + hits[h].yGlobal() = yg; + hits[h].zGlobal() = zg; + + hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg); + hits[h].iphi() = unsafe_atan2s<7>(yg, xg); - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + // printf("xl = %.2f yl = %.2f xg = %.2f yg = %.2f zg = %.2f q = %d \n",xl,yl,xg,yg,zg,clusParams.charge[ic]); } __syncthreads(); } // end loop on batches diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index ec3e068bca422..7284dab68f05e 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -56,7 +56,7 @@ siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHit2DCPUT")), + cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) @@ -64,7 +64,7 @@ phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA, cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHit2DCPUT")), + cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHitSoAHost")), cms.PSet(type = cms.string("uintAsHostProduct")) ))) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc index ef73c625ebfa8..7e7e11e49e0c8 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc @@ -1,9 +1,10 @@ #include +#include // needed here by soa layout #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ConsumesCollector.h" #include "FWCore/Framework/interface/Event.h" @@ -20,10 +21,23 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + + + template class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> { public: - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; + + using VertexSoAHost = zVertex::ZVertexSoAHost;//; + using VertexSoADevice = zVertex::ZVertexSoADevice;//; + explicit PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig); ~PixelTrackDumpCUDAT() override = default; @@ -32,10 +46,10 @@ class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> { private: void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; const bool m_onGPU; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDGetTokenT> tokenGPUVertex_; - edm::EDGetTokenT tokenSoATrack_; - edm::EDGetTokenT tokenSoAVertex_; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDGetTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; }; template @@ -43,12 +57,12 @@ PixelTrackDumpCUDAT::PixelTrackDumpCUDAT(const edm::ParameterSet& : m_onGPU(iConfig.getParameter("onGPU")) { if (m_onGPU) { tokenGPUTrack_ = - consumes>(iConfig.getParameter("pixelTrackSrc")); + consumes(iConfig.getParameter("pixelTrackSrc")); tokenGPUVertex_ = - consumes>(iConfig.getParameter("pixelVertexSrc")); + consumes(iConfig.getParameter("pixelVertexSrc")); } else { tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); - tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); } } @@ -71,19 +85,19 @@ void PixelTrackDumpCUDAT::analyze(edm::StreamID streamID, cms::cuda::ScopedContextProduce ctx{hTracks}; auto const& tracks = ctx.get(hTracks); - auto const* tsoa = tracks.get(); + auto const* tsoa = &tracks; assert(tsoa); auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_)); - auto const* vsoa = vertices.get(); + auto const* vsoa = &vertices; assert(vsoa); } else { - auto const* tsoa = iEvent.get(tokenSoATrack_).get(); - assert(tsoa); + auto const& tsoa = iEvent.get(tokenSoATrack_); + assert(tsoa.buffer()); - auto const* vsoa = iEvent.get(tokenSoAVertex_).get(); - assert(vsoa); + auto const& vsoa = iEvent.get(tokenSoAVertex_); + assert(vsoa.buffer()); } } diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc index 6a0f918b0d979..3367abf4c217f 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc @@ -27,21 +27,25 @@ #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "storeTracks.h" #include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" + /** * This class creates "leagcy" reco::Track * objects from the output of SoA CA. */ template class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - + using TrackSoAHost = TrackSoAHeterogeneousHost; + using tracksHelpers = tracksUtilities; public: using IndToEdm = std::vector; @@ -58,7 +62,7 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { // Event Data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT tokenTrack_; const edm::EDGetTokenT cpuHits_; const edm::EDGetTokenT hmsToken_; // Event Setup tokens @@ -66,7 +70,7 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> { const edm::ESGetToken ttTopoToken_; int32_t const minNumberOfHits_; - pixelTrack::Quality const minQuality_; + pixelTrackSoA::Quality const minQuality_; }; template @@ -78,12 +82,12 @@ PixelTrackProducerFromSoAT::PixelTrackProducerFromSoAT(const edm: idealMagneticFieldToken_(esConsumes()), ttTopoToken_(esConsumes()), minNumberOfHits_(iConfig.getParameter("minNumberOfHits")), - minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))) { - if (minQuality_ == pixelTrack::Quality::notQuality) { + minQuality_(pixelTrackSoA::qualityByName(iConfig.getParameter("minQuality"))) { + if (minQuality_ == pixelTrackSoA::Quality::notQuality) { throw cms::Exception("PixelTrackConfiguration") - << iConfig.getParameter("minQuality") + " is not a pixelTrack::Quality"; + << iConfig.getParameter("minQuality") + " is not a pixelTrackSoA::Quality"; } - if (minQuality_ < pixelTrack::Quality::dup) { + if (minQuality_ < pixelTrackSoA::Quality::dup) { throw cms::Exception("PixelTrackConfiguration") << iConfig.getParameter("minQuality") + " not supported"; } @@ -119,7 +123,7 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, reco::TrackBase::tight, reco::TrackBase::tight, reco::TrackBase::highPurity}; - assert(reco::TrackBase::highPurity == recoQuality[int(pixelTrack::Quality::highPurity)]); + assert(reco::TrackBase::highPurity == recoQuality[int(pixelTrackSoA::Quality::highPurity)]); // std::cout << "Converting gpu helix in reco tracks" << std::endl; @@ -139,6 +143,7 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, std::vector hitmap; auto const &rcs = rechits.data(); auto nhits = rcs.size(); + std::cout << "nhits : " << nhits << std::endl; hitmap.resize(nhits, nullptr); auto const *hitsModuleStart = iEvent.get(hmsToken_).get(); @@ -152,6 +157,7 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, auto i = fc[detI] + clus.pixelCluster().originalId(); if (i >= hitmap.size()) hitmap.resize(i + 256, nullptr); // only in case of hit overflow in one module + std::cout << "hitmap "<< i << " detI " << detI << "fc[detI]" << fc[detI] << std::endl; assert(nullptr == hitmap[i]); hitmap[i] = &h; } @@ -159,12 +165,10 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, std::vector hits; hits.reserve(5); - const auto &tsoa = *iEvent.get(tokenTrack_); - - auto const *quality = tsoa.qualityData(); - auto const &fit = tsoa.stateAtBS; - auto const &hitIndices = tsoa.hitIndices; - auto nTracks = tsoa.nTracks(); + auto &tsoa = iEvent.get(tokenTrack_); + auto const *quality = tsoa.view().quality(); + auto const hitIndices = tsoa.view().hitIndices(); + auto nTracks = tsoa.view().nTracks(); tracks.reserve(nTracks); @@ -173,19 +177,21 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, //sort index by pt std::vector sortIdxs(nTracks); std::iota(sortIdxs.begin(), sortIdxs.end(), 0); - std::sort( - sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); }); + std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + }); //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) indToEdm.resize(sortIdxs.size(), -1); for (const auto &it : sortIdxs) { - auto nHits = tsoa.nHits(it); + // auto nHits = tsoa.nHits(it); + auto nHits = tracksHelpers::nHits(tsoa.view(), it); assert(nHits >= 3); auto q = quality[it]; if (q < minQuality_) continue; - if (tsoa.nLayers(it) < minNumberOfHits_) + if (nHits < minNumberOfHits_) //move to nLayers? continue; indToEdm[it] = nt; ++nt; @@ -197,20 +203,27 @@ void PixelTrackProducerFromSoAT::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float chi2 = tsoa.chi2(it); - float phi = tsoa.phi(it); + float chi2 = tsoa.view()[it].chi2(); + float phi = tracksHelpers::phi(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + // fit.copyToDense(ipar, icov, it); + tracksHelpers::template copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); AlgebraicSymMatrix55 m; for (int i = 0; i < 5; ++i) + { for (int j = i; j < 5; ++j) - m(i, j) = ocov(i, j); - + { + std::cout << ocov(i, j) << " "; + m(i, j) = ocov(i, j); + } + std::cout << std::endl; + } + std::cout << std::endl; float sp = std::sin(phi); float cp = std::cos(phi); Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0); diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index 0675effd091e8..be7c0a6f0c240 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -1,8 +1,12 @@ #include +#include // needed here by soa layout #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" @@ -16,13 +20,16 @@ #include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" + + // Switch on to enable checks and printout for found tracks // #define PIXEL_DEBUG_PRODUCE template class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - using TrackSoA = pixelTrack::TrackSoAT; + + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; public: explicit PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig); @@ -36,16 +43,16 @@ class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + TrackSoAHost tracks_h_; }; template PixelTrackSoAFromCUDAT::PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} template void PixelTrackSoAFromCUDAT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { @@ -59,19 +66,23 @@ template void PixelTrackSoAFromCUDAT::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - soa_ = inputData.toHostAsync(ctx.stream()); + auto const& tracks_d = ctx.get(inputDataWrapped); // Tracks on device + tracks_h_ = TrackSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(tracks_h_.buffer().get(), + tracks_d.const_buffer().get(), + tracks_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } template void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { - auto const& tsoa = *soa_; - auto maxTracks = tsoa.stride(); + auto maxTracks = tracks_h_.view().metadata().size(); + auto nTracks = tracks_h_.view().nTracks(); - auto nTracks = tsoa.nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 @@ -84,8 +95,8 @@ void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::Eve int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = tracksUtilities::nHits(tracks_h_.view(), it); + assert(nHits == int(tracks_h_.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -94,9 +105,9 @@ void PixelTrackSoAFromCUDAT::produce(edm::Event& iEvent, edm::Eve #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, std::move(soa_)); + iEvent.emplace(tokenSOA_, std::move(tracks_h_)); + assert(!tracks_h_.buffer()); - assert(!soa_); } using PixelTrackSoAFromCUDA = PixelTrackSoAFromCUDAT; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc index d6a9db4953be1..19cbf42062683 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc @@ -1,7 +1,7 @@ #include "BrokenLineFitOnGPU.h" template -void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitsView const* hv, +void HelixFitOnGPU::launchBrokenLineKernelsOnCPU(HitSoAConstView hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples) { assert(tuples_); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index b1ee028b8863e..5af3889808941 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -2,7 +2,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" template -void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, +void HelixFitOnGPU::launchBrokenLineKernels(HitSoAConstView hv, uint32_t hitsInFit, uint32_t maxNumberOfTuples, cudaStream_t stream) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h index 4d1d57c4e27a8..a11c6ea2c71d2 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h @@ -8,7 +8,7 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -17,11 +17,9 @@ #include "HelixFitOnGPU.h" template -using HitsOnGPU = TrackingRecHit2DSOAViewT; +using Tuples = typename trackSoA::HitContainer; template -using Tuples = pixelTrack::HitContainerT; -template -using OutputSoA = pixelTrack::TrackSoAT; +using OutputSoAView = TrackSoAView; template using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -33,7 +31,7 @@ using TupleMultiplicity = caStructures::TupleMultiplicityT; template __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, TupleMultiplicity const *__restrict__ tupleMultiplicity, - HitsOnGPU const *__restrict__ hhp, + HitSoAConstView hh, typename TrackerTraits::tindex_type *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, @@ -46,7 +44,6 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found assert(hitsInFit <= nHitsL); assert(nHitsL <= nHitsH); - assert(hhp); assert(phits); assert(pfast_fit); assert(foundNtuplets); @@ -100,9 +97,9 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found // #define YERR_FROM_DC #ifdef YERR_FROM_DC // try to compute more precise error in y - auto dx = hhp->xGlobal(hitId[hitsInFit - 1]) - hhp->xGlobal(hitId[0]); - auto dy = hhp->yGlobal(hitId[hitsInFit - 1]) - hhp->yGlobal(hitId[0]); - auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]); + auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal(); + auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal(); + auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal(); float ux, uy, uz; #endif @@ -118,8 +115,8 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found float ge[6]; #ifdef YERR_FROM_DC - auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit)); - auto status = hhp->status(hit); + auto const &dp = hh.cpeParams().detParams(hh.detectorIndex(hit)); + auto status = hh[hit].chargeAndStatus().status; int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin; assert(qbin >= 0 && qbin < 5); bool nok = (status.isBigY | status.isOneY); @@ -136,12 +133,10 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found yerr *= dp.yfact[qbin]; // inflate yerr *= yerr; yerr += dp.apeYY; - yerr = nok ? hhp->yerrLocal(hit) : yerr; - dp.frame.toGlobal(hhp->xerrLocal(hit), 0, yerr, ge); + yerr = nok ? hh[hit].yerrLocal() : yerr; + dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge); #else - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); #endif #ifdef BL_DUMP_HITS @@ -151,16 +146,16 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found local_idx, tkid, hit, - hhp->detectorIndex(hit), + hh[hit].detectorIndex(), i, - hhp->xGlobal(hit), - hhp->yGlobal(hit), - hhp->zGlobal(hit)); + hh[hit].xGlobal(), + hh[hit].yGlobal(), + hh[hit].zGlobal()); printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]); } #endif - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } brokenline::fastFit(hits, fast_fit); @@ -176,12 +171,14 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ found template __global__ void kernel_BLFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, double bField, - OutputSoA *results, + OutputSoAView results_view, typename TrackerTraits::tindex_type const *__restrict__ ptkids, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit) { - assert(results); + assert(results_view.pt()); + assert(results_view.eta()); + assert(results_view.chi2()); assert(pfast_fit); constexpr auto invalidTkId = std::numeric_limits::max(); @@ -209,10 +206,11 @@ __global__ void kernel_BLFit(TupleMultiplicity const *__restrict_ brokenline::lineFit(hits_ge, fast_fit, bField, data, line); brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle); - results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); - results->pt(tkid) = float(bField) / float(std::abs(circle.par(2))); - results->eta(tkid) = asinhf(line.par(0)); - results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5); + tracksUtilities::copyFromCircle( + results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2))); + results_view[tkid].eta() = asinhf(line.par(0)); + results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5); #ifdef BROKENLINE_DEBUG if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml index 95c443c3b51e7..de2a40fc8b0f0 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml @@ -1,5 +1,6 @@ + diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc index fade739410e2f..b0a85c7ac1313 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc @@ -17,19 +17,24 @@ #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" #include "CAHitNtupletGeneratorOnGPU.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" + +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" template class CAHitNtupletCUDAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnGPU = TrackingRecHit2DGPUT; - using HitsOnCPU = TrackingRecHit2DCPUT; + using HitsConstView = HitSoAConstView; + using HitsOnGPU = TrackingRecHitSoADevice; //TODO move to OnDevice + using HitsOnCPU = TrackingRecHitSoAHost; //TODO move to OnHost + + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; + using GPUAlgo = CAHitNtupletGeneratorOnGPU; public: @@ -48,9 +53,9 @@ class CAHitNtupletCUDAT : public edm::global::EDProducer<> { edm::ESGetToken tokenField_; edm::EDGetTokenT> tokenHitGPU_; - edm::EDPutTokenT> tokenTrackGPU_; + edm::EDPutTokenT> tokenTrackGPU_; edm::EDGetTokenT tokenHitCPU_; - edm::EDPutTokenT tokenTrackCPU_; + edm::EDPutTokenT tokenTrackCPU_; GPUAlgo gpuAlgo_; }; @@ -60,10 +65,10 @@ CAHitNtupletCUDAT::CAHitNtupletCUDAT(const edm::ParameterSet& iCo : onGPU_(iConfig.getParameter("onGPU")), tokenField_(esConsumes()), gpuAlgo_(iConfig, consumesCollector()) { if (onGPU_) { tokenHitGPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackGPU_ = produces>(); + tokenTrackGPU_ = produces>(); } else { tokenHitCPU_ = consumes(iConfig.getParameter("pixelRecHitSrc")); - tokenTrackCPU_ = produces(); + tokenTrackCPU_ = produces(); } } @@ -95,13 +100,17 @@ void CAHitNtupletCUDAT::produce(edm::StreamID streamID, auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV(); if (onGPU_) { - auto hHits = iEvent.getHandle(tokenHitGPU_); + // auto hHits = iEvent.getHandle(tokenHitGPU_); //Why? + edm::Handle> hHits; + iEvent.getByToken(tokenHitGPU_, hHits); + cms::cuda::ScopedContextProduce ctx{*hHits}; - auto const& hits = ctx.get(*hHits); - ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream())); + auto& hits_d = ctx.get(*hHits); + ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits_d, bf, ctx.stream())); } else { - auto const& hits = iEvent.get(tokenHitCPU_); - iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf)); + // auto const& hits = iEvent.get(tokenHitCPU_); + auto& hits_h = iEvent.get(tokenHitCPU_); + iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits_h, bf)); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 75fbbffb49190..6d43af2966e03 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -14,7 +14,7 @@ void CAHitNtupletGeneratorKernelsCPU::printCounters(Counters cons } template -void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { +void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) { using namespace gpuPixelDoublets; using GPUCACell = GPUCACellT; @@ -26,7 +26,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con auto nhits = hh.nHits(); #ifdef NTUPLE_DEBUG - std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl; + std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << offsetBPIX2 << std::endl; #endif // use "nhits" to heuristically dimension the workspace @@ -35,7 +35,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con //this->device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); this->device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); assert(this->device_isOuterHitOfCell_.get()); - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; auto cellStorageSize = TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks); @@ -68,28 +68,20 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU con this->device_nCells_, this->device_theCellNeighbors_.get(), this->device_theCellTracks_.get(), - hh.view(), + hh, this->isOuterHitOfCell_, nActualPairs, this->params_.cellCuts_); } template -void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, - TkSoA *tracks_d, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); - - assert(tuples_d && quality_d); - // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream); - auto nhits = hh.nHits(); + uint32_t nhits = hh.metadata().size(); #ifdef NTUPLE_DEBUG std::cout << "start tuple building. N hits " << nhits << std::endl; @@ -103,7 +95,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU con kernel_connect(this->device_hitTuple_apc_, this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh, this->device_theCells_.get(), this->device_nCells_, this->device_theCellNeighbors_.get(), @@ -112,91 +104,84 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU con if (nhits > 1 && this->params_.earlyFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); } - kernel_find_ntuplets(hh.view(), + kernel_find_ntuplets(hh, + tracks_view, this->device_theCells_.get(), this->device_nCells_, this->device_theCellTracks_.get(), - tuples_d, this->device_hitTuple_apc_, - quality_d, this->params_.caParams_); if (this->params_.doStats_) kernel_mark_used(this->device_theCells_.get(), this->device_nCells_); - cms::cuda::finalizeBulk(this->device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk(this->device_hitTuple_apc_, &tracks_view.hitIndices()); - kernel_fillHitDetIndices(tuples_d, hh.view(), detId_d); - kernel_fillNLayers(tracks_d, this->device_hitTuple_apc_); + kernel_fillHitDetIndices(tracks_view, hh); + kernel_fillNLayers(tracks_view, this->device_hitTuple_apc_); // remove duplicates (tracks that share a doublet) kernel_earlyDuplicateRemover( - this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); - kernel_countMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + kernel_countMultiplicity(tracks_view, this->device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); - kernel_fillMultiplicity(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + kernel_fillMultiplicity(tracks_view, this->device_tupleMultiplicity_.get()); if (nhits > 1 && this->params_.lateFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); } } template -void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, - TkSoA *tracks_d, - cudaStream_t cudaStream) { +void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; - int32_t nhits = hh.nHits(); + int32_t nhits = hh.metadata().size(); - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + // auto const *tracks_view = &tracks_view->hitIndices; + // auto *quality_d = tracks_view->qualityData(); // classify tracks based on kinematics - kernel_classifyTracks(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); + kernel_classifyTracks(tracks_view, this->params_.qualityCuts_); if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks - kernel_fishboneCleaner(this->device_theCells_.get(), this->device_nCells_, quality_d); + kernel_fishboneCleaner(this->device_theCells_.get(), this->device_nCells_, tracks_view); } // remove duplicates (tracks that share a doublet) kernel_fastDuplicateRemover( - this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); // fill hit->track "map" if (this->params_.doSharedHitCut_ || this->params_.doStats_) { - kernel_countHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); + kernel_countHitInTracks(tracks_view, this->device_hitToTuple_.get()); cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); - kernel_fillHitInTracks(tuples_d, quality_d, this->device_hitToTuple_.get()); + kernel_fillHitInTracks(tracks_view, this->device_hitToTuple_.get()); } // remove duplicates (tracks that share at least one hit) if (this->params_.doSharedHitCut_) { - kernel_rejectDuplicate(tracks_d, - quality_d, + kernel_rejectDuplicate(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); - kernel_sharedHitCleaner(hh.view(), - tracks_d, - quality_d, + kernel_sharedHitCleaner(hh, + tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); if (this->params_.useSimpleTripletCleaner_) { - kernel_simpleTripletCleaner(tracks_d, - quality_d, + kernel_simpleTripletCleaner(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); } else { - kernel_tripletCleaner(tracks_d, - quality_d, + kernel_tripletCleaner(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); @@ -205,7 +190,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co if (this->params_.doStats_) { std::lock_guard guard(lock_stat); - kernel_checkOverflows(tuples_d, + kernel_checkOverflows(tracks_view, this->device_tupleMultiplicity_.get(), this->device_hitToTuple_.get(), this->device_hitTuple_apc_, @@ -223,7 +208,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co // counters (add flag???) std::lock_guard guard(lock_stat); kernel_doStatsForHitInTracks(this->device_hitToTuple_.get(), this->counters_); - kernel_doStatsForTracks(tuples_d, quality_d, this->counters_); + kernel_doStatsForTracks(tracks_view, this->counters_); } #ifdef DUMP_GPU_TK_TUPLES @@ -233,7 +218,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU co std::lock_guard guard(lock); ++iev; kernel_print_found_ntuplets( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 0, 1000000, iev); + hh, tracks_view, this->device_hitToTuple_.get(), 0, 1000000, iev); } #endif } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 59ae2041b44aa..acc692b341928 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -5,20 +5,17 @@ // #define GPU_DEBUG template -void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsConstView hh, + TkSoAView tracks_view, cudaStream_t cudaStream) { using namespace gpuPixelDoublets; using namespace caHitNtupletGeneratorKernels; - // these are pointer on GPU! - auto *tuples_d = &tracks_d->hitIndices; - auto *detId_d = &tracks_d->detIndices; - auto *quality_d = tracks_d->qualityData(); + // auto *quality_d = tracks_view.quality(); // zero tuples - cms::cuda::launchZero(tuples_d, cudaStream); + cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream); //TODO test .data() - int32_t nhits = hh.nHits(); + int32_t nhits = hh.metadata().size(); #ifdef NTUPLE_DEBUG std::cout << "start tuple building. N hits " << nhits << std::endl; @@ -45,7 +42,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con kernel_connect <<>>(this->device_hitTuple_apc_, this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh, this->device_theCells_.get(), this->device_nCells_, this->device_theCellNeighbors_.get(), @@ -63,19 +60,18 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); fishbone<<>>( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false); cudaCheck(cudaGetLastError()); } blockSize = 64; numberOfBlocks = (3 * this->params_.cellCuts_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), + kernel_find_ntuplets<<>>(hh, + tracks_view, this->device_theCells_.get(), this->device_nCells_, this->device_theCellTracks_.get(), - tuples_d, this->device_hitTuple_apc_, - quality_d, this->params_.caParams_); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -94,21 +90,21 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con blockSize = 128; numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize; - cms::cuda::finalizeBulk<<>>(this->device_hitTuple_apc_, tuples_d); + cms::cuda::finalizeBulk<<>>(this->device_hitTuple_apc_, &tracks_view.hitIndices()); //TODO test .data() #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - kernel_fillHitDetIndices<<>>(tuples_d, hh.view(), detId_d); + kernel_fillHitDetIndices<<>>(tracks_view, hh); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif - kernel_fillNLayers<<>>(tracks_d, this->device_hitTuple_apc_); + kernel_fillNLayers<<>>(tracks_view, this->device_hitTuple_apc_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -120,7 +116,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_earlyDuplicateRemover<<>>( - this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -130,10 +126,10 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con blockSize = 128; numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize; kernel_countMultiplicity - <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + <<>>(tracks_view, this->device_tupleMultiplicity_.get()); cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream); kernel_fillMultiplicity - <<>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get()); + <<>>(tracks_view, this->device_tupleMultiplicity_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -149,7 +145,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); fishbone<<>>( - hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); + hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true); cudaCheck(cudaGetLastError()); } @@ -163,8 +159,8 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU con } template -void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { - int32_t nhits = hh.nHits(); +void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream) { + int32_t nhits = hh.metadata().size(); using namespace gpuPixelDoublets; @@ -174,7 +170,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con using CellTracks = typename GPUCACell::CellTracks; using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer; - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; #ifdef NTUPLE_DEBUG std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; @@ -187,10 +183,10 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con // in principle we can use "nhits" to heuristically dimension the workspace... this->device_isOuterHitOfCell_ = - cms::cuda::make_device_unique(std::max(1, nhits - hh.offsetBPIX2()), stream); + cms::cuda::make_device_unique(std::max(1, nhits - offsetBPIX2), stream); assert(this->device_isOuterHitOfCell_.get()); - this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; + this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2}; this->cellStorage_ = cms::cuda::make_device_unique(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + @@ -203,7 +199,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con { int threadsPerBlock = 128; // at least one block! - int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock; + int blocks = (std::max(1, nhits - offsetBPIX2) + threadsPerBlock - 1) / threadsPerBlock; initDoublets<<>>(this->isOuterHitOfCell_, nhits, this->device_theCellNeighbors_.get(), @@ -236,7 +232,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con this->device_nCells_, this->device_theCellNeighbors_.get(), this->device_theCellTracks_.get(), - hh.view(), + hh, this->isOuterHitOfCell_, nActualPairs, this->params_.cellCuts_); @@ -249,36 +245,34 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU con } template -void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, - TkSoA *tracks_d, +void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsConstView hh, TkSoAView tracks_view, cudaStream_t cudaStream) { using namespace caHitNtupletGeneratorKernels; // these are pointer on GPU! - auto const *tuples_d = &tracks_d->hitIndices; - auto *quality_d = tracks_d->qualityData(); + // auto *quality_d = tracks_view.quality(); - int32_t nhits = hh.nHits(); + int32_t nhits = hh.metadata().size(); auto blockSize = 64; // classify tracks based on kinematics auto numberOfBlocks = this->nQuadrupletBlocks(blockSize); kernel_classifyTracks - <<>>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d); + <<>>(tracks_view, this->params_.qualityCuts_); if (this->params_.lateFishbone_) { // apply fishbone cleaning to good tracks numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_fishboneCleaner - <<>>(this->device_theCells_.get(), this->device_nCells_, quality_d); + <<>>(this->device_theCells_.get(), this->device_nCells_, tracks_view); cudaCheck(cudaGetLastError()); } // mark duplicates (tracks that share a doublet) numberOfBlocks = this->nDoubletBlocks(blockSize); kernel_fastDuplicateRemover<<>>( - this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_); + this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -289,7 +283,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co assert(this->hitToTupleView_.offSize > nhits); numberOfBlocks = this->nQuadrupletBlocks(blockSize); kernel_countHitInTracks - <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); + <<>>(tracks_view, this->device_hitToTuple_.get()); //CHECK cudaCheck(cudaGetLastError()); assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) && (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) && @@ -297,7 +291,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream); cudaCheck(cudaGetLastError()); kernel_fillHitInTracks - <<>>(tuples_d, quality_d, this->device_hitToTuple_.get()); + <<>>(tracks_view, this->device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -309,31 +303,27 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_rejectDuplicate - <<>>(tracks_d, - quality_d, + <<>>(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); kernel_sharedHitCleaner - <<>>(hh.view(), - tracks_d, - quality_d, + <<>>(hh, + tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); if (this->params_.useSimpleTripletCleaner_) { kernel_simpleTripletCleaner - <<>>(tracks_d, - quality_d, + <<>>(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); } else { kernel_tripletCleaner - <<>>(tracks_d, - quality_d, + <<>>(tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get()); @@ -347,7 +337,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co if (this->params_.doStats_) { numberOfBlocks = (std::max(nhits, int(this->params_.cellCuts_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize; kernel_checkOverflows - <<>>(tuples_d, + <<>>(tracks_view, this->device_tupleMultiplicity_.get(), this->device_hitToTuple_.get(), this->device_hitTuple_apc_, @@ -370,7 +360,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co cudaCheck(cudaGetLastError()); numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize; kernel_doStatsForTracks - <<>>(tuples_d, quality_d, this->counters_); + <<>>(tracks_view, this->counters_); //why sometimes yes and some no? cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG @@ -386,11 +376,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU co ++iev; for (int k = 0; k < 20000; k += 500) { kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), k, k + 500, iev); + hh, tracks_view, this->device_hitToTuple_.get(), k, k + 500, iev); cudaDeviceSynchronize(); } kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 20000, 1000000, iev); + hh, tracks_view, this->device_hitToTuple_.get(), 20000, 1000000, iev); cudaDeviceSynchronize(); // cudaStreamSynchronize(cudaStream); } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index b595106299d71..f00f7a40827e7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -3,9 +3,15 @@ // #define GPU_DEBUG -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "GPUCACell.h" #include "gpuPixelDoublets.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" + // #define DUMP_GPU_TK_TUPLES namespace caHitNtupletGenerator { @@ -84,7 +90,7 @@ namespace caHitNtupletGenerator { template struct ParamsT> : public AlgoParams { using TT = TrackerTraits; - using QualityCuts = pixelTrack::QualityCutsT; //track quality cuts + using QualityCuts = pixelTrackSoA::QualityCutsT; //track quality cuts using CellCuts = gpuPixelDoublets::CellCutsT; //cell building cuts using CAParams = CAParamsT; //params to be used on device @@ -135,7 +141,7 @@ namespace caHitNtupletGenerator { template struct ParamsT> : public AlgoParams { using TT = TrackerTraits; - using QualityCuts = pixelTrack::QualityCutsT; + using QualityCuts = pixelTrackSoA::QualityCutsT; using CellCuts = gpuPixelDoublets::CellCutsT; using CAParams = CAParamsT; @@ -184,7 +190,7 @@ namespace caHitNtupletGenerator { unsigned long long nZeroTrackCells; }; - using Quality = pixelTrack::Quality; + using Quality = pixelTrackSoA::Quality; } // namespace caHitNtupletGenerator @@ -193,7 +199,7 @@ class CAHitNtupletGeneratorKernels { public: using Traits = TTraits; using TrackerTraits = TTTraits; - using QualityCuts = pixelTrack::QualityCutsT; + using QualityCuts = pixelTrackSoA::QualityCutsT; using Params = caHitNtupletGenerator::ParamsT; using CAParams = caHitNtupletGenerator::CAParamsT; using Counters = caHitNtupletGenerator::Counters; @@ -201,8 +207,9 @@ class CAHitNtupletGeneratorKernels { template using unique_ptr = typename Traits::template unique_ptr; - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; + using HitsView = HitSoAView; + using HitsConstView = HitSoAConstView; + using TkSoAView = TrackSoAView; using HitToTuple = caStructures::HitToTupleT; using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -215,9 +222,8 @@ class CAHitNtupletGeneratorKernels { using CACell = GPUCACellT; - using Quality = pixelTrack::Quality; - using TkSoA = pixelTrack::TrackSoAT; - using HitContainer = pixelTrack::HitContainerT; + using Quality = pixelTrackSoA::Quality; + using HitContainer = typename trackSoA::HitContainer; CAHitNtupletGeneratorKernels(Params const& params) : params_(params), paramsMaxDoubletes3Quarters_(3 * params.cellCuts_.maxNumberOfDoublets_ / 4) {} @@ -226,11 +232,11 @@ class CAHitNtupletGeneratorKernels { TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); } - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void launchKernels(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); + void classifyTuples(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); void cleanup(cudaStream_t cudaStream); @@ -283,20 +289,24 @@ class CAHitNtupletGeneratorKernels { template class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels { using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; - using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; - using HitContainer = pixelTrack::HitContainerT; + using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitContainer = typename trackSoA::HitContainer; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; using HitToTuple = caStructures::HitToTupleT; using CellTracksVector = caStructures::CellTracksVectorT; using TupleMultiplicity = caStructures::TupleMultiplicityT; - using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitsConstView = HitSoAConstView; + using TkSoAView = TrackSoAView; public: - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void launchKernels(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); + void classifyTuples(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); + void buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); static void printCounters(Counters const* counters); }; @@ -304,19 +314,24 @@ class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels class CAHitNtupletGeneratorKernelsCPU : public CAHitNtupletGeneratorKernels { using CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels; - using HitsOnCPU = TrackingRecHit2DHeterogeneousT; - using TkSoA = pixelTrack::TrackSoAT; + using Counters = caHitNtupletGenerator::Counters; + using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitContainer = typename trackSoA::HitContainer; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; using HitToTuple = caStructures::HitToTupleT; using CellTracksVector = caStructures::CellTracksVectorT; using TupleMultiplicity = caStructures::TupleMultiplicityT; - using CAParams = caHitNtupletGenerator::CAParamsT; + + using HitsConstView = HitSoAConstView; + using TkSoAView = TrackSoAView; public: - void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream); - void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream); + void launchKernels(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); + void classifyTuples(HitsConstView hh, TkSoAView track_view, cudaStream_t cudaStream); + void buildDoublets(HitsConstView hh, int32_t offsetBPIX2, cudaStream_t stream); void allocateOnGPU(int32_t nHits, cudaStream_t stream); static void printCounters(Counters const* counters); }; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h index 03112e0f3fc48..9eff89c59a138 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -15,6 +15,9 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" + #include "CAStructures.h" #include "CAHitNtupletGeneratorKernels.h" #include "GPUCACell.h" @@ -28,8 +31,6 @@ namespace caHitNtupletGeneratorKernels { constexpr float nSigma2 = 25.f; //all of these below are mostly to avoid brining around the relative namespace - template - using HitsView = TrackingRecHit2DSOAViewT; template using HitToTuple = caStructures::HitToTupleT; @@ -46,19 +47,19 @@ namespace caHitNtupletGeneratorKernels { template using OuterHitOfCell = caStructures::OuterHitOfCellT; - using Quality = pixelTrack::Quality; + using Quality = pixelTrackSoA::Quality; template - using TkSoA = pixelTrack::TrackSoAT; + using TkSoAView = TrackSoAView; template - using HitContainer = pixelTrack::HitContainerT; + using HitContainer = typename trackSoA::HitContainer; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template - using QualityCuts = pixelTrack::QualityCutsT; + using QualityCuts = pixelTrackSoA::QualityCutsT; template using CAParams = caHitNtupletGenerator::CAParamsT; @@ -66,7 +67,7 @@ namespace caHitNtupletGeneratorKernels { using Counters = caHitNtupletGenerator::Counters; template - __global__ void kernel_checkOverflows(HitContainer const *foundNtuplets, + __global__ void kernel_checkOverflows(TkSoAView tracks_view, TupleMultiplicity const *tupleMultiplicity, HitToTuple const *hitToTuple, cms::cuda::AtomicPairCounter *apc, @@ -99,16 +100,16 @@ namespace caHitNtupletGeneratorKernels { nHits, hitToTuple->totOnes()); if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) { - assert(foundNtuplets->size(apc->get().m) == 0); - assert(foundNtuplets->size() == apc->get().n); + assert(tracks_view.hitIndices().size(apc->get().m) == 0); + assert(tracks_view.hitIndices().size() == apc->get().n); } } - for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { - if (foundNtuplets->size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit - printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx)); - assert(foundNtuplets->size(idx) <= TrackerTraits::maxHitsOnTrack); - for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih) + for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit + printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx)); + assert(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack); + for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih) assert(int(*ih) < nHits); } #endif @@ -168,8 +169,8 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_fishboneCleaner(GPUCACellT const *cells, uint32_t const *__restrict__ nCells, - Quality *quality) { - constexpr auto reject = pixelTrack::Quality::dup; + TkSoAView tracks_view) { + constexpr auto reject = pixelTrackSoA::Quality::dup; auto first = threadIdx.x + blockIdx.x * blockDim.x; for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) { @@ -178,7 +179,7 @@ namespace caHitNtupletGeneratorKernels { continue; for (auto it : thisCell.tracks()) - quality[it] = reject; + tracks_view[it].quality() = reject; } } @@ -187,13 +188,10 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_earlyDuplicateRemover(GPUCACellT const *cells, uint32_t const *__restrict__ nCells, - TkSoA const *__restrict__ ptracks, - Quality *quality, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - constexpr auto reject = pixelTrack::Quality::edup; /// cannot be loose - - auto const &tracks = *ptracks; + constexpr auto reject = pixelTrackSoA::Quality::edup; /// cannot be loose assert(nCells); auto first = threadIdx.x + blockIdx.x * blockDim.x; @@ -207,7 +205,7 @@ namespace caHitNtupletGeneratorKernels { // find maxNl for (auto it : thisCell.tracks()) { - auto nl = tracks.nLayers(it); + auto nl = tracks_view[it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -216,8 +214,8 @@ namespace caHitNtupletGeneratorKernels { // maxNl = std::min(4, maxNl); for (auto it : thisCell.tracks()) { - if (tracks.nLayers(it) < maxNl) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].nLayers() < maxNl) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } } @@ -226,11 +224,11 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_fastDuplicateRemover(GPUCACellT const *__restrict__ cells, uint32_t const *__restrict__ nCells, - TkSoA *__restrict__ tracks, + TkSoAView tracks_view, bool dupPassThrough) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; + auto const reject = dupPassThrough ? pixelTrackSoA::Quality::loose : pixelTrackSoA::Quality::dup; + constexpr auto loose = pixelTrackSoA::Quality::loose; assert(nCells); @@ -243,45 +241,37 @@ namespace caHitNtupletGeneratorKernels { float mc = maxScore; uint16_t im = tkNotFound; - /* chi2 penalize higher-pt tracks (try rescale it?) - auto score = [&](auto it) { - return tracks->nLayers(it) < 4 ? - std::abs(tracks->tip(it)) : // tip for triplets - tracks->chi2(it); //chi2 for quads - }; - */ - - auto score = [&](auto it) { return std::abs(tracks->tip(it)); }; + auto score = [&](auto it) { return std::abs(tracksUtilities::tip(tracks_view, it)); }; // full crazy combinatorics // full crazy combinatorics int ntr = thisCell.tracks().size(); for (int i = 0; i < ntr - 1; ++i) { auto it = thisCell.tracks()[i]; - auto qi = tracks->quality(it); + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks->stateAtBS.state(it)(2); - auto e2opi = tracks->stateAtBS.covariance(it)(9); - auto cti = tracks->stateAtBS.state(it)(3); - auto e2cti = tracks->stateAtBS.covariance(it)(12); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); for (auto j = i + 1; j < ntr; ++j) { auto jt = thisCell.tracks()[j]; - auto qj = tracks->quality(jt); + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks->stateAtBS.state(jt)(2); - auto ctj = tracks->stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; if ((qj < qi) || (qj == qi && score(it) < score(jt))) - tracks->quality(jt) = reject; + tracks_view[jt].quality() = reject; else { - tracks->quality(it) = reject; + tracks_view[it].quality() = reject; break; } } @@ -290,8 +280,8 @@ namespace caHitNtupletGeneratorKernels { // find maxQual auto maxQual = reject; // no duplicate! for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > maxQual) - maxQual = tracks->quality(it); + if (tracks_view[it].quality() > maxQual) + maxQual = tracks_view[it].quality(); } if (maxQual <= loose) @@ -299,7 +289,7 @@ namespace caHitNtupletGeneratorKernels { // find min score for (auto it : thisCell.tracks()) { - if (tracks->quality(it) == maxQual && score(it) < mc) { + if (tracks_view[it].quality() == maxQual && score(it) < mc) { mc = score(it); im = it; } @@ -310,8 +300,8 @@ namespace caHitNtupletGeneratorKernels { // mark all other duplicates (not yet, keep it loose) for (auto it : thisCell.tracks()) { - if (tracks->quality(it) > loose && it != im) - tracks->quality(it) = loose; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > loose && it != im) + tracks_view[it].quality() = loose; //no race: simple assignment of the same constant } } } @@ -319,14 +309,13 @@ namespace caHitNtupletGeneratorKernels { template __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1, cms::cuda::AtomicPairCounter *apc2, // just to zero them, - Hits const *__restrict__ hhp, + HitsConstView hh, GPUCACellT *cells, uint32_t const *__restrict__ nCells, CellNeighborsVector *cellNeighbors, OuterHitOfCell const isOuterHitOfCell, CAParams params) { using Cell = GPUCACellT; - auto const &hh = *hhp; auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y; auto first = threadIdx.x; @@ -383,16 +372,14 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_find_ntuplets(Hits const *__restrict__ hhp, + __global__ void kernel_find_ntuplets(HitsConstView hh, + TkSoAView tracks_view, GPUCACellT *__restrict__ cells, uint32_t const *nCells, CellTracksVector *cellTracks, - HitContainer *foundNtuplets, cms::cuda::AtomicPairCounter *apc, - Quality *__restrict__ quality, CAParams params) { // recursive: not obvious to widen - auto const &hh = *hhp; using Cell = GPUCACellT; @@ -423,7 +410,7 @@ namespace caHitNtupletGeneratorKernels { bool bpix1Start = params.startAt0(pid); thisCell.template find_ntuplets( - hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, params.minHitsPerNtuplet_, bpix1Start); + hh, cells, *cellTracks, tracks_view.hitIndices(), *apc, tracks_view.quality(), stack, params.minHitsPerNtuplet_, bpix1Start); assert(stack.empty()); } @@ -441,17 +428,16 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + __global__ void kernel_countMultiplicity(TkSoAView tracks_view, TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrackSoA::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrackSoA::Quality::bad); if (nhits > TrackerTraits::maxHitsOnTrack) // current limit printf("wrong mult %d %d\n", it, nhits); assert(nhits <= TrackerTraits::maxHitsOnTrack); @@ -460,17 +446,16 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets, - Quality const *__restrict__ quality, + __global__ void kernel_fillMultiplicity(TkSoAView tracks_view, TupleMultiplicity *tupleMultiplicity) { auto first = blockIdx.x * blockDim.x + threadIdx.x; - for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = foundNtuplets->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits < 3) continue; - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrackSoA::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrackSoA::Quality::bad); if (nhits > TrackerTraits::maxHitsOnTrack) printf("wrong mult %d %d\n", it, nhits); assert(nhits <= TrackerTraits::maxHitsOnTrack); @@ -478,22 +463,22 @@ namespace caHitNtupletGeneratorKernels { } } + ///TODO : why there was quality here? template - __global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples, - TkSoA const *__restrict__ tracks, - QualityCuts cuts, - Quality *__restrict__ quality) { + __global__ void kernel_classifyTracks(TkSoAView tracks_view, + QualityCuts cuts) { + // Quality *__restrict__ quality) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) { - auto nhits = tuples->size(it); + for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) { + auto nhits = tracks_view.hitIndices().size(it); if (nhits == 0) break; // guard // if duplicate: not even fit - if (quality[it] == pixelTrack::Quality::edup) + if (tracks_view[it].quality() == pixelTrackSoA::Quality::edup) continue; - assert(quality[it] == pixelTrack::Quality::bad); + assert(tracks_view[it].quality() == pixelTrackSoA::Quality::bad); // mark doublets as bad if (nhits < 3) @@ -502,101 +487,102 @@ namespace caHitNtupletGeneratorKernels { // if the fit has any invalid parameters, mark it as bad bool isNaN = false; for (int i = 0; i < 5; ++i) { - isNaN |= std::isnan(tracks->stateAtBS.state(it)(i)); + isNaN |= std::isnan(tracks_view[it].state()(i)); } if (isNaN) { #ifdef NTUPLE_DEBUG - printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it)); + printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2()); #endif continue; } - quality[it] = pixelTrack::Quality::strict; + tracks_view[it].quality() = pixelTrackSoA::Quality::strict; - if (cuts.strictCut(tracks, it)) + if (cuts.strictCut(tracks_view, it)) continue; - quality[it] = pixelTrack::Quality::tight; + tracks_view[it].quality() = pixelTrackSoA::Quality::tight; - if (cuts.isHP(tracks, nhits, it)) - quality[it] = pixelTrack::Quality::highPurity; + if (cuts.isHP(tracks_view, nhits, it)) + tracks_view[it].quality() = pixelTrackSoA::Quality::highPurity; } } template - __global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, + __global__ void kernel_doStatsForTracks(TkSoAView tracks_view, Counters *counters) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; //guard - if (quality[idx] < pixelTrack::Quality::loose) + if (tracks_view[idx].quality() < pixelTrackSoA::Quality::loose) continue; atomicAdd(&(counters->nLooseTracks), 1); - if (quality[idx] < pixelTrack::Quality::strict) + if (tracks_view[idx].quality() < pixelTrackSoA::Quality::strict) continue; atomicAdd(&(counters->nGoodTracks), 1); } } template - __global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, + __global__ void kernel_countHitInTracks(TkSoAView tracks_view, HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->count(*h); } } template - __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, - Quality const *__restrict__ quality, + __global__ void kernel_fillHitInTracks(TkSoAView tracks_view, HitToTuple *hitToTuple) { int first = blockDim.x * blockIdx.x + threadIdx.x; - for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - if (tuples->size(idx) == 0) + for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + if (tracks_view.hitIndices().size(idx) == 0) break; // guard - for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h) + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) hitToTuple->fill(*h, idx); } } template - __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - HitsView const *__restrict__ hhp, - HitContainer *__restrict__ hitDetIndices) { + __global__ void kernel_fillHitDetIndices(TkSoAView tracks_view, + HitsConstView hh) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets - for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { - hitDetIndices->off[idx] = tuples->off[idx]; + for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { + tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; } // fill hit indices - auto const &hh = *hhp; auto nhits = hh.nHits(); - for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { - assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) { + assert(tracks_view.hitIndices().content[idx] < nhits); + tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex(); } } + /* + Needs both TkSoA and TkSoAView for accessing SoA, computeNumberOfLayers(), nHits(), stride() + + ???? + */ + template - __global__ void kernel_fillNLayers(TkSoA *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) { - auto &tracks = *ptracks; + __global__ void kernel_fillNLayers(TkSoAView tracks_view, cms::cuda::AtomicPairCounter *apc) { + auto first = blockIdx.x * blockDim.x + threadIdx.x; // clamp the number of tracks to the capacity of the SoA - auto ntracks = std::min(apc->get().m, tracks.stride() - 1); + auto ntracks = std::min(apc->get().m, tracks_view.metadata().size() - 1); if (0 == first) - tracks.setNTracks(ntracks); + tracks_view.nTracks() = ntracks; for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + auto nHits = tracksUtilities::nHits(tracks_view, idx); assert(nHits >= 3); - tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx); + tracks_view[idx].nLayers() = tracksUtilities::computeNumberOfLayers(tracks_view, idx); } } @@ -619,7 +605,7 @@ namespace caHitNtupletGeneratorKernels { HitContainer const *__restrict__ ptuples, Quality const *__restrict__ quality, HitToTuple const *__restrict__ phitToTuple) { - constexpr auto loose = pixelTrack::Quality::loose; + constexpr auto loose = pixelTrackSoA::Quality::loose; auto &hitToTuple = *phitToTuple; auto const &foundNtuplets = *ptuples; @@ -656,10 +642,10 @@ namespace caHitNtupletGeneratorKernels { HitContainer const *__restrict__ tuples, Quality *__restrict__ quality, bool dupPassThrough) { - // constexpr auto bad = pixelTrack::Quality::bad; - constexpr auto dup = pixelTrack::Quality::dup; - constexpr auto loose = pixelTrack::Quality::loose; - // constexpr auto strict = pixelTrack::Quality::strict; + // constexpr auto bad = pixelTrackSoA::Quality::bad; + constexpr auto dup = pixelTrackSoA::Quality::dup; + constexpr auto loose = pixelTrackSoA::Quality::loose; + // constexpr auto strict = pixelTrackSoA::Quality::strict; // quality to mark rejected auto const reject = dupPassThrough ? loose : dup; @@ -677,59 +663,51 @@ namespace caHitNtupletGeneratorKernels { // mostly for very forward triplets..... template - __global__ void kernel_rejectDuplicate(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_rejectDuplicate(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? pixelTrackSoA::Quality::loose : pixelTrackSoA::Quality::dup; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { if (hitToTuple.size(idx) < 2) continue; - /* chi2 is bad for large pt - auto score = [&](auto it, auto nl) { - return nl < 4 ? std::abs(tracks.tip(it)) : // tip for triplets - tracks.chi2(it); //chi2 - }; - */ - auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); }; + auto score = [&](auto it, auto nl) { return std::abs(tracksUtilities::tip(tracks_view, it)); }; // full combinatorics for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { auto const it = *ip; - auto qi = quality[it]; + auto qi = tracks_view[it].quality(); if (qi <= reject) continue; - auto opi = tracks.stateAtBS.state(it)(2); - auto e2opi = tracks.stateAtBS.covariance(it)(9); - auto cti = tracks.stateAtBS.state(it)(3); - auto e2cti = tracks.stateAtBS.covariance(it)(12); - auto nli = tracks.nLayers(it); + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); + auto nli = tracks_view[it].nLayers(); for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { auto const jt = *jp; - auto qj = quality[jt]; + auto qj = tracks_view[jt].quality(); if (qj <= reject) continue; - auto opj = tracks.stateAtBS.state(jt)(2); - auto ctj = tracks.stateAtBS.state(jt)(3); - auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti); + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); if ((cti - ctj) * (cti - ctj) > dct) continue; - auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi); + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); if ((opi - opj) * (opi - opj) > dop) continue; - auto nlj = tracks.nLayers(jt); + auto nlj = tracks_view[jt].nLayers(); if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) - quality[jt] = reject; + tracks_view[jt].quality() = reject; else { - quality[it] = reject; + tracks_view[it].quality() = reject; break; } } @@ -738,21 +716,18 @@ namespace caHitNtupletGeneratorKernels { } template - __global__ void kernel_sharedHitCleaner(HitsView const *__restrict__ hhp, - TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_sharedHitCleaner(HitsConstView hh, + TkSoAView tracks_view, int nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup; + auto const reject = dupPassThrough ? pixelTrackSoA::Quality::loose : pixelTrackSoA::Quality::dup; // quality of longest track - auto const longTqual = pixelTrack::Quality::highPurity; + auto const longTqual = pixelTrackSoA::Quality::highPurity; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; - auto const &hh = *hhp; int l1end = hh.hitsLayerStart()[1]; int first = blockDim.x * blockIdx.x + threadIdx.x; @@ -764,10 +739,10 @@ namespace caHitNtupletGeneratorKernels { // find maxNl for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] < longTqual) + if (tracks_view[*it].quality() < longTqual) continue; - // if (tracks.nHits(*it)==3) continue; - auto nl = tracks.nLayers(*it); + // if (tracks_view[*it].nHits()==3) continue; + auto nl = tracks_view[*it].nLayers(); maxNl = std::max(nl, maxNl); } @@ -779,31 +754,29 @@ namespace caHitNtupletGeneratorKernels { // kill all tracks shorter than maxHl (only triplets??? for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - auto nl = tracks.nLayers(*it); + auto nl = tracks_view[*it].nLayers(); //checking if shared hit is on bpix1 and if the tuple is short enough if (idx < l1end and nl > nmin) continue; - if (nl < maxNl && quality[*it] > reject) - quality[*it] = reject; + if (nl < maxNl && tracks_view[*it].quality() > reject) + tracks_view[*it].quality() = reject; } } } template - __global__ void kernel_tripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_tripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = pixelTrackSoA::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::strict; + auto const good = pixelTrackSoA::Quality::strict; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -816,9 +789,9 @@ namespace caHitNtupletGeneratorKernels { // check if only triplets for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { - if (quality[*it] <= good) + if (tracks_view[*it].quality() <= good) continue; - onlyTriplets &= tracks.isTriplet(*it); + onlyTriplets &= tracksUtilities::isTriplet(tracks_view, *it); if (!onlyTriplets) break; } @@ -830,8 +803,8 @@ namespace caHitNtupletGeneratorKernels { // for triplets choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(tracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(tracksUtilities::tip(tracks_view, it)); im = it; } } @@ -842,26 +815,24 @@ namespace caHitNtupletGeneratorKernels { // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } template - __global__ void kernel_simpleTripletCleaner(TkSoA const *__restrict__ ptracks, - Quality *__restrict__ quality, + __global__ void kernel_simpleTripletCleaner(TkSoAView tracks_view, uint16_t nmin, bool dupPassThrough, HitToTuple const *__restrict__ phitToTuple) { // quality to mark rejected - auto const reject = pixelTrack::Quality::loose; + auto const reject = pixelTrackSoA::Quality::loose; /// min quality of good - auto const good = pixelTrack::Quality::loose; + auto const good = pixelTrackSoA::Quality::loose; auto &hitToTuple = *phitToTuple; - auto const &tracks = *ptracks; int first = blockDim.x * blockIdx.x + threadIdx.x; for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) { @@ -874,8 +845,8 @@ namespace caHitNtupletGeneratorKernels { // choose best tip! (should we first find best quality???) for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) { - mc = std::abs(tracks.tip(it)); + if (tracks_view[it].quality() >= good && std::abs(tracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(tracksUtilities::tip(tracks_view, it)); im = it; } } @@ -886,53 +857,48 @@ namespace caHitNtupletGeneratorKernels { // mark worse ambiguities for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { auto const it = *ip; - if (quality[it] > reject && tracks.isTriplet(it) && it != im) - quality[it] = reject; //no race: simple assignment of the same constant + if (tracks_view[it].quality() > reject && tracksUtilities::isTriplet(tracks_view, it) && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant } } // loop over hits } template - __global__ void kernel_print_found_ntuplets(HitsView const *__restrict__ hhp, - HitContainer const *__restrict__ ptuples, - TkSoA const *__restrict__ ptracks, - Quality const *__restrict__ quality, + __global__ void kernel_print_found_ntuplets(HitsConstView hh, + TkSoAView tracks_view, HitToTuple const *__restrict__ phitToTuple, int32_t firstPrint, int32_t lastPrint, int iev) { - constexpr auto loose = pixelTrack::Quality::loose; - auto const &hh = *hhp; - auto const &foundNtuplets = *ptuples; - auto const &tracks = *ptracks; + constexpr auto loose = pixelTrackSoA::Quality::loose; + int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x; - for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) { - auto nh = foundNtuplets.size(i); + for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np; i += blockDim.x * gridDim.x) { + auto nh = tracks_view.hitIndices().size(i); if (nh < 3) continue; - if (quality[i] < loose) + if (tracks_view[i].quality() < loose) continue; printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", 10000 * iev + i, - int(quality[i]), + int(tracks_view[i].quality()), nh, - tracks.nLayers(i), - tracks.charge(i), - tracks.pt(i), - tracks.eta(i), - tracks.phi(i), - tracks.tip(i), - tracks.zip(i), - // asinhf(fit_results[i].par(3)), - tracks.chi2(i), - hh.zGlobal(*foundNtuplets.begin(i)), - hh.zGlobal(*(foundNtuplets.begin(i) + 1)), - hh.zGlobal(*(foundNtuplets.begin(i) + 2)), - nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0, - nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0, - nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0, - nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0); + tracks_view[i].nLayers(), + tracksUtilities::charge(tracks_view, i), + tracks_view[i].pt(), + tracks_view[i].eta(), + tracksUtilities::phi(tracks_view, i), + tracksUtilities::tip(tracks_view, i), + tracksUtilities::zip(tracks_view, i), + tracks_view[i].chi2(), + hh[*tracks_view.hitIndices().begin(i)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(), + nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0, + nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0, + nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0, + nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0); } } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index 6d9ac785155d2..44c9be436bf23 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -21,6 +21,12 @@ #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" + #include "CAHitNtupletGeneratorOnGPU.h" namespace { @@ -66,12 +72,12 @@ namespace { (float)cfg.getParameter("dcaCutOuterTriplet")}}; }; - static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + static constexpr pixelTrackSoA::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { auto coeff = pset.getParameter>("chi2Coeff"); auto ptMax = pset.getParameter("chi2MaxPt"); coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax); - return QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut + return pixelTrackSoA::QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, // max pT used to determine the chi2 cut (float)ptMax, @@ -101,8 +107,8 @@ namespace { {(bool)cfg.getParameter("includeFarForwards")}}; } - static constexpr QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { - return QualityCutsT{ + static constexpr pixelTrackSoA::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + return pixelTrackSoA::QualityCutsT{ (float)pset.getParameter("maxChi2"), (float)pset.getParameter("minPt"), (float)pset.getParameter("maxTip"), @@ -274,16 +280,13 @@ void CAHitNtupletGeneratorOnGPU::endJob() { } template -PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuplesAsync( +TrackSoAHeterogeneousDevice CAHitNtupletGeneratorOnGPU::makeTuplesAsync( HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const { using HelixFitOnGPU = HelixFitOnGPU; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoA = TrackSoAHeterogeneousDevice; using GPUKernels = CAHitNtupletGeneratorKernelsGPU; - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); - - auto* soa = tracks.get(); - assert(soa); + TrackSoA tracks(stream); cudaCheck(cudaGetLastError()); GPUKernels kernels(m_params); @@ -291,20 +294,21 @@ PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPUhitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } else { fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream); } - kernels.classifyTuples(hits_d, soa, stream); + kernels.classifyTuples(hits_d.view(), tracks.view(), stream); #ifdef GPU_DEBUG cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); @@ -315,47 +319,43 @@ PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU -PixelTrackHeterogeneousT CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_d, +TrackSoAHeterogeneousHost CAHitNtupletGeneratorOnGPU::makeTuples(HitsOnCPU const& hits_h, float bfield) const { using HelixFitOnGPU = HelixFitOnGPU; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TrackSoA = TrackSoAHeterogeneousHost; using CPUKernels = CAHitNtupletGeneratorKernelsCPU; - PixelTrackHeterogeneous tracks(std::make_unique()); - - auto* soa = tracks.get(); - assert(soa); + TrackSoA tracks(nullptr); CPUKernels kernels(m_params); kernels.setCounters(m_counters); - kernels.allocateOnGPU(hits_d.nHits(), nullptr); + kernels.allocateOnGPU(hits_h.nHits(), nullptr); - kernels.buildDoublets(hits_d, nullptr); - kernels.launchKernels(hits_d, soa, nullptr); + kernels.buildDoublets(hits_h.view(), hits_h.offsetBPIX2(), nullptr); + kernels.launchKernels(hits_h.view(), tracks.view(), nullptr); - if (0 == hits_d.nHits()) + if (0 == hits_h.nHits()) return tracks; // now fit HelixFitOnGPU fitter(bfield, m_params.fitNas4_); - fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); + fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view()); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); + fitter.launchRiemannKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets); } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets); + fitter.launchBrokenLineKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets); } - kernels.classifyTuples(hits_d, soa, nullptr); + kernels.classifyTuples(hits_h.view(), tracks.view(), nullptr); #ifdef GPU_DEBUG std::cout << "finished building pixel tracks on CPU" << std::endl; #endif // check that the fixed-size SoA does not overflow - auto const& tsoa = *soa; - auto maxTracks = tsoa.stride(); - auto nTracks = tsoa.nTracks(); + auto maxTracks = tracks.view().metadata().size(); + auto nTracks = tracks.view().nTracks(); assert(nTracks < maxTracks); if (nTracks == maxTracks - 1) { edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1 diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h index 745579b960b76..0310b51d06e35 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h @@ -2,8 +2,14 @@ #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" @@ -24,28 +30,30 @@ namespace edm { template class CAHitNtupletGeneratorOnGPU { public: - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; - using HitsView = TrackingRecHit2DSOAViewT; - using HitsOnGPU = TrackingRecHit2DGPUT; - using HitsOnCPU = TrackingRecHit2DCPUT; - using hindex_type = typename HitsView::hindex_type; + using HitsView = HitSoAView; + using HitsConstView = HitSoAConstView; + using HitsOnGPU = TrackingRecHitSoADevice; //TODO move to OnDevice + using HitsOnCPU = TrackingRecHitSoAHost; //TODO move to OnHost + using hindex_type = typename trackingRecHitSoA::hindex_type; using HitToTuple = caStructures::HitToTupleT; using TupleMultiplicity = caStructures::TupleMultiplicityT; using OuterHitOfCell = caStructures::OuterHitOfCellT; using GPUCACell = GPUCACellT; - using OutputSoA = pixelTrack::TrackSoAT; - using HitContainer = typename OutputSoA::HitContainer; + // using OutputSoA = pixelTrack::TrackSoAT; + using TrackSoAHost = TrackSoAHeterogeneousHost; + using TrackSoADevice = TrackSoAHeterogeneousDevice; + using HitContainer = typename trackSoA::HitContainer; using Tuple = HitContainer; using CellNeighborsVector = caStructures::CellNeighborsVectorT; using CellTracksVector = caStructures::CellTracksVectorT; - using Quality = pixelTrack::Quality; + using Quality = pixelTrackSoA::Quality; - using QualityCuts = pixelTrack::QualityCutsT; + using QualityCuts = pixelTrackSoA::QualityCutsT; using Params = caHitNtupletGenerator::ParamsT; using Counters = caHitNtupletGenerator::Counters; @@ -61,16 +69,16 @@ class CAHitNtupletGeneratorOnGPU { void beginJob(); void endJob(); - PixelTrackHeterogeneous makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const; + TrackSoADevice makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const; - PixelTrackHeterogeneous makeTuples(HitsOnCPU const& hits_d, float bfield) const; + TrackSoAHost makeTuples(HitsOnCPU const& hits_d, float bfield) const; private: - void buildDoublets(HitsOnGPU const& hh, cudaStream_t stream) const; + void buildDoublets(HitsConstView hh, cudaStream_t stream) const; - void hitNtuplets(HitsOnGPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); + void hitNtuplets(HitsConstView hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream); - void launchKernels(HitsOnGPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const; + void launchKernels(HitsConstView hh, bool useRiemannFit, cudaStream_t cudaStream) const; Params m_params; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h index 965889abcb268..e14af7482146e 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h @@ -9,12 +9,12 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "CAStructures.h" @@ -31,16 +31,16 @@ class GPUCACellT { using CellNeighborsVector = caStructures::CellNeighborsVectorT; using CellTracksVector = caStructures::CellTracksVectorT; - using Hits = TrackingRecHit2DSOAViewT; + using HitsConstView = HitSoAConstView; using hindex_type = typename TrackerTraits::hindex_type; using tindex_type = typename TrackerTraits::tindex_type; static constexpr auto invalidHitId = std::numeric_limits::max(); using TmpTuple = cms::cuda::VecArray; - using HitContainer = pixelTrack::HitContainerT; - using Quality = pixelTrack::Quality; - static constexpr auto bad = pixelTrack::Quality::bad; + using HitContainer = typename trackSoA::HitContainer; + using Quality = pixelTrackSoA::Quality; + static constexpr auto bad = pixelTrackSoA::Quality::bad; enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 }; @@ -48,7 +48,7 @@ class GPUCACellT { __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors, CellTracksVector& cellTracks, - Hits const& hh, + HitsConstView hh, int layerPairId, hindex_type innerHitId, hindex_type outerHitId) { @@ -59,8 +59,8 @@ class GPUCACellT { theFishboneId = invalidHitId; // optimization that depends on access pattern - theInnerZ = hh.zGlobal(innerHitId); - theInnerR = hh.rGlobal(innerHitId); + theInnerZ = hh[innerHitId].zGlobal(); + theInnerR = hh[innerHitId].rGlobal(); // link to default empty theOuterNeighbors = &cellNeighbors[0]; @@ -115,22 +115,22 @@ class GPUCACellT { __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } - __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; } + __device__ __forceinline__ float inner_x(HitsConstView hh) const { return hh[theInnerHitId].xGlobal(); } + __device__ __forceinline__ float outer_x(HitsConstView hh) const { return hh[theOuterHitId].xGlobal(); } + __device__ __forceinline__ float inner_y(HitsConstView hh) const { return hh[theInnerHitId].yGlobal(); } + __device__ __forceinline__ float outer_y(HitsConstView hh) const { return hh[theOuterHitId].yGlobal(); } + __device__ __forceinline__ float inner_z(HitsConstView hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } - __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; } + __device__ __forceinline__ float outer_z(HitsConstView hh) const { return hh[theOuterHitId].zGlobal(); } + __device__ __forceinline__ float inner_r(HitsConstView hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } - __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); } + __device__ __forceinline__ float outer_r(HitsConstView hh) const { return hh[theOuterHitId].rGlobal(); } - __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); } - __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); } + __device__ __forceinline__ auto inner_iphi(HitsConstView hh) const { return hh[theInnerHitId].iphi(); } + __device__ __forceinline__ auto outer_iphi(HitsConstView hh) const { return hh[theOuterHitId].iphi(); } - __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); } - __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); } + __device__ __forceinline__ float inner_detIndex(HitsConstView hh) const { return hh[theInnerHitId].detectorIndex(); } + __device__ __forceinline__ float outer_detIndex(HitsConstView hh) const { return hh[theOuterHitId].detectorIndex(); } constexpr unsigned int inner_hit_id() const { return theInnerHitId; } constexpr unsigned int outer_hit_id() const { return theOuterHitId; } @@ -142,7 +142,7 @@ class GPUCACellT { theOuterHitId); } - __device__ bool check_alignment(Hits const& hh, + __device__ bool check_alignment(HitsConstView hh, GPUCACellT const& otherCell, const float ptmin, const float hardCurvCut, @@ -189,7 +189,7 @@ class GPUCACellT { return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; } - __device__ inline bool dcaCut(Hits const& hh, + __device__ inline bool dcaCut(HitsConstView hh, GPUCACellT const& otherCell, const float region_origin_radius_plus_tolerance, const float maxCurv) const { @@ -226,7 +226,7 @@ class GPUCACellT { return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); } - __device__ inline bool hole0(Hits const& hh, GPUCACellT const& innerCell) const { + __device__ inline bool hole0(HitsConstView hh, GPUCACellT const& innerCell) const { using namespace phase1PixelTopology; int p = innerCell.inner_iphi(hh); @@ -247,7 +247,7 @@ class GPUCACellT { return gap; } - __device__ inline bool hole4(Hits const& hh, GPUCACellT const& innerCell) const { + __device__ inline bool hole4(HitsConstView hh, GPUCACellT const& innerCell) const { using namespace phase1PixelTopology; int p = outer_iphi(hh); @@ -274,7 +274,7 @@ class GPUCACellT { // the visit of the graph based on the neighborhood connections between cells. template - __device__ inline void find_ntuplets(Hits const& hh, + __device__ inline void find_ntuplets(HitsConstView hh, GPUCACellT* __restrict__ cells, CellTracksVector& cellTracks, HitContainer& foundNtuplets, @@ -356,14 +356,14 @@ class GPUCACellT { __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); } __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); } - __device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) { + __device__ __forceinline__ void setFishbone(hindex_type id, float z, HitsConstView hh) { // make it deterministic: use the farther apart (in z) auto old = theFishboneId; while ( old != atomicCAS(&theFishboneId, old, - (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old)) + (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old)) old = theFishboneId; } __device__ __forceinline__ auto fishboneId() const { return theFishboneId; } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc index c300329a82208..bcadf5497f911 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc @@ -2,17 +2,18 @@ #include "HelixFitOnGPU.h" template -void HelixFitOnGPU::allocateOnGPU( - Tuples const *tuples, - caStructures::TupleMultiplicityT const *tupleMultiplicity, - pixelTrack::TrackSoAT *helix_fit_results) { - tuples_ = tuples; +void HelixFitOnGPU::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results) +{ + + // tuples_ = tuples; + tuples_ = &helix_fit_results.hitIndices(); tupleMultiplicity_ = tupleMultiplicity; outputSoa_ = helix_fit_results; assert(tuples_); assert(tupleMultiplicity_); - assert(outputSoa_); + assert(outputSoa_.chi2()); + assert(outputSoa_.pt()); } template diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h index 78bec6f5e2a87..d82228312b690 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h @@ -2,7 +2,9 @@ #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneTous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" @@ -43,10 +45,19 @@ namespace riemannFit { template class HelixFitOnGPU { public: - using HitsView = TrackingRecHit2DSOAViewT; - using Tuples = pixelTrack::HitContainerT; - using OutputSoA = pixelTrack::TrackSoAT; + using trackingRecHitSoAs = trackingRecHitSoA; + using trackSoAs = trackSoA; + + using HitView = HitSoAView; + using HitConstView = HitSoAConstView; + + using Tuples = typename trackSoAs::HitContainer; + using OutputSoAView = TrackSoAView; + + + // using Tuples = pixelTrack::HitContainerT; + // using OutputSoA = pixelTrack::TrackSoAT; using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -54,13 +65,13 @@ class HelixFitOnGPU { ~HelixFitOnGPU() { deallocateOnGPU(); } void setBField(double bField) { bField_ = bField; } - void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchRiemannKernels(HitConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); + void launchBrokenLineKernels(HitConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream); - void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchRiemannKernelsOnCPU(HitConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples); + void launchBrokenLineKernelsOnCPU(HitConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples); - void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA); + void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView helix_fit_results); void deallocateOnGPU(); private: @@ -69,7 +80,7 @@ class HelixFitOnGPU { // fowarded Tuples const *tuples_ = nullptr; TupleMultiplicity const *tupleMultiplicity_ = nullptr; - OutputSoA *outputSoa_; + OutputSoAView outputSoa_; float bField_; const bool fitNas4_; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc index e4a7de6adaf4c..0132fa1238de7 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc @@ -1,9 +1,7 @@ #include "RiemannFitOnGPU.h" template -void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitsView const *hv, - uint32_t nhits, - uint32_t maxNumberOfTuples) { +void HelixFitOnGPU::launchRiemannKernelsOnCPU(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples) { assert(tuples_); // Fit internals diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu index 3d6b2d570077e..3e3afef8190c1 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu @@ -2,7 +2,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" template -void HelixFitOnGPU::launchRiemannKernels(HitsView const *hv, +void HelixFitOnGPU::launchRiemannKernels(HitSoAConstView hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t stream) { diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h index 18dd205cd13c3..febbf30a54441 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h @@ -6,7 +6,9 @@ #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" @@ -14,12 +16,11 @@ #include "HelixFitOnGPU.h" + template -using HitsOnGPU = TrackingRecHit2DSOAViewT; -template -using Tuples = pixelTrack::HitContainerT; +using Tuples = typename trackSoA::HitContainer; template -using OutputSoA = pixelTrack::TrackSoAT; +using OutputSoAView = TrackSoAView; template using TupleMultiplicity = caStructures::TupleMultiplicityT; @@ -27,7 +28,7 @@ template __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, - HitsOnGPU const *__restrict__ hhp, + HitSoAConstView hh, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit, @@ -68,14 +69,12 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNt auto const *hitId = foundNtuplets->begin(tkid); for (unsigned int i = 0; i < hitsInFit; ++i) { auto hit = hitId[i]; - // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); + // printf("Hit global: %f,%f,%f\n", hh.xg_d[hit],hh.yg_d[hit],hh.zg_d[hit]); float ge[6]; - hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); - // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); + hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); + // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hh.detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } riemannFit::fastFit(hits, fast_fit); @@ -133,13 +132,15 @@ template __global__ void kernel_LineFit(TupleMultiplicity const *__restrict__ tupleMultiplicity, uint32_t nHits, double bField, - OutputSoA *results, + OutputSoAView results_view, double *__restrict__ phits, float *__restrict__ phits_ge, double *__restrict__ pfast_fit_input, riemannFit::CircleFit *__restrict__ circle_fit, uint32_t offset) { - assert(results); + assert(results_view.pt()); + assert(results_view.eta()); + assert(results_view.chi2()); assert(circle_fit); assert(N <= nHits); @@ -154,7 +155,7 @@ __global__ void kernel_LineFit(TupleMultiplicity const *__restric break; // get it for the ntuple container (one to one to helix) - auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); riemannFit::Map3xNd hits(phits + local_idx); riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); @@ -164,11 +165,16 @@ __global__ void kernel_LineFit(TupleMultiplicity const *__restric riemannFit::fromCircleToPerigee(circle_fit[local_idx]); - results->stateAtBS.copyFromCircle( - circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid); - results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2)); - results->eta(tkid) = asinhf(line_fit.par(0)); - results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + tracksUtilities::copyFromCircle(results_view, + circle_fit[local_idx].par, + circle_fit[local_idx].cov, + line_fit.par, + line_fit.cov, + 1.f / float(bField), + tkid); + results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2)); + results_view[tkid].eta() = asinhf(line_fit.par(0)); + results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); #ifdef RIEMANN_DEBUG printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h index d4b3282574ec3..f32adf9f6e770 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h @@ -27,10 +27,10 @@ namespace gpuPixelDoublets { template using OuterHitOfCell = caStructures::OuterHitOfCellT; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template - __global__ void fishbone(Hits const* __restrict__ hhp, + __global__ void fishbone(HitsConstView hh, GPUCACellT* cells, uint32_t const* __restrict__ nCells, OuterHitOfCell const isOuterHitOfCellWrap, @@ -38,8 +38,6 @@ namespace gpuPixelDoublets { bool checkTrack) { constexpr auto maxCellsPerHit = GPUCACellT::maxCellsPerHit; - auto const& hh = *hhp; - auto const isOuterHitOfCell = isOuterHitOfCellWrap.container; int32_t offset = isOuterHitOfCellWrap.offset; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h index deed54ca02b5b..24299af89daba 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h @@ -7,18 +7,18 @@ namespace gpuPixelDoublets { - template - using CellNeighbors = caStructures::CellNeighborsT; - template - using CellTracks = caStructures::CellTracksT; - template - using CellNeighborsVector = caStructures::CellNeighborsVectorT; - template - using CellTracksVector = caStructures::CellTracksVectorT; - template - using OuterHitOfCell = caStructures::OuterHitOfCellT; - template - using Hits = typename GPUCACellT::Hits; + // template + // using CellNeighbors = caStructures::CellNeighborsT; + // template + // using CellTracks = caStructures::CellTracksT; + // template + // using CellNeighborsVector = caStructures::CellNeighborsVectorT; + // template + // using CellTracksVector = caStructures::CellTracksVectorT; + // template + // using OuterHitOfCell = caStructures::OuterHitOfCellT; + // template + // using Hits = typename GPUCACellT::Hits; // end constants // clang-format on @@ -59,11 +59,10 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAViewT const* __restrict__ hhp, + HitsConstView hh, OuterHitOfCell isOuterHitOfCell, int nActualPairs, CellCutsT cuts) { - auto const& __restrict__ hh = *hhp; doubletsFromHisto( nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, cuts); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h index 0f3d786a8e476..dd0d2daad5124 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h @@ -7,7 +7,7 @@ #include #include -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" @@ -32,11 +32,11 @@ namespace gpuPixelDoublets { template using OuterHitOfCell = caStructures::OuterHitOfCellT; template - using Hits = typename GPUCACellT::Hits; + using HitsConstView = typename GPUCACellT::HitsConstView; template struct CellCutsT { - using H = Hits; + using H = HitsConstView; using T = TrackerTraits; const uint32_t maxNumberOfDoublets_; @@ -45,21 +45,21 @@ namespace gpuPixelDoublets { const bool doPtCut_; const bool idealConditions_; //this is actually not used by phase2 - __device__ __forceinline__ bool zSizeCut(H const& hh, int i, int o) const { - auto mi = hh.detectorIndex(i); + __device__ __forceinline__ bool zSizeCut(H hh, int i, int o) const { + auto mi = hh[i].detectorIndex(); bool innerB1 = mi < T::last_bpix1_detIndex; bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; - auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; if (mes < 0) return false; - auto mo = hh.detectorIndex(o); - auto so = hh.clusterSizeY(o); + auto mo = hh[o].detectorIndex(); + auto so = hh[o].clusterSizeY(); - auto dz = hh.zGlobal(i) - hh.zGlobal(o); - auto dr = hh.rGlobal(i) - hh.rGlobal(o); + auto dz = hh[i].zGlobal() - hh[o].zGlobal(); + auto dr = hh[i].rGlobal() - hh[o].rGlobal(); auto innerBarrel = mi < T::last_barrel_detIndex; auto onlyBarrel = mo < T::last_barrel_detIndex; @@ -72,14 +72,14 @@ namespace gpuPixelDoublets { : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred; } - __device__ __forceinline__ bool clusterCut(H const& hh, int i, int o) const { - auto mo = hh.detectorIndex(o); + __device__ __forceinline__ bool clusterCut(H hh, int i, int o) const { + auto mo = hh[o].detectorIndex(); bool outerFwd = (mo >= T::last_barrel_detIndex); if (!outerFwd) return false; - auto mi = hh.detectorIndex(i); + auto mi = hh[i].detectorIndex(); bool innerB1orB2 = mi < T::last_bpix2_detIndex; if (!innerB1orB2) @@ -87,7 +87,7 @@ namespace gpuPixelDoublets { bool innerB1 = mi < T::last_bpix1_detIndex; bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; - auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; if (innerB1 && outerFwd) // B1 and F1 if (mes > 0 && mes < T::minYsizeB1) @@ -113,7 +113,7 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAViewT const& __restrict__ hh, + HitsConstView hh, OuterHitOfCell isOuterHitOfCell, CellCutsT const& cuts) { // ysize cuts (z in the barrel) times 8 @@ -124,10 +124,10 @@ namespace gpuPixelDoublets { const bool doPtCut = cuts.doPtCut_; const uint32_t maxNumOfDoublets = cuts.maxNumberOfDoublets_; - using PhiBinner = typename TrackingRecHit2DSOAViewT::PhiBinner; + using PhiBinner = typename trackingRecHitSoA::PhiBinner; auto const& __restrict__ phiBinner = hh.phiBinner(); - uint32_t const* __restrict__ offsets = hh.hitsLayerStart(); + uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data(); assert(offsets); auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; }; @@ -179,7 +179,7 @@ namespace gpuPixelDoublets { // found hit corresponding to our cuda thread, now do the job - if (hh.detectorIndex(i) > gpuClustering::maxNumModules) + if (hh[i].detectorIndex() > gpuClustering::maxNumModules) continue; // invalid /* maybe clever, not effective when zoCut is on @@ -188,7 +188,7 @@ namespace gpuPixelDoublets { if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue; */ - auto mez = hh.zGlobal(i); + auto mez = hh[i].zGlobal(); if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId]) continue; @@ -196,8 +196,8 @@ namespace gpuPixelDoublets { if (doClusterCut && cuts.clusterCut(hh, i, fo)) continue; - auto mep = hh.iphi(i); - auto mer = hh.rGlobal(i); + auto mep = hh[i].iphi(); + auto mer = hh[i].rGlobal(); // all cuts: true if fails constexpr float z0cut = TrackerTraits::z0Cut; // cm @@ -208,13 +208,13 @@ namespace gpuPixelDoublets { auto ptcut = [&](int j, int16_t idphi) { auto r2t4 = minRadius2T4; auto ri = mer; - auto ro = hh.rGlobal(j); + auto ro = hh[j].rGlobal(); auto dphi = short2phi(idphi); return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri); }; auto z0cutoff = [&](int j) { - auto zo = hh.zGlobal(j); - auto ro = hh.rGlobal(j); + auto zo = hh[j].zGlobal(); + auto ro = hh[j].rGlobal(); auto dr = ro - mer; return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; }; @@ -245,14 +245,14 @@ namespace gpuPixelDoublets { auto oi = __ldg(p); assert(oi >= offsets[outer]); assert(oi < offsets[outer + 1]); - auto mo = hh.detectorIndex(oi); + auto mo = hh[oi].detectorIndex(); if (mo > gpuClustering::maxNumModules) continue; // invalid if (doZ0Cut && z0cutoff(oi)) continue; - auto mop = hh.iphi(oi); + auto mop = hh[oi].iphi(); uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); if (idphi > iphicut) continue; diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml index d480d7408b9e2..522b186f3351b 100644 --- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml +++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml @@ -26,4 +26,5 @@ + diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc index 024c95398b988..5b7e7e6aa1a1c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc @@ -18,13 +18,19 @@ #include "FWCore/Utilities/interface/RunningAverage.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" + #include "gpuVertexFinder.h" #undef PIXVERTEX_DEBUG_PRODUCE template class PixelVertexProducerCUDAT : public edm::global::EDProducer<> { - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; + using TracksSoADevice = TrackSoAHeterogeneousDevice; + using TracksSoAHost = TrackSoAHeterogeneousHost; using GPUAlgo = gpuVertexFinder::Producer; public: @@ -40,10 +46,10 @@ class PixelVertexProducerCUDAT : public edm::global::EDProducer<> { bool onGPU_; - edm::EDGetTokenT> tokenGPUTrack_; - edm::EDPutTokenT tokenGPUVertex_; - edm::EDGetTokenT tokenCPUTrack_; - edm::EDPutTokenT tokenCPUVertex_; + edm::EDGetTokenT> tokenGPUTrack_; + edm::EDPutTokenT> tokenGPUVertex_; + edm::EDGetTokenT tokenCPUTrack_; + edm::EDPutTokenT tokenCPUVertex_; const GPUAlgo gpuAlgo_; @@ -68,11 +74,11 @@ PixelVertexProducerCUDAT::PixelVertexProducerCUDAT(const edm::Par { if (onGPU_) { tokenGPUTrack_ = - consumes>(conf.getParameter("pixelTrackSrc")); - tokenGPUVertex_ = produces(); + consumes(conf.getParameter("pixelTrackSrc")); + tokenGPUVertex_ = produces>(); } else { tokenCPUTrack_ = consumes(conf.getParameter("pixelTrackSrc")); - tokenCPUVertex_ = produces(); + tokenCPUVertex_ = produces(); } } @@ -104,23 +110,22 @@ template void PixelVertexProducerCUDAT::produceOnGPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - edm::Handle> hTracks; + using TracksSoA = TrackSoAHeterogeneousDevice; + edm::Handle> hTracks; iEvent.getByToken(tokenGPUTrack_, hTracks); cms::cuda::ScopedContextProduce ctx{*hTracks}; - auto const* tracks = ctx.get(*hTracks).get(); + auto& tracks = ctx.get(*hTracks); - assert(tracks); - ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_)); + ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_)); } template void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const { - auto const* tracks = iEvent.get(tokenCPUTrack_).get(); - assert(tracks); + auto& tracks = iEvent.get(tokenCPUTrack_); #ifdef PIXVERTEX_DEBUG_PRODUCE auto const& tsoa = *tracks; @@ -129,8 +134,8 @@ void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamI int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); - assert(nHits == int(tsoa.hitIndices.size(it))); + auto nHits = tracksUtilities::nHits(tracks.view(),it); + assert(nHits == int(tracks.view().hitIndices().size(it))); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... nt++; @@ -138,7 +143,7 @@ void PixelVertexProducerCUDAT::produceOnCPU(edm::StreamID streamI std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_)); + iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_)); } template diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc index 8cceeaa42cc10..fe43685fba3ff 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc @@ -1,4 +1,5 @@ -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/Common/interface/OrphanHandle.h" #include "DataFormats/TrackReco/interface/Track.h" @@ -35,17 +36,17 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> { private: void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; - edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenVertex_; edm::EDGetTokenT tokenBeamSpot_; edm::EDGetTokenT tokenTracks_; edm::EDGetTokenT tokenIndToEdm_; }; PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf) - : tokenVertex_(consumes(conf.getParameter("src"))), - tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), - tokenTracks_(consumes(conf.getParameter("TrackCollection"))), - tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { + : tokenVertex_(consumes(conf.getParameter("src"))), + tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), + tokenTracks_(consumes(conf.getParameter("TrackCollection"))), + tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { produces(); } @@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv dydz = bs.dydz(); } - auto const &soa = *(iEvent.get(tokenVertex_).get()); + auto const &soa = iEvent.get(tokenVertex_); - int nv = soa.nvFinal; + int nv = soa.view().nvFinal(); #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "converting " << nv << " vertices " @@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv std::set uind; // for verifing index consistency for (int j = nv - 1; j >= 0; --j) { - auto i = soa.sortInd[j]; // on gpu sorted in ascending order.... + auto i = soa.view()[j].sortInd(); // on gpu sorted in ascending order.... assert(i < nv); uind.insert(i); assert(itrk.empty()); - auto z = soa.zv[i]; + auto z = soa.view()[i].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; z += z0; reco::Vertex::Error err; - err(2, 2) = 1.f / soa.wv[i]; + err(2, 2) = 1.f / soa.view()[i].wv(); err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.idv[k] == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt); + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index dc125878b1058..ab0e9320ea4fb 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -2,7 +2,8 @@ #include "CUDADataFormats/Common/interface/Product.h" #include "CUDADataFormats/Common/interface/HostProduct.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/ESHandle.h" #include "FWCore/Framework/interface/Event.h" @@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - edm::EDGetTokenT> tokenCUDA_; - edm::EDPutTokenT tokenSOA_; + edm::EDGetTokenT> tokenCUDA_; + edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr m_soa; + zVertex::ZVertexSoAHost zvertex_h; }; PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) - : tokenCUDA_(consumes>(iConfig.getParameter("src"))), - tokenSOA_(produces()) {} + : tokenCUDA_(consumes>(iConfig.getParameter("src"))), + tokenSOA_(produces()) {} void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -50,16 +51,21 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - auto const& inputDataWrapped = iEvent.get(tokenCUDA_); + cms::cuda::Product const& inputDataWrapped = iEvent.get(tokenCUDA_); cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - auto const& inputData = ctx.get(inputDataWrapped); - - m_soa = inputData.toHostAsync(ctx.stream()); + auto const& zvertex_d = ctx.get(inputDataWrapped); // Tracks on device + zvertex_h = zVertex::ZVertexSoAHost(ctx.stream()); // Create an instance of Tracks on Host, using the stream + cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(), + zvertex_d.const_buffer().get(), + zvertex_d.bufferSize(), + cudaMemcpyDeviceToHost, + ctx.stream())); // Copy data from Device to Host + cudaCheck(cudaGetLastError()); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // No copies.... - iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); + iEvent.emplace(tokenSOA_, std::move(zvertex_h)); } DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h new file mode 100644 index 0000000000000..4679dea96bc7b --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h @@ -0,0 +1,23 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousDevice_h + +#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" + +template +class WorkSpaceSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection> { +public: + WorkSpaceSoAHeterogeneousDevice() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousDevice(cudaStream_t stream) + : PortableDeviceCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoADevice = WorkSpaceSoAHeterogeneousDevice; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h new file mode 100644 index 0000000000000..1efba9a8930b2 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h @@ -0,0 +1,23 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpaceSoAHeterogeneousHost_h + +#include "CUDADataFormats/Common/interface/PortableHostCollection.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" + +template +class WorkSpaceSoAHeterogeneousHost : public cms::cuda::PortableHostCollection> { +public: + WorkSpaceSoAHeterogeneousHost() = default; + + // Constructor which specifies the SoA size and CUDA stream + explicit WorkSpaceSoAHeterogeneousHost(cudaStream_t stream) + : PortableHostCollection>(S, stream) {} +}; + +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoAHost = WorkSpaceSoAHeterogeneousHost; + } +} // namespace gpuVertexFinder +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h new file mode 100644 index 0000000000000..94493fbeb8b24 --- /dev/null +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h @@ -0,0 +1,35 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_WorkSpace_h +#define RecoPixelVertexing_PixelVertexFinding_WorkSpace_h + +#include +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +// Intermediate data used in the vertex reco algos +// For internal use only +GENERATE_SOA_LAYOUT(WorkSpaceSoAHeterogeneousLayout, + SOA_COLUMN(uint16_t, itrk), // index of original track + SOA_COLUMN(float, zt), // input track z at bs + SOA_COLUMN(float, ezt2), // input error^2 on the above + SOA_COLUMN(float, ptt2), // input pt^2 on the above + SOA_COLUMN(uint8_t, izt), // interized z-position of input tracks + SOA_COLUMN(int32_t, iv), // vertex index for each associated track + SOA_SCALAR(uint32_t, ntrks), // number of "selected tracks" + SOA_SCALAR(uint32_t, nvIntermediate)) // the number of vertices after splitting pruning etc. + +// Methods that operate on View and ConstView of the WorkSpaceSoALayout. +namespace gpuVertexFinder { + namespace workSpace { + using WorkSpaceSoALayout = WorkSpaceSoAHeterogeneousLayout<>; + using WorkSpaceSoAView = WorkSpaceSoAHeterogeneousLayout<>::View; + using WorkSpaceSoAConstView = WorkSpaceSoAHeterogeneousLayout<>::ConstView; + + namespace utilities { + __host__ __device__ inline void init(WorkSpaceSoAView &workspace_view) { + workspace_view.ntrks() = 0; + workspace_view.nvIntermediate() = 0; + } + } // namespace utilities + } // namespace workSpace +} // namespace gpuVertexFinder + +#endif diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h index f71aa56842a67..655aacd32a96c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h @@ -17,8 +17,8 @@ namespace gpuVertexFinder { // // based on Rodrighez&Laio algo // - __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -32,21 +32,24 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(ezt2); + assert(izt); + assert(nn); + assert(iv); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -63,7 +66,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); @@ -197,7 +200,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { @@ -219,8 +222,8 @@ namespace gpuVertexFinder { printf("found %d proto vertices\n", foundClusters); } - __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void clusterTracksByDensityKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h index a11283a7b2065..f92d9a1d0113d 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksDBSCAN(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksDBSCAN(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,21 +28,23 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(iv); + assert(nn); + assert(ezt2); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -59,7 +61,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -214,7 +216,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h index 66d246fcfa4fa..21182690ec7e8 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h @@ -14,8 +14,8 @@ namespace gpuVertexFinder { // this algo does not really scale as it works in a single block... // enough for <10K tracks we have - __global__ void clusterTracksIterative(ZVertices* pdata, - WorkSpace* pws, + __global__ void clusterTracksIterative(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -28,21 +28,23 @@ namespace gpuVertexFinder { auto er2mx = errmax * errmax; - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt; - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(nn); + assert(iv); + assert(ezt2); using Hist = cms::cuda::HistoContainer; __shared__ Hist hist; @@ -59,7 +61,7 @@ namespace gpuVertexFinder { // fill hist (bin shall be wider than "eps") for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - assert(i < ZVertices::MAXTRACKS); + assert(i < zVertex::utilities::MAXTRACKS); int iz = int(zt[i] * 10.); // valid if eps<=0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; @@ -185,7 +187,7 @@ namespace gpuVertexFinder { } __syncthreads(); - assert(foundClusters < ZVertices::MAXVTX); + assert(foundClusters < zVertex::utilities::MAXVTX); // propagate the negative id to all the tracks in the cluster. for (auto i = threadIdx.x; i < nt; i += blockDim.x) { diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h index 0acf67244528a..888c326a28cd9 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h @@ -12,28 +12,32 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void fitVertices(ZVertices* pdata, - WorkSpace* pws, + __device__ __forceinline__ void fitVertices(VtxSoAView pdata, + WsSoAView pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; - uint32_t& nvIntermediate = ws.nvIntermediate; - - int32_t* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; - - assert(pdata); + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + assert(zt); + assert(ezt2); + assert(zv); + assert(wv); + assert(chi2); + assert(nn); assert(nvFinal <= nvIntermediate); nvFinal = nvIntermediate; @@ -101,8 +105,8 @@ namespace gpuVertexFinder { printf("and %d noise\n", noise); } - __global__ void fitVerticesKernel(ZVertices* pdata, - WorkSpace* pws, + __global__ void fitVerticesKernel(VtxSoAView pdata, + WsSoAView pws, float chi2Max // for outlier rejection ) { fitVertices(pdata, pws, chi2Max); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h index 93f78d498b26f..434dc84c5c6a4 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h @@ -15,29 +15,29 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) { - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ ptt2 = ws.ptt2; - uint32_t const& nvFinal = data.nvFinal; + __device__ __forceinline__ void sortByPt2(VtxSoAView pdata, WsSoAView pws) { + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ ptt2 = ws.ptt2(); + uint32_t const& nvFinal = data.nvFinal(); - int32_t const* __restrict__ iv = ws.iv; - float* __restrict__ ptv2 = data.ptv2; - uint16_t* __restrict__ sortInd = data.sortInd; + int32_t const* __restrict__ iv = ws.iv(); + float* __restrict__ ptv2 = data.ptv2(); + uint16_t* __restrict__ sortInd = data.sortInd(); - // if (threadIdx.x == 0) - // printf("sorting %d vertices\n",nvFinal); + assert(ptv2); + assert(sortInd); if (nvFinal < 1) return; // fill indexing for (auto i = threadIdx.x; i < nt; i += blockDim.x) { - data.idv[ws.itrk[i]] = iv[i]; + data[ws[i].itrk()].idv() = iv[i]; } - // can be done asynchronoisly at the end of previous event + // can be done asynchronously at the end of previous event for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) { ptv2[i] = 0; } @@ -66,7 +66,7 @@ namespace gpuVertexFinder { #endif } - __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); } + __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); } } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h index 0fe8bd882dcc5..7dce9c25b62a7 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h @@ -12,24 +12,26 @@ namespace gpuVertexFinder { - __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __device__ __forceinline__ void splitVertices(VtxSoAView pdata, WsSoAView pws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false - auto& __restrict__ data = *pdata; - auto& __restrict__ ws = *pws; - auto nt = ws.ntrks; - float const* __restrict__ zt = ws.zt; - float const* __restrict__ ezt2 = ws.ezt2; - float* __restrict__ zv = data.zv; - float* __restrict__ wv = data.wv; - float const* __restrict__ chi2 = data.chi2; - uint32_t& nvFinal = data.nvFinal; + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float const* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); - int32_t const* __restrict__ nn = data.ndof; - int32_t* __restrict__ iv = ws.iv; + int32_t const* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); - assert(pdata); assert(zt); + assert(wv); + assert(chi2); + assert(nn); // one vertex per block for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) { @@ -120,7 +122,7 @@ namespace gpuVertexFinder { // get a new global vertex __shared__ uint32_t igv; if (0 == threadIdx.x) - igv = atomicAdd(&ws.nvIntermediate, 1); + igv = atomicAdd(&ws.nvIntermediate(), 1); __syncthreads(); for (auto k = threadIdx.x; k < nq; k += blockDim.x) { if (1 == newV[k]) @@ -130,7 +132,7 @@ namespace gpuVertexFinder { } // loop on vertices } - __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) { + __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) { splitVertices(pdata, pws, maxChi2); } diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 74bcd26f8a79c..369c64aaaf7a9 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,5 +1,8 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" + #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" #include "gpuClusterTracksIterative.h" @@ -19,29 +22,24 @@ namespace gpuVertexFinder { constexpr float maxChi2ForSplit = 9.f; template - __global__ void loadTracks( - pixelTrack::TrackSoAT const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) { - assert(ptracks); - assert(soa); - auto const& tracks = *ptracks; - auto const& fit = tracks.stateAtBS; - auto const* quality = tracks.qualityData(); - + __global__ void loadTracks(TrackSoAConstView tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) { + assert(soa.idv()); + auto const* quality = tracks_view.quality(); + using helper = tracksUtilities; auto first = blockIdx.x * blockDim.x + threadIdx.x; - - for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { - auto nHits = tracks.nHits(idx); + for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) { + auto nHits = helper::nHits(tracks_view, idx); assert(nHits >= 3); // initialize soa... - soa->idv[idx] = -1; + soa[idx].idv() = -1; - if (tracks.isTriplet(idx)) + if (helper::isTriplet(tracks_view, idx)) continue; // no triplets - if (quality[idx] < pixelTrack::Quality::highPurity) + if (quality[idx] < pixelTrackSoA::Quality::highPurity) continue; - auto pt = tracks.pt(idx); + auto pt = tracks_view[idx].pt(); if (pt < ptMin) continue; @@ -49,19 +47,19 @@ namespace gpuVertexFinder { // clamp pt pt = std::min(pt, ptMax); - auto& data = *pws; - auto it = atomicAdd(&data.ntrks, 1); - data.itrk[it] = idx; - data.zt[it] = tracks.zip(idx); - data.ezt2[it] = fit.covariance(idx)(14); - data.ptt2[it] = pt * pt; + auto& data = pws; + auto it = atomicAdd(&data.ntrks(), 1); + data[it].itrk() = idx; + data[it].zt() = helper::zip(tracks_view, idx); + data[it].ezt2() = tracks_view[idx].covariance()(14); + data[it].ptt2() = pt * pt; } } // #define THREE_KERNELS #ifndef THREE_KERNELS - __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderOneKernel(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -78,8 +76,8 @@ namespace gpuVertexFinder { sortByPt2(pdata, pws); } #else - __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, + __global__ void vertexFinderKernel1(VtxSoAView pdata, + WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" @@ -90,53 +88,55 @@ namespace gpuVertexFinder { fitVertices(pdata, pws, maxChi2ForFirstFit); } - __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) { + __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) { fitVertices(pdata, pws, maxChi2ForFinalFit); __syncthreads(); sortByPt2(pdata, pws); } #endif - template +template #ifdef __CUDACC__ - ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, - pixelTrack::TrackSoAT const* tksoa, - float ptMin, - float ptMax) const { + zVertex::ZVertexSoADevice Producer::makeAsync(cudaStream_t stream, + TrackSoAConstView tracks_view, + float ptMin, + float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + zVertex::ZVertexSoADevice vertices(stream); #else - - ZVertexHeterogeneous Producer::make(pixelTrack::TrackSoAT const* tksoa, - float ptMin, - float ptMax) const { - + zVertex::ZVertexSoAHost Producer::make(TrackSoAConstView tracks_view, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on CPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); + zVertex::ZVertexSoAHost vertices(nullptr); #endif - assert(tksoa); - auto* soa = vertices.get(); - assert(soa); + auto soa = vertices.view(); + + assert(soa.idv()); + assert(soa.zv()); + assert(soa.wv()); + assert(soa.chi2()); + assert(soa.ptv2()); + assert(soa.ndof()); + assert(soa.sortInd()); #ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoADevice(stream); #else - auto ws_d = std::make_unique(); + auto ws_d = gpuVertexFinder::workSpace::WorkSpaceSoAHost(nullptr); #endif #ifdef __CUDACC__ - init<<<1, 1, 0, stream>>>(soa, ws_d.get()); + init<<<1, 1, 0, stream>>>(soa, ws_d.view()); auto blockSize = 128; - auto numberOfBlocks = (pixelTrack::TrackSoAT::stride() + blockSize - 1) / blockSize; - loadTracks<<>>(tksoa, soa, ws_d.get(), ptMin, ptMax); + auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize; + loadTracks<<>>(tracks_view, soa, ws_d.view(), ptMin, ptMax); cudaCheck(cudaGetLastError()); #else - init(soa, ws_d.get()); - loadTracks(tksoa, soa, ws_d.get(), ptMin, ptMax); + init(soa, ws_d.view()); + loadTracks(tracks_view, soa, ws_d.view(), ptMin, ptMax); #endif #ifdef __CUDACC__ @@ -148,50 +148,51 @@ namespace gpuVertexFinder { if (oneKernel_) { // implemented only for density clustesrs #ifndef THREE_KERNELS - vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); #else - vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); #endif } else { // five kernels if (useDensity_) { - clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>( + soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max); } cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit); cudaCheck(cudaGetLastError()); // one block per vertex... - splitVerticesKernel<<>>(soa, ws_d.get(), maxChi2ForSplit); + splitVerticesKernel<<>>(soa, ws_d.view(), maxChi2ForSplit); cudaCheck(cudaGetLastError()); - fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit); + fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit); cudaCheck(cudaGetLastError()); - sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get()); + sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view()); } cudaCheck(cudaGetLastError()); #else // __CUDACC__ if (useDensity_) { - clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useDBSCAN_) { - clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max); } else if (useIterative_) { - clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max); + clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max); } #ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl; + std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - fitVertices(soa, ws_d.get(), maxChi2ForFirstFit); + fitVertices(soa, ws_d.view(), maxChi2ForFirstFit); // one block per vertex! - splitVertices(soa, ws_d.get(), maxChi2ForSplit); - fitVertices(soa, ws_d.get(), maxChi2ForFinalFit); - sortByPt2(soa, ws_d.get()); + splitVertices(soa, ws_d.view(), maxChi2ForSplit); + fitVertices(soa, ws_d.view(), maxChi2ForFinalFit); + sortByPt2(soa, ws_d.view()); #endif return vertices; @@ -199,5 +200,4 @@ namespace gpuVertexFinder { template class Producer; template class Producer; - } // namespace gpuVertexFinder diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h index 6128939f6eb87..f0528e6d7366c 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h @@ -4,45 +4,29 @@ #include #include -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "WorkSpaceUtilities.h" +#include "WorkSpaceSoAHeterogeneousHost.h" +#include "WorkSpaceSoAHeterogeneousDevice.h" namespace gpuVertexFinder { - using ZVertices = ZVertexSoA; - // workspace used in the vertex reco algos - struct WorkSpace { - static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS; - static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX; + using VtxSoAView = zVertex::ZVertexSoAView; + using WsSoAView = gpuVertexFinder::workSpace::WorkSpaceSoAView; - uint32_t ntrks; // number of "selected tracks" - uint32_t itrk[MAXTRACKS]; // index of original track - float zt[MAXTRACKS]; // input track z at bs - float ezt2[MAXTRACKS]; // input error^2 on the above - float ptt2[MAXTRACKS]; // input pt^2 on the above - uint8_t izt[MAXTRACKS]; // interized z-position of input tracks - int32_t iv[MAXTRACKS]; // vertex index for each associated track - - uint32_t nvIntermediate; // the number of vertices after splitting pruning etc. - - __host__ __device__ void init() { - ntrks = 0; - nvIntermediate = 0; - } - }; - - __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) { - pdata->init(); - pws->init(); + __global__ void init(VtxSoAView pdata, WsSoAView pws) { + zVertex::utilities::init(pdata); + gpuVertexFinder::workSpace::utilities::init(pws); } template class Producer { - public: - using ZVertices = ZVertexSoA; - using WorkSpace = gpuVertexFinder::WorkSpace; - using TkSoA = pixelTrack::TrackSoAT; + using TkSoAConstView = TrackSoAConstView; + public: Producer(bool oneKernel, bool useDensity, bool useDBSCAN, @@ -63,8 +47,8 @@ namespace gpuVertexFinder { ~Producer() = default; - ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const; - ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const; + zVertex::ZVertexSoADevice makeAsync(cudaStream_t stream, TkSoAConstView tracks_view, float ptMin, float ptMax) const; + zVertex::ZVertexSoAHost make(TkSoAConstView tracks_view, float ptMin, float ptMax) const; private: const bool oneKernel_; diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5f8a0646c726a..ab7afdbfd26a6 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -7,6 +7,17 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +// PixelTrackUtilities only included in order to compile SoALayout with Eigen columns +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h" + +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceUtilities.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousHost.h" +#include "RecoPixelVertexing/PixelVertexFinding/plugins/WorkSpaceSoAHeterogeneousDevice.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -23,22 +34,22 @@ #ifdef ONE_KERNEL #ifdef __CUDACC__ -__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata, - gpuVertexFinder::WorkSpace* pws, +__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata, + gpuVertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); __syncthreads(); - fitVertices(pdata, pws, 50.); + gpuVertexFinder::fitVertices(pdata, pws, 50.); __syncthreads(); - splitVertices(pdata, pws, 9.f); + gpuVertexFinder::splitVertices(pdata, pws, 9.f); __syncthreads(); - fitVertices(pdata, pws, 5000.); + gpuVertexFinder::fitVertices(pdata, pws, 5000.); __syncthreads(); - sortByPt2(pdata, pws); + gpuVertexFinder::sortByPt2(pdata, pws); } #endif #endif @@ -101,25 +112,24 @@ struct ClusterGenerator { std::exponential_distribution ptGen; }; -// a macro SORRY -#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M)) -#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M)) - -__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) { - auto const& __restrict__ data = *pdata; - auto const& __restrict__ ws = *pws; - printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate); +__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) { + auto& __restrict__ ws = pws; + printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate()); } int main() { + cudaStream_t stream; #ifdef __CUDACC__ cms::cudatest::requireDevices(); + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + zVertex::ZVertexSoADevice onGPU_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoADevice ws_d(stream); #else - auto onGPU_d = std::make_unique(); - auto ws_d = std::make_unique(); + stream = nullptr; + + zVertex::ZVertexSoAHost onGPU_d(stream); + gpuVertexFinder::workSpace::WorkSpaceSoAHost ws_d(stream); #endif Event ev; @@ -135,24 +145,26 @@ int main() { gen(ev); #ifdef __CUDACC__ - init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); #else - onGPU_d->init(); - ws_d->init(); + gpuVertexFinder::init(onGPU_d.view(), ws_d.view()); #endif std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else - ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t)); + ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; @@ -168,30 +180,30 @@ int main() { uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.get(), ws_d.get()); - CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.get(), ws_d.get()); - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; + print(onGPU_d.view(), ws_d.view()); + CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + print(onGPU_d.view(), ws_d.view()); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); #endif if (nv == 0) { @@ -221,18 +233,18 @@ int main() { nn = hnn; ind = hind; #else - zv = onGPU_d->zv; - wv = onGPU_d->wv; - ptv2 = onGPU_d->ptv2; - nn = onGPU_d->ndof; - ind = onGPU_d->sortInd; + zv = onGPU_d.view().zv(); + wv = onGPU_d.view().wv(); + ptv2 = onGPU_d.view().ptv2(); + nn = onGPU_d.view().ndof(); + ind = onGPU_d.view().sortInd(); #endif #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -244,14 +256,14 @@ int main() { } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif for (auto j = 0U; j < nv; ++j) @@ -264,26 +276,26 @@ int main() { #ifdef __CUDACC__ // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f); + cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); - nv = ws_d->nvIntermediate; + gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f); + nv = ws_d.view().nvIntermediate(); #endif std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f); cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view()); cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.get(), ws_d.get()); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f); + gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view()); + nv = onGPU_d.view().nvFinal(); + memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float)); #endif if (nv == 0) { @@ -292,12 +304,12 @@ int main() { } #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif for (auto j = 0U; j < nv; ++j) if (nn[j] > 0) diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc index c11b53538c5b0..6b20de6fddb2d 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc @@ -45,12 +45,13 @@ #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" -#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" + +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h" +#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" namespace L2TauTagNNv1 { constexpr int nCellEta = 5; @@ -145,10 +146,11 @@ struct L2TauNNProducerCacheData { }; class L2TauNNProducer : public edm::stream::EDProducer> { - using TrackSoA = pixelTrack::TrackSoAT; - using PixelTrackHeterogeneous = PixelTrackHeterogeneousT; public: + + using TrackSoAHost = pixelTrack::TrackSoAHostPhase1; + struct caloRecHitCollections { const HBHERecHitCollection* hbhe; const HORecHitCollection* ho; @@ -182,16 +184,17 @@ class L2TauNNProducer : public edm::stream::EDProducer& allTaus, - const TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, + const zVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi); - void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const TrackSoA& patatracks_tsoa, + void selectGoodTracksAndVertices(const zVertex::ZVertexSoAHost& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood); + std::pair impactParameter(int it, - const TrackSoA& patatracks_tsoa, + const TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi); @@ -210,8 +213,8 @@ class L2TauNNProducer : public edm::stream::EDProducer eeToken_; const edm::ESGetToken geometryToken_; const edm::ESGetToken bFieldToken_; - const edm::EDGetTokenT pataVerticesToken_; - const edm::EDGetTokenT pataTracksToken_; + const edm::EDGetTokenT pataVerticesToken_; + const edm::EDGetTokenT pataTracksToken_; const edm::EDGetTokenT beamSpotToken_; const unsigned int maxVtx_; const float fractionSumPt2_; @@ -295,7 +298,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd eeToken_(consumes(cfg.getParameter("eeInput"))), geometryToken_(esConsumes()), bFieldToken_(esConsumes()), - pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), + pataVerticesToken_(consumes(cfg.getParameter("pataVertices"))), pataTracksToken_(consumes(cfg.getParameter("pataTracks"))), beamSpotToken_(consumes(cfg.getParameter("BeamSpot"))), maxVtx_(cfg.getParameter("maxVtx")), @@ -572,44 +575,45 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix, } } -void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, - const TrackSoA& patatracks_tsoa, +void L2TauNNProducer::selectGoodTracksAndVertices(const zVertex::ZVertexSoAHost& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, std::vector& trkGood, std::vector& vtxGood) { - const auto maxTracks = patatracks_tsoa.stride(); - const int nv = patavtx_soa.nvFinal; + using patatrackHelpers = tracksUtilities; + const auto maxTracks = patatracks_tsoa.view().metadata().size(); + const int nv = patavtx_soa.view().nvFinal(); trkGood.clear(); trkGood.reserve(maxTracks); vtxGood.clear(); vtxGood.reserve(nv); - auto const* quality = patatracks_tsoa.qualityData(); + auto const* quality = patatracks_tsoa.view().quality(); // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum). std::vector pTSquaredSum(nv, 0); std::vector nTrkAssociated(nv, 0); for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) { - auto nHits = patatracks_tsoa.nHits(trk_idx); + auto nHits = patatrackHelpers::nHits(patatracks_tsoa.view(), trk_idx); if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.idv[trk_idx]; + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { - auto patatrackPt = patatracks_tsoa.pt[trk_idx]; + auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; - if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) { + if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) { patatrackPt = std::min(patatrackPt, trackPtMax_); pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt; } } - if (nHits > 0 and quality[trk_idx] >= pixelTrack::Quality::loose) { + if (nHits > 0 and quality[trk_idx] >= pixelTrackSoA::Quality::loose) { trkGood.push_back(trk_idx); } } if (nv > 0) { const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_; for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) { - auto vtx_idx = patavtx_soa.sortInd[j]; + auto vtx_idx = patavtx_soa.view()[j].sortInd(); assert(vtx_idx < nv); if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac && pTSquaredSum[vtx_idx] > minSumPt2_) { @@ -620,15 +624,14 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa, } std::pair L2TauNNProducer::impactParameter(int it, - const TrackSoA& patatracks_tsoa, + const TrackSoAHost& patatracks_tsoa, float patatrackPhi, const reco::BeamSpot& beamspot, const MagneticField* magfi) { - auto const& fit = patatracks_tsoa.stateAtBS; /* dxy and dz */ riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + tracksUtilities::copyToDense(patatracks_tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); float sp = std::sin(patatrackPhi); @@ -653,11 +656,12 @@ std::pair L2TauNNProducer::impactParameter(int it, void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const std::vector& allTaus, - const TrackSoA& patatracks_tsoa, - const ZVertexSoA& patavtx_soa, + const TrackSoAHost& patatracks_tsoa, + const zVertex::ZVertexSoAHost& patavtx_soa, const reco::BeamSpot& beamspot, const MagneticField* magfi) { using NNInputs = L2TauTagNNv1::NNInputs; + using patatrackHelpers = tracksUtilities; float deta, dphi; int eta_idx = 0; int phi_idx = 0; @@ -678,19 +682,19 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix, const float tauPhi = allTaus[tau_idx]->phi(); for (const auto it : trkGood) { - const float patatrackPt = patatracks_tsoa.pt[it]; + const float patatrackPt = patatracks_tsoa.const_view()[it].pt(); if (patatrackPt <= 0) continue; - const float patatrackPhi = patatracks_tsoa.phi(it); - const float patatrackEta = patatracks_tsoa.eta(it); - const float patatrackCharge = patatracks_tsoa.charge(it); - const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it); - const auto nHits = patatracks_tsoa.nHits(it); + const float patatrackPhi = patatrackHelpers::phi(patatracks_tsoa.const_view(), it); + const float patatrackEta = patatracks_tsoa.const_view()[it].eta(); + const float patatrackCharge = patatrackHelpers::charge(patatracks_tsoa.const_view(), it); + const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2(); + const auto nHits = patatrackHelpers::nHits(patatracks_tsoa.const_view(), it); if (nHits <= 0) continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.idv[it]; + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); @@ -766,8 +770,8 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset const auto eeCal = event.getHandle(eeToken_); const auto hbhe = event.getHandle(hbheToken_); const auto ho = event.getHandle(hoToken_); - const auto& patatracks_SoA = *event.get(pataTracksToken_); - const auto& vertices_SoA = *event.get(pataVerticesToken_); + auto& patatracks_SoA = event.get(pataTracksToken_); + auto& vertices_SoA = event.get(pataVerticesToken_); const auto bsHandle = event.getHandle(beamSpotToken_); auto const fieldESH = eventsetup.getHandle(bFieldToken_); diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc index 9023640f62d5a..2724f54a5267b 100644 --- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc +++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc @@ -1,4 +1,4 @@ -#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" #include "DataFormats/BeamSpot/interface/BeamSpot.h" #include "DataFormats/GeometrySurface/interface/Plane.h" #include "DataFormats/TrackerCommon/interface/TrackerTopology.h" @@ -46,7 +46,7 @@ class SeedProducerFromSoAT : public edm::global::EDProducer<> { // Event data tokens const edm::EDGetTokenT tBeamSpot_; - const edm::EDGetTokenT> tokenTrack_; + const edm::EDGetTokenT> tokenTrack_; // Event setup tokens const edm::ESGetToken idealMagneticFieldToken_; const edm::ESGetToken trackerDigiGeometryToken_; @@ -84,6 +84,8 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // std::cout << "Converting gpu helix to trajectory seed" << std::endl; auto result = std::make_unique(); + using trackHelper = tracksUtilities; + auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_); auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_); auto const& dus = tracker->detUnits(); @@ -95,21 +97,20 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl; GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); - const auto& tsoa = *(iEvent.get(tokenTrack_)); + auto& tsoa = iEvent.get(tokenTrack_); - auto const* quality = tsoa.qualityData(); - auto const& fit = tsoa.stateAtBS; - auto const& detIndices = tsoa.detIndices; - auto maxTracks = tsoa.stride(); + auto const* quality = tsoa.view().quality(); + auto const& detIndices = tsoa.view().detIndices(); + auto maxTracks = tsoa.view().metadata().size(); int32_t nt = 0; for (int32_t it = 0; it < maxTracks; ++it) { - auto nHits = tsoa.nHits(it); + auto nHits = trackHelper::nHits(tsoa.view(), it); if (nHits == 0) break; // this is a guard: maybe we need to move to nTracks... auto q = quality[it]; - if (q != pixelTrack::Quality::loose) + if (q != pixelTrackSoA::Quality::loose) continue; // FIXME if (nHits < minNumberOfHits_) continue; @@ -126,11 +127,11 @@ void SeedProducerFromSoAT::produce(edm::StreamID streamID, // mind: this values are respect the beamspot! - float phi = tsoa.phi(it); + float phi = trackHelper::nHits(tsoa.view(), it); riemannFit::Vector5d ipar, opar; riemannFit::Matrix5d icov, ocov; - fit.copyToDense(ipar, icov, it); + trackHelper::copyToDense(tsoa.view(), ipar, icov, it); riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);