Skip to content

Commit b44af67

Browse files
authored
Merge pull request #34286 from CMSTrackerDPG/cpefast_wo_track_angle_30_06_2021
make CPEFast to better reproduce Generic (w/o track angle)
2 parents 232ab28 + a80a100 commit b44af67

14 files changed

+360
-116
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#ifndef CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H
2+
#define CUDADataFormats_TrackingRecHit_interface_SiPixelHitStatus_H
3+
4+
#include <cstdint>
5+
6+
// more information on bit fields : https://en.cppreference.com/w/cpp/language/bit_field
7+
struct SiPixelHitStatus {
8+
bool isBigX : 1; // ∈[0,1]
9+
bool isOneX : 1; // ∈[0,1]
10+
bool isBigY : 1; // ∈[0,1]
11+
bool isOneY : 1; // ∈[0,1]
12+
uint8_t qBin : 3; // ∈[0,1,...,7]
13+
};
14+
15+
#endif

CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h

+48-25
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@ class TrackingRecHit2DHeterogeneous {
1414

1515
TrackingRecHit2DHeterogeneous() = default;
1616

17-
explicit TrackingRecHit2DHeterogeneous(uint32_t nHits,
18-
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
19-
uint32_t const* hitsModuleStart,
20-
cudaStream_t stream);
17+
explicit TrackingRecHit2DHeterogeneous(
18+
uint32_t nHits,
19+
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
20+
uint32_t const* hitsModuleStart,
21+
cudaStream_t stream,
22+
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
2123

2224
~TrackingRecHit2DHeterogeneous() = default;
2325

@@ -41,6 +43,9 @@ class TrackingRecHit2DHeterogeneous {
4143
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
4244
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
4345

46+
// needs specialization for Host
47+
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
48+
4449
private:
4550
static constexpr uint32_t n16 = 4; // number of elements in m_store16
4651
static constexpr uint32_t n32 = 10; // number of elements in m_store32
@@ -65,20 +70,27 @@ class TrackingRecHit2DHeterogeneous {
6570
int16_t* m_iphi;
6671
};
6772

73+
using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
74+
using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
75+
using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
76+
using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
77+
6878
#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
6979
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
7080

7181
template <typename Traits>
72-
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nHits,
73-
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
74-
uint32_t const* hitsModuleStart,
75-
cudaStream_t stream)
82+
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
83+
uint32_t nHits,
84+
pixelCPEforGPU::ParamsOnGPU const* cpeParams,
85+
uint32_t const* hitsModuleStart,
86+
cudaStream_t stream,
87+
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
7688
: m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) {
7789
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
7890

7991
view->m_nHits = nHits;
80-
m_view = Traits::template make_device_unique<TrackingRecHit2DSOAView>(stream);
81-
m_AverageGeometryStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
92+
m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream); // leave it on host and pass it by value?
93+
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
8294
view->m_averageGeometry = m_AverageGeometryStore.get();
8395
view->m_cpeParams = cpeParams;
8496
view->m_hitsModuleStart = hitsModuleStart;
@@ -98,15 +110,21 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
98110
// if ordering is relevant they may have to be stored phi-ordered by layer or so
99111
// this will break 1to1 correspondence with cluster and module locality
100112
// so unless proven VERY inefficient we keep it ordered as generated
101-
m_store16 = Traits::template make_device_unique<uint16_t[]>(nHits * n16, stream);
102-
m_store32 =
103-
Traits::template make_device_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
104-
m_PhiBinnerStore = Traits::template make_device_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
113+
114+
// host copy is "reduced" (to be reviewed at some point)
115+
if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
116+
// it has to compile for ALL cases
117+
copyFromGPU(input, stream);
118+
} else {
119+
assert(input == nullptr);
120+
m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
121+
m_store32 = Traits::template make_unique<float[]>(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream);
122+
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
123+
}
105124

106125
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
107126
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
108127

109-
auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
110128
auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
111129

112130
// copy all the pointers
@@ -118,20 +136,25 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(uint32_t nH
118136
view->m_yl = get32(1);
119137
view->m_xerr = get32(2);
120138
view->m_yerr = get32(3);
139+
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
121140

122-
view->m_xg = get32(4);
123-
view->m_yg = get32(5);
124-
view->m_zg = get32(6);
125-
view->m_rg = get32(7);
141+
if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
142+
assert(input == nullptr);
143+
view->m_xg = get32(5);
144+
view->m_yg = get32(6);
145+
view->m_zg = get32(7);
146+
view->m_rg = get32(8);
126147

127-
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(0));
148+
auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
149+
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
128150

129-
view->m_charge = reinterpret_cast<int32_t*>(get32(8));
130-
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
131-
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
132-
view->m_detInd = get16(1);
151+
view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
152+
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
153+
view->m_detInd = get16(0);
133154

134-
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
155+
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
156+
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
157+
}
135158

136159
// transfer view
137160
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
2+
#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
3+
4+
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
5+
#include "CUDADataFormats/Common/interface/HostProduct.h"
6+
7+
// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
8+
class TrackingRecHit2DReduced {
9+
public:
10+
using HLPstorage = HostProduct<float[]>;
11+
using HIDstorage = HostProduct<uint16_t[]>;
12+
13+
template <typename UP32, typename UP16>
14+
TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
15+
: m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
16+
auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };
17+
18+
// copy all the pointers (better be in sync with the producer store)
19+
20+
m_view.m_xl = get32(0);
21+
m_view.m_yl = get32(1);
22+
m_view.m_xerr = get32(2);
23+
m_view.m_yerr = get32(3);
24+
m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
25+
m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
26+
}
27+
28+
// view only!
29+
TrackingRecHit2DReduced(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}
30+
31+
TrackingRecHit2DReduced() = default;
32+
~TrackingRecHit2DReduced() = default;
33+
34+
TrackingRecHit2DReduced(const TrackingRecHit2DReduced&) = delete;
35+
TrackingRecHit2DReduced& operator=(const TrackingRecHit2DReduced&) = delete;
36+
TrackingRecHit2DReduced(TrackingRecHit2DReduced&&) = default;
37+
TrackingRecHit2DReduced& operator=(TrackingRecHit2DReduced&&) = default;
38+
39+
TrackingRecHit2DSOAView& view() { return m_view; }
40+
TrackingRecHit2DSOAView const& view() const { return m_view; }
41+
42+
auto nHits() const { return m_nHits; }
43+
44+
private:
45+
TrackingRecHit2DSOAView m_view;
46+
47+
HLPstorage m_store32;
48+
HIDstorage m_store16;
49+
50+
int m_nHits;
51+
};
52+
53+
#endif

CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h

+19-3
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@
77
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
88
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
99
#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h"
10+
#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h"
1011

1112
namespace pixelCPEforGPU {
1213
struct ParamsOnGPU;
1314
}
1415

1516
class TrackingRecHit2DSOAView {
1617
public:
18+
using Status = SiPixelHitStatus;
19+
static_assert(sizeof(Status) == sizeof(uint8_t));
20+
1721
using hindex_type = uint32_t; // if above is <=2^32
1822

1923
using PhiBinner = cms::cuda::HistoContainer<int16_t, 128, -1, 8 * sizeof(int16_t), hindex_type, 10>;
@@ -22,6 +26,7 @@ class TrackingRecHit2DSOAView {
2226

2327
template <typename>
2428
friend class TrackingRecHit2DHeterogeneous;
29+
friend class TrackingRecHit2DReduced;
2530

2631
__device__ __forceinline__ uint32_t nHits() const { return m_nHits; }
2732

@@ -47,8 +52,18 @@ class TrackingRecHit2DSOAView {
4752
__device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
4853
__device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }
4954

50-
__device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; }
51-
__device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); }
55+
__device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
56+
ich = std::min(ich, chargeMask());
57+
uint32_t w = *reinterpret_cast<uint8_t*>(&is);
58+
ich |= (w << 24);
59+
m_chargeAndStatus[i] = ich;
60+
}
61+
62+
__device__ __forceinline__ Status status(int i) const {
63+
uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
64+
return *reinterpret_cast<Status*>(&w);
65+
}
66+
5267
__device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
5368
__device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
5469
__device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
@@ -79,7 +94,8 @@ class TrackingRecHit2DSOAView {
7994
int16_t* m_iphi;
8095

8196
// cluster properties
82-
int32_t* m_charge;
97+
static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
98+
uint32_t* m_chargeAndStatus;
8399
int16_t* m_xsize;
84100
int16_t* m_ysize;
85101
uint16_t* m_detInd;

CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc

+10-3
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,22 @@
66

77
template <>
88
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const {
9-
auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
10-
cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream);
9+
auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
10+
cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
1111
return ret;
1212
}
1313

1414
template <>
15-
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const {
15+
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
1616
auto ret = cms::cuda::make_host_unique<uint32_t[]>(gpuClustering::maxNumModules + 1, stream);
1717
cudaCheck(cudaMemcpyAsync(
1818
ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream));
1919
return ret;
2020
}
21+
22+
// the only specialization needed
23+
template <>
24+
void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
25+
assert(input);
26+
m_store32 = input->localCoordToHostAsync(stream);
27+
}

CUDADataFormats/TrackingRecHit/src/classes.h

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
#include "CUDADataFormats/Common/interface/Product.h"
55
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
6+
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
67
#include "DataFormats/Common/interface/Wrapper.h"
78

89
#endif // CUDADataFormats_SiPixelCluster_src_classes_h
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
<lcgdict>
22
<class name="TrackingRecHit2DCPU" persistent="false"/>
3-
<class name="TrackingRecHit2DHost" persistent="false"/>
4-
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
53
<class name="edm::Wrapper<TrackingRecHit2DCPU>" persistent="false"/>
4+
<class name="TrackingRecHit2DHost" persistent="false"/>
65
<class name="edm::Wrapper<TrackingRecHit2DHost>" persistent="false"/>
6+
<class name="cms::cuda::Product<TrackingRecHit2DGPU>" persistent="false"/>
77
<class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPU>>" persistent="false"/>
8+
<class name="TrackingRecHit2DReduced" persistent="false"/>
9+
<class name="edm::Wrapper<TrackingRecHit2DReduced>" persistent="false"/>
810
</lcgdict>

CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,17 @@ int main() {
1515
cudaStream_t stream;
1616
cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
1717

18+
auto nHits = 200;
1819
// inner scope to deallocate memory before destroying the stream
1920
{
20-
auto nHits = 200;
2121
TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream);
2222

2323
testTrackingRecHit2D::runKernels(tkhit.view());
24+
25+
TrackingRecHit2DHost tkhitH(nHits, nullptr, nullptr, stream, &tkhit);
26+
cudaStreamSynchronize(stream);
27+
assert(tkhitH.view());
28+
assert(tkhitH.view()->nHits() == unsigned(nHits));
2429
}
2530

2631
cudaCheck(cudaStreamDestroy(stream));

0 commit comments

Comments
 (0)