Skip to content

Commit

Permalink
Introduce Cluster Charge Cut, optimize Histogram (bucket sorting) (#171)
Browse files Browse the repository at this point in the history
  • Loading branch information
VinInn authored and fwyzard committed Jan 15, 2021
1 parent 3c43f0a commit fd748ed
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 82 deletions.
124 changes: 61 additions & 63 deletions RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,87 +6,92 @@

#include <cuda_runtime.h>

#include "GPUHitsAndDoublets.h"
#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"

struct Quadruplet {
int hitId[4];
using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;
hindex_type hitId[4];
};


class GPUCACell {
public:

using Hits = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
using hindex_type = siPixelRecHitsHeterogeneousProduct::hindex_type;

GPUCACell() = default;
#ifdef __CUDACC__

__host__ __device__
void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
int layerPairId, int doubletId, int innerHitId,int outerHitId)
__device__ __forceinline__
void init(Hits const & hh,
int layerPairId, int doubletId,
hindex_type innerHitId, hindex_type outerHitId)
{
theInnerHitId = innerHitId;
theOuterHitId = outerHitId;
theDoubletId = doubletId;
theLayerPairId = layerPairId;

theInnerX = hh.xg_d[innerHitId];
theOuterX = hh.xg_d[outerHitId];

theInnerY = hh.yg_d[innerHitId];
theOuterY = hh.yg_d[outerHitId];

theInnerZ = hh.zg_d[innerHitId];
theOuterZ = hh.zg_d[outerHitId];
theInnerR = hh.rg_d[innerHitId];
theOuterR = hh.rg_d[outerHitId];
theInnerZ = __ldg(hh.zg_d+innerHitId);
theInnerR = __ldg(hh.rg_d+innerHitId);
theOuterNeighbors.reset();
}

constexpr float get_inner_x() const { return theInnerX; }
constexpr float get_outer_x() const { return theOuterX; }
constexpr float get_inner_y() const { return theInnerY; }
constexpr float get_outer_y() const { return theOuterY; }
constexpr float get_inner_z() const { return theInnerZ; }
constexpr float get_outer_z() const { return theOuterZ; }
constexpr float get_inner_r() const { return theInnerR; }
constexpr float get_outer_r() const { return theOuterR; }
__device__ __forceinline__ float get_inner_x(Hits const & hh) const { return __ldg(hh.xg_d+theInnerHitId); }
__device__ __forceinline__ float get_outer_x(Hits const & hh) const { return __ldg(hh.xg_d+theOuterHitId); }
__device__ __forceinline__ float get_inner_y(Hits const & hh) const { return __ldg(hh.yg_d+theInnerHitId); }
__device__ __forceinline__ float get_outer_y(Hits const & hh) const { return __ldg(hh.yg_d+theOuterHitId); }
__device__ __forceinline__ float get_inner_z(Hits const & hh) const { return theInnerZ; } // { return __ldg(hh.zg_d+theInnerHitId); } // { return theInnerZ; }
__device__ __forceinline__ float get_outer_z(Hits const & hh) const { return __ldg(hh.zg_d+theOuterHitId); }
__device__ __forceinline__ float get_inner_r(Hits const & hh) const { return theInnerR; } // { return __ldg(hh.rg_d+theInnerHitId); } // { return theInnerR; }
__device__ __forceinline__ float get_outer_r(Hits const & hh) const { return __ldg(hh.rg_d+theOuterHitId); }

constexpr unsigned int get_inner_hit_id() const {
return theInnerHitId;
}
constexpr unsigned int get_outer_hit_id() const {
return theOuterHitId;
}

constexpr void print_cell() const {

__device__
void print_cell() const {
printf("printing cell: %d, on layerPair: %d, innerHitId: %d, outerHitId: "
"%d, innerradius %f, outerRadius %f \n",
theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId,
theInnerR, theOuterR);
theDoubletId, theLayerPairId, theInnerHitId, theOuterHitId
);
}

__host__ __device__
bool check_alignment_and_tag(
const GPUCACell *cells, unsigned int innerCellId, const float ptmin,

__device__
bool check_alignment(Hits const & hh,
GPUCACell const & otherCell, const float ptmin,
const float region_origin_x, const float region_origin_y,
const float region_origin_radius, const float thetaCut,
const float phiCut, const float hardPtCut)
const float phiCut, const float hardPtCut) const
{
auto ro = get_outer_r();
auto zo = get_outer_z();
const auto &otherCell = cells[innerCellId];
auto ri = get_inner_r(hh);
auto zi = get_inner_z(hh);

auto ro = get_outer_r(hh);
auto zo = get_outer_z(hh);

auto r1 = otherCell.get_inner_r();
auto z1 = otherCell.get_inner_z();
bool aligned = areAlignedRZ(r1, z1, ro, zo, ptmin, thetaCut);
auto r1 = otherCell.get_inner_r(hh);
auto z1 = otherCell.get_inner_z(hh);
bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, thetaCut);
return (aligned &&
haveSimilarCurvature(cells, innerCellId, ptmin, region_origin_x,
haveSimilarCurvature(hh, otherCell, ptmin, region_origin_x,
region_origin_y, region_origin_radius, phiCut,
hardPtCut));
}


constexpr bool areAlignedRZ(float r1, float z1, float ro, float zo,
__device__ __forceinline__
static bool areAlignedRZ(float r1, float z1, float ri, float zi, float ro, float zo,
const float ptmin,
const float thetaCut) const {
const float thetaCut) {
float radius_diff = std::abs(r1 - ro);
float distance_13_squared =
radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
Expand All @@ -96,27 +101,26 @@ class GPUCACell {
// radius_diff later

float tan_12_13_half_mul_distance_13_squared =
fabs(z1 * (get_inner_r() - ro) + get_inner_z() * (ro - r1) + zo * (r1 - get_inner_r()));
fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
}

constexpr bool
haveSimilarCurvature(const GPUCACell *cells, unsigned int innerCellId,
__device__
bool
haveSimilarCurvature(Hits const & hh, GPUCACell const & otherCell,
const float ptmin, const float region_origin_x,
const float region_origin_y,
const float region_origin_radius, const float phiCut,
const float hardPtCut) const {

const auto &otherCell = cells[innerCellId];

auto x1 = otherCell.get_inner_x();
auto y1 = otherCell.get_inner_y();
auto x1 = otherCell.get_inner_x(hh);
auto y1 = otherCell.get_inner_y(hh);

auto x2 = get_inner_x();
auto y2 = get_inner_y();
auto x2 = get_inner_x(hh);
auto y2 = get_inner_y(hh);

auto x3 = get_outer_x();
auto y3 = get_outer_y();
auto x3 = get_outer_x(hh);
auto y3 = get_outer_y(hh);

float distance_13_squared = (x1 - x3) * (x1 - x3) + (y1 - y3) * (y1 - y3);
float tan_12_13_half_mul_distance_13_squared =
Expand All @@ -139,7 +143,7 @@ class GPUCACell {

return distance_13_beamspot_squared <
(region_origin_radius + phiCut) * (region_origin_radius + phiCut);
}
}

// 87 cm/GeV = 1/(3.8T * 0.3)

Expand Down Expand Up @@ -186,13 +190,13 @@ class GPUCACell {
// trying to free the track building process from hardcoded layers, leaving
// the visit of the graph based on the neighborhood connections between cells.

#ifdef __CUDACC__
// #ifdef __CUDACC__

__device__
inline void find_ntuplets(
const GPUCACell *cells,
GPUCACell const * __restrict__ cells,
GPU::SimpleVector<Quadruplet> *foundNtuplets,
GPU::VecArray<unsigned int,3> &tmpNtuplet,
GPU::VecArray<hindex_type,3> &tmpNtuplet,
const unsigned int minHitsPerNtuplet) const
{
// the building process for a track ends if:
Expand Down Expand Up @@ -231,16 +235,10 @@ class GPUCACell {
int theLayerPairId;

private:
unsigned int theInnerHitId;
unsigned int theOuterHitId;
float theInnerX;
float theOuterX;
float theInnerY;
float theOuterY;
float theInnerZ;
float theOuterZ;
float theInnerR;
float theOuterR;
hindex_type theInnerHitId;
hindex_type theOuterHitId;
};

#endif // RecoPixelVertexing_PixelTriplets_plugins_GPUCACell_h
38 changes: 19 additions & 19 deletions RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ namespace gpuPixelDoublets {

template<typename Hist>
__device__
__forceinline__
void doubletsFromHisto(uint8_t const * __restrict__ layerPairs,
uint32_t nPairs,
GPUCACell * cells,
uint32_t * nCells,
int16_t const * __restrict__ iphi,
Hist const * __restrict__ hist,
Hist const & __restrict__ hist,
uint32_t const * __restrict__ offsets,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & __restrict__ hh,
GPU::VecArray< unsigned int, 256> * isOuterHitOfCell,
Expand Down Expand Up @@ -63,6 +64,8 @@ namespace gpuPixelDoublets {
uint8_t outer = layerPairs[2*pairLayerId+1];
assert(outer > inner);

auto hoff = Hist::histOff(outer);

auto i = (0 == pairLayerId) ? j : j-innerLayerCumulativeSize[pairLayerId-1];
i += offsets[inner];

Expand All @@ -73,8 +76,8 @@ namespace gpuPixelDoublets {

// found hit corresponding to our cuda thread, now do the job
auto mep = iphi[i];
auto mez = hh.zg_d[i];
auto mer = hh.rg_d[i];
auto mez = __ldg(hh.zg_d+i);
auto mer = __ldg(hh.rg_d+i);

constexpr float z0cut = 12.f; // cm
constexpr float hardPtCut = 0.5f; // GeV
Expand All @@ -83,22 +86,22 @@ namespace gpuPixelDoublets {
auto ptcut = [&](int j) {
auto r2t4 = minRadius2T4;
auto ri = mer;
auto ro = hh.rg_d[j];
auto ro = __ldg(hh.rg_d+j);
auto dphi = short2phi( min( abs(int16_t(mep-iphi[j])), abs(int16_t(iphi[j]-mep)) ) );
return dphi*dphi * (r2t4 - ri*ro) > (ro-ri)*(ro-ri);
};
auto z0cutoff = [&](int j) {
auto zo = hh.zg_d[j];
auto ro = hh.rg_d[j];
auto zo = __ldg(hh.zg_d+j);
auto ro = __ldg(hh.rg_d+j);
auto dr = ro-mer;
return dr > maxr[pairLayerId] ||
dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
};

auto iphicut = phicuts[pairLayerId];

auto kl = hist[outer].bin(int16_t(mep-iphicut));
auto kh = hist[outer].bin(int16_t(mep+iphicut));
auto kl = Hist::bin(int16_t(mep-iphicut));
auto kh = Hist::bin(int16_t(mep+iphicut));
auto incr = [](auto & k) { return k = (k+1) % Hist::nbins();};
int tot = 0;
int nmin = 0;
Expand All @@ -108,9 +111,11 @@ namespace gpuPixelDoublets {
int tooMany=0;
for (auto kk = kl; kk != khh; incr(kk)) {
if (kk != kl && kk != kh)
nmin += hist[outer].size(kk);
for (auto p = hist[outer].begin(kk); p < hist[outer].end(kk); ++p) {
auto oi=*p;
nmin += hist.size(kk+hoff);
auto const * __restrict__ p = hist.begin(kk+hoff);
auto const * __restrict__ e = hist.end(kk+hoff);
for (;p < e; ++p) {
auto oi=__ldg(p);
assert(oi>=offsets[outer]);
assert(oi<offsets[outer+1]);

Expand All @@ -128,19 +133,14 @@ namespace gpuPixelDoublets {
if (tooMany > 0)
printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d %d,%d\n", i, inner, outer, kl, kh, nmin, tot);

if (hist[outer].nspills > 0)
printf("spill bin to be checked in %d %d\n", outer, hist[outer].nspills);

// if (0==hist[outer].nspills) assert(tot>=nmin);
// look in spill bin as well....

} // loop in block...
}

constexpr auto getDoubletsFromHistoMaxBlockSize = 64;
constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;

__global__
__launch_bounds__(getDoubletsFromHistoMaxBlockSize)
__launch_bounds__(getDoubletsFromHistoMaxBlockSize,getDoubletsFromHistoMinBlocksPerMP)
void getDoubletsFromHisto(GPUCACell * cells,
uint32_t * nCells,
siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * __restrict__ hhp,
Expand Down Expand Up @@ -184,7 +184,7 @@ namespace gpuPixelDoublets {

auto const & __restrict__ hh = *hhp;
doubletsFromHisto(layerPairs, nPairs, cells, nCells,
hh.iphi_d, hh.hist_d, hh.hitsLayerStart_d,
hh.iphi_d, *hh.hist_d, hh.hitsLayerStart_d,
hh, isOuterHitOfCell,
phicuts, minz, maxz, maxr);
}
Expand Down

0 comments on commit fd748ed

Please sign in to comment.