Skip to content

Commit

Permalink
Merge pull request cms-sw#16 from jsamudio/devFactorAndShared
Browse files Browse the repository at this point in the history
fixed device functions and shared memory implementation
  • Loading branch information
jsamudio authored Mar 20, 2023
2 parents 338163d + eb77819 commit d3eb59b
Showing 1 changed file with 86 additions and 95 deletions.
181 changes: 86 additions & 95 deletions RecoParticleFlow/PFClusterProducer/plugins/PFClusterCudaHCAL.cu
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,50 @@ namespace PFClusterCudaHCAL {
}
}

__device__ auto dev_getSeedRhIdx(int* seeds, int seedNum) { return seeds[seedNum]; }

__device__ auto dev_getRhFracIdx(int* rechits, int rhNum) {
if (rhNum <= 0) {
printf("Invalid rhNum (%d) for get RhFracIdx!\n", rhNum);
}
return rechits[rhNum - 1];
}

__device__ auto dev_getRhFrac(
int* topoSeedList, int topoSeedBegin, float* pcrhfrac, int* seedFracOffsets, int seedNum, int rhNum) {
int seedIdx = topoSeedList[topoSeedBegin + seedNum];
return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
}

__device__ auto dev_computeClusterPos(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
float4& pos4,
float frac,
int rhInd,
bool isDebug,
const float* __restrict__ pfrh_x,
const float* __restrict__ pfrh_y,
const float* __restrict__ pfrh_z,
const float* __restrict__ pfrh_energy,
float rhENormInv) {
float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
const auto rh_energy = pfrh_energy[rhInd] * frac;
const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
if (isDebug)
printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
rhInd,
norm,
frac,
rh_energy,
rechitPos.x,
rechitPos.y,
rechitPos.z);

pos4.x += rechitPos.x * norm;
pos4.y += rechitPos.y * norm;
pos4.z += rechitPos.z * norm;
pos4.w += norm; // position_norm
}

__device__ void dev_hcalFastCluster_optimizedSimple(PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
int topoId,
int nRHTopo,
Expand Down Expand Up @@ -697,27 +741,14 @@ namespace PFClusterCudaHCAL {
}
__syncthreads();

auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; };

auto getRhFracIdx = [&](int seedNum, int rhNum) {
if (rhNum <= 0)
printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum);
return rechits[rhNum - 1];
};

auto getRhFrac = [&](int seedNum, int rhNum) {
int seedIdx = topoSeedList[topoSeedBegin + seedNum];
return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
};

if (debug) {
if (threadIdx.x == 0) {
printf("\n===========================================================================================\n");
printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
for (int s = 0; s < nSeeds; s++) {
if (s != 0)
printf(", ");
printf("%d", getSeedRhIdx(s));
printf("%d", dev_getSeedRhIdx(seeds, s));
}
if (nRHTopo == nSeeds) {
printf(")\n\n");
Expand All @@ -726,41 +757,17 @@ namespace PFClusterCudaHCAL {
for (int r = 1; r < nRHNotSeed; r++) {
if (r != 1)
printf(", ");
printf("%d", getRhFracIdx(0, r));
printf("%d", dev_getRhFracIdx(rechits, r));
}
printf(")\n\n");
}
}
__syncthreads();
}

auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
float4& pos4,
float frac,
int rhInd,
bool isDebug) {
float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
const auto rh_energy = pfrh_energy[rhInd] * frac;
const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
if (isDebug)
printf("\t\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
rhInd,
norm,
frac,
rh_energy,
rechitPos.x,
rechitPos.y,
rechitPos.z);

pos4.x += rechitPos.x * norm;
pos4.y += rechitPos.y * norm;
pos4.z += rechitPos.z * norm;
pos4.w += norm; // position_norm
};

// Set initial cluster position (energy) to seed rechit position (energy)
for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
int i = getSeedRhIdx(s);
int i = dev_getSeedRhIdx(seeds, s);
clusterPos[s] = make_float4(pfrh_x[i], pfrh_y[i], pfrh_z[i], 1.0);
prevClusterPos[s] = clusterPos[s];
clusterEnergy[s] = pfrh_energy[i];
Expand All @@ -784,15 +791,16 @@ namespace PFClusterCudaHCAL {
float seedEnergy = -1.;
float4 seedInitClusterPos = make_float4(0., 0., 0., 0.);
if (tid < nSeeds) {
seedThreadIdx = getSeedRhIdx(tid);
seedThreadIdx = dev_getSeedRhIdx(seeds, tid);
seedNeighbors = make_int4(pfrh_neighbours[8 * seedThreadIdx],
pfrh_neighbours[8 * seedThreadIdx + 1],
pfrh_neighbours[8 * seedThreadIdx + 2],
pfrh_neighbours[8 * seedThreadIdx + 3]);
seedEnergy = pfrh_energy[seedThreadIdx];

// Compute initial cluster position shift for seed
computeClusterPos(pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug);
dev_computeClusterPos(
pfClusParams, seedInitClusterPos, 1., seedThreadIdx, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
}

do {
Expand Down Expand Up @@ -869,14 +877,15 @@ namespace PFClusterCudaHCAL {
if (tid < nSeeds) {
for (int r = 0; r < nRHNotSeed - 1; r++) {
int j = rechits[r];
float frac = getRhFrac(tid, r + 1);
float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, tid, r + 1);

if (frac > -0.5) {
clusterEnergy[tid] += frac * pfrh_energy[j];

if (nSeeds == 1 || j == seedNeighbors.x || j == seedNeighbors.y || j == seedNeighbors.z ||
j == seedNeighbors.w)
computeClusterPos(pfClusParams, clusterPos[tid], frac, j, debug);
dev_computeClusterPos(
pfClusParams, clusterPos[tid], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
}
}
}
Expand Down Expand Up @@ -959,9 +968,16 @@ namespace PFClusterCudaHCAL {
__shared__ int nRHNotSeed, topoSeedBegin, gridStride, iter;
__shared__ float tol, diff2, rhENormInv;
__shared__ bool notDone, debug;
__shared__ float4 clusterPos[400], prevClusterPos[400];
__shared__ float clusterEnergy[400], rhFracSum[1500];
__shared__ int seeds[400], rechits[1500];
//__shared__ float4 clusterPos[400], prevClusterPos[400];
//__shared__ float clusterEnergy[400], rhFracSum[1500];
//__shared__ int seeds[400], rechits[1500];
extern __shared__ float4 sharedArr[];
float4* clusterPos = sharedArr; //nSeeds
float4* prevClusterPos = (float4*)&clusterPos[nSeeds]; //nSeeds
float* clusterEnergy = (float*)&prevClusterPos[nSeeds]; //nSeeds
float* rhFracSum = (float*)&clusterEnergy[nSeeds]; //nRHTopo - nSeeds
int* seeds = (int*)&rhFracSum[nRHTopo - nSeeds]; //nSeeds
int* rechits = (int*)&seeds[nSeeds]; //nRHTopo - nSeeds

if (threadIdx.x == 0) {
nRHNotSeed = nRHTopo - nSeeds + 1; // 1 + (# rechits per topoId that are NOT seeds)
Expand Down Expand Up @@ -991,27 +1007,14 @@ namespace PFClusterCudaHCAL {
}
__syncthreads();

auto getSeedRhIdx = [&](int seedNum) { return seeds[seedNum]; };

auto getRhFracIdx = [&](int seedNum, int rhNum) {
if (rhNum <= 0)
printf("Invalid rhNum (%d) for getRhFracIdx!\n", rhNum);
return rechits[rhNum - 1];
};

auto getRhFrac = [&](int seedNum, int rhNum) {
int seedIdx = topoSeedList[topoSeedBegin + seedNum];
return pcrhfrac[seedFracOffsets[seedIdx] + rhNum];
};

if (debug) {
if (threadIdx.x == 0) {
printf("\n===========================================================================================\n");
printf("Processing topo cluster %d with nSeeds = %d nRHTopo = %d and seeds (", topoId, nSeeds, nRHTopo);
for (int s = 0; s < nSeeds; s++) {
if (s != 0)
printf(", ");
printf("%d", getSeedRhIdx(s));
printf("%d", dev_getSeedRhIdx(seeds, s));
}
if (nRHTopo == nSeeds) {
printf(")\n\n");
Expand All @@ -1020,38 +1023,14 @@ namespace PFClusterCudaHCAL {
for (int r = 1; r < nRHNotSeed; r++) {
if (r != 1)
printf(", ");
printf("%d", getRhFracIdx(0, r));
printf("%d", dev_getRhFracIdx(rechits, r));
}
printf(")\n\n");
}
}
__syncthreads();
}

auto computeClusterPos = [&](PFClusteringParamsGPU::DeviceProduct::ConstView pfClusParams,
float4& pos4,
float frac,
int rhInd,
bool isDebug) {
float4 rechitPos = make_float4(pfrh_x[rhInd], pfrh_y[rhInd], pfrh_z[rhInd], 1.0);
const auto rh_energy = pfrh_energy[rhInd] * frac;
const auto norm = (frac < pfClusParams.minFracInCalc() ? 0.0f : max(0.0f, logf(rh_energy * rhENormInv)));
if (isDebug)
printf("\t\trechit %d: norm = %f\tfrac = %f\trh_energy = %f\tpos = (%f, %f, %f)\n",
rhInd,
norm,
frac,
rh_energy,
rechitPos.x,
rechitPos.y,
rechitPos.z);

pos4.x += rechitPos.x * norm;
pos4.y += rechitPos.y * norm;
pos4.z += rechitPos.z * norm;
pos4.w += norm; // position_norm
};

// Set initial cluster position (energy) to seed rechit position (energy)
for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
int i = seeds[s];
Expand Down Expand Up @@ -1122,7 +1101,7 @@ namespace PFClusterCudaHCAL {

// Reset cluster position and energy
for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
int seedRhIdx = getSeedRhIdx(s);
int seedRhIdx = dev_getSeedRhIdx(seeds, s);
float norm = logf(pfrh_energy[seedRhIdx] * rhENormInv);
clusterPos[s] = make_float4(pfrh_x[seedRhIdx] * norm, pfrh_y[seedRhIdx] * norm, pfrh_z[seedRhIdx] * norm, norm);
clusterEnergy[s] = pfrh_energy[seedRhIdx];
Expand All @@ -1141,17 +1120,18 @@ namespace PFClusterCudaHCAL {

// Recalculate position
for (int s = threadIdx.x; s < nSeeds; s += gridStride) {
int seedRhIdx = getSeedRhIdx(s);
int seedRhIdx = dev_getSeedRhIdx(seeds, s);
for (int r = 0; r < nRHNotSeed - 1; r++) {
int j = rechits[r];
float frac = getRhFrac(s, r + 1);
float frac = dev_getRhFrac(topoSeedList, topoSeedBegin, pcrhfrac, seedFracOffsets, s, r + 1);

if (frac > -0.5) {
clusterEnergy[s] += frac * pfrh_energy[j];

if (nSeeds == 1 || j == pfrh_neighbours[8 * seedRhIdx] || j == pfrh_neighbours[8 * seedRhIdx + 1] ||
j == pfrh_neighbours[8 * seedRhIdx + 2] || j == pfrh_neighbours[8 * seedRhIdx + 3])
computeClusterPos(pfClusParams, clusterPos[s], frac, j, debug);
dev_computeClusterPos(
pfClusParams, clusterPos[s], frac, j, debug, pfrh_x, pfrh_y, pfrh_z, pfrh_energy, rhENormInv);
}
}
}
Expand Down Expand Up @@ -1743,10 +1723,13 @@ namespace PFClusterCudaHCAL {
dim3 grid((nRH + 31) / 32, (nRH + 31) / 32);
dim3 block(32, 32);

typeof(nTopo) h_nTopo;

cudaCheck(cudaMemcpyFromSymbolAsync(&h_nTopo, nTopo, sizeof(int), 0, cudaMemcpyDeviceToHost, cudaStream));

int nRHTopo_h[nRH];
int nSeedsTopo_h[nRH];
cudaCheck(cudaMemcpyAsync(&nRHTopo_h, outputGPU.topoRHCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
cudaCheck(cudaMemcpyAsync(&nSeedsTopo_h, outputGPU.topoSeedCount.get(), nRH*sizeof(int), cudaMemcpyDeviceToHost, cudaStream));
int nRHTopoMax = *std::max_element(nRHTopo_h, nRHTopo_h + (nRH-1));
int nSeedsTopoMax = *std::max_element(nSeedsTopo_h, nSeedsTopo_h + (nRH-1));

fillRhfIndex<<<grid, block, 0, cudaStream>>>(nRH,
outputGPU.pfrh_topoId.get(),
outputGPU.pfrh_isSeed.get(),
Expand All @@ -1755,8 +1738,16 @@ namespace PFClusterCudaHCAL {
outputGPU.seedFracOffsets.get(),
scratchGPU.rhcount.get(),
outputGPU.pcrh_fracInd.get());
int sharedMem =
nSeedsTopoMax*sizeof(float4)+
nSeedsTopoMax*sizeof(float4)+
nSeedsTopoMax*sizeof(float)+
(nRHTopoMax-nSeedsTopoMax)*sizeof(float)+
nSeedsTopoMax*sizeof(int)+
(nRHTopoMax-nSeedsTopoMax)*sizeof(int);


hcalFastCluster_selection<<<nRH, threadsPerBlock, 0, cudaStream>>>(pfClusParams.const_view(),
hcalFastCluster_selection<<<nRH, threadsPerBlock, sharedMem, cudaStream>>>(pfClusParams.const_view(),
nRH,
inputPFRecHits.pfrh_x.get(),
inputPFRecHits.pfrh_y.get(),
Expand Down

0 comments on commit d3eb59b

Please sign in to comment.