Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate buffer packing functions with less atomics #1199

Merged
merged 5 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 16 additions & 14 deletions example/particles/particles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,17 +288,18 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
} while (r > 0.5);

// Randomly sample direction perpendicular to origin
Real theta = acos(2. * rng_gen.drand() - 1.);
Real phi = 2. * M_PI * rng_gen.drand();
v(0, n) = sin(theta) * cos(phi);
v(1, n) = sin(theta) * sin(phi);
v(2, n) = cos(theta);
const Real mu = 2.0 * rng_gen.drand() - 1.0;
const Real phi = 2. * M_PI * rng_gen.drand();
const Real stheta = std::sqrt(1.0 - mu * mu);
v(0, n) = stheta * cos(phi);
v(1, n) = stheta * sin(phi);
v(2, n) = mu;
// Project v onto plane normal to sphere
Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
Real NdN = r * r;
v(0, n) = v(0, n) - vdN / NdN * x(n);
v(1, n) = v(1, n) - vdN / NdN * y(n);
v(2, n) = v(2, n) - vdN / NdN * z(n);
Real inverse_NdN = 1. / (r * r);
v(0, n) = v(0, n) - vdN * inverse_NdN * x(n);
v(1, n) = v(1, n) - vdN * inverse_NdN * y(n);
v(2, n) = v(2, n) - vdN * inverse_NdN * z(n);

// Normalize
Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
Expand Down Expand Up @@ -335,11 +336,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
z(n) = minx_k + nx_k * dx_k * rng_gen.drand();

// Randomly sample direction on the unit sphere, fixing speed
Real theta = acos(2. * rng_gen.drand() - 1.);
Real phi = 2. * M_PI * rng_gen.drand();
v(0, n) = vel * sin(theta) * cos(phi);
v(1, n) = vel * sin(theta) * sin(phi);
v(2, n) = vel * cos(theta);
const Real mu = 2.0 * rng_gen.drand() - 1.0;
const Real phi = 2. * M_PI * rng_gen.drand();
const Real stheta = std::sqrt(1.0 - mu * mu);
v(0, n) = vel * stheta * cos(phi);
v(1, n) = vel * stheta * sin(phi);
v(2, n) = vel * mu;

// Create particles at the beginning of the timestep
t(n) = t0;
Expand Down
23 changes: 14 additions & 9 deletions src/interface/swarm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {
context.block_index_ = block_index_;
context.neighbor_indices_ = neighbor_indices_;
context.cell_sorted_ = cell_sorted_;
context.buffer_sorted_ = buffer_sorted_;
context.cell_sorted_begin_ = cell_sorted_begin_;
context.cell_sorted_number_ = cell_sorted_number_;

Expand Down Expand Up @@ -73,9 +74,10 @@ Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_
new_indices_("new_indices_", nmax_pool_), scratch_a_("scratch_a_", nmax_pool_),
scratch_b_("scratch_b_", nmax_pool_),
num_particles_to_send_("num_particles_to_send_", NMAX_NEIGHBORS),
buffer_counters_("buffer_counters_", NMAX_NEIGHBORS),
buffer_start_("buffer_start_", NMAX_NEIGHBORS),
neighbor_received_particles_("neighbor_received_particles_", NMAX_NEIGHBORS),
cell_sorted_("cell_sorted_", nmax_pool_), mpiStatus(true) {
cell_sorted_("cell_sorted_", nmax_pool_),
buffer_sorted_("buffer_sorted_", nmax_pool_), mpiStatus(true) {
PARTHENON_REQUIRE_THROWS(typeid(Coordinates_t) == typeid(UniformCartesian),
"SwarmDeviceContext only supports a uniform Cartesian mesh!");

Expand Down Expand Up @@ -209,6 +211,9 @@ void Swarm::SetPoolMax(const std::int64_t nmax_pool) {
Kokkos::resize(cell_sorted_, nmax_pool);
pmb->LogMemUsage(n_new * sizeof(SwarmKey));

Kokkos::resize(buffer_sorted_, nmax_pool);
pmb->LogMemUsage(n_new * sizeof(SwarmKey));

block_index_.Resize(nmax_pool);
pmb->LogMemUsage(n_new * sizeof(int));

Expand Down Expand Up @@ -490,35 +495,35 @@ void Swarm::SortParticlesByCell() {
break;
}

if (cell_sorted(start_index).cell_idx_1d_ == cell_idx_1d) {
if (cell_sorted(start_index).sort_idx_ == cell_idx_1d) {
if (start_index == 0) {
break;
} else if (cell_sorted(start_index - 1).cell_idx_1d_ != cell_idx_1d) {
} else if (cell_sorted(start_index - 1).sort_idx_ != cell_idx_1d) {
break;
} else {
start_index--;
continue;
}
}
if (cell_sorted(start_index).cell_idx_1d_ >= cell_idx_1d) {
if (cell_sorted(start_index).sort_idx_ >= cell_idx_1d) {
start_index--;
if (start_index < 0) {
start_index = -1;
break;
}
if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
if (cell_sorted(start_index).sort_idx_ < cell_idx_1d) {
start_index = -1;
break;
}
continue;
}
if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
if (cell_sorted(start_index).sort_idx_ < cell_idx_1d) {
start_index++;
if (start_index > max_active_index) {
start_index = -1;
break;
}
if (cell_sorted(start_index).cell_idx_1d_ > cell_idx_1d) {
if (cell_sorted(start_index).sort_idx_ > cell_idx_1d) {
start_index = -1;
break;
}
Expand All @@ -532,7 +537,7 @@ void Swarm::SortParticlesByCell() {
int number = 0;
int current_index = start_index;
while (current_index <= max_active_index &&
cell_sorted(current_index).cell_idx_1d_ == cell_idx_1d) {
cell_sorted(current_index).sort_idx_ == cell_idx_1d) {
current_index++;
number++;
cell_sorted_number(k, j, i) = number;
Expand Down
5 changes: 4 additions & 1 deletion src/interface/swarm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ class Swarm {
constexpr static int unset_index_ = -1;

ParArray1D<int> num_particles_to_send_;
ParArray1D<int> buffer_counters_;
ParArray1D<int> buffer_start_;
ParArray1D<int> neighbor_received_particles_;
int total_received_particles_;

Expand All @@ -298,6 +298,9 @@ class Swarm {
ParArray1D<SwarmKey>
cell_sorted_; // 1D per-cell sorted array of key-value swarm memory indices

ParArray1D<SwarmKey>
buffer_sorted_; // 1D per-buffer sorted array of key-value swarm memory indices

ParArrayND<int>
cell_sorted_begin_; // Per-cell array of starting indices in cell_sorted_

Expand Down
190 changes: 93 additions & 97 deletions src/interface/swarm_comms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,72 +156,14 @@ void Swarm::SetupPersistentMPI() {
}
}

void Swarm::CountParticlesToSend_() {
auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
auto swarm_d = GetDeviceContext();
auto pmb = GetBlockPointer();
const int nbmax = vbswarm->bd_var_.nbmax;

// Fence to make sure particles aren't currently being transported locally
// TODO(BRR) do this operation on device.
pmb->exec_space.fence();
const int particle_size = GetParticleDataSize();
vbswarm->particle_size = particle_size;

// TODO(BRR) This kernel launch should be folded into the subsequent logic once we
// convert that to kernel-based reductions
auto &x = Get<Real>(swarm_position::x::name()).Get();
auto &y = Get<Real>(swarm_position::y::name()).Get();
auto &z = Get<Real>(swarm_position::z::name()).Get();
const int max_active_index = GetMaxActiveIndex();
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
if (swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
}
});

// Facilitate lambda captures
auto &block_index = block_index_;
auto &num_particles_to_send = num_particles_to_send_;

// Zero out number of particles to send before accumulating
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
KOKKOS_LAMBDA(const int n) { num_particles_to_send[n] = 0; });

parthenon::par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
if (swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);

if (block_index(n) >= 0) {
Kokkos::atomic_add(&num_particles_to_send(block_index(n)), 1);
}
}
});

auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();

// Resize send buffers if too small
for (int n = 0; n < pmb->neighbors.size(); n++) {
const int bufid = pmb->neighbors[n].bufid;
auto sendbuf = vbswarm->bd_var_.send[bufid];
if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
vbswarm->bd_var_.send[bufid] = sendbuf;
}
vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
}
}

void Swarm::LoadBuffers_() {
auto swarm_d = GetDeviceContext();
auto pmb = GetBlockPointer();
const int particle_size = GetParticleDataSize();
vbswarm->particle_size = particle_size;
const int nneighbor = pmb->neighbors.size();
// Fence to make sure particles aren't currently being transported locally
pmb->exec_space.fence();

auto &int_vector = std::get<getType<int>()>(vectors_);
auto &real_vector = std::get<getType<Real>()>(vectors_);
Expand All @@ -236,53 +178,107 @@ void Swarm::LoadBuffers_() {
auto &y = Get<Real>(swarm_position::y::name()).Get();
auto &z = Get<Real>(swarm_position::z::name()).Get();

// Zero buffer index counters
auto &buffer_counters = buffer_counters_;
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1,
KOKKOS_LAMBDA(const int n) { buffer_counters[n] = 0; });
if (max_active_index_ >= 0) {
auto &buffer_sorted = buffer_sorted_;
auto &buffer_start = buffer_start_;

auto &bdvar = vbswarm->bd_var_;
auto neighbor_buffer_index = neighbor_buffer_index_;
// Loop over active particles and use atomic operations to find indices into buffers if
// this particle will be sent.
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if (swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
const int m =
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
const int bufid = neighbor_buffer_index(m);

if (m >= 0) {
const int bid = Kokkos::atomic_fetch_add(&buffer_counters(m), 1);
int buffer_index = bid * particle_size;
swarm_d.MarkParticleForRemoval(n);
for (int i = 0; i < realPackDim; i++) {
bdvar.send[bufid](buffer_index) = vreal(i, n);
buffer_index++;
}
for (int i = 0; i < intPackDim; i++) {
bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, n));
buffer_index++;
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
if (swarm_d.IsActive(n)) {
bool on_current_mesh_block = true;
const int m =
swarm_d.GetNeighborBlockIndex(n, x(n), y(n), z(n), on_current_mesh_block);
buffer_sorted(n) = SwarmKey(m, n);
} else {
buffer_sorted(n) = SwarmKey(this_block_, n);
}
});

// sort by buffer index
sort(buffer_sorted, SwarmKeyComparator(), 0, max_active_index_);

// use discontinuity check to determine start of a buffer in buffer_sorted array
auto &num_particles_to_send = num_particles_to_send_;
auto max_active_index = max_active_index_;

// Zero out number of particles to send before accumulating
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, NMAX_NEIGHBORS - 1, KOKKOS_LAMBDA(const int n) {
num_particles_to_send[n] = 0;
buffer_start[n] = 0;
});

pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
auto m = buffer_sorted(n).sort_idx_;
// start checks (used for index of particle in buffer)
if (m >= 0 && n == 0) {
buffer_start(m) = 0;
} else if (m >= 0 && m != buffer_sorted(n - 1).sort_idx_) {
buffer_start(m) = n;
}
// end checks (used to to size particle buffers)
if (m >= 0 && n == max_active_index) {
num_particles_to_send(m) = n + 1;
} else if (m >= 0 && m != buffer_sorted(n + 1).sort_idx_) {
num_particles_to_send(m) = n + 1;
}
});

// copy values back to host for buffer sizing
auto num_particles_to_send_h = num_particles_to_send_.GetHostMirrorAndCopy();
auto buffer_start_h = buffer_start.GetHostMirrorAndCopy();

// Resize send buffers if too small
for (int n = 0; n < pmb->neighbors.size(); n++) {
num_particles_to_send_h(n) -= buffer_start_h(n);
const int bufid = pmb->neighbors[n].bufid;
auto sendbuf = vbswarm->bd_var_.send[bufid];
if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
vbswarm->bd_var_.send[bufid] = sendbuf;
}
vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
}

auto &bdvar = vbswarm->bd_var_;
auto neighbor_buffer_index = neighbor_buffer_index_;
// Loop over active particles buffer_sorted, use index n and buffer_start to
/// get index in buffer m, pack that particle in buffer
pmb->par_for(
PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
auto p_index = buffer_sorted(n).swarm_idx_;
if (swarm_d.IsActive(p_index)) {
const int m = buffer_sorted(n).sort_idx_;
const int bufid = neighbor_buffer_index(m);
if (m >= 0) {
const int bid = n - buffer_start[m];
int buffer_index = bid * particle_size;
swarm_d.MarkParticleForRemoval(p_index);
for (int i = 0; i < realPackDim; i++) {
bdvar.send[bufid](buffer_index) = vreal(i, p_index);
buffer_index++;
}
for (int i = 0; i < intPackDim; i++) {
bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, p_index));
buffer_index++;
}
}
}
}
});
});

// Remove particles that were loaded to send to another block from this block
RemoveMarkedParticles();
// Remove particles that were loaded to send to another block from this block
RemoveMarkedParticles();
}
}

void Swarm::Send(BoundaryCommSubset phase) {
auto pmb = GetBlockPointer();
const int nneighbor = pmb->neighbors.size();
auto swarm_d = GetDeviceContext();

// Query particles for those to be sent
CountParticlesToSend_();

// Prepare buffers for send operations
// Potentially resize buffer, get consistent index from particle array, get ready to
// send
LoadBuffers_();

// Send buffer data
Expand Down
7 changes: 4 additions & 3 deletions src/interface/swarm_device_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ struct SwarmKey {
SwarmKey() {}
KOKKOS_INLINE_FUNCTION
SwarmKey(const int cell_idx_1d, const int swarm_idx_1d)
: cell_idx_1d_(cell_idx_1d), swarm_idx_(swarm_idx_1d) {}
: sort_idx_(cell_idx_1d), swarm_idx_(swarm_idx_1d) {}

int cell_idx_1d_;
int sort_idx_;
int swarm_idx_;
};

struct SwarmKeyComparator {
KOKKOS_INLINE_FUNCTION
bool operator()(const SwarmKey &s1, const SwarmKey &s2) {
return s1.cell_idx_1d_ < s2.cell_idx_1d_;
return s1.sort_idx_ < s2.sort_idx_;
}
};

Expand Down Expand Up @@ -139,6 +139,7 @@ class SwarmDeviceContext {
ParArrayND<int> block_index_;
ParArrayND<int> neighbor_indices_; // 4x4x4 array of possible block AMR regions
ParArray1D<SwarmKey> cell_sorted_;
ParArray1D<SwarmKey> buffer_sorted_;
ParArrayND<int> cell_sorted_begin_;
ParArrayND<int> cell_sorted_number_;
int ndim_;
Expand Down
Loading