Skip to content

Commit

Permalink
Merge pull request #841 from parthenon-hpc-lab/bprather/fix-mpi-comms
Browse files Browse the repository at this point in the history
Fix MPI communicator leak
  • Loading branch information
bprather authored Mar 28, 2023
2 parents ed12873 + 2926095 commit c2f1ee1
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 12 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- [[PR 663]](https://github.com/lanl/parthenon/pull/663) Change bvals_in_one to use sparse boundary buffers and add flux_correction in one.

### Fixed (not changing behavior/API/variables/...)
- [[PR 841]](https://github.com/parthenon-hpc-lab/parthenon/pull/841) De-allocate MPI communicators when freeing `Reduction` objects
- [[PR 851]](https://github.com/parthenon-hpc-lab/parthenon/pull/851) Fix xdmf hyperslab definition for vectors
- [[PR 843]](https://github.com/parthenon-hpc-lab/parthenon/pull/843) Add guard rails to prolongation/restriction infrastructure
- [[PR 832]](https://github.com/parthenon-hpc-lab/parthenon/pull/833) Fix movie2d script after it broke due to change in HDF5 format
Expand Down
32 changes: 20 additions & 12 deletions src/utils/reductions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifndef UTILS_REDUCTIONS_HPP_
#define UTILS_REDUCTIONS_HPP_

#include <memory>
#include <vector>

#include <config.hpp>
Expand Down Expand Up @@ -59,20 +60,24 @@ struct ReductionBase {
T val;
#ifdef MPI_PARALLEL
MPI_Request req;
MPI_Comm comm;
std::shared_ptr<MPI_Comm> pcomm;
#endif
bool active = false;
ReductionBase() {
#ifdef MPI_PARALLEL
MPI_Comm_dup(MPI_COMM_WORLD, &comm);
// Store the communicator in a shared_ptr, so that
// MPI_Comm_free is called, but only when the last
// copy of this ReductionBase is destroyed.
pcomm = std::shared_ptr<MPI_Comm>(new MPI_Comm, MPI_Comm_disconnect);
PARTHENON_MPI_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, pcomm.get()));
#endif
}

TaskStatus CheckReduce() {
if (!active) return TaskStatus::complete;
int check = 1;
#ifdef MPI_PARALLEL
MPI_Test(&req, &check, MPI_STATUS_IGNORE);
PARTHENON_MPI_CHECK(MPI_Test(&req, &check, MPI_STATUS_IGNORE));
#endif
if (check) {
active = false;
Expand All @@ -87,9 +92,10 @@ struct AllReduce : public ReductionBase<T> {
TaskStatus StartReduce(MPI_Op op) {
if (this->active) return TaskStatus::complete;
#ifdef MPI_PARALLEL
MPI_Iallreduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, this->comm, &(this->req));
PARTHENON_MPI_CHECK(
MPI_Iallreduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, *(this->pcomm), &(this->req)));
#endif
this->active = true;
return TaskStatus::complete;
Expand All @@ -102,14 +108,16 @@ struct Reduce : public ReductionBase<T> {
if (this->active) return TaskStatus::complete;
#ifdef MPI_PARALLEL
if (Globals::my_rank == n) {
MPI_Ireduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, n, this->comm, &(this->req));
PARTHENON_MPI_CHECK(MPI_Ireduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, n,
*(this->pcomm), &(this->req)));

} else {
MPI_Ireduce(contiguous_container::data(this->val), nullptr,
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, n, this->comm, &(this->req));
PARTHENON_MPI_CHECK(MPI_Ireduce(contiguous_container::data(this->val), nullptr,
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, n,
*(this->pcomm), &(this->req)));
}
#endif
this->active = true;
Expand Down

0 comments on commit c2f1ee1

Please sign in to comment.