Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix MPI communicator leak #841

Merged
merged 9 commits into from
Mar 28, 2023
Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- [[PR 663]](https://github.com/lanl/parthenon/pull/663) Change bvals_in_one to use sparse boundary buffers and add flux_correction in one.

### Fixed (not changing behavior/API/variables/...)
- [[PR 841]](https://github.com/parthenon-hpc-lab/parthenon/pull/841) De-allocate MPI communicators when freeing `Reduction` objects
- [[PR 843]](https://github.com/parthenon-hpc-lab/parthenon/pull/843) Add guard rails to prolongation/restriction infrastructure
- [[PR 832]](https://github.com/parthenon-hpc-lab/parthenon/pull/833) Fix movie2d script after it broke due to change in HDF5 format
- [[PR 820]](https://github.com/parthenon-hpc-lab/parthenon/pull/820) Fix XDMF spec to conform to standard and handle scalar and vector variables
Expand Down
32 changes: 20 additions & 12 deletions src/utils/reductions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifndef UTILS_REDUCTIONS_HPP_
#define UTILS_REDUCTIONS_HPP_

#include <memory>
#include <vector>

#include <config.hpp>
Expand Down Expand Up @@ -59,20 +60,24 @@ struct ReductionBase {
T val;
#ifdef MPI_PARALLEL
MPI_Request req;
MPI_Comm comm;
std::shared_ptr<MPI_Comm> pcomm;
#endif
bool active = false;
ReductionBase() {
#ifdef MPI_PARALLEL
MPI_Comm_dup(MPI_COMM_WORLD, &comm);
// Store the communicator in a shared_ptr, so that
// MPI_Comm_free is called, but only when the last
// copy of this ReductionBase is destroyed.
pcomm = std::shared_ptr<MPI_Comm>(new MPI_Comm, MPI_Comm_free);
PARTHENON_MPI_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, pcomm.get()));
#endif
}

TaskStatus CheckReduce() {
if (!active) return TaskStatus::complete;
int check = 1;
#ifdef MPI_PARALLEL
MPI_Test(&req, &check, MPI_STATUS_IGNORE);
PARTHENON_MPI_CHECK(MPI_Test(&req, &check, MPI_STATUS_IGNORE));
#endif
if (check) {
active = false;
Expand All @@ -87,9 +92,10 @@ struct AllReduce : public ReductionBase<T> {
TaskStatus StartReduce(MPI_Op op) {
if (this->active) return TaskStatus::complete;
#ifdef MPI_PARALLEL
MPI_Iallreduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, this->comm, &(this->req));
PARTHENON_MPI_CHECK(
MPI_Iallreduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, *(this->pcomm), &(this->req)));
#endif
this->active = true;
return TaskStatus::complete;
Expand All @@ -102,14 +108,16 @@ struct Reduce : public ReductionBase<T> {
if (this->active) return TaskStatus::complete;
#ifdef MPI_PARALLEL
if (Globals::my_rank == n) {
MPI_Ireduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, n, this->comm, &(this->req));
PARTHENON_MPI_CHECK(MPI_Ireduce(MPI_IN_PLACE, contiguous_container::data(this->val),
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, n,
*(this->pcomm), &(this->req)));

} else {
MPI_Ireduce(contiguous_container::data(this->val), nullptr,
contiguous_container::size(this->val), GetContainerMPIType(this->val),
op, n, this->comm, &(this->req));
PARTHENON_MPI_CHECK(MPI_Ireduce(contiguous_container::data(this->val), nullptr,
contiguous_container::size(this->val),
GetContainerMPIType(this->val), op, n,
*(this->pcomm), &(this->req)));
}
#endif
this->active = true;
Expand Down