Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix intercommunicator overflow with big payload collectives #9942

Merged
merged 1 commit into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 27 additions & 9 deletions ompi/mca/coll/inter/coll_inter_allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -48,9 +49,10 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, root = 0, size, rsize, err = OMPI_SUCCESS;
int rank, root = 0, size, rsize, err = OMPI_SUCCESS, i;
char *ptmp_free = NULL, *ptmp = NULL;
ptrdiff_t gap, span;
void *rbuf_ptr;

rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm->c_local_comm);
Expand All @@ -76,9 +78,9 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,

if (rank == root) {
/* Do a send-recv between the two root procs. to avoid deadlock */
err = ompi_coll_base_sendrecv_actual(ptmp, scount*size, sdtype, 0,
err = ompi_coll_base_sendrecv_actual(ptmp, scount*(size_t)size, sdtype, 0,
MCA_COLL_BASE_TAG_ALLGATHER,
rbuf, rcount*rsize, rdtype, 0,
rbuf, rcount*(size_t)rsize, rdtype, 0,
MCA_COLL_BASE_TAG_ALLGATHER,
comm, MPI_STATUS_IGNORE);
if (OMPI_SUCCESS != err) {
Expand All @@ -87,12 +89,28 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
}
/* bcast the message to all the local processes */
if ( rcount > 0 ) {
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
if ( OPAL_UNLIKELY(rcount*(size_t)rsize > INT_MAX) ) {
// Sending the message in the coll_bcast as "rcount*rsize" would exceed
// the 'int count' parameter in the coll_bcast() function. Instead broadcast
// the result in "rcount" chunks to the local group.
span = opal_datatype_span(&rdtype->super, rcount, &gap);
for( i = 0; i < rsize; ++i) {
rbuf_ptr = (char*)rbuf + span * (size_t)i;
err = comm->c_local_comm->c_coll->coll_bcast(rbuf_ptr, rcount, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
}
} else {
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
}
}

exit:
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_allgatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -47,7 +48,8 @@ mca_coll_inter_allgatherv_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, size_local, total=0, err;
int i, rank, size, size_local, err;
size_t total = 0;
int *count=NULL,*displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype = NULL;
Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/coll/inter/coll_inter_gather.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -76,7 +77,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
comm->c_local_comm->c_coll->coll_gather_module);
if (0 == rank) {
/* First process sends data to the root */
err = MCA_PML_CALL(send(ptmp, scount*size_local, sdtype, root,
err = MCA_PML_CALL(send(ptmp, scount*(size_t)size_local, sdtype, root,
MCA_COLL_BASE_TAG_GATHER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (OMPI_SUCCESS != err) {
Expand All @@ -86,7 +87,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
free(ptmp_free);
} else {
/* I am the root, loop receiving the data. */
err = MCA_PML_CALL(recv(rbuf, rcount*size, rdtype, 0,
err = MCA_PML_CALL(recv(rbuf, rcount*(size_t)size, rdtype, 0,
MCA_COLL_BASE_TAG_GATHER,
comm, MPI_STATUS_IGNORE));
if (OMPI_SUCCESS != err) {
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_gatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -44,7 +45,8 @@ mca_coll_inter_gatherv_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, size_local, total=0, err;
int i, rank, size, size_local, err;
size_t total = 0;
int *count=NULL, *displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype;
Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/coll/inter/coll_inter_scatter.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -69,7 +70,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
}
ptmp = ptmp_free - gap;

err = MCA_PML_CALL(recv(ptmp, rcount*size_local, rdtype,
err = MCA_PML_CALL(recv(ptmp, rcount*(size_t)size_local, rdtype,
root, MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
if (OMPI_SUCCESS != err) {
Expand All @@ -86,7 +87,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
}
} else {
/* Root sends data to the first process in the remote group */
err = MCA_PML_CALL(send(sbuf, scount*size, sdtype, 0,
err = MCA_PML_CALL(send(sbuf, scount*(size_t)size, sdtype, 0,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (OMPI_SUCCESS != err) {
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_scatterv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -45,7 +46,8 @@ mca_coll_inter_scatterv_inter(const void *sbuf, const int *scounts,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err, total=0, size_local;
int i, rank, size, err, size_local;
size_t total = 0;
int *counts=NULL,*displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype;
Expand Down