Skip to content

Commit

Permalink
Merge pull request #9942 from jjhursey/big-payload-inter-coll
Browse files Browse the repository at this point in the history
Fix intercommunicator overflow with big payload collectives
  • Loading branch information
jjhursey committed Feb 7, 2022
2 parents 2d2d0a7 + fe07940 commit acbe7b0
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 16 deletions.
36 changes: 27 additions & 9 deletions ompi/mca/coll/inter/coll_inter_allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -48,9 +49,10 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int rank, root = 0, size, rsize, err = OMPI_SUCCESS;
int rank, root = 0, size, rsize, err = OMPI_SUCCESS, i;
char *ptmp_free = NULL, *ptmp = NULL;
ptrdiff_t gap, span;
void *rbuf_ptr;

rank = ompi_comm_rank(comm);
size = ompi_comm_size(comm->c_local_comm);
Expand All @@ -76,9 +78,9 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,

if (rank == root) {
/* Do a send-recv between the two root procs. to avoid deadlock */
err = ompi_coll_base_sendrecv_actual(ptmp, scount*size, sdtype, 0,
err = ompi_coll_base_sendrecv_actual(ptmp, scount*(size_t)size, sdtype, 0,
MCA_COLL_BASE_TAG_ALLGATHER,
rbuf, rcount*rsize, rdtype, 0,
rbuf, rcount*(size_t)rsize, rdtype, 0,
MCA_COLL_BASE_TAG_ALLGATHER,
comm, MPI_STATUS_IGNORE);
if (OMPI_SUCCESS != err) {
Expand All @@ -87,12 +89,28 @@ mca_coll_inter_allgather_inter(const void *sbuf, int scount,
}
/* bcast the message to all the local processes */
if ( rcount > 0 ) {
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
if ( OPAL_UNLIKELY(rcount*(size_t)rsize > INT_MAX) ) {
// Sending the message in the coll_bcast as "rcount*rsize" would exceed
// the 'int count' parameter in the coll_bcast() function. Instead broadcast
// the result in "rcount" chunks to the local group.
span = opal_datatype_span(&rdtype->super, rcount, &gap);
for( i = 0; i < rsize; ++i) {
rbuf_ptr = (char*)rbuf + span * (size_t)i;
err = comm->c_local_comm->c_coll->coll_bcast(rbuf_ptr, rcount, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
}
} else {
err = comm->c_local_comm->c_coll->coll_bcast(rbuf, rcount*rsize, rdtype,
root, comm->c_local_comm,
comm->c_local_comm->c_coll->coll_bcast_module);
if (OMPI_SUCCESS != err) {
goto exit;
}
}
}

exit:
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_allgatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -47,7 +48,8 @@ mca_coll_inter_allgatherv_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, size_local, total=0, err;
int i, rank, size, size_local, err;
size_t total = 0;
int *count=NULL,*displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype = NULL;
Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/coll/inter/coll_inter_gather.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2007 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -76,7 +77,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
comm->c_local_comm->c_coll->coll_gather_module);
if (0 == rank) {
/* First process sends data to the root */
err = MCA_PML_CALL(send(ptmp, scount*size_local, sdtype, root,
err = MCA_PML_CALL(send(ptmp, scount*(size_t)size_local, sdtype, root,
MCA_COLL_BASE_TAG_GATHER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (OMPI_SUCCESS != err) {
Expand All @@ -86,7 +87,7 @@ mca_coll_inter_gather_inter(const void *sbuf, int scount,
free(ptmp_free);
} else {
/* I am the root, loop receiving the data. */
err = MCA_PML_CALL(recv(rbuf, rcount*size, rdtype, 0,
err = MCA_PML_CALL(recv(rbuf, rcount*(size_t)size, rdtype, 0,
MCA_COLL_BASE_TAG_GATHER,
comm, MPI_STATUS_IGNORE));
if (OMPI_SUCCESS != err) {
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_gatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -44,7 +45,8 @@ mca_coll_inter_gatherv_inter(const void *sbuf, int scount,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, size_local, total=0, err;
int i, rank, size, size_local, err;
size_t total = 0;
int *count=NULL, *displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype;
Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/coll/inter/coll_inter_scatter.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2008 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -69,7 +70,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
}
ptmp = ptmp_free - gap;

err = MCA_PML_CALL(recv(ptmp, rcount*size_local, rdtype,
err = MCA_PML_CALL(recv(ptmp, rcount*(size_t)size_local, rdtype,
root, MCA_COLL_BASE_TAG_SCATTER,
comm, MPI_STATUS_IGNORE));
if (OMPI_SUCCESS != err) {
Expand All @@ -86,7 +87,7 @@ mca_coll_inter_scatter_inter(const void *sbuf, int scount,
}
} else {
/* Root sends data to the first process in the remote group */
err = MCA_PML_CALL(send(sbuf, scount*size, sdtype, 0,
err = MCA_PML_CALL(send(sbuf, scount*(size_t)size, sdtype, 0,
MCA_COLL_BASE_TAG_SCATTER,
MCA_PML_BASE_SEND_STANDARD, comm));
if (OMPI_SUCCESS != err) {
Expand Down
4 changes: 3 additions & 1 deletion ompi/mca/coll/inter/coll_inter_scatterv.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
* Copyright (c) 2006-2010 University of Houston. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -45,7 +46,8 @@ mca_coll_inter_scatterv_inter(const void *sbuf, const int *scounts,
struct ompi_communicator_t *comm,
mca_coll_base_module_t *module)
{
int i, rank, size, err, total=0, size_local;
int i, rank, size, err, size_local;
size_t total = 0;
int *counts=NULL,*displace=NULL;
char *ptmp_free=NULL, *ptmp=NULL;
ompi_datatype_t *ndtype;
Expand Down

0 comments on commit acbe7b0

Please sign in to comment.