diff --git a/ompi/mca/coll/base/coll_base_alltoall.c b/ompi/mca/coll/base/coll_base_alltoall.c index 9446b8a414d..b3c642c4398 100644 --- a/ompi/mca/coll/base/coll_base_alltoall.c +++ b/ompi/mca/coll/base/coll_base_alltoall.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -35,88 +35,112 @@ #include "coll_base_topo.h" #include "coll_base_util.h" -/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ +/* + * We want to minimize the amount of temporary memory needed while allowing as many ranks + * to exchange data simultaneously. We use a variation of the ring algorithm, where in a + * single step a process echange the data with both neighbors at distance k (on the left + * and the right on a logical ring topology). With this approach we need to pack the data + * for a single of the two neighbors, as we can then use the original buffer (and datatype + * and count) to send the data to the other. + */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, j, size, rank, err = MPI_SUCCESS, line; - ptrdiff_t ext, gap = 0; + int i, size, rank, left, right, err = MPI_SUCCESS, line; + ptrdiff_t extent; ompi_request_t *req; - char *allocated_buffer = NULL, *tmp_buffer; - size_t max_size; + char *tmp_buffer; + size_t packed_size = 0, max_size; + opal_convertor_t convertor; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - /* If only one process, we're done. */ - if (1 == size) { + ompi_datatype_type_size(rdtype, &max_size); + + /* Easy way out */ + if ((1 == size) || (0 == rcount) || (0 == max_size) ) { return MPI_SUCCESS; } - /* Find the largest receive amount */ - ompi_datatype_type_extent (rdtype, &ext); - max_size = opal_datatype_span(&rdtype->super, rcount, &gap); + /* Find the largest amount of packed send/recv data among all peers where + * we need to pack before the send. + */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + for (i = 1 ; i <= (size >> 1) ; ++i) { + right = (rank + i) % size; + ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, right); + + if( OPAL_UNLIKELY(opal_local_arch != ompi_proc->super.proc_convertor->master->remote_arch)) { + packed_size = opal_datatype_compute_remote_size(&rdtype->super, + ompi_proc->super.proc_convertor->master->remote_sizes); + max_size = packed_size > max_size ? packed_size : max_size; + } + } +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + max_size *= rcount; - /* Initiate all send/recv to/from others. */ + ompi_datatype_type_extent(rdtype, &extent); /* Allocate a temporary buffer */ - allocated_buffer = calloc (max_size, 1); - if( NULL == allocated_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } - tmp_buffer = allocated_buffer - gap; - max_size = ext * rcount; - - /* in-place alltoall slow algorithm (but works) */ - for (i = 0 ; i < size ; ++i) { - for (j = i+1 ; j < size ; ++j) { - if (i == rank) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, - (char *) rbuf + j * max_size); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - - /* Exchange data with the peer */ - err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, - j, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - - err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, - j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, - comm)); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - } else if (j == rank) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, - (char *) rbuf + i * max_size); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - - /* Exchange data with the peer */ - err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, - i, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - - err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, - i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, - comm)); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } - } else { - continue; - } - - /* Wait for the requests to complete */ - err = ompi_request_wait ( &req, MPI_STATUSES_IGNORE); - if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } + tmp_buffer = calloc (max_size, 1); + if( NULL == tmp_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } + + for (i = 1 ; i <= (size >> 1) ; ++i) { + struct iovec iov = {.iov_base = tmp_buffer, .iov_len = max_size}; + uint32_t iov_count = 1; + + right = (rank + i) % size; + left = (rank + size - i) % size; + + ompi_proc_t *right_proc = ompi_comm_peer_lookup(comm, right); + opal_convertor_clone(right_proc->super.proc_convertor, &convertor, 0); + opal_convertor_prepare_for_send(&convertor, &rdtype->super, rcount, + (char *) rbuf + right * extent); + packed_size = max_size; + err = opal_convertor_pack(&convertor, &iov, &iov_count, &packed_size); + if (1 != err) { goto error_hndl; } + + /* Receive data from the right */ + err = MCA_PML_CALL(irecv ((char *) rbuf + right * extent, rcount, rdtype, + right, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + if( left != right ) { + /* Send data to the left */ + err = MCA_PML_CALL(send ((char *) rbuf + left * extent, rcount, rdtype, + left, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { goto error_hndl; } + + /* Receive data from the left */ + err = MCA_PML_CALL(irecv ((char *) rbuf + left * extent, rcount, rdtype, + left, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } } + + /* Send data to the right */ + err = MCA_PML_CALL(send ((char *) tmp_buffer, packed_size, MPI_PACKED, + right, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { goto error_hndl; } } error_hndl: /* Free the temporary buffer */ - if( NULL != allocated_buffer ) - free (allocated_buffer); + if( NULL != tmp_buffer ) + free (tmp_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, diff --git a/ompi/mca/coll/base/coll_base_alltoallv.c b/ompi/mca/coll/base/coll_base_alltoallv.c index 5274de89a42..f51bd577fb7 100644 --- a/ompi/mca/coll/base/coll_base_alltoallv.c +++ b/ompi/mca/coll/base/coll_base_alltoallv.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2017 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -37,85 +37,121 @@ #include "coll_base_topo.h" #include "coll_base_util.h" +/* + * We want to minimize the amount of temporary memory needed while allowing as many ranks + * to exchange data simultaneously. We use a variation of the ring algorithm, where in a + * single step a process echange the data with both neighbors at distance k (on the left + * and the right on a logical ring topology). With this approach we need to pack the data + * for a single of the two neighbors, as we can then use the original buffer (and datatype + * and count) to send the data to the other. + */ int mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, j, size, rank, err=MPI_SUCCESS; - char *allocated_buffer, *tmp_buffer; - size_t max_size; - ptrdiff_t ext, gap = 0; + int i, size, rank, left, right, err = MPI_SUCCESS, line; + ompi_request_t *req; + char *tmp_buffer; + size_t packed_size = 0, max_size; + opal_convertor_t convertor; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); - /* If only one process, we're done. */ - if (1 == size) { + ompi_datatype_type_size(rdtype, &max_size); + max_size *= rcounts[rank]; + + /* Easy way out */ + if ((1 == size) || (0 == max_size) ) { return MPI_SUCCESS; } - /* Find the largest receive amount */ - ompi_datatype_type_extent (rdtype, &ext); - for (i = 0, max_size = 0 ; i < size ; ++i) { - if (i == rank) { - continue; - } - size_t cur_size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); - max_size = cur_size > max_size ? cur_size : max_size; - } - /* The gap will always be the same as we are working on the same datatype */ - if (OPAL_UNLIKELY(0 == max_size)) { - return MPI_SUCCESS; + /* Find the largest amount of packed send/recv data among all peers where + * we need to pack before the send. + */ +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + for (i = 1 ; i <= (size >> 1) ; ++i) { + right = (rank + i) % size; + ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, right); + + if( OPAL_UNLIKELY(opal_local_arch != ompi_proc->super.proc_convertor->master->remote_arch)) { + packed_size = opal_datatype_compute_remote_size(&rdtype->super, + ompi_proc->super.proc_convertor->master->remote_sizes); + packed_size *= rcounts[right]; + max_size = packed_size > max_size ? packed_size : max_size; + } } +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ /* Allocate a temporary buffer */ - allocated_buffer = calloc (max_size, 1); - if (NULL == allocated_buffer) { - return OMPI_ERR_OUT_OF_RESOURCE; - } - tmp_buffer = allocated_buffer - gap; - - /* Initiate all send/recv to/from others. */ - /* in-place alltoallv slow algorithm (but works) */ - for (i = 0 ; i < size ; ++i) { - for (j = i+1 ; j < size ; ++j) { - if (i == rank && 0 != rcounts[j]) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], - tmp_buffer, (char *) rbuf + rdisps[j] * ext); - if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Exchange data with the peer */ - err = ompi_coll_base_sendrecv_actual((void *) tmp_buffer, rcounts[j], rdtype, - j, MCA_COLL_BASE_TAG_ALLTOALLV, - (char *)rbuf + rdisps[j] * ext, rcounts[j], rdtype, - j, MCA_COLL_BASE_TAG_ALLTOALLV, - comm, MPI_STATUS_IGNORE); - if (MPI_SUCCESS != err) { goto error_hndl; } - } else if (j == rank && 0 != rcounts[i]) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], - tmp_buffer, (char *) rbuf + rdisps[i] * ext); - if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Exchange data with the peer */ - err = ompi_coll_base_sendrecv_actual((void *) tmp_buffer, rcounts[i], rdtype, - i, MCA_COLL_BASE_TAG_ALLTOALLV, - (char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, - i, MCA_COLL_BASE_TAG_ALLTOALLV, - comm, MPI_STATUS_IGNORE); - if (MPI_SUCCESS != err) { goto error_hndl; } - } + tmp_buffer = calloc (max_size, 1); + if( NULL == tmp_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } + + for (i = 1 ; i <= (size >> 1) ; ++i) { + struct iovec iov = {.iov_base = tmp_buffer, .iov_len = max_size}; + uint32_t iov_count = 1; + + right = (rank + i) % size; + left = (rank + size - i) % size; + + if( 0 != rcounts[right] ) { /* nothing to exchange with the peer on the right */ + ompi_proc_t *right_proc = ompi_comm_peer_lookup(comm, right); + opal_convertor_clone(right_proc->super.proc_convertor, &convertor, 0); + opal_convertor_prepare_for_send(&convertor, &rdtype->super, rcounts[right], + (char *) rbuf + rdisps[right]); + packed_size = max_size; + err = opal_convertor_pack(&convertor, &iov, &iov_count, &packed_size); + if (1 != err) { goto error_hndl; } + + /* Receive data from the right */ + err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[right], rcounts[right], rdtype, + right, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } + } + + if( (left != right) && (0 != rcounts[left]) ) { + /* Send data to the left */ + err = MCA_PML_CALL(send ((char *) rbuf + rdisps[left], rcounts[left], rdtype, + left, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { goto error_hndl; } + + /* Receive data from the left */ + err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[left], rcounts[left], rdtype, + left, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } + } + + if( 0 != rcounts[right] ) { /* nothing to exchange with the peer on the right */ + /* Send data to the right */ + err = MCA_PML_CALL(send ((char *) tmp_buffer, packed_size, MPI_PACKED, + right, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ - free (allocated_buffer); + if( NULL != tmp_buffer ) + free (tmp_buffer); + + if( MPI_SUCCESS != err ) { + OPAL_OUTPUT((ompi_coll_base_framework.framework_output, + "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, + rank)); + (void)line; // silence compiler warning + } /* All done */ return err; diff --git a/ompi/mca/coll/basic/coll_basic_alltoallw.c b/ompi/mca/coll/basic/coll_basic_alltoallw.c index 93fa880fc2d..50ec9c9774b 100644 --- a/ompi/mca/coll/basic/coll_basic_alltoallw.c +++ b/ompi/mca/coll/basic/coll_basic_alltoallw.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2016 The University of Tennessee and The University + * Copyright (c) 2004-2021 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -14,8 +14,8 @@ * Copyright (c) 2013 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2013 FUJITSU LIMITED. All rights reserved. - * Copyright (c) 2014-2016 Research Organization for Information Science - * and Technology (RIST). All rights reserved. + * Copyright (c) 2014-2021 Research Organization for Information Science + * and Technology (RIST). All rights reserved. * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ @@ -31,92 +31,115 @@ #include "mpi.h" #include "ompi/constants.h" #include "ompi/datatype/ompi_datatype.h" +#include "opal/datatype/opal_convertor_internal.h" #include "ompi/mca/coll/coll.h" #include "ompi/mca/coll/base/coll_tags.h" #include "ompi/mca/pml/pml.h" - +/* + * We want to minimize the amount of temporary memory needed while allowing as many ranks + * to exchange data simultaneously. We use a variation of the ring algorithm, where in a + * single step a process echange the data with both neighbors at distance k (on the left + * and the right on a logical ring topology). With this approach we need to pack the data + * for a single of the two neighbors, as we can then use the original buffer (and datatype + * and count) to send the data to the other. + */ static int mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t * const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { - int i, j, size, rank, err = MPI_SUCCESS, max_size; - ompi_request_t *req; - char *tmp_buffer, *save_buffer = NULL; - ptrdiff_t ext, gap = 0; - - /* Initialize. */ + int i, size, rank, left, right, err = MPI_SUCCESS; + ompi_request_t *req = MPI_REQUEST_NULL; + char *tmp_buffer = NULL; + size_t max_size = 0, packed_size, msg_size_left, msg_size_right; + opal_convertor_t convertor; size = ompi_comm_size(comm); - rank = ompi_comm_rank(comm); - - /* If only one process, we're done. */ - if (1 == size) { + if (1 == size) { /* If only one process, we're done. */ return MPI_SUCCESS; } + rank = ompi_comm_rank(comm); - /* Find the largest receive amount */ - for (i = 0, max_size = 0 ; i < size ; ++i) { - ext = opal_datatype_span(&rdtypes[i]->super, rcounts[i], &gap); - - max_size = ext > max_size ? ext : max_size; + /* Find the largest amount of packed send/recv data among all peers where + * we need to pack before the send. + */ + for (i = 1 ; i <= (size >> 1) ; ++i) { + right = (rank + i) % size; +#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT + ompi_proc_t *ompi_proc = ompi_comm_peer_lookup(comm, right); + + if( OPAL_LIKELY(opal_local_arch == ompi_proc->super.proc_convertor->master->remote_arch)) { + opal_datatype_type_size(&rdtypes[right]->super, &packed_size); + } else { + packed_size = opal_datatype_compute_remote_size(&rdtypes[right]->super, + ompi_proc->super.proc_convertor->master->remote_sizes); + } +#else + opal_datatype_type_size(&rdtypes[right]->super, &packed_size); +#endif /* OPAL_ENABLE_HETEROGENEOUS_SUPPORT */ + packed_size *= rcounts[right]; + max_size = packed_size > max_size ? packed_size : max_size; } /* Allocate a temporary buffer */ - tmp_buffer = save_buffer = calloc (max_size, 1); + tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } - tmp_buffer -= gap; - - /* in-place alltoallw slow algorithm (but works) */ - for (i = 0 ; i < size ; ++i) { - size_t msg_size_i; - ompi_datatype_type_size(rdtypes[i], &msg_size_i); - msg_size_i *= rcounts[i]; - for (j = i+1 ; j < size ; ++j) { - size_t msg_size_j; - ompi_datatype_type_size(rdtypes[j], &msg_size_j); - msg_size_j *= rcounts[j]; - - /* Initiate all send/recv to/from others. */ - if (i == rank && msg_size_j != 0) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtypes[j], rcounts[j], - tmp_buffer, (char *) rbuf + rdisps[j]); - if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Exchange data with the peer */ - err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtypes[j], - j, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); - if (MPI_SUCCESS != err) { goto error_hndl; } - - err = MCA_PML_CALL(send ((void *) tmp_buffer, rcounts[j], rdtypes[j], - j, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, - comm)); - if (MPI_SUCCESS != err) { goto error_hndl; } - } else if (j == rank && msg_size_i != 0) { - /* Copy the data into the temporary buffer */ - err = ompi_datatype_copy_content_same_ddt (rdtypes[i], rcounts[i], - tmp_buffer, (char *) rbuf + rdisps[i]); - if (MPI_SUCCESS != err) { goto error_hndl; } - - /* Exchange data with the peer */ - err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i], - i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); - if (MPI_SUCCESS != err) { goto error_hndl; } - - err = MCA_PML_CALL(send ((void *) tmp_buffer, rcounts[i], rdtypes[i], - i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, - comm)); - if (MPI_SUCCESS != err) { goto error_hndl; } - } else { - continue; - } - - /* Wait for the requests to complete */ + + for (i = 1 ; i <= (size >> 1) ; ++i) { + struct iovec iov = {.iov_base = tmp_buffer, .iov_len = max_size}; + uint32_t iov_count = 1; + + right = (rank + i) % size; + left = (rank + size - i) % size; + + ompi_datatype_type_size(rdtypes[right], &msg_size_right); + msg_size_right *= rcounts[right]; + + ompi_datatype_type_size(rdtypes[left], &msg_size_left); + msg_size_left *= rcounts[left]; + + if( 0 != msg_size_right ) { /* nothing to exchange with the peer on the right */ + ompi_proc_t *right_proc = ompi_comm_peer_lookup(comm, right); + opal_convertor_clone(right_proc->super.proc_convertor, &convertor, 0); + opal_convertor_prepare_for_send(&convertor, &rdtypes[right]->super, rcounts[right], + (char *) rbuf + rdisps[right]); + packed_size = max_size; + err = opal_convertor_pack(&convertor, &iov, &iov_count, &packed_size); + if (1 != err) { goto error_hndl; } + + /* Receive data from the right */ + err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[right], rcounts[right], rdtypes[right], + right, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } + } + + if( (left != right) && (0 != msg_size_left) ) { + /* Send data to the left */ + err = MCA_PML_CALL(send ((char *) rbuf + rdisps[left], rcounts[left], rdtypes[left], + left, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); + if (MPI_SUCCESS != err) { goto error_hndl; } + + /* Receive data from the left */ + err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[left], rcounts[left], rdtypes[left], + left, MCA_COLL_BASE_TAG_ALLTOALLW, comm, &req)); + if (MPI_SUCCESS != err) { goto error_hndl; } + } + + if( 0 != msg_size_right ) { /* nothing to exchange with the peer on the right */ + /* Send data to the right */ + err = MCA_PML_CALL(send ((char *) tmp_buffer, packed_size, MPI_PACKED, + right, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, + comm)); + if (MPI_SUCCESS != err) { goto error_hndl; } + err = ompi_request_wait (&req, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } @@ -124,7 +147,7 @@ mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, con error_hndl: /* Free the temporary buffer */ - free (save_buffer); + free (tmp_buffer); /* All done */ diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c index e08265b42bc..365ddc45a1e 100644 --- a/opal/datatype/opal_convertor.c +++ b/opal/datatype/opal_convertor.c @@ -455,29 +455,6 @@ int32_t opal_convertor_set_position_nocheck(opal_convertor_t *convertor, size_t return rc; } -static size_t opal_datatype_compute_remote_size(const opal_datatype_t *pData, const size_t *sizes) -{ - uint32_t typeMask = pData->bdt_used; - size_t length = 0; - - if (opal_datatype_is_predefined(pData)) { - return sizes[pData->desc.desc->elem.common.type]; - } - - if (OPAL_UNLIKELY(NULL == pData->ptypes)) { - /* Allocate and fill the array of types used in the datatype description */ - opal_datatype_compute_ptypes((opal_datatype_t *) pData); - } - - for (int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++) { - if (typeMask & ((uint32_t) 1 << i)) { - length += (pData->ptypes[i] * sizes[i]); - typeMask ^= ((uint32_t) 1 << i); - } - } - return length; -} - /** * Compute the remote size. If necessary remove the homogeneous flag * and redirect the convertor description toward the non-optimized @@ -496,9 +473,9 @@ size_t opal_convertor_compute_remote_size(opal_convertor_t *pConvertor) } if (0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE)) { /* This is for a single datatype, we must update it with the count */ - pConvertor->remote_size = opal_datatype_compute_remote_size(datatype, - pConvertor->master - ->remote_sizes); + pConvertor->remote_size = + opal_datatype_compute_remote_size(datatype, + pConvertor->master->remote_sizes); pConvertor->remote_size *= pConvertor->count; } } diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h index 7dabd1742c0..5f7fc53fa7d 100644 --- a/opal/datatype/opal_datatype.h +++ b/opal/datatype/opal_datatype.h @@ -311,6 +311,15 @@ OPAL_DECLSPEC int32_t opal_datatype_copy_content_same_ddt(const opal_datatype_t OPAL_DECLSPEC int opal_datatype_compute_ptypes(opal_datatype_t *datatype); +/* + * Compute the size of the datatype using a specific set of predefined type sizes. + * This function allows to compute the size of a packed buffer without creating + * a fully fledged specialized convertor for the remote peer. + */ +OPAL_DECLSPEC size_t +opal_datatype_compute_remote_size(const opal_datatype_t *pData, + const size_t *sizes); + /* Compute the span in memory of count datatypes. This function help with temporary * memory allocations for receiving already typed data (such as those used for reduce * operations). This span is the distance between the minimum and the maximum byte diff --git a/opal/datatype/opal_datatype_get_count.c b/opal/datatype/opal_datatype_get_count.c index 202601d97a2..fed344d1bbd 100644 --- a/opal/datatype/opal_datatype_get_count.c +++ b/opal/datatype/opal_datatype_get_count.c @@ -223,3 +223,27 @@ int opal_datatype_compute_ptypes(opal_datatype_t *datatype) } } } + +size_t opal_datatype_compute_remote_size(const opal_datatype_t *pData, const size_t *sizes) +{ + uint32_t typeMask = pData->bdt_used; + size_t length = 0; + + if (opal_datatype_is_predefined(pData)) { + return sizes[pData->desc.desc->elem.common.type]; + } + + if (OPAL_UNLIKELY(NULL == pData->ptypes)) { + /* Allocate and fill the array of types used in the datatype description */ + opal_datatype_compute_ptypes((opal_datatype_t *) pData); + } + + for (int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++) { + if (typeMask & ((uint32_t) 1 << i)) { + length += (pData->ptypes[i] * sizes[i]); + typeMask ^= ((uint32_t) 1 << i); + } + } + return length; +} +