Skip to content

Commit

Permalink
Manage errors in NBC collective ops
Browse files Browse the repository at this point in the history
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>

Correctly bubble up errors in NBC collective operations

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>

The error field of requests needs to be rearmed at start, not at create

Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
  • Loading branch information
abouteiller committed Nov 15, 2018
1 parent 37954b5 commit 65660e5
Showing 1 changed file with 29 additions and 10 deletions.
39 changes: 29 additions & 10 deletions ompi/mca/coll/libnbc/nbc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2006 The Trustees of Indiana University and Indiana
* University Research and Technology
* Corporation. All rights reserved.
* Copyright (c) 2013 The University of Tennessee and The University
* Copyright (c) 2013-2018 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
* Copyright (c) 2006 The Technical University of Chemnitz. All
Expand Down Expand Up @@ -335,8 +335,14 @@ int NBC_Progress(NBC_Handle *handle) {
while (handle->req_count) {
ompi_request_t *subreq = handle->req_array[handle->req_count - 1];
if (REQUEST_COMPLETE(subreq)) {
ompi_request_free(&subreq);
if(OPAL_UNLIKELY( OMPI_SUCCESS != subreq->req_status.MPI_ERROR )) {
NBC_Error ("MPI Error in NBC subrequest %p : %d", subreq, subreq->req_status.MPI_ERROR);
/* copy the error code from the underlying request and let the
* round finish */
handle->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
}
handle->req_count--;
ompi_request_free(&subreq);
} else {
flag = false;
break;
Expand All @@ -349,6 +355,26 @@ int NBC_Progress(NBC_Handle *handle) {

/* a round is finished */
if (flag) {
/* reset handle for next round */
if (NULL != handle->req_array) {
/* free request array */
free (handle->req_array);
handle->req_array = NULL;
}

handle->req_count = 0;

/* previous round had an error */
if (OPAL_UNLIKELY(OMPI_SUCCESS != handle->super.req_status.MPI_ERROR)) {
res = handle->super.req_status.MPI_ERROR;
NBC_Error("NBC_Progress: an error %d was found during schedule %p at row-offset %li - aborting the schedule\n", res, handle->schedule, handle->row_offset);
handle->nbc_complete = true;
if (!handle->super.req_persistent) {
NBC_Free(handle);
}
return res;
}

/* adjust delim to start of current round */
NBC_DEBUG(5, "NBC_Progress: going in schedule %p to row-offset: %li\n", handle->schedule, handle->row_offset);
delim = handle->schedule->data + handle->row_offset;
Expand All @@ -358,14 +384,6 @@ int NBC_Progress(NBC_Handle *handle) {
/* adjust delim to end of current round -> delimiter */
delim = delim + size;

if (NULL != handle->req_array) {
/* free request array */
free (handle->req_array);
handle->req_array = NULL;
}

handle->req_count = 0;

if (*delim == 0) {
/* this was the last round - we're done */
NBC_DEBUG(5, "NBC_Progress last round finished - we're done\n");
Expand Down Expand Up @@ -638,6 +656,7 @@ int NBC_Start(NBC_Handle *handle) {

/* kick off first round */
handle->super.req_state = OMPI_REQUEST_ACTIVE;
handle->super.req_status.MPI_ERROR = OMPI_SUCCESS;
res = NBC_Start_round(handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
return res;
Expand Down

0 comments on commit 65660e5

Please sign in to comment.