Skip to content

Commit

Permalink
Add Fenix_Process_detect_failures
Browse files Browse the repository at this point in the history
  • Loading branch information
Matthew-Whitlock committed Aug 27, 2024
1 parent 09f2885 commit e2cc897
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 0 deletions.
2 changes: 2 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ int Fenix_Process_fail_list(int** fail_list);

int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);

int Fenix_Process_detect_failures(int do_recovery);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/fenix_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ typedef struct {
//Manage state of the comms. Necessary when failures happen rapidly, mussing up state
int new_world_exists, user_world_exists;

int dummy_recv_buffer;
MPI_Request check_failures_req;


MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API

Expand Down
2 changes: 2 additions & 0 deletions include/fenix_process_recovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ void __fenix_set_rank_role(int FenixRankRole);

void __fenix_postinit(int *);

int __fenix_detect_failures(int do_recovery);

void __fenix_finalize();

void __fenix_finalize_spare();
Expand Down
4 changes: 4 additions & 0 deletions src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,7 @@ int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status){
//Request was (potentially) cancelled if ret is MPI_ERR_PROC_FAILED
return ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED;
}

int Fenix_Process_detect_failures(int do_recovery){
return __fenix_detect_failures(do_recovery);
}
20 changes: 20 additions & 0 deletions src/fenix_process_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,11 @@ void __fenix_postinit(int *error)
// fenix.role);
//}

if(fenix.new_world_exists){
//Set up dummy irecv to use for checking for failures.
MPI_Irecv(&fenix.dummy_recv_buffer, 1, MPI_INT, MPI_ANY_SOURCE,
34095347, fenix.new_world, &fenix.check_failures_req);
}

if (fenix.repair_result != 0) {
*error = fenix.repair_result;
Expand All @@ -707,6 +712,21 @@ void __fenix_postinit(int *error)
}
}

int __fenix_detect_failures(int do_recovery){
if(!fenix.new_world_exists) return FENIX_ERROR_UNINITIALIZED;

int old_ignore_errs = fenix.ignore_errs;
fenix.ignore_errs = !do_recovery;

int req_completed;
int ret = MPI_Test(&fenix.check_failures_req, &req_completed, MPI_STATUS_IGNORE);

if(req_completed) ret = FENIX_ERROR_INTERN;

fenix.ignore_errs = old_ignore_errs;
return ret;
}

void __fenix_finalize()
{
int location = FENIX_FINALIZE_LOC;
Expand Down

0 comments on commit e2cc897

Please sign in to comment.