Skip to content

Commit

Permalink
Travis fixes (sandialabs#55)
Browse files Browse the repository at this point in the history
Fix some travis/testing issues.

Travis now pulls from ULFM master branch when it needs to rebuild ULFM.
Travis has an environment variable enabling oversubscription during the tests, instead of having that on all platforms when running make test
Tests that involve failure have their timeouts individually set to 1, so tests don't take 10+ seconds each w/ the default timeout of 10s
Simplified travis scripts (no more .travis_helpers directory)
  • Loading branch information
Matthew-Whitlock committed Jun 29, 2020
1 parent 5fb81d7 commit a41fd3b
Show file tree
Hide file tree
Showing 21 changed files with 636 additions and 119 deletions.
62 changes: 46 additions & 16 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,54 @@ addons:
- valgrind
cache:
directories:
- .travis_helpers/ulfm-install
- ulfm-install
before_install:
- cd .travis_helpers
- source ./fetchULFMmpi.sh
- cd ../ #Always end back at the root directory
- echo "Configuring ULFM"
- if [ -f ulfm-install/lib/libmpi.so ]; then
echo "libmpi.so found -- nothing to build.";
cd ulfm-install;
else
ROOT=`pwd`;
mkdir ulfm-install;
echo "Downloading ULFM from repo";
git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/;
echo " - Configuring and building ULFM.";
cd ulfm-src;
echo " - Running autogen.pl";
./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running configure";
./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make";
make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make install";
make install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Finished installing ULFM";
cd ../ulfm-install/;
fi

#Expect that any changes to the above still puts me in the install's home dir
- export MPI_HOME=`pwd`
- export PATH=$MPI_HOME/bin/:$PATH
- export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH
- export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH
- export MANPATH=$MPI_HOME/share/man:$MANPATH

- export MPICC="`which mpicc`"
- export MPICXX="`which mpic++`"

#Allow oversubscription for tests, since we're potentially single core
- export OMPI_MCA_rmaps_base_oversubscribe=1

- tail -n50 ./ulfm_build_output.txt
- cd ../ #End back at root
install:
- mkdir build && cd build
- cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1
script:
- cd .travis_helpers
- source fetchULFMmpi.sh #Just updates path if ULFM was built properly in before_install
- cd ../
- mkdir build
- cd build
- cmake ../ -DBUILD_TESTING=ON
- make -j4 VERBOSE=1
- make test
- cd ../ #Always end back at the root directory.
after_success:
- echo "Success, printing run logs:"
- cat Testing/Temporary/LastTest.log
after_failure:
- echo "Failure occured, printing run logs:"
- pwd
- cat build/Testing/Temporary/LastTest.log
- echo "Printing ULFM build log tail. If no output, ULFM was built before this test run"
- tail -n100 .travis_helpers/build_output.txt
- cat Testing/Temporary/LastTest.log
37 changes: 0 additions & 37 deletions .travis_helpers/fetchULFMmpi.sh

This file was deleted.

6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb")

#ENABLE_TESTING
Expand Down Expand Up @@ -109,4 +109,6 @@ if(BUILD_TESTING)
add_subdirectory(test/subset_internal)
add_subdirectory(test/subset_merging)
add_subdirectory(test/request_tracking)
add_subdirectory(test/request_cancelled)
add_subdirectory(test/no_jump)
endif()
2 changes: 1 addition & 1 deletion examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ if(BUILD_TESTING)
add_executable(fenix_hello_world-debug fenix_hello_world.c)
target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME hello_world
COMMAND mpirun --oversubscribe -np 3 fenix_hello_world-debug "1")
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1")
endif()
13 changes: 13 additions & 0 deletions examples/01_hello_world/fenix/fenix_hello_world.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,19 @@ int main(int argc, char **argv) {

printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n",
processor_name, old_rank, new_rank, new_world_size, old_world_size);

int *fails, num_fails;
num_fails = Fenix_Process_fail_list(&fails);

char fails_str[100];
sprintf(fails_str, "Rank %d sees failed processes [", new_rank);
for(int i = 0; i < num_fails; i++){
sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]);
}
sprintf(fails_str, "%s]\n", fails_str);
printf(fails_str);



Fenix_Finalize();
MPI_Finalize();
Expand Down
2 changes: 1 addition & 1 deletion examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_ring-debug fenix_ring.c)
target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME ring
COMMAND mpirun --oversubscribe -np 5 fenix_ring-debug 1 2)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2)
set_tests_properties(ring PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/05_subset_create/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_create-debug subset_create.c)
target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_create
COMMAND mpirun -np 5 --oversubscribe fenix_subset_create-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1)
set_tests_properties(subset_create PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/06_subset_createv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_createv-debug subset_createv.c)
target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_createv
COMMAND mpirun -np 5 --oversubscribe fenix_subset_createv-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1)
set_tests_properties(subset_createv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
3 changes: 3 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ extern "C" {
#define FENIX_ERROR_SUBSET_STRIDE -25
#define FENIX_ERROR_NODATA_FOUND -30
#define FENIX_ERROR_INTERN -40
#define FENIX_ERROR_CANCELLED -50
#define FENIX_WARNING_SPARE_RANKS_DEPLETED 100
#define FENIX_WARNING_PARTIAL_RESTORE 101

Expand Down Expand Up @@ -216,6 +217,8 @@ int Fenix_Data_group_delete(int group_id);

int Fenix_Data_member_delete(int group_id, int member_id);

int Fenix_Process_fail_list(int** fail_list);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
Expand Down
7 changes: 7 additions & 0 deletions include/fenix_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ typedef struct {
int role; // Role of rank: initial, survivor or repair
int fenix_init_flag;

int fail_world_size;
int* fail_world;

//Save the pointer to role and error of Fenix_Init
int *ret_role;
int *ret_error;

fenix_request_store_t request_store;

fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions
Expand Down
5 changes: 5 additions & 0 deletions src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,8 @@ int Fenix_Data_group_delete(int group_id) {
int Fenix_Data_member_delete(int group_id, int member_id) {
return __fenix_member_delete(group_id, member_id);
}

int Fenix_Process_fail_list(int** fail_list){
*fail_list = fenix.fail_world;
return fenix.fail_world_size;
}
4 changes: 2 additions & 2 deletions src/fenix_data_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth,
/* If so, recover the data and set the recovery */
/* for member recovery. */

int i, group_position;
int i;
int remote_need_recovery;
fenix_group_t *group;
MPI_Status status;
Expand Down Expand Up @@ -149,7 +149,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth,

} else { /* Already created. Renew the MPI communicator */

group = ( data_recovery->group[group_position] );
group = ( data_recovery->group[group_index] );
group->comm = comm; /* Renew communicator */
MPI_Comm_rank(comm, &(group->current_rank));

Expand Down
101 changes: 88 additions & 13 deletions src/fenix_mpi_override.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ int MPI_Sendrecv(MPI_CONST_TYPE void* sendbuf, int sendcount,
static inline
void __fenix_override_request(int ret, MPI_Request *request)
{
if(ret != MPI_SUCCESS) return;
if(ret != MPI_SUCCESS) {
return;
}

assert(*request != MPI_REQUEST_NULL);

Expand All @@ -265,28 +267,55 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype,
int ret;
ret = PMPI_Irecv(buf, count, datatype, source, tag,
__fenix_replace_comm(comm), request);

__fenix_override_request(ret, request);
__fenix_test_MPI_inline(ret, "MPI_Irecv");
return ret;
}

int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status)
{
int ret;
int ret, is_cancelled = 1;
MPI_Request request = MPI_REQUEST_NULL;
if(*fenix_request != MPI_REQUEST_NULL)
__fenix_request_store_get(&fenix.request_store,
*((int *) fenix_request),
&request);
if(*fenix_request != MPI_REQUEST_NULL){
if(*fenix_request == FENIX_REQUEST_CANCELLED){
is_cancelled = 1;
} else {
int retval =
__fenix_request_store_get(&fenix.request_store, *((int*)fenix_request), &request);

if(retval == FENIX_ERROR_CANCELLED) {
is_cancelled = 1;
}

if(retval == FENIX_REQUEST_COMPLETED){
if(status != MPI_STATUS_IGNORE)
__fenix_request_store_get_status(&fenix.request_store, *((int*)fenix_request), status);
*fenix_request = MPI_REQUEST_NULL;
return;
}
}
}

ret = PMPI_Wait(&request, status);
if(ret == MPI_SUCCESS) {

if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL) && (*fenix_request != FENIX_REQUEST_CANCELLED)) {
__fenix_request_store_remove(&fenix.request_store,
*((int *) fenix_request));
assert(request == MPI_REQUEST_NULL);
*fenix_request = MPI_REQUEST_NULL;
*fenix_request = MPI_REQUEST_NULL;
}
if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){
__fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), status);
*fenix_request = FENIX_REQUEST_CANCELLED;
}
__fenix_test_MPI_inline(ret, "MPI_Wait");


if(is_cancelled){
*fenix_request = FENIX_REQUEST_CANCELLED;
return FENIX_ERROR_CANCELLED;
}
return ret;
}

Expand All @@ -297,11 +326,13 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[],
{
// The list (array_of_requests) may contain null or inactive handles.
int ret, i;
for(i=0 ; i<count ; i++)
if(array_of_fenix_requests[i] != MPI_REQUEST_NULL)
__fenix_request_store_getremove(&fenix.request_store,
for(i=0 ; i<count ; i++){
if(array_of_fenix_requests[i] != MPI_REQUEST_NULL){
__fenix_request_store_getremove(&fenix.request_store,
*((int *)&(array_of_fenix_requests[i])),
&(array_of_fenix_requests[i]));
}
}

ret = PMPI_Waitall(count, array_of_fenix_requests, array_of_statuses);
__fenix_test_MPI_inline(ret, "MPI_Waitall");
Expand Down Expand Up @@ -333,8 +364,52 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[],

int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status)
{
#warning "TODO"
printf("Fenix: need to implement MPI_Test\n");
int ret;
int is_cancelled = 0;
MPI_Request real_req = MPI_REQUEST_NULL;

if(*request != MPI_REQUEST_NULL){
if(*request == FENIX_REQUEST_CANCELLED){
is_cancelled = 1;
} else {
int retval =
__fenix_request_store_get(&fenix.request_store, *((int*)request), &real_req);

if(retval == FENIX_ERROR_CANCELLED) {
is_cancelled = 1;
}

if(retval == FENIX_REQUEST_COMPLETED){
*flag = 1;
if(status != MPI_STATUS_IGNORE)
__fenix_request_store_get_status(&fenix.request_store, *((int*)request), status);
*request = MPI_REQUEST_NULL;
return;
}
}
}


ret = PMPI_Test(&real_req, flag, status);
if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){
__fenix_request_store_cancel(&fenix.request_store, *((int*)request), status);
*request = FENIX_REQUEST_CANCELLED;
}

__fenix_test_MPI_inline(ret, "MPI_Test");

if(*flag && *request != MPI_REQUEST_NULL && *request != FENIX_REQUEST_CANCELLED && ret == MPI_SUCCESS){
//This request is done, it can be removed from the store.
__fenix_request_store_remove(&fenix.request_store, *((int*)request));
*request = MPI_REQUEST_NULL;
}

if(is_cancelled){
*request = FENIX_REQUEST_CANCELLED;
return FENIX_ERROR_CANCELLED;
}

else return ret;
}

int MPI_Cancel(MPI_Request *request)
Expand Down
Loading

0 comments on commit a41fd3b

Please sign in to comment.