Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bp4: integrate SCR calls #3294

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ adios_option(BP5 "Enable support for BP5" AUTO)
adios_option(ZeroMQ "Enable support for ZeroMQ" AUTO)
adios_option(HDF5 "Enable support for the HDF5 engine" AUTO)
adios_option(IME "Enable support for DDN IME transport" AUTO)
adios_option(SCR "Enable support for SCR" AUTO)
adios_option(Python "Enable support for Python bindings" AUTO)
adios_option(Fortran "Enable support for Fortran bindings" AUTO)
adios_option(SysVShMem "Enable support for SysV Shared Memory IPC on *NIX" AUTO)
Expand Down Expand Up @@ -222,7 +223,7 @@ endif()


set(ADIOS2_CONFIG_OPTS
BP5 DataMan DataSpaces HDF5 HDF5_VOL MHS SST CUDA Fortran MPI Python Blosc BZip2 LIBPRESSIO MGARD PNG SZ ZFP DAOS IME O_DIRECT Sodium SysVShMem ZeroMQ Profiling Endian_Reverse
BP5 DataMan DataSpaces HDF5 HDF5_VOL MHS SST CUDA Fortran MPI Python Blosc BZip2 LIBPRESSIO MGARD PNG SZ ZFP DAOS IME SCR O_DIRECT Sodium SysVShMem ZeroMQ Profiling Endian_Reverse
)

GenerateADIOSHeaderConfig(${ADIOS2_CONFIG_OPTS})
Expand Down
58 changes: 58 additions & 0 deletions buildme
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash

#source /etc/profile.d/z00_lmod.sh
#module load cuda/10.2
#module load gcc/8.3.1
module load cmake/3.23.1 # to avoid cmake FindMPI problem

set -x

installdir=`pwd`/install
#rm -rf $installdir

# download SCR-v3.0.1 release
if [ ! -f scr-v3.0.1.tgz ] ; then
wget https://github.com/LLNL/scr/releases/download/v3.0.1/scr-v3.0.1.tgz
if [ ! -d scr-v3.0.1 ] ; then
tar -xzf scr-v3.0.1.tgz
fi
fi

# build SCR-v3.0.1 in debug mode
pushd scr-v3.0.1
installdir_scr=`pwd`/install
rm -rf $installdir_scr

rm -rf build
mkdir build
pushd build
cmake \
-DCMAKE_INSTALL_PREFIX=$installdir_scr \
-DCMAKE_BUILD_TYPE=Debug \
-DSCR_RESOURCE_MANAGER=LSF \
-DENABLE_PDSH=OFF \
..
make -j
make install
popd
popd

# build ADIOS2 and point to the above SCR installation
rm -rf build
mkdir build
cd build
cmake \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to load gcc to be able to build ADIOS2 (I did not load gcc to be able to build SCR)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think I had to do the same.

-DCMAKE_INSTALL_PREFIX=$installdir \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DADIOS2_USE_MPI=ON \
-DADIOS2_USE_HDF5=OFF \
-DADIOS2_USE_CUDA=OFF \
-DADIOS2_USE_Fortran=OFF \
-DADIOS2_USE_Python=OFF \
-DSCR_ROOT=$installdir_scr \
..

make -j

make install
9 changes: 9 additions & 0 deletions cmake/DetectOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,15 @@ if(IME_FOUND)
set(ADIOS2_HAVE_IME TRUE)
endif()

# SCR (Scalable Checkpoint/Restart Library)
if(ADIOS2_USE_SCR STREQUAL AUTO)
find_package(SCR)
elseif(ADIOS2_USE_SCR)
find_package(SCR REQUIRED)
endif()
if(SCR_FOUND)
set(ADIOS2_HAVE_SCR TRUE)
endif()

# Python

Expand Down
60 changes: 60 additions & 0 deletions cmake/FindSCR.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#------------------------------------------------------------------------------#
# Distributed under the OSI-approved Apache License, Version 2.0. See
# accompanying file Copyright.txt for details.
#------------------------------------------------------------------------------#
#
# FindSCR
# -----------
#
# Try to find the SCR library
#
# This module defines the following variables:
#
# SCR_FOUND - System has SCR
# SCR_INCLUDE_DIRS - The SCR include directory
# SCR_LIBRARIES - Link these to use SCR
#
# and the following imported targets:
# SCR::SCR - The SCR library target
#
# You can also set the following variable to help guide the search:
# SCR_ROOT - The install prefix for SCR containing the
# include and lib folders
# Note: this can be set as a CMake variable or an
# environment variable. If specified as a CMake
# variable, it will override any setting specified
# as an environment variable.

if(NOT SCR_FOUND)
if((NOT SCR_ROOT) AND (NOT (ENV{SCR_ROOT} STREQUAL "")))
set(SCR_ROOT "$ENV{SCR_ROOT}")
endif()
if(SCR_ROOT)
set(SCR_INCLUDE_OPTS HINTS ${SCR_ROOT}/include NO_DEFAULT_PATHS)
set(SCR_LIBRARY_OPTS
HINTS ${SCR_ROOT}/lib ${SCR_ROOT}/lib64
NO_DEFAULT_PATHS
)
endif()

find_path(SCR_INCLUDE_DIR scr.h ${SCR_INCLUDE_OPTS})
find_library(SCR_LIBRARY NAMES scr ${SCR_LIBRARY_OPTS})

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(SCR
FOUND_VAR SCR_FOUND
REQUIRED_VARS SCR_LIBRARY SCR_INCLUDE_DIR
)
if(SCR_FOUND)
set(SCR_INCLUDE_DIRS ${SCR_INCLUDE_DIR})
set(SCR_LIBRARIES ${SCR_LIBRARY})
if(SCR_FOUND AND NOT TARGET SCR::SCR)
add_library(SCR::SCR UNKNOWN IMPORTED)
set_target_properties(SCR::SCR PROPERTIES
IMPORTED_LOCATION "${SCR_LIBRARY}"
INTERFACE_LINK_LIBRARIES "${SCR_LIBRARIES}"
INTERFACE_INCLUDE_DIRECTORIES "${SCR_INCLUDE_DIR}"
)
endif()
endif()
endif()
3 changes: 3 additions & 0 deletions examples/hello/bpReader/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ endif()
if(ADIOS2_HAVE_MPI)
add_executable(hello_bpReader_mpi helloBPReader.cpp)
target_link_libraries(hello_bpReader_mpi adios2::cxx11_mpi MPI::MPI_C)
if(ADIOS2_HAVE_SCR)
target_link_libraries(hello_bpReader_mpi SCR::SCR)
endif()

add_executable(hello_bpReaderHeatMap2D helloBPReaderHeatMap2D.cpp)
target_link_libraries(hello_bpReaderHeatMap2D adios2::cxx11_mpi MPI::MPI_C)
Expand Down
44 changes: 44 additions & 0 deletions examples/hello/bpReader/helloBPReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#include <adios2.h>

#include "scr.h"

int main(int argc, char *argv[])
{
int provided;
Expand All @@ -27,6 +29,40 @@ int main(int argc, char *argv[])
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
std::string filename = "myVector_cpp.bp";

// Since ADIOS2 engine introspects the directory structure to determine the file format,
// flush any cached dataset to restart from the parallel file system.
// Collective over MPI_COMM_WORLD.
SCR_Config("SCR_GLOBAL_RESTART=1");

// SCR attempts to load an application's most recent checkpoint by default.
// The user can specify a particular checkpoint by name by setting SCR_CURRENT.
// Collective over MPI_COMM_WORLD.
SCR_Configf("SCR_CURRENT=%s", filename.c_str());

SCR_Init();

// Query whether SCR successfully loaded a restart, and if so, its name.
// Collective over MPI_COMM_WORLD.
// Both parameters are output.
// The first parameter is set to 1 if SCR loaded a checkpoint, 0 otherwise.
// The second parameter will hold the name of the checkpoint if one was loaded.
// The name returned is the same string provided in SCR_Start_output when the checkpoint was created.
int have_restart;
char scr_dset[SCR_MAX_FILENAME];
SCR_Have_restart(&have_restart, scr_dset);

// Start a restart phase.
// Collective over MPI_COMM_WORLD.
// Should only be called if SCR_Have_restart indicates that SCR loaded a checkpoint.
// For convenience, SCR_Start_restart returns the name of the checkpoint again.
// This will match the name returned by SCR_Have_restart.
//SCR_Start_restart(scr_dset);

// Each process should track whether it reads its data successfully.
// Set to 0 if calling process fails to read its data.
int scr_valid = 1;

try
{
/** ADIOS class factory of IO class objects */
Expand Down Expand Up @@ -138,6 +174,14 @@ int main(int argc, char *argv[])
MPI_Abort(MPI_COMM_WORLD, 1);
}

// Complete restart phase.
// Collective over MPI_COMM_WORLD.
// Each process should indicate whether it successfully read its data.
// An allreduce determines whether all ranks succeeded.
// If any failed, SCR will attempt to load up the next most recent checkpoint, if any.
//SCR_Complete_restart(scr_valid);

SCR_Finalize();
MPI_Finalize();

return 0;
Expand Down
12 changes: 12 additions & 0 deletions examples/hello/bpWriter/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@

add_executable(hello_bpWriter helloBPWriter.cpp)
target_link_libraries(hello_bpWriter adios2::cxx11)
if(ADIOS2_HAVE_SCR)
target_link_libraries(hello_bpWriter SCR::SCR)
endif()

add_executable(hello_bpWriter_c helloBPWriter.c)
target_link_libraries(hello_bpWriter_c adios2::c)
if(ADIOS2_HAVE_SCR)
target_link_libraries(hello_bpWriter_c SCR::SCR)
endif()

add_executable(hello_bpPutDeferred helloBPPutDeferred.cpp)
target_link_libraries(hello_bpPutDeferred adios2::cxx11)
Expand All @@ -29,9 +35,15 @@ endif()
if(ADIOS2_HAVE_MPI)
add_executable(hello_bpWriter_mpi helloBPWriter.cpp)
target_link_libraries(hello_bpWriter_mpi adios2::cxx11_mpi MPI::MPI_C)
if(ADIOS2_HAVE_SCR)
target_link_libraries(hello_bpWriter_mpi SCR::SCR)
endif()

add_executable(hello_bpWriter_c_mpi helloBPWriter.c)
target_link_libraries(hello_bpWriter_c_mpi adios2::c_mpi MPI::MPI_C)
if(ADIOS2_HAVE_SCR)
target_link_libraries(hello_bpWriter_c_mpi SCR::SCR)
endif()

add_executable(hello_bpPutDeferred_mpi helloBPPutDeferred.cpp)
target_link_libraries(hello_bpPutDeferred_mpi adios2::cxx11_mpi MPI::MPI_C)
Expand Down
43 changes: 43 additions & 0 deletions examples/hello/bpWriter/helloBPWriter.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <mpi.h>
#endif

#include "scr.h"

void check_error(const int error)
{
if (error)
Expand Down Expand Up @@ -48,6 +50,10 @@ int main(int argc, char *argv[])
size = 1;
#endif

// Collective over MPI_COMM_WORLD.
// Rebuild any cached dataset, if possible.
SCR_Init();

adios2_error errio;
// application input, data in heap
const size_t Nx = 10;
Expand Down Expand Up @@ -86,6 +92,30 @@ int main(int argc, char *argv[])
start, count, adios2_constant_dims_true);
check_handler(variable, "variable");

// Set this to 0 on any write error to indicate that the calling process failed to write.
// Passed to SCR_Complete_output where an allreduce will check whether any process had an error.
int scr_valid = 1;

// Start a write phase (checkpoint and/or output).
// Collective over MPI_COMM_WORLD
//
// This holds processes in a barrier to ensure there is space if necessary.
// This is useful in case an async flush has not yet finished.
// It's also a simple way to avoid deleting files from a previous checkpoint
// if a failure has happened on some node but has not yet been detected.
//
// The caller should provide a name for the dataset.
// The name is returned during a restart, and it can be used to identify the checkpoint.
// The name is also the way a user can request SCR to restart from a specific checkpoint.
// It is common to encode timestamp info in this name.
//
// And the caller should specify dataset flags:
// SCR_FLAG_CHECKPOINT => dataset can be used to restart the application
// SCR_FLAG_OUTPUT => dataset must be written to parallel file system
// These flags can be OR'd together:
// SCR_FLAG_CHECKPOINT | SCR_FLAG_OUTPUT
SCR_Start_output("myVector", SCR_FLAG_CHECKPOINT);

adios2_engine *engine = adios2_open(io, "myVector_c.bp", adios2_mode_write);
check_handler(engine, "engine");

Expand All @@ -95,12 +125,25 @@ int main(int argc, char *argv[])
errio = adios2_close(engine);
check_error(errio);

// Complete write phase.
// Collective over MPI_COMM_WORLD.
// Each process should pass 1 if it wrote its portion successfully and 0 if not.
// The library executes an allreduce to determine whether all ranks succeeded.
// If any rank failed, the dataset is considered to be invalid.
// This is the point where the SCR library applies any redundancy schemes,
// and where SCR initiate a flush to the parallel file system if needed.
SCR_Complete_output(scr_valid);

// deallocate adios
errio = adios2_finalize(adios);
check_error(errio);

free(myFloats);

// Flush any datasets from cache, if needed, or wait for those flushes to complete.
// Collective over MPI_COMM_WORLD
SCR_Finalize();

#if ADIOS2_USE_MPI
MPI_Finalize();
#endif
Expand Down
10 changes: 10 additions & 0 deletions examples/hello/bpWriter/helloBPWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <mpi.h>
#endif

#include "scr.h"

int main(int argc, char *argv[])
{
int rank, size;
Expand All @@ -27,6 +29,7 @@ int main(int argc, char *argv[])
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
SCR_Init();
#else
rank = 0;
size = 1;
Expand Down Expand Up @@ -68,6 +71,10 @@ int main(int argc, char *argv[])
// variable

std::string filename = "myVector_cpp.bp";

int scr_valid = 1;
SCR_Start_output(filename.c_str(), SCR_FLAG_CHECKPOINT);

/** Engine derived class, spawned to start IO operations */
adios2::Engine bpFileWriter = bpIO.Open(filename, adios2::Mode::Write);

Expand All @@ -84,6 +91,8 @@ int main(int argc, char *argv[])
<< " to disk. It can now be read by running "
"./bin/hello_bpReader.\n";
}

SCR_Complete_output(scr_valid);
}
catch (std::invalid_argument &e)
{
Expand Down Expand Up @@ -111,6 +120,7 @@ int main(int argc, char *argv[])
}

#if ADIOS2_USE_MPI
SCR_Finalize();
MPI_Finalize();
#endif

Expand Down
4 changes: 4 additions & 0 deletions source/adios2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,10 @@ if(ADIOS2_HAVE_IME)
target_link_libraries(adios2_core PRIVATE IME::IME)
endif()

if(ADIOS2_HAVE_SCR)
target_link_libraries(adios2_core PRIVATE SCR::SCR)
endif()

if(ADIOS2_HAVE_MPI)
set(maybe_adios2_c_mpi adios2_c_mpi)
set(maybe_adios2_cxx11_mpi adios2_cxx11_mpi)
Expand Down
Loading