diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ab43e6528f4..0e105bb12213 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,6 +100,11 @@ ENDIF () # really Trilinos should support bulding with every newer C++ standard so we # should just be doing that instead. +# Force off support for Makefile.export.* files while TriBITS is refactored to +# remove this (#8498) +SET(${PROJECT_NAME}_ENABLE_EXPORT_MAKEFILES OFF CACHE BOOL + "Support being removed from TriBITS (see trilinos/Trilinos#8498)" FORCE) + # Do all of the processing for this Tribits project TRIBITS_PROJECT() diff --git a/cmake/ctest/drivers/atdm/ats1/local-driver.sh b/cmake/ctest/drivers/atdm/ats1/local-driver.sh index 4a3eb08ffe43..6efdf59bfd74 100755 --- a/cmake/ctest/drivers/atdm/ats1/local-driver.sh +++ b/cmake/ctest/drivers/atdm/ats1/local-driver.sh @@ -8,13 +8,6 @@ if [ "${SBATCH_TEST_TIME_LIMIT_MINUTES}" == "" ] ; then export SBATCH_TEST_TIME_LIMIT_MINUTES=780 # Default 13 hour time limit fi -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] && \ - [[ ! $JOB_NAME == *"intel"* ]] \ - ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi -echo "Trilinos_ENABLE_BUILD_STATS='${Trilinos_ENABLE_BUILD_STATS}'" - # Load environment on the login node source $WORKSPACE/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME diff --git a/cmake/ctest/drivers/atdm/ats2/local-driver.sh b/cmake/ctest/drivers/atdm/ats2/local-driver.sh index eb9ffd6ab5aa..15cde236883a 100755 --- a/cmake/ctest/drivers/atdm/ats2/local-driver.sh +++ b/cmake/ctest/drivers/atdm/ats2/local-driver.sh @@ -2,10 +2,6 @@ set +x -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi - # Need to load env so we define some vars source $WORKSPACE/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME diff --git a/cmake/ctest/drivers/atdm/cee-rhel7/local-driver.sh b/cmake/ctest/drivers/atdm/cee-rhel7/local-driver.sh index 276a47b6e916..719b4a870ca2 100755 --- a/cmake/ctest/drivers/atdm/cee-rhel7/local-driver.sh +++ b/cmake/ctest/drivers/atdm/cee-rhel7/local-driver.sh @@ -5,13 +5,6 @@ set +x # Need to load env so we define some vars source $WORKSPACE/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] && \ - [[ ! $JOB_NAME == *"intel"* ]] \ - ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi -echo "Trilinos_ENABLE_BUILD_STATS='${Trilinos_ENABLE_BUILD_STATS}'" - # Ensure that we don't set both Trilinos_PACKAGES and Trilinos_PACKAGE_ENABLES_FILE if [ -z $Trilinos_PACKAGES ]; then # Make adjustments for mini build of Trilinos for SPARC diff --git a/cmake/ctest/drivers/atdm/cts1/local-driver.sh b/cmake/ctest/drivers/atdm/cts1/local-driver.sh index a4852cc57296..a7b7cb5b0b3b 100755 --- a/cmake/ctest/drivers/atdm/cts1/local-driver.sh +++ b/cmake/ctest/drivers/atdm/cts1/local-driver.sh @@ -7,16 +7,6 @@ if [ "${SLURM_CTEST_TIMEOUT}" == "" ] ; then # This is just running tests, not the entire build! fi -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] && \ - [[ ! $JOB_NAME == *"intel"* ]] \ - ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi -echo "Trilinos_ENABLE_BUILD_STATS='${Trilinos_ENABLE_BUILD_STATS}'" -# NOTE: That above matching is a bit fragile but it avoids needing to load a -# full env and it is good enough for driving nightly builds. (I would never -# do this with a build name coming from a user.) - set -x source $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-config-build.sh diff --git a/cmake/ctest/drivers/atdm/cts1empire/local-driver.sh b/cmake/ctest/drivers/atdm/cts1empire/local-driver.sh index 2a877912eecc..aef2e19228d6 100755 --- a/cmake/ctest/drivers/atdm/cts1empire/local-driver.sh +++ b/cmake/ctest/drivers/atdm/cts1empire/local-driver.sh @@ -5,10 +5,6 @@ if [ "${SLURM_CTEST_TIMEOUT}" == "" ] ; then # This is just running tests, not the entire build! fi -if [ "${Trilinos_CTEST_DO_ALL_AT_ONCE}" == "" ] ; then - export Trilinos_CTEST_DO_ALL_AT_ONCE=TRUE -fi - set -x source $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-config-build.sh diff --git a/cmake/ctest/drivers/atdm/ride/local-driver.sh b/cmake/ctest/drivers/atdm/ride/local-driver.sh index 926984fc1c3c..0af106ac960c 100755 --- a/cmake/ctest/drivers/atdm/ride/local-driver.sh +++ b/cmake/ctest/drivers/atdm/ride/local-driver.sh @@ -12,11 +12,6 @@ if [ "${EXCLUDE_NODES_FROM_BSUB}" == "" ] ; then fi fi -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi -echo "Trilinos_ENABLE_BUILD_STATS='${Trilinos_ENABLE_BUILD_STATS}'" - source $WORKSPACE/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME set -x diff --git a/cmake/ctest/drivers/atdm/sems-rhel7/local-driver.sh b/cmake/ctest/drivers/atdm/sems-rhel7/local-driver.sh index 2bd435558894..10f0a23fe932 100755 --- a/cmake/ctest/drivers/atdm/sems-rhel7/local-driver.sh +++ b/cmake/ctest/drivers/atdm/sems-rhel7/local-driver.sh @@ -2,10 +2,6 @@ set +x -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] ; then - export Trilinos_ENABLE_BUILD_STATS=ON -fi - if [[ "${Trilinos_REPOSITORY_LOCATION}" == "" ]] ; then export Trilinos_REPOSITORY_LOCATION=git@github.com:trilinos/Trilinos.git fi diff --git a/cmake/ctest/drivers/atdm/tlcc2/local-driver.sh b/cmake/ctest/drivers/atdm/tlcc2/local-driver.sh index 6d9dd791b20d..a013e491bec4 100755 --- a/cmake/ctest/drivers/atdm/tlcc2/local-driver.sh +++ b/cmake/ctest/drivers/atdm/tlcc2/local-driver.sh @@ -5,10 +5,6 @@ if [ "${SALLOC_CTEST_TIME_LIMIT_MINUTES}" == "" ] ; then # This is just running tests, not the entire build! fi -if [ "${Trilinos_CTEST_DO_ALL_AT_ONCE}" == "" ] ; then - export Trilinos_CTEST_DO_ALL_AT_ONCE=TRUE -fi - set -x source $WORKSPACE/Trilinos/cmake/ctest/drivers/atdm/ctest-s-driver-config-build.sh diff --git a/cmake/ctest/drivers/atdm/utils/setup_env.sh b/cmake/ctest/drivers/atdm/utils/setup_env.sh index 7b090582286b..5557a46c9c17 100644 --- a/cmake/ctest/drivers/atdm/utils/setup_env.sh +++ b/cmake/ctest/drivers/atdm/utils/setup_env.sh @@ -4,6 +4,11 @@ set +x # A) Load the env # +if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] ; then + export Trilinos_ENABLE_BUILD_STATS=ON +fi +echo "Trilinos_ENABLE_BUILD_STATS='${Trilinos_ENABLE_BUILD_STATS}'" + source ${WORKSPACE}/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME echo module list diff --git a/cmake/ctest/drivers/atdm/van1-tx2/local-driver.sh b/cmake/ctest/drivers/atdm/van1-tx2/local-driver.sh index e924eb67701c..7a7e8fc6c121 100755 --- a/cmake/ctest/drivers/atdm/van1-tx2/local-driver.sh +++ b/cmake/ctest/drivers/atdm/van1-tx2/local-driver.sh @@ -9,10 +9,6 @@ if [[ "${SALLOC_CTEST_LIMIT_MINUTES}" == "" ]] ; then # do everything. fi -if [[ "${Trilinos_ENABLE_BUILD_STATS}" == "" ]] ; then - export Trilinos_ENABLE_BUILD_STATS=OFF -fi - source $WORKSPACE/Trilinos/cmake/std/atdm/load-env.sh $JOB_NAME echo diff --git a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake index 24f177e5b105..88e3dd3dd88d 100644 --- a/cmake/std/PullRequestLinuxCommonTestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCommonTestingSettings.cmake @@ -109,5 +109,5 @@ set (TPL_Scotch_INCLUDE_DIRS "$ENV{SEMS_SCOTCH_INCLUDE_PATH}" CACHE PATH "Set by set (Scotch_LIBRARY_DIRS "$ENV{SEMS_SCOTCH_LIBRARY_PATH}" CACHE PATH "Set by default for PR testing") # Build stats compiler wrappers (#7376) -set(Trilinos_ENABLE_BUILD_STATS OFF CACHE BOOL "Set in PullRequestLinuxCommonTestingSettings.cmake") -# Turn them off for now in all PR builds until more review can be done. +set(Trilinos_ENABLE_BUILD_STATS ON CACHE BOOL "Set in PullRequestLinuxCommonTestingSettings.cmake") +set(Trilinos_REMOVE_BUILD_STATS_TIMING_FILES_ON_FRESH_CONFIGURE OFF CACHE BOOL "Set in PullRequestLinuxCommonTestingSettings.cmake") diff --git a/cmake/std/PullRequestLinuxCommonTestingSettingsSERIAL.cmake b/cmake/std/PullRequestLinuxCommonTestingSettingsSERIAL.cmake index b1ea16ecbe9c..f56833b533c2 100644 --- a/cmake/std/PullRequestLinuxCommonTestingSettingsSERIAL.cmake +++ b/cmake/std/PullRequestLinuxCommonTestingSettingsSERIAL.cmake @@ -100,5 +100,6 @@ SET(SuperLU_LIBRARY_DIRS "$ENV{SEMS_SUPERLU_LIBRARY_PATH}" CACHE PATH "Set by de # set (TPL_Scotch_INCLUDE_DIRS "$ENV{SEMS_SCOTCH_INCLUDE_PATH}" CACHE PATH "Set by default for PR testing") # set (Scotch_LIBRARY_DIRS "$ENV{SEMS_SCOTCH_LIBRARY_PATH}" CACHE PATH "Set by default for PR testing") - - +# Build stats compiler wrappers (#7376) +set(Trilinos_ENABLE_BUILD_STATS ON CACHE BOOL "Set in PullRequestLinuxCommonTestingSettingsSERIAL.cmake") +set(Trilinos_REMOVE_BUILD_STATS_TIMING_FILES_ON_FRESH_CONFIGURE OFF CACHE BOOL "Set in PullRequestLinuxCommonTestingSettingsSERIAL.cmake") diff --git a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake index 282402c9d8a6..4a0ed0ab9b04 100644 --- a/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake +++ b/cmake/std/PullRequestLinuxCuda10.1.105uvmOffTestingSettings.cmake @@ -143,11 +143,8 @@ set (Trilinos_ENABLE_TrilinosCouplings OFF CACHE BOOL "Turn off packages for non # Turn off tests currently failing with UVM = OFF # Packages with >5 failing tests -set (Amesos2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Anasazi_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") -set (Belos_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Domi_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") -set (Ifpack2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Kokkos_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (KokkosKernels_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (MueLu_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") @@ -161,6 +158,7 @@ set (Teko_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Xpetra_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") set (Zoltan2_ENABLE_TESTS OFF CACHE BOOL "Turn off tests for non-UVM build") + # ShyLU_DD UVM = OFF tests set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_IPOU_DIM3_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") set (ShyLU_DDFROSch_test_thyra_xpetra_laplace_one_rank_TLP_GDSW_DIM2_TPETRA_MPI_1_DISABLE ON CACHE BOOL "Turn off tests for non-UVM build") diff --git a/cmake/std/atdm/ats2/all_supported_builds.sh b/cmake/std/atdm/ats2/all_supported_builds.sh index 2b5b18e0560d..5141f5bae0a0 100644 --- a/cmake/std/atdm/ats2/all_supported_builds.sh +++ b/cmake/std/atdm/ats2/all_supported_builds.sh @@ -7,8 +7,8 @@ export ATDM_CONFIG_ALL_SUPPORTED_BUILDS=( ats2-cuda-10.1.243-gnu-7.3.1-spmpi-rolling_static_opt ats2-cuda-10.1.243-gnu-7.3.1-spmpi-rolling_static_dbg ats2-cuda-10.1.243-gnu-7.3.1-spmpi-rolling_complex_static_opt - ats2-xl-2020.03.18_spmpi-rolling_serial_static_opt - ats2-xl-2020.03.18_spmpi-rolling_serial_static_dbg - ats2-cuda-10.1.243-xl-2020.03.18_spmpi-rolling_static_opt - ats2-cuda-10.1.243-xl-2020.03.18_spmpi-rolling_static_dbg + #ats2-xl-2020.03.18_spmpi-rolling_serial_static_opt + #ats2-xl-2020.03.18_spmpi-rolling_serial_static_dbg + #ats2-cuda-10.1.243-xl-2020.03.18_spmpi-rolling_static_opt + #ats2-cuda-10.1.243-xl-2020.03.18_spmpi-rolling_static_dbg ) diff --git a/cmake/std/atdm/sems-rhel7/all_supported_builds.sh b/cmake/std/atdm/sems-rhel7/all_supported_builds.sh index 03a5bbaec6b6..7c2f2a817728 100644 --- a/cmake/std/atdm/sems-rhel7/all_supported_builds.sh +++ b/cmake/std/atdm/sems-rhel7/all_supported_builds.sh @@ -3,8 +3,9 @@ export ATDM_CONFIG_CTEST_S_BUILD_NAME_PREFIX=Trilinos-atdm- export ATDM_CONFIG_ALL_SUPPORTED_BUILDS=( - sems-rhel7-clang-10.0.0-openmp-shared-release - sems-rhel7-clang-10.0.0-openmp-shared-release-debug + #sems-rhel7-clang-10.0.0-openmp-shared-release + #sems-rhel7-clang-10.0.0-openmp-shared-release-debug + sems-rhel7-clang-7.0.1-openmp-shared-release-debug sems-rhel7-cuda-10.1-Volta70-complex-shared-release-debug sems-rhel7-gnu-7.2.0-openmp-complex-shared-release-debug sems-rhel7-intel-18.0.5-openmp-complex-shared-release-debug diff --git a/commonTools/build_stats/BuildStatsData.py b/commonTools/build_stats/BuildStatsData.py new file mode 100644 index 000000000000..51ba1467ff6a --- /dev/null +++ b/commonTools/build_stats/BuildStatsData.py @@ -0,0 +1,81 @@ +from FindTribitsCiSupportDir import * +import GeneralScriptSupport as GSS + + +# Standard set of build stats fields we want to read in +# +def getStdBuildStatsColsAndTypesList(): + return [ + ColNameAndType('max_resident_size_Kb', 'float'), + ColNameAndType('elapsed_real_time_sec', 'float'), + ColNameAndType('FileName', 'string'), + ColNameAndType('FileSize', 'float'), + ] +# NOTE: Above, we use type 'float' instead of 'int' for fields that are ints +# because we want to allow a very large size. + + +def getColNameTypeIdxListGivenColNameAndTypeList(csvFileName, columnHeadersList, + colNameAndTypesToGetList, + ): + colNameTypeIdxList = [] + for colNameAndTypeToGet in colNameAndTypesToGetList: + colIdx = GSS.findInSequence(columnHeadersList, colNameAndTypeToGet.colName()) + if colIdx != -1: + colNameTypeIdxList.append(ColNameTypeIdx(colNameAndTypeToGet, colIdx)) + else: + raise Exception( + "Error, the CSV file column header '"+colNameAndTypeToGet.colName()+"'"+\ + " does not exist in the list of column headers "+str(columnHeadersList)+\ + " from the CSV file '"+csvFileName+"'!") + return colNameTypeIdxList + + +class ColNameAndType(object): + def __init__(self, colName, colType): + self.__colName = colName + self.__colType = colType + self.assertType() + def colName(self): + return self.__colName + def colType(self): + return self.__colType + def __repr__(self): + myStr = "ColNameAndType{"+self.__colName+","+str(self.__colType)+"}" + return myStr + def convertFromStr(self, strIn): + if self.__colType == "string": + return strIn + elif self.__colType == "int": + return int(strIn) + elif self.__colType == "float": + return float(strIn) + def assertType(self): + supportedTypes = [ "string", "int", "float" ] + if -1 == GSS.findInSequence(supportedTypes, self.__colType): + raise Exception( + "Error, type '"+str(self.__colType)+"' is not supported! Supported types"+\ + " include "+str(supportedTypes)+"!") + def __eq__(self, other): + return((self.__colName,self.__colType)==(other.__colName,other.__colType)) + def __ne__(self, other): + return((self.__colName,self.__colType)!=(other.__colName,other.__colType)) + + +class ColNameTypeIdx(object): + def __init__(self, colNameAndType, colIdx): + self.__colNameAndType = colNameAndType + self.__colIdx = colIdx + def colName(self): + return self.__colNameAndType.colName() + def getIdx(self): + return self.__colIdx + def convertFromStr(self, strIn): + return self.__colNameAndType.convertFromStr(strIn) + def __repr__(self): + myStr = "ColNameTypeIdx{"+str(self.__colNameAndType)+","+str(self.__colIdx)+"}" + return myStr + def __eq__(self, other): + return ((self.__colNameAndType,self.__colIdx)==(other.__colNameAndType,other.__colIdx)) + def __ne__(self, other): + return ((self.__colNameAndType,self.__colIdx)!=(other.__colNameAndType,other.__colIdx)) diff --git a/commonTools/build_stats/BuildStatsGatherTarget.cmake b/commonTools/build_stats/BuildStatsGatherTarget.cmake new file mode 100644 index 000000000000..c2070abefea5 --- /dev/null +++ b/commonTools/build_stats/BuildStatsGatherTarget.cmake @@ -0,0 +1,90 @@ +################################################################################ +# +# Add target for gathering up build stats +# +################################################################################ + + +include("${CMAKE_CURRENT_LIST_DIR}/BuildStatsSharedVars.cmake") + + +# Create custom 'gather-build-stats' target that will run last +# +# NOTE: This function must be called at the very end of all of the build +# targets that get created for a project! +# +function(add_target_gather_build_stats) + + if (${PROJECT_NAME}_ENABLE_BUILD_STATS) + + add_custom_command( + OUTPUT "${BUILD_STATS_CSV_FILE}" + COMMAND "${BUILD_STATS_SRC_DIR}/gather_build_stats.py" + WORKING_DIRECTORY "${${PROJECT_NAME}_BINARY_DIR}" ) + + add_custom_target(gather-build-stats ALL + DEPENDS "${BUILD_STATS_CSV_FILE}") + + get_all_build_targets_including_in_subdirs("${${PROJECT_NAME}_SOURCE_DIR}" + projectBuildTargetsList) + + if (projectBuildTargetsList) + add_dependencies(gather-build-stats ${projectBuildTargetsList}) + endif() + + endif() + +endfunction() + + +# Get a list all of the lib and exec build targets starting in a subdir and in +# below subdirs. +# +function(get_all_build_targets_including_in_subdirs srcdir targetsListVarOut) + + set(targetsList "") + + # Recurse into subdirectories. + get_property(dirs DIRECTORY ${srcdir} PROPERTY SUBDIRECTORIES) + foreach(d IN LISTS dirs) + get_all_build_targets_including_in_subdirs(${d} targetsSubdirList) + list(APPEND targetsList ${targetsSubdirList}) + endforeach() + + # Get the targets from this directory. + get_property(allTargetsThisDir DIRECTORY ${srcdir} PROPERTY BUILDSYSTEM_TARGETS) + filter_only_build_targets(allTargetsThisDir buildTargetsThisDir) + list(APPEND targetsList ${buildTargetsThisDir}) + + # Return + set(${targetsListVarOut} ${targetsList} PARENT_SCOPE) + +endfunction() + + +function(filter_only_build_targets targetListInVar targetListOutVar) + + #print_var(targetListInVar) + #print_var(${targetListInVar}) + + set(targetListOut "") + + foreach (target IN LISTS ${targetListInVar}) + #print_var(target) + get_property(targetType TARGET ${target} PROPERTY TYPE) + #print_var(targetType) + if ( + targetType STREQUAL "STATIC_LIBRARY" OR + targetType STREQUAL "SHARED_LIBRARY" OR + targetType STREQUAL "EXECUTABLE" + ) + #message("-- " "${target} is a regular build target!") + list(APPEND targetListOut ${target}) + else() + #message("-- " "${target} is **NOT** a regular build target!") + endif() + endforeach() + + set(${targetListOutVar} ${targetListOut} PARENT_SCOPE) + +endfunction() diff --git a/commonTools/build_stats/BuildStatsSharedVars.cmake b/commonTools/build_stats/BuildStatsSharedVars.cmake new file mode 100644 index 000000000000..fa89221ea280 --- /dev/null +++ b/commonTools/build_stats/BuildStatsSharedVars.cmake @@ -0,0 +1,2 @@ +set(BUILD_STATS_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}") +set(BUILD_STATS_CSV_FILE "${${PROJECT_NAME}_BINARY_DIR}/build_stats.csv") diff --git a/commonTools/build_stats/BuildStatsWrappers.cmake b/commonTools/build_stats/BuildStatsWrappers.cmake index 517382159398..5dfcdf03b107 100644 --- a/commonTools/build_stats/BuildStatsWrappers.cmake +++ b/commonTools/build_stats/BuildStatsWrappers.cmake @@ -4,15 +4,51 @@ # ################################################################################ -set(BUILD_STATS_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}") -set(BUILD_STATS_CSV_FILE "${${PROJECT_NAME}_BINARY_DIR}/build_stats.csv") +include("${CMAKE_CURRENT_LIST_DIR}/BuildStatsSharedVars.cmake") + # Generate the build stats compiler wrappers if asked to do so. # function(generate_build_stats_wrappers) - # Set default for cache var ${PROJECT_NAME}_ENABLE_BUILD_STATS + set_project_enable_build_stats_var() + + if (${PROJECT_NAME}_ENABLE_BUILD_STATS) + build_stats_find_and_check_time() # Sets cache var BUILD_STATS_TIME_CMD + if(NOT BUILD_STATS_TIME_CMD) + message("-- ${PROJECT_NAME}_ENABLE_BUILD_STATS=ON, but valid GNU Time was not found") + message("-- NOTE: Force setting ${PROJECT_NAME}_ENABLE_BUILD_STATS=OFF!") + set(${PROJECT_NAME}_ENABLE_BUILD_STATS OFF CACHE BOOL + "Forced to 'OFF' since valid 'time' command not found" FORCE) + return() + endif() + + get_base_build_dir_for_python() + + generate_build_stats_wrapper_for_op(C WRAP CMAKE_C_COMPILER) + generate_build_stats_wrapper_for_op(CXX WRAP CMAKE_CXX_COMPILER) + if (${PROJECT_NAME}_ENABLE_Fortran) + generate_build_stats_wrapper_for_op(Fortran WRAP CMAKE_Fortran_COMPILER) + endif() + + generate_build_stats_wrapper_for_op(LD WRAP CMAKE_LD ALLOW_FIND) + generate_build_stats_wrapper_for_op(AR WRAP CMAKE_AR ALLOW_FIND) + generate_build_stats_wrapper_for_op(RANLIB WRAP CMAKE_RANLIB ALLOW_FIND) + # NOTE: LD, AR, and RANDLIB can be used even in builds where + # BUILD_SHARED_LIBS=ON because individual add_librariy() commands can + # request static libraries be built. + + set(BUILD_STATS_COMPLETED_FIRST_CONFIG TRUE CACHE INTERNAL "") + endif() + +endfunction() + + +# Macro that sets the cache var ${PROJECT_NAME}_ENABLE_BUILD_STATS +# +macro(set_project_enable_build_stats_var) + if (NOT "$ENV{${PROJECT_NAME}_ENABLE_BUILD_STATS}" STREQUAL "") # Use the default set in the env (overrides any local default set) set(${PROJECT_NAME}_ENABLE_BUILD_STATS_DEFAULT @@ -25,30 +61,103 @@ function(generate_build_stats_wrappers) set(${PROJECT_NAME}_ENABLE_BUILD_STATS_DEFAULT OFF) endif() - # Set cache var ${PROJECT_NAME}_ENABLE_BUILD_STATS advanced_set(${PROJECT_NAME}_ENABLE_BUILD_STATS ${${PROJECT_NAME}_ENABLE_BUILD_STATS_DEFAULT} CACHE BOOL "If set to 'ON', then compiler wrappers will be created and used to gather build stats." ) - # Generate the build-stats compiler wrappers - get_base_build_dir_for_python() - if (${PROJECT_NAME}_ENABLE_BUILD_STATS) - generate_build_stats_wrapper_for_lang(C) - generate_build_stats_wrapper_for_lang(CXX) - if (${PROJECT_NAME}_ENABLE_Fortran) - generate_build_stats_wrapper_for_lang(Fortran) - endif() +endmacro() - set(gather_build_status "${${PROJECT_NAME}_BINARY_DIR}/gather_build_stats.sh") - configure_file("${BUILD_STATS_SRC_DIR}/gather_build_stats.sh" - "${gather_build_status}" COPYONLY) + +# Find the GNU 'time' command that is used by magic_wrapper.py to extract the +# info out of the command that it runs. +# +# If this finds the GNU 'time' command and it behaves correctly, then it sets +# the cache var BUILD_STATS_TIME_CMD on output. If BUILD_STATS_TIME_CMD is +# already set by the user in the cache and it is found to not behave +# correctly, then BUILD_STATS_TIME_CMD will be removed from the cache. +# +function(build_stats_find_and_check_time) + + # let the user provide BUILD_STATS_TIME_CMD + if (BUILD_STATS_TIME_CMD) + message("-- BUILD_STATS_TIME_CMD=${BUILD_STATS_TIME_CMD}") + set(GNU_TIME_EXE "${BUILD_STATS_TIME_CMD}") + else() + find_program(GNU_TIME_EXE "time" HINTS "/usr/bin") + if(GNU_TIME_EXE) + message("-- Found time at ${GNU_TIME_EXE}") + else() + message("-- GNU time NOT found") + message("-- Install GNU time and/or set BUILD_STATS_TIME_CMD=/path/to/time") + return() + endif() endif() + # This should ideally call the python script and request the fields to test, + # add 'badflag' or some other nonsense. + SET(GNU_TIME_POSSIBLE_FIELDS "e;M;K;D;X;F;R;W;w;c;S;U;P;I;O;r;s;k;x") + SET(GNU_TIME_SUPPORTED_FIELDS "") + + # Should ideally ask for the dtypes or suitable regexes to vet them + foreach(flag ${GNU_TIME_POSSIBLE_FIELDS}) + message(DEBUG "----------------------") + message(DEBUG "Time: Testing field ${flag}") + # The output from time goes to stderr, the programs output to stdout + execute_process(COMMAND "${GNU_TIME_EXE}" + "--format=%${flag}" "true" + # this is useless - we run a noop command + RESULT_VARIABLE GNU_TIME_RC + # capture stderr + ERROR_VARIABLE GNU_TIME_OUTPUT + ) + # If this fails, then something is broken on the system. The checks after + # will likely fail, because they expect a predefined format for stderr + # text. + if(NOT GNU_TIME_RC EQUAL 0) + message(DEBUG "Time invocation error returned `${GNU_TIME_RC}` but expected `0`") + message("-- GNU_TIME_EXE=${GNU_TIME_EXE} does not work") + message("-- Unset BUILD_STATS_TIME_CMD since '${GNU_TIME_EXE}' is invalid!") + unset(BUILD_STATS_TIME_CMD CACHE) + return() + endif() + + # For now, just assert that all expected fields are supported (see + # discussion after function of other possible options). + if("${GNU_TIME_OUTPUT}" MATCHES "^?${flag}.*") + message("-- Time does not support Field: ${flag}") + message("-- GNU_TIME_EXE=${GNU_TIME_EXE} does not work") + message("-- Unset BUILD_STATS_TIME_CMD since '${GNU_TIME_EXE}' is invalid!") + unset(BUILD_STATS_TIME_CMD CACHE) + return() + else() + message(DEBUG "-- Time supports Field: ${flag}") + list(APPEND GNU_TIME_SUPPORTED_FIELDS "${flag}") + endif() + endforeach() + + # If we get here, we should have a list of supported fields from TIME. + set(BUILD_STATS_TIME_CMD ${GNU_TIME_EXE} + CACHE FILEPATH "The GNU time binary required by build_stats" FORCE ) endfunction() +# +# NOTE: Above, the GNU_TIME_SUPPORTED_FIELDS list var is currently not being +# used for anything but in the future, it could be exported to the env as +# TRILINOS_BUILD_STATS_OUTPUT_FIELDS for the magic_wapper.py to use to pass in +# to the 'time' command for fields that are known to be supported. This would +# override the default fields specified there. Note that `time` will actually +# silently accept bad fields, and give `?field` back. If we were to set +# TRILINOS_BUILD_STATS_OUTPUT_FIELDS the GNU_TIME_SUPPORTED_FIELDS then bad +# fields will simply not be written to a file. +# +# One unimplemented feature in the wrapper is +# `TRILINOS_BUILD_STATS_PARSE_NM` which we could control if NM is used. Like +# 'time', we expect it to work and we could `find_program()` it as well. -# Get the var BASE_BUILD_DIR_FOR_PYTHON +# Get the non-cache var BASE_BUILD_DIR_FOR_PYTHON +# +# This var gets picked up in the configure of build_stat_wrapper.sh.in. # macro(get_base_build_dir_for_python) set(get_cwd_for_python ${BUILD_STATS_SRC_DIR}/get_cwd_for_python.py) @@ -58,6 +167,7 @@ macro(get_base_build_dir_for_python) OUTPUT_VARIABLE BASE_BUILD_DIR_FOR_PYTHON OUTPUT_STRIP_TRAILING_WHITESPACE) endmacro() +# # NOTE: We need this function to get the value of os.getcwd() from Python so # that it matches the value returned inside of magic_wapper.py. The issue is # that some platforms, CMake determines a different absolute base build dir @@ -67,48 +177,98 @@ endmacro() # needed when computing relative paths. -# Generate the build stats compiler wrapper for a single language. +# Generate the build stats compiler wrapper for a given CMake variable. # -function(generate_build_stats_wrapper_for_lang lang) - - string(TOLOWER "${lang}" lang_lc) - set(compiler_wrapper - "${${PROJECT_NAME}_BINARY_DIR}/build_stat_${lang_lc}_wrapper.sh") - - # Override the compiler with the wrapper but remember the original compiler - if ("${CMAKE_${lang}_COMPILER_ORIG}" STREQUAL "") - set(CMAKE_${lang}_COMPILER_ORIG "${CMAKE_${lang}_COMPILER}" - CACHE FILEPATH "Original non-wrappeed ${lang} compiler" FORCE ) - set(CMAKE_${lang}_COMPILER "${compiler_wrapper}" - CACHE FILEPATH "Overwritten build stats ${lang} compiler wrapper" FORCE ) - endif() +# Usage: +# +# generate_build_stats_wrapper_for_op( WRAP [ALLOW_FIND]) +# +# The intent of this function is pass in arbitrary cmake variables +# that map to commands and generate suitable wrappers. +# +# is the short name, like C, CXX, Fortran, LD, AR, RANLIB. +# +function(generate_build_stats_wrapper_for_op op_name) + cmake_parse_arguments( + PARSE_ARGV 1 + BUILD_STATS # prefix + "ALLOW_FIND" # options + "WRAP" # one_value_keywords + "" # multi_value_keywords + ) + set(variable_to_set "${BUILD_STATS_WRAP}") - message("-- " "Generate build stats compiler wrapper for ${lang}") - set(BUILD_STAT_COMPILER_WRAPPER_INNER_COMPILER "${CMAKE_${lang}_COMPILER_ORIG}") - configure_file("${BUILD_STATS_SRC_DIR}/build_stat_lang_wrapper.sh.in" - "${compiler_wrapper}" @ONLY) + string(TOLOWER "${op_name}" op_lc) + set(op_wrapper "${${PROJECT_NAME}_BINARY_DIR}/build_stat_${op_lc}_wrapper.sh") - # Use the orginal compiler for the installed Config.cmake files - set(CMAKE_${lang}_COMPILER_FOR_CONFIG_FILE_INSTALL_DIR - "${CMAKE_${lang}_COMPILER_ORIG}" CACHE INTERNAL "") - #print_var(CMAKE_${lang}_COMPILER_FOR_CONFIG_FILE_INSTALL_DIR) + generate_build_stats_wrapper_for_op_find_op_lc() # Sets ${variable_to_set} -endfunction() + # Override the op with the wrapper but remember the original command + if (NOT BUILD_STATS_COMPLETED_FIRST_CONFIG) + if (${variable_to_set}) # True if was set on input or was found above + set(${variable_to_set}_ORIG ${${variable_to_set}} + CACHE FILEPATH "Original non-wrapped ${op_name}" FORCE ) + set(${variable_to_set} "${op_wrapper}" + CACHE FILEPATH "Overwritten build stats ${op_name} wrapper" FORCE ) + + message("-- " "Generating build stats wrapper for ${op_name}") + set(BUILD_STATS_WRAPPER_INNER_OP "${${variable_to_set}_ORIG}") + configure_file("${BUILD_STATS_SRC_DIR}/build_stat_wrapper.sh.in" + "${op_wrapper}" @ONLY) -# NOTE: The above implementation will make sure the compiler wrapper will get -# updated if the *.sh.in template file changes and just reconfiguring. -# Actaully, you should be able to fix the wrapper and just type 'make' and it -# should reconfigure and update automatically. + set(${variable_to_set}_OP_FOR_CONFIG_FILE_INSTALL_DIR + "${${variable_to_set}_ORIG}" CACHE INTERNAL "") + else() + message("-- Not wrapping ${op_name} because " + "${variable_to_set}=`${variable_to_set}` is not set." + " To enable statistics set ${variable_to_set}.") + endif() + endif() +endfunction() +# +# NOTE: Above, if this is not the first configure (and +# BUILD_STATS_COMPLETED_FIRST_CONFIG is unset) then we don't want to do +# anything different with the build stats wrappers. For example, we don't +# want CMAKE_CXX_FLAGS to be empty on the first configure when this function +# is called and have CMake to find the C++ compiler later in the first +# configure and then on the reconfigure have a build stats wrapper generated +# for the C++ compiler. If this happened, then the C++ code would build with +# the raw C++ compiler after the first configure but after the second and +# subsequent (re)configures would (re)build the code with the build-stats +# wrapped compiler. It seems like a bad idea to have the code build +# differently on a reconfigure even if the user does not do anything other +# than trigger a reconfigure (e.g. by touching a CMakeLists.txt file or adding +# a new source file). + + +# Helper macro to shorten above function some +# +# Sets ${variable_to_set} if ${op_lc} is found. +# +macro(generate_build_stats_wrapper_for_op_find_op_lc) + # there's an issue here - if CMAKE_FOO is unset (whatever `variable_to_set` is) + # we need a to know the command - but CMake hasn't chosen one yet... + if( BUILD_STATS_ALLOW_FIND + AND ( + ("${${variable_to_set}}" STREQUAL "") + OR + (NOT ${variable_to_set}) + ) + ) + message("-- " "${variable_to_set} is not set, but a wrapper has been requested. Asking CMake to find ${op_lc}") + find_program(${variable_to_set} "${op_lc}") + print_var(${variable_to_set}) + endif() +endmacro() -# Remove the build stats file on configure if asked to do so. +# Remove the build stats file on each configure if asked to do so. # function(remove_build_stats_file_on_configure) advanced_set(${PROJECT_NAME}_REMOVE_BUILD_STATS_ON_CONFIGURE OFF ${${PROJECT_NAME}_REMOVE_BUILD_STATS_ON_CONFIGURE_DEFAULT} CACHE BOOL - "If set to 'ON', then the build_stats.csv file will be removed on each configure." - ) + "If set to 'ON', then the build_stats.csv file will be removed on each configure." ) if ( (${PROJECT_NAME}_REMOVE_BUILD_STATS_ON_CONFIGURE) @@ -156,18 +316,19 @@ endfunction() # function(install_build_stats_scripts) - if (${PROJECT_NAME}_ENABLE_BUILD_STATS) - - set(gather_build_status "${${PROJECT_NAME}_BINARY_DIR}/gather_build_stats.sh") - install(PROGRAMS "${gather_build_status}" - DESTINATION "${${PROJECT_NAME}_INSTALL_RUNTIME_DIR}") + # disable this for now... + return() + if (${PROJECT_NAME}_ENABLE_BUILD_STATS) install_build_stats_wrapper_for_lang(C) install_build_stats_wrapper_for_lang(CXX) if (${PROJECT_NAME}_ENABLE_Fortran) install_build_stats_wrapper_for_lang(Fortran) endif() + install_build_stats_wrapper_for_lang(AR) + install_build_stats_wrapper_for_lang(LD) + install_build_stats_wrapper_for_lang(RANLIB) endif() endfunction() @@ -176,93 +337,12 @@ endfunction() # Install the build stats compiler wrapper for a single language. # function(install_build_stats_wrapper_for_lang lang) - string(TOLOWER "${lang}" lang_lc) - set(compiler_wrapper - "${${PROJECT_NAME}_BINARY_DIR}/build_stat_${lang_lc}_wrapper.sh") - install(PROGRAMS "${compiler_wrapper}" - DESTINATION "${${PROJECT_NAME}_INSTALL_RUNTIME_DIR}") -endfunction() - - -# Create custom 'gather-build-stats' target that will run last -# -# NOTE: This function must be called at the very end of all of the build -# targets that get created for a project! -# -function(add_target_gather_build_stats) - - if (${PROJECT_NAME}_ENABLE_BUILD_STATS) - - set(buildStatsCsvFile "${${PROJECT_NAME}_BINARY_DIR}/build_stats.csv") - - add_custom_command( - OUTPUT "${buildStatsCsvFile}" - COMMAND "${${PROJECT_NAME}_BINARY_DIR}/gather_build_stats.sh" - WORKING_DIRECTORY "${${PROJECT_NAME}_BINARY_DIR}" ) - - add_custom_target(gather-build-stats ALL - DEPENDS "${buildStatsCsvFile}") - - get_all_build_targets_including_in_subdirs("${${PROJECT_NAME}_SOURCE_DIR}" - projectBuildTargetsList) - - if (projectBuildTargetsList) - add_dependencies(gather-build-stats ${projectBuildTargetsList}) - endif() - - endif() - -endfunction() - - -# Get a list all of the lib and exec build targets starting in a a subdir and -# in below subdirs. -# -function(get_all_build_targets_including_in_subdirs srcdir targetsListVarOut) - - set(targetsList "") - - # Recurse into subdirectories. - get_property(dirs DIRECTORY ${srcdir} PROPERTY SUBDIRECTORIES) - foreach(d IN LISTS dirs) - get_all_build_targets_including_in_subdirs(${d} targetsSubdirList) - list(APPEND targetsList ${targetsSubdirList}) - endforeach() - - # Get the targets from this directory. - get_property(allTargetsThisDir DIRECTORY ${srcdir} PROPERTY BUILDSYSTEM_TARGETS) - filter_only_build_targets(allTargetsThisDir buildTargetsThisDir) - list(APPEND targetsList ${buildTargetsThisDir}) - - # Return - set(${targetsListVarOut} ${targetsList} PARENT_SCOPE) + string(TOLOWER "${op_name}" op_lc) + set(op_wrapper + "${${PROJECT_NAME}_BINARY_DIR}/build_stat_${op_lc}_wrapper.sh") + install(PROGRAMS "${op_wrapper}" + DESTINATION "${${PROJECT_NAME}_INSTALL_RUNTIME_DIR}") endfunction() -function(filter_only_build_targets targetListInVar targetListOutVar) - - #print_var(targetListInVar) - #print_var(${targetListInVar}) - - set(targetListOut "") - - foreach (target IN LISTS ${targetListInVar}) - #print_var(target) - get_property(targetType TARGET ${target} PROPERTY TYPE) - #print_var(targetType) - if ( - targetType STREQUAL "STATIC_LIBRARY" OR - targetType STREQUAL "SHARED_LIBRARY" OR - targetType STREQUAL "EXECUTABLE" - ) - #message("-- " "${target} is a regular build target!") - list(APPEND targetListOut ${target}) - else() - #message("-- " "${target} is **NOT** a regular build target!") - endif() - endforeach() - - set(${targetListOutVar} ${targetListOut} PARENT_SCOPE) - -endfunction() diff --git a/commonTools/build_stats/CMakeLists.txt b/commonTools/build_stats/CMakeLists.txt index bd5efbc45a33..158048338f50 100644 --- a/commonTools/build_stats/CMakeLists.txt +++ b/commonTools/build_stats/CMakeLists.txt @@ -1,5 +1,6 @@ tribits_package(TrilinosBuildStats) +include("${CMAKE_CURRENT_LIST_DIR}/BuildStatsGatherTarget.cmake") add_target_gather_build_stats() # NOTE: We define this build target here after all of the other packages are # done getting defined so that it will have a dependency on all defined @@ -17,18 +18,21 @@ tribits_add_advanced_test( Results OVERALL_NUM_MPI_PROCS 1 TEST_0 MESSAGE "Gather up the build stats in case the build failed" - CMND "${${PROJECT_NAME}_BINARY_DIR}/gather_build_stats.sh" + CMND "${CMAKE_CURRENT_LIST_DIR}/gather_build_stats.py" + ARGS -v WORKING_DIRECTORY "${${PROJECT_NAME}_BINARY_DIR}" SKIP_CLEAN_WORKING_DIRECTORY # Critical or you delete the entire working dir! ALWAYS_FAIL_ON_NONZERO_RETURN TEST_1 - MESSAGE "Sumarize the build stats from the already created build_stats.csv file" + MESSAGE "Sumarize the build stats from the already created build_stats.csv file (CTEST_FULL_OUTPUT)" CMND "${${PROJECT_NAME}_SOURCE_DIR}/commonTools/build_stats/summarize_build_stats.py" - ARGS --build-stats-csv-file="${${PROJECT_NAME}_BINARY_DIR}/build_stats.csv" - --bin-by-subdirs-under-dirs=commonTools,packages + ARGS --bin-by-subdirs-under-dirs=commonTools,packages + "${${PROJECT_NAME}_BINARY_DIR}/build_stats.csv" ALWAYS_FAIL_ON_NONZERO_RETURN ADDED_TEST_NAME_OUT Results_TEST_NAME ) + # Note, above CTEST_FULL_OUTPUT is in the MESSAGE to get ctest to keep the + # full test output and send and display on CDash. if (Results_TEST_NAME) set_tests_properties( ${Results_TEST_NAME} PROPERTIES diff --git a/commonTools/build_stats/build_stat_lang_wrapper.sh.in b/commonTools/build_stats/build_stat_lang_wrapper.sh.in deleted file mode 100755 index c516503ad387..000000000000 --- a/commonTools/build_stats/build_stat_lang_wrapper.sh.in +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -cmd="@BUILD_STAT_COMPILER_WRAPPER_INNER_COMPILER@" -base_build_dir=@BASE_BUILD_DIR_FOR_PYTHON@ -if [ "${CMAKE_IS_IN_CONFIGURE_MODE}" == "1" ]; then - ${cmd} "$@" -else - "@BUILD_STATS_SRC_DIR@/wrapper/magic_wrapper.py" \ - "----base-build-dir=${base_build_dir}" "----op=${cmd}" "$@" -fi diff --git a/commonTools/build_stats/build_stat_wrapper.sh.in b/commonTools/build_stats/build_stat_wrapper.sh.in new file mode 100755 index 000000000000..28d18f611c65 --- /dev/null +++ b/commonTools/build_stats/build_stat_wrapper.sh.in @@ -0,0 +1,18 @@ +#!/bin/bash +# Consumed by the magic_wrapper + +export TRILINOS_BUILD_STATS_PARSE_NM="false" +# BUILD_STATS_INNER_OP is the command we are wrapping +export TRILINOS_BUILD_STATS_INNER_OP="@BUILD_STATS_WRAPPER_INNER_OP@" +# BUILD_STATS_TIME_CMD points to a valid GNU Time executable +export TRILINOS_BUILD_STATS_TIME_CMD="@BUILD_STATS_TIME_CMD@" +# We need to know the `root` of the build tree so we annotate +# paths correctly (see github PR 8638 for issue with Makefile builds) +export TRILINOS_BUILD_STATS_BASE_DIR="@BASE_BUILD_DIR_FOR_PYTHON@" + +if [ "${CMAKE_IS_IN_CONFIGURE_MODE}" == "1" ]; then + ${TRILINOS_BUILD_STATS_INNER_OP} "$@" +else + "@PYTHON_EXECUTABLE@" \ + "@BUILD_STATS_SRC_DIR@/wrapper/magic_wrapper.py" "$@" +fi diff --git a/commonTools/build_stats/gather_build_stats.py b/commonTools/build_stats/gather_build_stats.py new file mode 100755 index 000000000000..0cda143c9d91 --- /dev/null +++ b/commonTools/build_stats/gather_build_stats.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv + +from FindTribitsCiSupportDir import * +import GeneralScriptSupport as GSS +import CDashQueryAnalyzeReport as CDQAR + +from BuildStatsData import * + + +# +# Helper functions +# + + +# Robustly read all CSV build stats *.timing files under a base dir and return +# as a list of dicts (LOD). +# +# This robustly deals with *.timing files and discards any *.timing files that +# have any problems. +# +def readAllValidTimingFiles(baseDir, printErrMsg=True, printStats=False): + listOfAllTimingFiles = getListOfAllTimingFiles(baseDir) + if printStats: + print("Number of *.timing files found = "+str(len(listOfAllTimingFiles))) + allValidTimingFilesLOD = [] + for timingFile in listOfAllTimingFiles: + timingFileFullPath = baseDir+"/"+timingFile + (buildStatsTimingDict, errMsg) = \ + readBuildStatsTimingFileIntoDict(timingFileFullPath) + if errMsg != "" and printErrMsg: + print(errMsg) + if not buildStatsTimingDict == None: + allValidTimingFilesLOD.append(buildStatsTimingDict) + if printStats: + print("Number of valid *.timing files found = "+str(len(allValidTimingFilesLOD))) + return allValidTimingFilesLOD + + +# Robustly read a CSV build stats timing file created by magic_wrapper.py and +# return as dict. +# +# Returns tuple: +# +# (timingBuildStatsDict, errorMsg) +# +# If the timing build stats file 'buildStatsTimingFile' exists and has valid +# data, then 'timingBuildStatsDict' will be a simple dict with the contents of +# the CSV file. Otherwise, 'timingBuildStatsDict' will be 'None' and 'errMsg' +# will contain the error message. +# +# The provides for a very robust reading of these timing build stats files in +# case there are problems with the running of the magic_wrapper.py tool. +# +def readBuildStatsTimingFileIntoDict(buildStatsTimingFile): + + # Output data initialization + buildStatsTimingDict = None + errMsg = "" + + (listOfDicts, errMsg) = robustReadCsvFileIntoListOfDicts(buildStatsTimingFile) + + if errMsg == "" and not len(listOfDicts) == 1: + errMsg = buildStatsTimingFile+": ERROR: Contains "+\ + str(len(listOfDicts))+" != 1 data rows!" + + if listOfDicts != None and errMsg == "": + # No errors found to this point, so grab the first row as the build stats dict + buildStatsTimingDict = listOfDicts[0] + + if buildStatsTimingDict != None and errMsg == "": + errMsgBody = checkBuildStatsTimingDictHasError(buildStatsTimingDict) + if errMsgBody != "": + errMsg = buildStatsTimingFile+": "+errMsgBody + + if buildStatsTimingDict != None and errMsg == "": + normalizeFileNameFieldInDict(buildStatsTimingDict) + + if errMsg != "": + buildStatsTimingDict = None + + return (buildStatsTimingDict, errMsg) + + +# Call readCsvFileIntoListOfDicts() but make robust to basic read errors. +# +# Returns: +# +# (listOfDicts, errMsg) +# +# Returns a valid list of dicts listOfDicts!=None unless some error occurs. +# If some error occured, then errMsg will be sets to a string descrdibing what +# the problem was. +# +# No exception should ever be thrown from this function. This is useful to +# use in cases where the existance or basic structure of a CSV file may be +# broken and we want to ignore or gracefully deal with invalid files. +# +def robustReadCsvFileIntoListOfDicts(csvFile): + listOfDicts = None + errMsg = "" + if os.path.exists(csvFile): + try: + listOfDicts = CDQAR.readCsvFileIntoListOfDicts(csvFile) + except Exception as exceptObj: + if str(exceptObj).find("is empty which is not allowed") != -1: + errMsg = csvFile+": ERROR: File is empty!" + else: + errMsg = csvFile+": ERROR: "+str(exceptObj) + # NOTE: The above check is tied pretty tighlty to the implementation of + # readCsvFileIntoListOfDicts() in looking for a specific substring in + # the error message but it will still capture any other error as well + # and report it through errMsg. + else: + errMsg = csvFile+": ERROR: File does not exist!" + return (listOfDicts, errMsg) +# ToDo: Move the above function to CsvFileUtils.py! + + +# Assert that a build stats timing dict contains the required fields and has +# valid data in those required field. +# +# Returns: +# +# errMsg +# +# Returns errMsg=="" if there is no error. Otherwise, errMsg describes the +# nature of the error. +# +def checkBuildStatsTimingDictHasError(buildStatsTimingDict): + errMsg = "" + for stdBuildStatColAndType in getStdBuildStatsColsAndTypesList(): + requiredFieldName = stdBuildStatColAndType.colName() + requiredFieldType = stdBuildStatColAndType.colType() + strVal = buildStatsTimingDict.get(requiredFieldName, None) + if strVal == None: + errMsg = "ERROR: The required field '"+requiredFieldName+"' is missing!" + break + try: + convertedVal = stdBuildStatColAndType.convertFromStr(strVal) + except Exception as exceptObj: + errMsg = "ERROR: For field '"+requiredFieldName+"' the string value '"+strVal+"'"+\ + " could not be converted to the expected type '"+requiredFieldType+"'!" + return errMsg + + +# Normalize the 'FileName' field value +# +def normalizeFileNameFieldInDict(aDict): + fileName = aDict.get('FileName') + if fileName.startswith("./"): + aDict.update({'FileName':fileName[2:]}) + + +# Get list of all *.timing files under baseDir and return paths relative to +# baseDir. +# +# This does not read the contents of any of the timing files, it just returns +# a list of all of them. +# +def getListOfAllTimingFiles(baseDir): + listOfAllTimingFiles = [] + for root, subdirs, files in os.walk(baseDir): + if root == baseDir: relRoot = "" + else: relRoot = root.replace(baseDir+"/","") + for aFile in files: + if aFile.endswith(".timing"): + aFileRelPath = os.path.join(relRoot, aFile) + listOfAllTimingFiles.append(aFileRelPath) + return listOfAllTimingFiles + + +# Fill in dict of lists for combined info from a list of dicts +# +# The output dict of lists will have the superset of keys from all of the +# input dicts in the listOfDicts and any non-existent values will be given the +# empty string "" instead of `None`. +# +def getDictOfListsFromListOfDicts(listOfDicts, printStats=False): + numTotalRows = len(listOfDicts) + supersetOfFieldNamesList = getSupersetOfFieldNamesList(listOfDicts) + if printStats: + print( + "Combined build-stats keys sorted:\n"+\ + " "+str(supersetOfFieldNamesList) ) + dictOfLists = {} + # Create dict of lists with all empty values + for keyName in supersetOfFieldNamesList: + dictOfLists.update( { keyName : [""] * numTotalRows } ) + # Fill in the values from the dicts in the list + for i in range(numTotalRows): + aDict = listOfDicts[i] + for key, value in aDict.items(): + dictOfLists.get(key)[i] = value + # Return the completed data-structure + return dictOfLists + + +# Get superset of all of the field names for a list of dicts +# +def getSupersetOfFieldNamesList(listOfDicts): + supersetOfFieldNames = set() + for aDict in listOfDicts: + supersetOfFieldNames = supersetOfFieldNames.union(aDict.keys()) + return sorted(list(supersetOfFieldNames)) + + +# Write a dict of lists to a CSV File +# +# Note, this writes the column names (keys) in sorted order. +# +def writeDictOfListsToCsvFile(dictOfLists, csvFile): + keysList = sorted(dictOfLists.keys()) + if len(keysList) > 0: + numTotalRows = len(dictOfLists.get(keysList[0])) # All lists are same length + else: + numTotalRows = 0 + numTotalKeys = len(keysList) + with open(csvFile, "w") as csvFileHandle: + csvWriter = csv.writer(csvFileHandle, delimiter=",", lineterminator="\n") + csvWriter.writerow(keysList) + for i in range(numTotalRows): + rowList = [] + for aKey in keysList: + rowList.append(dictOfLists.get(aKey)[i]) + csvWriter.writerow(rowList) +# ToDo: Move the above function to CsvFileUtils.py! + + +# +# Helper functions for main() +# + + +# +# Help message +# + + +def getRequiredFieldsAndTypesDocStr(): + docStr = "" + for stdBuildStatColAndType in getStdBuildStatsColsAndTypesList(): + requiredFieldName = stdBuildStatColAndType.colName() + requiredFieldType = stdBuildStatColAndType.colType() + docStr += " "+requiredFieldName+" : "+requiredFieldType+"\n" + return docStr + + +usageHelp = r""" + +Gather up build stats from *.timing CSV files under the given base directory +created by the magic_wrapper.py tool as a byproduct of building the various +targets in a project. + +This will discard the data from any *.timing file that does not have valid +values for the required minimum column headers/fields with types: + +"""+getRequiredFieldsAndTypesDocStr()+r""" + +or if any other problems are found with a *.timing file. + +The column headers in all of the *.timing files are combined into one superset +in the generated 'buildStatsCsvFile' file and the columns are listed in sorted +order. (The values for any fields missing in a *.timing file are given the null +string ''.) +""" + + +def injectCmndLineOptionsInParser(clp): + + clp.add_argument( + "--base-dir", "-d", dest="baseDir", default="", + help="Base directory for project build dir containing the *.timing files."+\ + " [default is current working directory]" ) + + clp.add_argument( + "--verbose", "-v", dest="verbose", action="store_true", default=False, + help="Provide verbose output." ) + + clp.add_argument("buildStatsCsvFile", nargs='?', default="build_stats.csv", + help="The build status CSV file to created on output."+\ + " [default is 'build_stats.csv' in current working directory]" ) + + +def getCmndLineOptions(): + from argparse import ArgumentParser, RawDescriptionHelpFormatter + clp = ArgumentParser(description=usageHelp, + formatter_class=RawDescriptionHelpFormatter) + injectCmndLineOptionsInParser(clp) + options = clp.parse_args(sys.argv[1:]) + if options.baseDir == "": + options.baseDir = os.getcwd() + elif not os.path.exists(options.baseDir): + print("Error, the base dir '"+options.baseDir+"' does not exist!") + return options + + +# +# Main() +# + +if __name__ == '__main__': + inOptions = getCmndLineOptions() + if inOptions.verbose: + print("Reading all *.timing files from under '"+inOptions.baseDir+"' ...") + allValidTimingFilesListOfDicts = readAllValidTimingFiles(inOptions.baseDir, + printStats=inOptions.verbose) + allValidTimingFilesDictOfLists = \ + getDictOfListsFromListOfDicts(allValidTimingFilesListOfDicts, + printStats=inOptions.verbose) + writeDictOfListsToCsvFile(allValidTimingFilesDictOfLists, + inOptions.buildStatsCsvFile) + if inOptions.verbose: + print("Wrote file '"+inOptions.buildStatsCsvFile+"'") diff --git a/commonTools/build_stats/gather_build_stats.sh b/commonTools/build_stats/gather_build_stats.sh deleted file mode 100755 index 5a92f41610ed..000000000000 --- a/commonTools/build_stats/gather_build_stats.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash -output_file=$1 ; shift -if [[ "${output_file}" == "" ]] ; then - output_file=build_stats.csv -fi -find -name "*timing" -print -quit | xargs head -n1 > "${output_file}" -find -name "*timing" -print0 | xargs -0 tail -q -n1 >> "${output_file}" diff --git a/commonTools/build_stats/summarize_build_stats.py b/commonTools/build_stats/summarize_build_stats.py index 14c4ff619585..f62d3e82904c 100755 --- a/commonTools/build_stats/summarize_build_stats.py +++ b/commonTools/build_stats/summarize_build_stats.py @@ -8,6 +8,8 @@ import GeneralScriptSupport as GSS import CDashQueryAnalyzeReport as CDQAR +from BuildStatsData import * + # # Helper functions @@ -38,19 +40,6 @@ def readBuildStatsCsvFileIntoDictOfLists(buildStatusCsvFileName, return buildStatsDOL -# Standard set of build stats fields we want to read in -# -def getStdBuildStatsColsAndTypesList(): - return [ - ColNameAndType('max_resident_size_Kb', 'float'), - ColNameAndType('elapsed_real_time_sec', 'float'), - ColNameAndType('FileName', 'string'), - ColNameAndType('FileSize', 'float'), - ] -# NOTE: Above, we use type 'float' instead of 'int' for fields that are ints -# because we want to allow a very large size. - - # Read in a CSV file as a dict of lists. # def readCsvFileIntoDictOfLists(csvFileName, colNameAndTypeList): @@ -81,6 +70,7 @@ def readCsvFileIntoDictOfLists(csvFileName, colNameAndTypeList): dataRowIdx += 1 # Return completed dict of lists return dictOfLists +# ToDo: Move above function into CsvFileUtils.py def assertNumExpectedCsvFileLineEntries(csvFileName, columnHeadersList, @@ -93,72 +83,6 @@ def assertNumExpectedCsvFileLineEntries(csvFileName, columnHeadersList, str(len(csvLineList))+" entries!" ) -def getColNameTypeIdxListGivenColNameAndTypeList(csvFileName, columnHeadersList, - colNameAndTypesToGetList, - ): - colNameTypeIdxList = [] - for colNameAndTypeToGet in colNameAndTypesToGetList: - colIdx = GSS.findInSequence(columnHeadersList, colNameAndTypeToGet.colName()) - if colIdx != -1: - colNameTypeIdxList.append(ColNameTypeIdx(colNameAndTypeToGet, colIdx)) - else: - raise Exception( - "Error, the CSV file column header '"+colNameAndTypeToGet.colName()+"'"+\ - " does not exist in the list of column headers "+str(columnHeadersList)+\ - " from the CSV file '"+csvFileName+"'!") - return colNameTypeIdxList - - -class ColNameAndType(object): - def __init__(self, colName, colType): - self.__colName = colName - self.__colType = colType - self.assertType() - def colName(self): - return self.__colName - def colType(self): - return self.__colType - def __repr__(self): - myStr = "ColNameAndType{"+self.__colName+","+str(self.__colType)+"}" - return myStr - def convertFromStr(self, strIn): - if self.__colType == "string": - return strIn - elif self.__colType == "int": - return int(strIn) - elif self.__colType == "float": - return float(strIn) - def assertType(self): - supportedTypes = [ "string", "int", "float" ] - if -1 == GSS.findInSequence(supportedTypes, self.__colType): - raise Exception( - "Error, type '"+str(self.__colType)+"' is not supported! Supported types"+\ - " include "+str(supportedTypes)+"!") - def __eq__(self, other): - return((self.__colName,self.__colType)==(other.__colName,other.__colType)) - def __ne__(self, other): - return((self.__colName,self.__colType)!=(other.__colName,other.__colType)) - - -class ColNameTypeIdx(object): - def __init__(self, colNameAndType, colIdx): - self.__colNameAndType = colNameAndType - self.__colIdx = colIdx - def colName(self): - return self.__colNameAndType.colName() - def getIdx(self): - return self.__colIdx - def convertFromStr(self, strIn): - return self.__colNameAndType.convertFromStr(strIn) - def __repr__(self): - myStr = "ColNameTypeIdx{"+str(self.__colNameAndType)+","+str(self.__colIdx)+"}" - return myStr - def __eq__(self, other): - return ((self.__colNameAndType,self.__colIdx)==(other.__colNameAndType,other.__colIdx)) - def __ne__(self, other): - return ((self.__colNameAndType,self.__colIdx)!=(other.__colNameAndType,other.__colIdx)) - - # Add standard scaled fields to read-in build stats dict of lists # def addStdScaledBuildStatsFields(buildStatsDOL): @@ -343,7 +267,6 @@ def createAsciiReportOfOneBuildStatsSummary(buildStatsSummary, buildStatsSetName return asciiReportStr - # Create an ASCII text report block for a list of build stats summaries for a # single list of stats. # @@ -362,12 +285,15 @@ def createAsciiReportOfBuildStatsSummaries(buildStatsSummariesBinnedBySubdirs): # -# Help message +# Helper functions for main() # -usageHelp = r"""summarize_build_stats.py --build-stats-csv-file= - --bin-by-subdirs-under-dirs=,,... +# +# Help message +# + +usageHelp = r""" Summarize gathered build stats from the the build stats CSV file and print the report as ASCII text to STDOUT. This prints a report like: @@ -388,33 +314,23 @@ def createAsciiReportOfBuildStatsSummaries(buildStatsSummariesBinnedBySubdirs): ... """ - -# -# Helper functions for main() -# - - -def injectCmndLineOptionsInParser(clp, gitoliteRootDefault=""): - - clp.add_option( - "--build-stats-csv-file", dest="buildStatsCsvFile", type="string", default="", - help="The build status CSV file created by build wappers and gathered up." ) +def injectCmndLineOptionsInParser(clp): - clp.add_option( - "--bin-by-subdirs-under-dirs", dest="binBySubdirsUnderDirsStr", type="string", - default="", + clp.add_argument( + "--bin-by-subdirs-under-dirs", dest="binBySubdirsUnderDirsStr", default="", help="List of base dirs to group results by subdir under."+\ " Format ',,..." ) + clp.add_argument("buildStatsCsvFile", + help="The build status CSV file created by build wappers and gathered up." ) + def getCmndLineOptions(): - from optparse import OptionParser - clp = OptionParser(usage=usageHelp) + from argparse import ArgumentParser, RawDescriptionHelpFormatter + clp = ArgumentParser(description=usageHelp, + formatter_class=RawDescriptionHelpFormatter) injectCmndLineOptionsInParser(clp) - (options, args) = clp.parse_args() - if options.buildStatsCsvFile == "": - raise Exception( - "Error, input argument --build-stats-csv-file must be set!") + options = clp.parse_args(sys.argv[1:]) if not os.path.exists(options.buildStatsCsvFile): raise Exception( "Error, file '"+options.buildStatsCsvFile+"' does not exist!") diff --git a/commonTools/build_stats/unit_tests/CMakeLists.txt b/commonTools/build_stats/unit_tests/CMakeLists.txt index 5fb76c2122c5..07aed0dabe54 100644 --- a/commonTools/build_stats/unit_tests/CMakeLists.txt +++ b/commonTools/build_stats/unit_tests/CMakeLists.txt @@ -6,3 +6,12 @@ TRIBITS_ADD_ADVANCED_TEST( summarize_build_stats_UnitTests PASS_REGULAR_EXPRESSION "OK" ALWAYS_FAIL_ON_NONZERO_RETURN ) + +TRIBITS_ADD_ADVANCED_TEST( gather_build_stats_UnitTests + OVERALL_WORKING_DIRECTORY TEST_NAME + OVERALL_NUM_MPI_PROCS 1 + TEST_0 CMND ${PYTHON_EXECUTABLE} + ARGS ${CMAKE_CURRENT_SOURCE_DIR}/gather_build_stats_UnitTests.py -v + PASS_REGULAR_EXPRESSION "OK" + ALWAYS_FAIL_ON_NONZERO_RETURN + ) diff --git a/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.bad_type_filesize b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.bad_type_filesize new file mode 100755 index 000000000000..b9d9dabe1294 --- /dev/null +++ b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.bad_type_filesize @@ -0,0 +1,2 @@ +max_resident_size_Kb,elapsed_real_time_sec,num_involuntary_context_switch,FileName,FileSize,num_filesystem_outputs +240000,3.5,46,./some/base/dir/target1.o,bad size type,20368 diff --git a/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.empty b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.empty new file mode 100755 index 000000000000..e69de29bb2d1 diff --git a/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.junk b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.junk new file mode 100755 index 000000000000..b0f800bdd0dd --- /dev/null +++ b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.junk @@ -0,0 +1,3 @@ +blab junk, %$$, *$% +for this garbage +what? diff --git a/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.missing_col_filename b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.missing_col_filename new file mode 100755 index 000000000000..34ca3ffd8051 --- /dev/null +++ b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.missing_col_filename @@ -0,0 +1,2 @@ +max_resident_size_Kb,elapsed_real_time_sec,num_involuntary_context_switch,FileSize,num_filesystem_outputs +240000,3.5,46,3300000,20368 diff --git a/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.two_data_rows b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.two_data_rows new file mode 100755 index 000000000000..c7f68936ef64 --- /dev/null +++ b/commonTools/build_stats/unit_tests/bad_timing_build_stats_files/target1.timing.two_data_rows @@ -0,0 +1,3 @@ +max_resident_size_Kb,elapsed_real_time_sec,num_involuntary_context_switch,FileName,FileSize,num_filesystem_outputs +240000,3.5,46,./some/base/dir/target1.o,3300000,20368 +240000,3.5,46,./some/base/dir/target1.o,3300000,20368 diff --git a/commonTools/build_stats/unit_tests/dummy_build_dir/README.md b/commonTools/build_stats/unit_tests/dummy_build_dir/README.md new file mode 100644 index 000000000000..da30e241e514 --- /dev/null +++ b/commonTools/build_stats/unit_tests/dummy_build_dir/README.md @@ -0,0 +1,3 @@ +This directory contains some *.timing files to use to test +gather_build_stats.py. The *.timing files are in subdirs to be able to test +recursing into subdirs to find these files. diff --git a/commonTools/build_stats/unit_tests/dummy_build_dir/packages/pkga/src/target2.timing b/commonTools/build_stats/unit_tests/dummy_build_dir/packages/pkga/src/target2.timing new file mode 100755 index 000000000000..c2f19be5486b --- /dev/null +++ b/commonTools/build_stats/unit_tests/dummy_build_dir/packages/pkga/src/target2.timing @@ -0,0 +1,2 @@ +max_resident_size_Kb,cpu_sec_user_mode,elapsed_real_time_sec,FileName,FileSize +180000,1.38,1.5,packages/pkga/src/target2.lib,870000 diff --git a/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/dir/target1.timing b/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/dir/target1.timing new file mode 100755 index 000000000000..a00fbf4dc7dc --- /dev/null +++ b/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/dir/target1.timing @@ -0,0 +1,2 @@ +max_resident_size_Kb,elapsed_real_time_sec,num_involuntary_context_switch,FileName,FileSize,num_filesystem_outputs +240000,3.5,46,./some/base/dir/target1.o,3300000,20368 diff --git a/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/target3.timing b/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/target3.timing new file mode 100755 index 000000000000..d9a2eaa22d0e --- /dev/null +++ b/commonTools/build_stats/unit_tests/dummy_build_dir/some/base/target3.timing @@ -0,0 +1 @@ +This file contains junk and will be discarded by gather_build_stats.py. \ No newline at end of file diff --git a/commonTools/build_stats/unit_tests/dummy_build_dir/target4.timing b/commonTools/build_stats/unit_tests/dummy_build_dir/target4.timing new file mode 100755 index 000000000000..9f224dd2d638 --- /dev/null +++ b/commonTools/build_stats/unit_tests/dummy_build_dir/target4.timing @@ -0,0 +1,2 @@ +max_resident_size_Kb,elapsed_real_time_sec,FileName,FileSize +2000,1.9,target4.o,260000 diff --git a/commonTools/build_stats/unit_tests/gather_build_stats_UnitTests.py b/commonTools/build_stats/unit_tests/gather_build_stats_UnitTests.py new file mode 100644 index 000000000000..1b8e95b02aa4 --- /dev/null +++ b/commonTools/build_stats/unit_tests/gather_build_stats_UnitTests.py @@ -0,0 +1,361 @@ +# @HEADER +# ************************************************************************ +# +# TriBITS: Tribal Build, Integrate, and Test System +# Copyright 2013 Sandia Corporation +# +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Corporation nor the names of the +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# ************************************************************************ +# @HEADER + +import os +import sys +import copy +import shutil +import unittest +import pprint + +thisScriptsDir = os.path.dirname(os.path.abspath(__file__)) +g_testBaseDir = thisScriptsDir +sys.path = [thisScriptsDir+"/.."] + sys.path +import gather_build_stats as GBS +import FindTribitsCiSupportDir +import GeneralScriptSupport as GSS + +g_pp = pprint.PrettyPrinter(indent=2) + +# Shared test data + +g_listOfDicts = [ + {'field1':'11', 'field2':'12', 'field4':'14'}, + {'field1':'21', 'field2':'22', 'field3':'23', 'field5':"25"}, + ] + + +############################################################################# +# +# Test gather_build_stats.readAllValidTimingFiles() +# +############################################################################# + + +class test_readAllValidTimingFiles(unittest.TestCase): + + def test_1(self): + baseDir = g_testBaseDir+"/dummy_build_dir" + allValidTimingFiles = GBS.readAllValidTimingFiles(baseDir, printErrMsg=False) + allValidTimingFiles_expected = [ + {'FileName': 'target4.o', + 'FileSize': '260000', + 'elapsed_real_time_sec': '1.9', + 'max_resident_size_Kb': '2000'}, + {'FileName': 'packages/pkga/src/target2.lib', + 'FileSize': '870000', + 'cpu_sec_user_mode': '1.38', + 'elapsed_real_time_sec': '1.5', + 'max_resident_size_Kb': '180000'}, + {'FileName': 'some/base/dir/target1.o', + 'FileSize': '3300000', + 'elapsed_real_time_sec': '3.5', + 'max_resident_size_Kb': '240000', + 'num_filesystem_outputs': '20368', + 'num_involuntary_context_switch': '46'}] + # NOTE: The bad timign file 'some/base/target3.timing' was gracefully + # skipped! + allValidTimingFiles.sort(key=lambda item: item.get('FileName')) # Avoid system-dependent behavior + allValidTimingFiles_expected.sort(key=lambda item: item.get('FileName')) + self.assertEqual(allValidTimingFiles, allValidTimingFiles_expected) + + +############################################################################# +# +# Test gather_build_stats.readBuildStatsTimingFileIntoDict() +# +############################################################################# + + +def readBuildStatsTimingFileIntoDictTest(testObj, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected, + ): + (buildStatsTimingDict, errMsg) = GBS.readBuildStatsTimingFileIntoDict( + buildStatsTimingFile) + testObj.assertEqual(errMsg, errMsg_expected) + if numKeys_expected > 0: + testObj.assertEqual(len(buildStatsTimingDict.keys()), numKeys_expected) + testObj.assertEqual(buildStatsTimingDict, buildStatsTimingDict_expected) + + +class test_readBuildStatsTimingFileIntoDict(unittest.TestCase): + + def test_correct(self): + buildStatsTimingFile = \ + g_testBaseDir+"/dummy_build_dir/some/base/dir/target1.timing" + numKeys_expected = 6 + buildStatsTimingDict_expected = { + 'FileName': 'some/base/dir/target1.o', + 'FileSize': '3300000', + 'elapsed_real_time_sec': '3.5', + 'max_resident_size_Kb': '240000', + 'num_filesystem_outputs': '20368', + 'num_involuntary_context_switch': '46', + } + errMsg_expected = "" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + def test_missing_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/file_does_not_exist.timing" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = buildStatsTimingFile+": ERROR: File does not exist!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + def test_two_data_rows_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/bad_timing_build_stats_files/target1.timing.two_data_rows" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = buildStatsTimingFile+": ERROR: Contains 2 != 1 data rows!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + def test_empty_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/bad_timing_build_stats_files/target1.timing.empty" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = buildStatsTimingFile+": ERROR: File is empty!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + def test_junk_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/bad_timing_build_stats_files/target1.timing.junk" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = buildStatsTimingFile+": ERROR: Error, for CSV file"+\ + " '"+buildStatsTimingFile+"' the data row 0 ['for this garbage'] has 1 entries"+\ + " which does not macth the number of column headers 3!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + # NOTE: The above test is very much tied to the implementation of + # readCsvFileIntoListOfDicts() for the error message it puts out. That is + # very + + def test_missing_col_filename_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/bad_timing_build_stats_files/target1.timing.missing_col_filename" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = \ + buildStatsTimingFile+": ERROR: The required field 'FileName' is missing!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + def test_bad_type_filesize_fail(self): + buildStatsTimingFile = \ + g_testBaseDir+"/bad_timing_build_stats_files/target1.timing.bad_type_filesize" + numKeys_expected = 0 + buildStatsTimingDict_expected = None + errMsg_expected = \ + buildStatsTimingFile+": ERROR: For field 'FileSize' the string value"+\ + " 'bad size type' could not be converted to the expected type 'float'!" + readBuildStatsTimingFileIntoDictTest(self, buildStatsTimingFile, + numKeys_expected, buildStatsTimingDict_expected, errMsg_expected) + + +############################################################################# +# +# Test gather_build_stats.writeDictOfListsToCsvFile() +# +############################################################################# + +class test_writeDictOfListsToCsvFile(unittest.TestCase): + + def test_1(self): + dictOfLists = GBS.getDictOfListsFromListOfDicts(g_listOfDicts) + csvFile = "test_writeDictOfListsToCsvFile_build_stats.csv" + csvFileText_expected = \ + "field1,field2,field3,field4,field5\n11,12,,14,\n21,22,23,,25\n" + GBS.writeDictOfListsToCsvFile(dictOfLists, csvFile) + with open(csvFile, 'r') as csvFileHandle: + csvFileText = csvFileHandle.read() + self.assertEqual(csvFileText, csvFileText_expected) + + +############################################################################# +# +# Test gather_build_stats.getListOfAllTimingFiles() +# +############################################################################# + + +class test_getListOfAllTimingFiles(unittest.TestCase): + + def test_1(self): + baseDir = g_testBaseDir+"/dummy_build_dir" + listOfAllTimingFiles = GBS.getListOfAllTimingFiles(baseDir) + listOfAllTimingFiles_expected = [ + 'packages/pkga/src/target2.timing', + 'some/base/dir/target1.timing', + 'some/base/target3.timing', + 'target4.timing', + ] + listOfAllTimingFiles.sort() # Avoid system-dependent behavior + listOfAllTimingFiles_expected.sort() + self.assertEqual(listOfAllTimingFiles, listOfAllTimingFiles_expected) + + +############################################################################# +# +# Test gather_build_stats.getDictOfListsFromListOfDicts() +# +############################################################################# + +class test_getDictOfListsFromListOfDicts(unittest.TestCase): + + def test_1(self): + dictOfLists = GBS.getDictOfListsFromListOfDicts(g_listOfDicts) + dictOfLists_expected = { + 'field1': ['11', '21'], + 'field2': ['12', '22'], + 'field3': ['', '23'], + 'field4': ['14', ''], + 'field5': ['', '25'], + } + self.assertEqual(dictOfLists, dictOfLists_expected) + + +############################################################################# +# +# Test gather_build_stats.getSupersetOfFieldNamesList() +# +############################################################################# + +class test_getSupersetOfFieldNamesList(unittest.TestCase): + + def test_1(self): + supersetOfFieldNamesList = GBS.getSupersetOfFieldNamesList(g_listOfDicts) + supersetOfFieldNamesList_expected = \ + ['field1', 'field2', 'field3', 'field4', 'field5'] + supersetOfFieldNamesList.sort() # Make system independent + supersetOfFieldNamesList_expected.sort() + self.assertEqual(supersetOfFieldNamesList, supersetOfFieldNamesList_expected) + + + +############################################################################# +# +# Test gather_build_stats.py +# +############################################################################# + + +csvFileText_expected = \ + "FileName,FileSize,cpu_sec_user_mode,elapsed_real_time_sec,max_resident_size_Kb,num_filesystem_outputs,num_involuntary_context_switch\n"+\ + "target4.o,260000,,1.9,2000,,\n"+\ + "some/base/dir/target1.o,3300000,,3.5,240000,20368,46\n"+\ + "packages/pkga/src/target2.lib,870000,1.38,1.5,180000,,\n" + + +def gather_build_stats_py_expected_output(csvFile): + return \ + "Reading all *.timing files from under '"+g_testBaseDir+"/dummy_build_dir' ...\n"+\ + "Number of *.timing files found = 4\n"+\ + g_testBaseDir+"/dummy_build_dir/some/base/target3.timing: ERROR: Contains 0 != 1 data rows!\n"+\ + "Number of valid *.timing files found = 3\n"+\ + "Combined build-stats keys sorted:\n"+\ + " ['FileName', 'FileSize', 'cpu_sec_user_mode', 'elapsed_real_time_sec', 'max_resident_size_Kb', 'num_filesystem_outputs', 'num_involuntary_context_switch']\n"+\ + "Wrote file '"+csvFile+"'\n" + + +def sortCsvFileTextList(csvFileText): + csvFileTextList_orig = csvFileText.split('\n') + csvFileTextList = [] + csvFileTextList.append(csvFileTextList_orig[0]) # Headers + csvFileTextList.extend(sorted(csvFileTextList_orig[1:])) # Rows + return csvFileTextList + + +def test_gather_build_stats_py_body(testObj, csvFile, cmnd, silentStdout=False): + output = GSS.getCmndOutput(cmnd) + #print("output:\n"+output) + with open(csvFile, 'r') as csvFileHandle: + csvFileText = csvFileHandle.read() + testObj.assertEqual( + sortCsvFileTextList(csvFileText), + sortCsvFileTextList(csvFileText_expected)) + if not silentStdout: + testObj.assertEqual( + output.split('\n'), + gather_build_stats_py_expected_output(csvFile).split('\n')) + + +class test_gather_build_stats_py(unittest.TestCase): + + def test_help(self): + cmnd = thisScriptsDir+"/../gather_build_stats.py --help" + output = GSS.getCmndOutput(cmnd) + #print("output:\n"+output+"\n") + self.assertTrue(output.find("Gather up build stats from *.timing CSV files")!=-1) + self.assertTrue(output.find("max_resident_size_Kb : float")!=-1) + self.assertTrue(output.find("FileName : string")!=-1) + self.assertTrue(output.find("The column headers in all of the *.timing files are combined")!=-1) + + def test_default_out_file(self): + csvFile = "build_stats.csv" + cmnd = thisScriptsDir+"/../gather_build_stats.py"+\ + " -d "+g_testBaseDir+"/dummy_build_dir" + test_gather_build_stats_py_body(self, csvFile, cmnd, silentStdout=True) + + def test_default_out_file_verbose(self): + csvFile = "build_stats.csv" + cmnd = thisScriptsDir+"/../gather_build_stats.py -v"+\ + " -d "+g_testBaseDir+"/dummy_build_dir" + test_gather_build_stats_py_body(self, csvFile, cmnd) + + def test_explicit_out_file_verbose(self): + csvFile = "test_gather_build_stats_py_build_stats.csv" + cmnd = thisScriptsDir+"/../gather_build_stats.py -v"+\ + " -d "+g_testBaseDir+"/dummy_build_dir "+csvFile + test_gather_build_stats_py_body(self, csvFile, cmnd) + + +# +# Run the unit tests! +# + +if __name__ == '__main__': + + unittest.main() diff --git a/commonTools/build_stats/unit_tests/summarize_build_stats_UnitTests.py b/commonTools/build_stats/unit_tests/summarize_build_stats_UnitTests.py index 619cd40871e4..b2bc6314326b 100644 --- a/commonTools/build_stats/unit_tests/summarize_build_stats_UnitTests.py +++ b/commonTools/build_stats/unit_tests/summarize_build_stats_UnitTests.py @@ -697,14 +697,14 @@ class test_summarize_build_stats_py(unittest.TestCase): def test_big_small_full_project(self): cmnd = thisScriptsDir+"/../summarize_build_stats.py"+\ - " --build-stats-csv-file="+g_testBaseDir+"/build_stats.big.small.csv" + " "+g_testBaseDir+"/build_stats.big.small.csv" output = GSS.getCmndOutput(cmnd) self.assertEqual(GSS.s(output), GSS.s(big_small_summary_full_project_ascii+"\n")) def test_big_small_by_subdir(self): cmnd = thisScriptsDir+"/../summarize_build_stats.py"+\ - " --build-stats-csv-file="+g_testBaseDir+"/build_stats.big.small.csv"+\ - " --bin-by-subdirs-under-dirs=commonTools,packages" + " --bin-by-subdirs-under-dirs=commonTools,packages"+\ + " "+g_testBaseDir+"/build_stats.big.small.csv" output = GSS.getCmndOutput(cmnd) self.assertEqual(GSS.s(output), GSS.s(big_small_summary_ascii+"\n")) diff --git a/commonTools/build_stats/wrapper/NMParser.py b/commonTools/build_stats/wrapper/NMParser.py index 7e00c33549e4..c178abfd3e30 100644 --- a/commonTools/build_stats/wrapper/NMParser.py +++ b/commonTools/build_stats/wrapper/NMParser.py @@ -6,6 +6,7 @@ import subprocess # spawning nm import re # re matching import os # line seperator +import sys from Python2and3 import b, s @@ -54,10 +55,19 @@ def parse_object(filename): The keys are obtained from nm_option_desc_map and enforced inside the regex used See nm_re_type_expr, nm_re_str, and nm_re in the static fields of this class """ + FNULL = None + if sys.version_info < (3,): + FNULL = open(os.devnull, 'w') + local_devnull = FNULL + else: + local_devnull = subprocess.DEVNULL p = subprocess.Popen(['nm', '-aS', filename], - stdout=subprocess.PIPE) + stdout=subprocess.PIPE, + stderr=local_devnull) output = p.communicate()[0] + if FNULL: FNULL.close() + nm_counts = dict() for line in output.split(b(os.linesep)): diff --git a/commonTools/build_stats/wrapper/WrapperCommandLineParser.py b/commonTools/build_stats/wrapper/WrapperCommandLineParser.py index a8c0a1ae1119..08551dd2f030 100644 --- a/commonTools/build_stats/wrapper/WrapperCommandLineParser.py +++ b/commonTools/build_stats/wrapper/WrapperCommandLineParser.py @@ -14,18 +14,112 @@ def __init__(self, cmdline_args): self.op_output_file = '' # if we perform an operation this is it self.op = '' + self.short_op = '' # whether to gather and print a csv_banner self.print_csv_banner = False # whatever the op's args should be self.op_args = [] + # a list of lists of commands to evaluate + self.commands = [] + # whether we have the output arg + self.have_output_arg = False + # ENV control variables + self.parse_nm = True + self.output_fields = None + + self.time_cmd = 'not_set' + # we must parse envs first, because they contain required parameters + self.parse_env_controls() + # finally parse the args self.parse_cmdline_args(cmdline_args) + def parse_env_controls(self): + """Parse control variables from the ENV (rather than command line) + + # REQUIRED + TRILINOS_BUILD_STATS_TIME_CMD points to a valid GNU Time executable + + # REQUIRED + TRILINOS_BUILD_STATS_INNER_OP: is the command we are wrapping + + # REQUIRED + TRILINOS_BUILD_STATS_BASE_DIR : We need to know the `root` of the build + tree so we annotate paths correctly (see github + PR 8638 for an issue with Makefile builds) + # OPTIONAL + TRILINOS_BUILD_STATS_OUTPUT_FIELDS : control what gets written to timing + files Can enable only some fields + e.g., + FileName,FileSize,op + + + """ + # optional, control which fields we write to a file + # This does not promise we will not parse all possible fields + # (That is to say, this does not promise any performance benefits) + self.output_fields = os.environ.get('TRILINOS_BUILD_STATS_OUTPUT_FIELDS') + + err_msg='' + # required : TRILINOS_BUILD_STATS_TIME_CMD + # TRILINOS_BUILD_STATS_INNER_OP + # TRILINOS_BUILD_STATS_BASE_DIR + if 'TRILINOS_BUILD_STATS_TIME_CMD' not in os.environ: + err_msg+=os.linesep + err_msg+=('TRILINOS_BUILD_STATS_TIME_CMD (ENV) is required. CMake should ' + '`find` and set this, if using the build tools manually, locate ' + 'GNU Time (typically /usr/bin/time) verify it supports `--format` ' + 'and `--output`. Then set TRILINOS_BUILD_STATS_TIME_CMD=/path/to/time') + + if 'TRILINOS_BUILD_STATS_INNER_OP' not in os.environ: + err_msg+=os.linesep + err_msg+=('TRILINOS_BUILD_STATS_INNER_OP (ENV) is required. CMake should ' + 'set this to a specific operations, e.g., ${CMAKE_C_COMPILER}. ' + 'If you are using the tools independently, please see the docs ' + 'for examples of how to write the wrapper scripts. E.g., ' + 'export TRILINOS_BUILD_STATS_INNER_OP=mpicc') + + if 'TRILINOS_BUILD_STATS_BASE_DIR' not in os.environ: + err_msg+=os.linesep + err_msg+=('TRILINOS_BUILD_STATS_BASE_DIR (ENV) is required. CMake should ' + 'set this to the build directory (top level). If using this script ' + 'manually, set this to your build directory (full path). E.g., ' + 'export TRILINOS_BUILD_STATS_BASE_DIR=/path/to/build') + + if err_msg: + sys.stderr.write(err_msg) + exit(-1) + + # set the required parameters - the dict will throw if these aren't defined + # but we should have exit() if any errors. + self.time_cmd = os.environ['TRILINOS_BUILD_STATS_TIME_CMD'] + self.op = os.environ['TRILINOS_BUILD_STATS_INNER_OP'] + self.base_build_dir = os.environ['TRILINOS_BUILD_STATS_BASE_DIR'] + + # we name the output as: blah.o.op.timing + # this will result in blah.ar.timing, blah.mpicc.timing blah.ld.timing... + self.short_op = os.path.basename(self.op) + self.output_stats_file = self.short_op + '.timing' + + parse_nm = os.environ.get('TRILINOS_BUILD_STATS_PARSE_NM', "True") + if parse_nm.lower() == 'true': + self.parse_nm = True + elif parse_nm.lower() == 'false': + self.parse_nm = False + else: + msg='ERROR: TRILINOS_BUILD_STATS_PARSE_NM is set to [{}]'.format(parse_nm) + msg+=', but valid values are True or False. Defaulting to True{}'.format(os.linesep) + sys.stderr.write(msg); + self.parse_nm = True + def __repr__(self): return self.lcl_print() def __str__(self): return self.lcl_print() + def generate_stats(self): + return self.have_output_arg + def lcl_print(self): fmt_string = [ 'output_stats_file : {output_stats_file}', @@ -40,46 +134,92 @@ def lcl_print(self): op=self.op, print_csv_banner=self.print_csv_banner) + def get_output_fields(self,csv_map): + if self.output_fields: + # this assumes it is a string of comma separated labels + fields = self.output_fields.split(',') + else: + # apply sort here, so the output will be deterministic + fields = sorted([ k for k in csv_map ]) + + return fields + + def generate_commandlets(self, cmdline_args): + + # it seems we need to handle compound commands e.g., && (maybe ||) + cmdlet = [] + for arg in cmdline_args: + if arg.strip() == "&&": + # add the command + self.commands.append(cmdlet) + # start a new command + cmdlet = [] + elif arg.strip() != '': + cmdlet.append(arg) + + if cmdlet: + self.commands.append(cmdlet) + # post - should have all commands broken up into lists of lists (of options) + return + + def parse_cmdline_arg_helper(self, cmdline_args): + + self.have_output_arg = False + # we want to do something different for ar, ranlib, or ld.* + # these commands do not necessarily have a good 'output' arg denoted by -o + # first try to find -o, if that passes then use it. + # if not, then do something special based on ar/ranlib/ld.* + + # find the output arg (will raise an exception if not found) + # we use -o blah.o or -o /path/to/blah.o or none at all + try: + output_idx = cmdline_args.index('-o') + self.op_output_file = cmdline_args[output_idx+1] + self.output_stats_file = self.op_output_file + '.' + self.output_stats_file + + self.have_output_arg = True + return + + except: + pass + + # we failed -o, so try op specific stuff + if self.short_op.endswith('ar') or self.short_op.endswith('ranlib'): + for arg in cmdline_args: + if arg.endswith('.a'): + self.op_output_file = arg + self.output_stats_file = arg + '.' + self.output_stats_file + self.have_output_arg = True + return + # we hit this if we can't find a .a + return + def parse_cmdline_args(self, cmdline_args): - base_build_dir_arg_prefix = '----base-build-dir=' wrapper_header_arg = '----get_header' - wrapper_op_arg_prefix = '----op=' print_csv_header=False - have_op=False # require that any wrapper arg be the first try: wrapper_arg_idx = 1 wrapper_arg = cmdline_args[wrapper_arg_idx] - if wrapper_arg.startswith(base_build_dir_arg_prefix): - self.base_build_dir = wrapper_arg.split('=', 1)[1] - wrapper_arg_idx += 1 - wrapper_arg = cmdline_args[wrapper_arg_idx] if wrapper_arg == wrapper_header_arg: self.print_csv_banner=True - elif wrapper_arg.startswith(wrapper_op_arg_prefix): - self.op = wrapper_arg.split('=', 1)[1] - # find the output arg (will raise an exception if not found) - # we use -o blah.o or -o /path/to/blah.o or none at all - # we name the output as: blah.o.op.timing - # this will result in blah.ar.timing, blah.mpicc.timing blah.ld.timing... - short_op = os.path.basename(self.op) - output_stats_file_suffix = short_op + '.timing' - try: - output_idx = cmdline_args.index('-o') - self.op_output_file = cmdline_args[output_idx+1] - self.output_stats_file = self.op_output_file + '.' + output_stats_file_suffix - # ToDo: The above needs to be made to work for ar as well! - except: - pass - - else: - raise Exception('unparseable arguments') - - # Remove the first wrapper_arg_idx+1 args (script name + wrapper args) - self.op_args = cmdline_args[wrapper_arg_idx+1:] + # this isn't implemented.... + sys.stderr.write('----get_header was requested, but is not implemented' + '. Doing nothing.') + exit(0) - except: + self.parse_cmdline_arg_helper(cmdline_args) + + # Remove the script name + self.op_args = cmdline_args[1:] + # we could clean this whole thing up some.. + self.generate_commandlets([self.op] + self.op_args) + + except Exception as e: + print("Got an error parsing the command line in the compiler wrapper python script") + print(e) + raise # any error and we give up help_msg = ["Compiler wrapper:", " Usage: wrapper [---base-build-dir=] ----op= [args] | ----get_header", @@ -95,5 +235,6 @@ def parse_cmdline_args(self, cmdline_args): " statistics will be written to .timing", ] print('\n'.join(help_msg)) + #raise sys.exit(0) diff --git a/commonTools/build_stats/wrapper/WrapperOpTimer.py b/commonTools/build_stats/wrapper/WrapperOpTimer.py index 86ba025dc405..a1f5502ffd53 100644 --- a/commonTools/build_stats/wrapper/WrapperOpTimer.py +++ b/commonTools/build_stats/wrapper/WrapperOpTimer.py @@ -1,11 +1,13 @@ import subprocess import csv import os +from WrapperCommandLineParser import WrapperCommandLineParser -# -# Data for this WrapperOpTimer module -# +def get_full_header(fields_list,full_header_map): + return ','.join([ full_header_map[f] for f in fields_list ]) + +# the values are usr_bin_time_csv_map = { "E": "elapsed_real_time_fmt", @@ -126,47 +128,48 @@ "x", ] -field_header_full = \ - ','.join([ usr_bin_time_csv_map[f] \ - for f in default_fields ]) +field_header_full = get_full_header(default_fields, usr_bin_time_csv_map) #','.join([ WrapperOpTimer.usr_bin_time_csv_map[f] for f in default_fields ]) field_header_short = ','.join(default_fields) -field_arg = '--format=' + field_header_full + '\n' + \ - ','.join([ '%{}'.format(f) for f in default_fields] ) - - -# -# Class WrapperOpTimer -# +field_arg = '--format=' + field_header_full + '\n' + ','.join([ '%{}'.format(f) for f in default_fields] ) class WrapperOpTimer: @staticmethod - def time_op(op, - op_output_file, - output_stats_file, - op_args, - base_build_dir=None): - """ - evaluate 'op' with 'op_args', and gather stats into output_stats_file - """ - cmd = [ - '/usr/bin/time', - # '--append', - '--output=' + output_stats_file, - field_arg, - op ] + op_args - - # print(' '.join(cmd)) + def run_cmd(cmd): p = subprocess.Popen(cmd) p.communicate() returncode = p.returncode + return returncode + + @staticmethod + def time_op(wcp): + """ + evaluate 'op' with 'op_args', and gather stats into output_stats_file + """ + # if os.path.exists(output_stats_file) and os.path.getsize(output_stats_file) > 0: + # print("WARNING: File '"+output_stats_file+"' exists and will be overwritten") + # print("op='"+op+"'") + # print("op_args='"+str(op_args)+"'") + # print("op_output_file='"+op_output_file+"'") # initializing the titles and rows list fields = [] csv_row = {} + cmdcount = 0 + returncode = 0 + for cmd in wcp.commands: + if cmdcount == 0: + cmd = [ wcp.time_cmd, + # '--append', + '--output=' + wcp.output_stats_file, + field_arg, + ] + cmd + cmdcount += 1 + returncode |= WrapperOpTimer.run_cmd(cmd) + # reading csv file - with open(output_stats_file, 'r') as csvfile: + with open(wcp.output_stats_file, 'r') as csvfile: # creating a csv reader object csvreader = csv.reader(csvfile) @@ -174,28 +177,39 @@ def time_op(op, fields = next(csvreader) # extracting each data row one by one - # we effectively retain on the last row. + # we effectively retain only the last row. # it isn't clear if we should expect multiple rows per file + # + # In the bash version of this I was able to handle multiple rows per file + # We could do that here, but it would require returning a list of csv maps + # On the system side of things, it is very murky. We would need to ensure + # file integrity (concurrent reads/writes). For now, it's + # best to enforce 1 file per operation performed. (which should happen if we + # name things correctly) - That is invalid is there is a cycle in the Build graph, + # but that is a larger problem. for row in csvreader: csv_row = dict(zip(fields, row)) # FileSize - csv_row['FileSize'] = WrapperOpTimer.get_file_size(op_output_file) + csv_row['FileSize'] = WrapperOpTimer.get_file_size(wcp.op_output_file) + + # add a field with the short op + csv_row['op'] = os.path.basename(wcp.op) # FileName - if base_build_dir: - abs_base_build_dir = os.path.abspath(base_build_dir) + if wcp.base_build_dir: + abs_base_build_dir = os.path.abspath(wcp.base_build_dir) current_working_dir = os.path.abspath(os.getcwd()) rel_path_to_base_build_dir = os.path.relpath( current_working_dir, start=abs_base_build_dir) - rel_op_output_file = os.path.join(rel_path_to_base_build_dir, op_output_file) + rel_op_output_file = os.path.join(rel_path_to_base_build_dir, wcp.op_output_file) else: - rel_op_output_file = op_output_file + rel_op_output_file = wcp.op_output_file csv_row['FileName'] = rel_op_output_file # Remove the build stats output file if the build failed - if returncode != 0 and os.path.exists(output_stats_file): - os.remove(output_stats_file) + if returncode != 0 and os.path.exists(wcp.output_stats_file): + os.remove(wcp.output_stats_file) return (csv_row, returncode) diff --git a/commonTools/build_stats/wrapper/magic_wrapper.py b/commonTools/build_stats/wrapper/magic_wrapper.py index fec04c14c355..803ec3208b62 100755 --- a/commonTools/build_stats/wrapper/magic_wrapper.py +++ b/commonTools/build_stats/wrapper/magic_wrapper.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 ''' Note: @@ -17,11 +17,14 @@ from WrapperOpTimer import WrapperOpTimer # given a dict of key/val pairs, write them as a CSV line -def write_csv_map(filename,csv_map): +def write_csv_map(filename,csv_map,csv_fields): try: with open(filename, 'w') as csvfile: writer = csv.DictWriter(csvfile, - fieldnames=[ k for k in csv_map ]) + fieldnames=csv_fields, + # ignore fields in the csv_map that aren't + # in fieldnames + extrasaction='ignore') writer.writeheader() writer.writerow(csv_map) except IOError: @@ -38,28 +41,38 @@ def main(cmdline_args): # keep a dict of field : value # first do the operation # this must be first, as it generates the output file - (csv_map, returncode) = \ - WrapperOpTimer.time_op( - base_build_dir=wcp.base_build_dir, - op=wcp.op, - op_output_file=wcp.op_output_file, - output_stats_file=wcp.output_stats_file, - op_args=wcp.op_args) + # + # WARNING: Be very careful with stdout before these commands. If the wrapped command + # has shell redirection it can slurp up Python's output... best to require all messages + # go after the compiler commnand has completed. + if wcp.generate_stats(): + (csv_map, returncode) = WrapperOpTimer.time_op(wcp) + #print("======> Gathering stats...", file=sys.stdout) + else: + # only run the command and return the return code + returncode = 0 + for cmd in wcp.commands: + returncode |= WrapperOpTimer.run_cmd(cmd) + #print("##======> NO stats {}".format(wcp.op_output_file), file=sys.stdout) + return returncode if returncode == 0: - # test nm - # we probably need some to handle the case the .o isn't created - # as-us, the following will return empty dicts (we parse/validate) the output - # from NM, so we won't return garbage - nmp = NMParser.parse_object(wcp.op_output_file) - # NMParser.print_counts(nmp) - # add NM's output to our csv data - # we could move this into the NM parser - csv_map.update(NMParser.get_csv_map(nmp)) + if wcp.parse_nm: + # test nm + # we probably need some to handle the case the .o isn't created + # as-us, the following will return empty dicts (we parse/validate) the output + # from NM, so we won't return garbage + nmp = NMParser.parse_object(wcp.op_output_file) + # NMParser.print_counts(nmp) + # add NM's output to our csv data + # we could move this into the NM parser + csv_map.update(NMParser.get_csv_map(nmp)) # ultimately, print the csv data to a file # make sure to quote csv columns - write_csv_map(wcp.output_stats_file, csv_map) + write_csv_map(wcp.output_stats_file, + csv_map, + csv_fields=wcp.get_output_fields(csv_map)) # NOTE: Above, we don't write the *.timing file if the build failed because # the output target file may not exist! And we don't want a CSV file entry diff --git a/doc/build_ref/TrilinosBuildReferenceTemplate.rst b/doc/build_ref/TrilinosBuildReferenceTemplate.rst index 064ac46baf55..8592acdd4000 100644 --- a/doc/build_ref/TrilinosBuildReferenceTemplate.rst +++ b/doc/build_ref/TrilinosBuildReferenceTemplate.rst @@ -272,23 +272,25 @@ needs to be rebuilt). The solution is to switch from the default ``Unix Makefiles`` generator to the ``Ninja`` generator (see `Enabling support for Ninja`_). + Enabling and viewing build statistics ------------------------------------- The Trilinos project has portable built-in support for generating and -reporting build statistics such the high-watermark for RAM, the wall clock -time, and many other statistics used to build each and every object file, -library, and executable target. To enable support for this, configure with:: +reporting build statistics such high-watermark for RAM, wall clock time, file +size, and many other statistics used to build each and every object file, +library, and executable target in the project (and report that information to +CDash). To enable support for these build statistics, configure with:: -D Trilinos_ENABLE_BUILD_STATS=ON \ This will do the following: -* Generate compiler wrappers ``build_stats__wrapper.sh`` for C, C++, and - Fortran in the build tree that will compute statics as a byproduct for every - usage of each compiler. (The compiler wrappers create a file - ``.timing`` for every generated object, library and executable - ```` file.) +* Generate wrappers ``build_stats__wrapper.sh`` for C, C++, and Fortran + (and for static builds also ``ar``, ``randlib`` and ``ld``) in the build + tree that will compute statics as a byproduct of every invocation of these + commands. (The wrappers create a file ``.timing`` for every + generated object, library and executable ```` file.) * Define a build target called ``generate-build-stats`` that when run will gather up all of the generated build statistics into a single CSV file @@ -296,19 +298,14 @@ This will do the following: the end of the ``ALL`` target so a raw ``make`` will automatically create an up-to-date ``build_stats.csv`` file.) -* Enable the package ``TrilinosBuildStats`` (and when +* By default, enable the package ``TrilinosBuildStats`` (and when ``-DTrilinos_ENABLE_TESTS=ON`` or ``-DTrilinosBuildStats_ENABLE_TESTS=ON`` are also set) will define the test ``TrilinosBuildStats_Results`` to summarize and report the build statistics. When run, this test calls the - tool ``summarize_build_stats.py`` in to report summary build stats to STDOUT - the test and will also upload the file ``build_stats.csv`` to CDash as using - the CTest property ``ATTACHED_FILES`` when submitting test results to CDash. - -* Set up to install the generated build-stats compiler wrappers and a helper - script ``gather_build_stats.sh`` into the ``/bin`` directory as part - of the default ``install`` target. (This allows customers to also use these - compiler wrappers to generate and gather up build stats for their own - project as well that depends on Trilinos.) + tools ``gather_build_stats.py`` and ``summarize_build_stats.py`` to gather + and report summary build stats to STDOUT and will also upload the file + ``build_stats.csv`` to CDash as using the CTest property ``ATTACHED_FILES`` + when submitting test results to CDash. The default for the cache variable ``Trilinos_ENABLE_BUILD_STATS`` is determined as follows: @@ -324,6 +321,9 @@ determined as follows: * Else, the default value is set to ``OFF``. +Otherwise, if ``Trilinos_ENABLE_BUILD_STATS`` is explicitly set in the cache +with ``-DTrilinos_ENABLE_BUILD_STATS=ON|OFF``, then that value will be used. + When the test ``TrilinosBuildStats_Results`` is run, it produces summary statistics to STDOUT like shown below:: @@ -356,8 +356,8 @@ where: given target measured in MB. * ``elapsed_real_time_sec`` is the wall clock time used to build a given target measured in seconds. -* ``file_size_mb`` is the produced file size of a given build target - (i.e. object file, library, or executable) measured in MB. +* ``file_size_mb`` is the file size of a given build target (i.e. object file, + library, or executable) measured in MB. * ``Full Project`` are the stats for all of the enabled Trilinos packages. * ```` are the build stats for the ```` subdirectory under the base directories ``commonTools`` and ``packages``. (These map to Trilinos @@ -370,7 +370,7 @@ compilers, platforms, and build configurations and even across the same builds over days, weeks, and months.) The generated ``build_stats.csv`` file contains many other types of useful -build stats as well but the above three are some of the more impact-full build +build stats as well but the above three are some of the more significant build statistics. To avoid situations where a full rebuild does not occur (e.g. any build target @@ -384,42 +384,47 @@ process and therefore will usually remove the file even of later configure operations fail. Finally, to make rebuilds more robust and to restrict build stats to only new -targets getting (re)built after an initial configure, configure with:: +targets getting (re)built after an initial configure, then configure with:: -D Trilinos_REMOVE_BUILD_STATS_TIMING_FILES_ON_FRESH_CONFIGURE=ON -The will remove **all** of the ``*.timing`` files under the base build +This will remove **all** of the ``*.timing`` files under the base build directory during a fresh configure (i.e. where the ``CMakeCache.txt`` file does not exist). But this will not remove ``*.timing`` files on reconfigures (i..e where a ``CMakeCache.txt`` file is preserved). Timing stats for targets that are already built and don't need to be rebuilt after the last fresh configure will not get reported. (But this can be useful for CI builds where one only wants to see build stats for the files updated in the last PR -iteration. Also, this will avoid problems with corrupted `.timing` -files that may already exist in the base repo.) +iteration. NOTES: +* The underlying compilers must already be specified in the cache variables + ``CMAKE_C_COMPILER``, ``CMAKE_CXX_COMPILER``, and ``CMAKE_Fortran_COMPILER`` + and not left up to CMake to determine. The best way to do that is, for + example ``-DCMAKE_C_COMPILER=$(which mpicc)`` on the ``cmake`` command-line. + +* The tool ``gather_build_stats.py`` is very robust and will discard data from + any invalid or corrupted ``*.timing`` files and can deal with ``*.timing`` + files with different sets and ordering of the data fields from different + versions of the build stats wrapper tool. (Therefore, one can keep + rebuilding in an existing build directory with old ``*.timing`` files + hanging around and never have to worry about being able to create an updated + ``build_stats.csv`` file.) + * The installed ``TrilinosConfig.cmake`` and ``Config.cmake`` files list the original underlying C, C++, and Fortran compilers, **not** the build stats compiler wrappers. -* If a customer wants to use the build stats compiler wrappers, then they can - just directly set ``CMAKE__COMPILER`` to the path of the installed - compiler wrappers ``/bin/build_stats__wrapper.sh`` for each - language ```` = C, CXX, and Fortran and also add - ``set(ENV{CMAKE_IS_IN_CONFIGURE_MODE} 1)`` to the beginning of their - project's top-level ``CMakeLists.txt`` file. - * The ``generate-build-stats`` target has dependencies on every object, library, and executable build target in the project so it will always only run after all of those targets are up to date. -* The file ``build_stats.csv`` can be downloaded off of CDash from the - ``TrilinosBuildStats_Results`` test results summary page. (The file is - downloaded as a compressed ``build_stats.csv.tgz`` file which will then need - to be uncompressed using ``tar -xzvf build_stats.csv.tgz`` before being - viewed.) +* After uploading the test results to CDash, the file ``build_stats.csv`` can + be downloaded off CDash from the ``TrilinosBuildStats_Results`` test results + details page. (The file is downloaded as a compressed + ``build_stats.csv.tgz`` file which will then need to be uncompressed using + ``tar -xzvf build_stats.csv.tgz`` before viewing.) * Any ``build_stats.csv`` file can be viewed and queried by uploading it to the site ``https://jjellio.github.io/build_stats/index.html``. diff --git a/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp b/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp index 963f61f3a46e..f0db0cb5f7cc 100644 --- a/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Details_Amesos2Wrapper_def.hpp @@ -327,14 +327,14 @@ void Amesos2Wrapper::initialize () A_local->getColMap (), entriesPerRow())); // copy entries into A_local_crs - Teuchos::Array indices(A_local->getNodeMaxNumRowEntries()); - Teuchos::Array values(A_local->getNodeMaxNumRowEntries()); + typename crs_matrix_type::nonconst_local_inds_host_view_type indices("Indices",A_local->getNodeMaxNumRowEntries() ); + typename crs_matrix_type::nonconst_values_host_view_type values("Values", A_local->getNodeMaxNumRowEntries()); for(local_ordinal_type i = 0; i < numRows; i++) { size_t numEntries = 0; - A_local->getLocalRowCopy(i, indices(), values(), numEntries); + A_local->getLocalRowCopy(i, indices, values, numEntries); ArrayView indicesInsert(indices.data(), numEntries); - ArrayView valuesInsert(values.data(), numEntries); + ArrayView valuesInsert((const scalar_type*)values.data(), numEntries); A_local_crs_nc->insertLocalValues(i, indicesInsert, valuesInsert); } A_local_crs_nc->fillComplete (A_local->getDomainMap (), A_local->getRangeMap ()); diff --git a/packages/ifpack2/test/CMakeLists.txt b/packages/ifpack2/test/CMakeLists.txt index a455b67aaff6..7460f7d7669c 100644 --- a/packages/ifpack2/test/CMakeLists.txt +++ b/packages/ifpack2/test/CMakeLists.txt @@ -1,3 +1,5 @@ TRIBITS_ADD_TEST_DIRECTORIES(belos) -TRIBITS_ADD_TEST_DIRECTORIES(unit_tests) +IF ((NOT Tpetra_INST_LONG_DOUBLE) AND (NOT Tpetra_ENABLE_quadmath)) + TRIBITS_ADD_TEST_DIRECTORIES(unit_tests) +ENDIF() TRIBITS_ADD_TEST_DIRECTORIES(vanka) diff --git a/packages/ifpack2/test/belos/CMakeLists.txt b/packages/ifpack2/test/belos/CMakeLists.txt index 826abcb84912..815aacfcd02f 100644 --- a/packages/ifpack2/test/belos/CMakeLists.txt +++ b/packages/ifpack2/test/belos/CMakeLists.txt @@ -18,6 +18,20 @@ IF(Ifpack2_ENABLE_QD) ) ENDIF() +IF (Tpetra_INST_LONG_DOUBLE) + TRIBITS_ADD_EXECUTABLE( + tif_belos_longdouble + SOURCES belos_solve_longdouble.cpp + ) +ENDIF() + +IF (Tpetra_ENABLE_quadmath) + TRIBITS_ADD_EXECUTABLE( + tif_belos_float128 + SOURCES belos_solve_float128.cpp + ) +ENDIF() + IF (Tpetra_INST_DOUBLE) TRIBITS_ADD_EXECUTABLE( tif_tpetra_native @@ -108,6 +122,28 @@ TRIBITS_ADD_TEST( STANDARD_PASS_OUTPUT ) +IF (Tpetra_INST_LONG_DOUBLE) +TRIBITS_ADD_TEST( + tif_belos_longdouble + NAME gmres_belos_longdouble + ARGS "" + COMM serial mpi + NUM_MPI_PROCS 1 + STANDARD_PASS_OUTPUT +) +ENDIF() + +IF (Tpetra_ENABLE_quadmath) +TRIBITS_ADD_TEST( + tif_belos_float128 + NAME gmres_belos_float128 + ARGS "" + COMM serial mpi + NUM_MPI_PROCS 1 + STANDARD_PASS_OUTPUT +) +ENDIF() + TRIBITS_ADD_TEST( tif_belos NAME small_pseudoblkcg_belos diff --git a/packages/ifpack2/test/belos/belos_solve_float128.cpp b/packages/ifpack2/test/belos/belos_solve_float128.cpp new file mode 100644 index 000000000000..34c04e4764d3 --- /dev/null +++ b/packages/ifpack2/test/belos/belos_solve_float128.cpp @@ -0,0 +1,179 @@ +//@HEADER +// ************************************************************************ +// +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// Copyright (2009) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, RICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +//@HEADER +// +// This driver constructs a simple tridiagonal matrix and constant RHS, +// and solves this system using the Belos Block GMRES method with a '__float128' +// ScalarType. +// +// NOTE: No preconditioner is used in this case. +// +// +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +#include +#include "BelosLinearProblem.hpp" +#include "BelosBlockGmresSolMgr.hpp" +#include "BelosTypes.hpp" +#include "BelosConfigDefs.hpp" +#include "BelosTpetraAdapter.hpp" + +#include +#include + +#include "Tpetra_Map.hpp" +#include "Tpetra_CrsMatrix.hpp" +#include "Tpetra_Operator.hpp" +#include "Tpetra_MultiVector.hpp" +#include "Tpetra_Vector.hpp" +#include "Tpetra_Core.hpp" +#include "Tpetra_Details_DefaultTypes.hpp" + +#include "Teuchos_VerboseObject.hpp" + +#include + +using namespace Teuchos; + +int main(int argc, char *argv[]) { + // + typedef __float128 scalar_type; + typedef int LO; + typedef Tpetra::Map Tpetra_Map; + typedef Tpetra::CrsMatrix Tpetra_CrsMatrix; + typedef Tpetra::Vector Tpetra_Vector; + typedef Tpetra::Operator OP; + typedef Tpetra::MultiVector MV; + typedef Belos::LinearProblem problem_type; + + Teuchos::GlobalMPISession session(&argc, &argv, NULL); + + bool success = false; + + auto comm = Tpetra::getDefaultComm (); + + Teuchos::RCP + out = Teuchos::VerboseObjectBase::getDefaultOStream(); + + try { + // The number of rows and columns in the matrix. + const Tpetra::global_size_t numGblIndices = 50; + + // Construct a Map that puts approximately the same number of + // equations on each processor. + Teuchos::RCP map = Teuchos::rcp (new Tpetra_Map (numGblIndices, 0, comm)); + auto numMyElements = map->getNodeNumElements (); + + // Create a Tpetra sparse matrix whose rows have distribution + // given by the Map. We expect at most three entries per row. + Teuchos::RCP A (new Tpetra_CrsMatrix (map, 3)); + // Fill the sparse matrix, one row at a time. + const scalar_type two = static_cast (2.0); + const scalar_type negOne = static_cast (-1.0); + for (LO lclRow = 0; lclRow < static_cast (numMyElements); ++lclRow) { + const Tpetra::Details::DefaultTypes::global_ordinal_type gblRow = map->getGlobalElement (lclRow); + // A(0, 0:1) = [2, -1] + if (gblRow == 0) { + A->insertGlobalValues (gblRow, tuple (gblRow, gblRow + 1), + tuple<> (two, negOne)); + } + // A(N-1, N-2:N-1) = [-1, 2] + else if (static_cast (gblRow) == numGblIndices - 1) { + A->insertGlobalValues (gblRow, tuple (gblRow - 1, gblRow), + tuple<> (negOne, two)); + } + // A(i, i-1:i+1) = [-1, 2, -1] + else { + A->insertGlobalValues (gblRow, tuple (gblRow - 1, gblRow, gblRow + 1), + tuple<> (negOne, two, negOne)); + } + } + // Tell the sparse matrix that we are done adding entries to it. + A->fillComplete(); + + // Create b, the RHS vector + Teuchos::RCP b (new Tpetra_Vector (map, true)); + b->putScalar(1.0); + + //Allocate solution vector x + Teuchos::RCP x (new Tpetra_Vector (map, true)); + + //Create linear problem + Teuchos::RCP my_problem (new problem_type (A, x, b)); + my_problem->setProblem(); + + //Create BlockGmres solver + Belos::BlockGmresSolMgr my_solver; + Teuchos::RCP solverParams = Teuchos::parameterList (); + my_solver.setParameters(solverParams); + my_solver.setProblem (my_problem); + + //Perform solve + my_solver.solve (); + + //Compute norm of solution vector + scalar_type norm_x = x->norm2(); + *out << "mantissa length of __float128 ST = " << FLT128_MANT_DIG << "\n"; + out->precision(FLT128_DIG); + *out << "precision = " << FLT128_DIG << "\n"; + *out << "||x|| = " << norm_x << "\n"; + scalar_type norm_x_gold = 1.695644420272127690523095201779e+03; + scalar_type diff = fabsq(norm_x-norm_x_gold); + *out << "diff = " << diff << "\n"; + if (diff < 3.0e-14) { + success = true; + } + } + + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success) + + if (success) { + *out << "End Result: TEST PASSED\n"; + } + else { + *out << "End Result: TEST FAILED\n"; + } + + return ( success ? 0 : 1 ); + +} // end test_bl_gmres_float128.cpp diff --git a/packages/ifpack2/test/belos/belos_solve_longdouble.cpp b/packages/ifpack2/test/belos/belos_solve_longdouble.cpp new file mode 100644 index 000000000000..8cdeaca0c7ff --- /dev/null +++ b/packages/ifpack2/test/belos/belos_solve_longdouble.cpp @@ -0,0 +1,178 @@ +//@HEADER +// ************************************************************************ +// +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// Copyright (2009) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, RICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +//@HEADER +// +// This driver constructs a simple tridiagonal matrix and constant RHS, +// and solves this system using the Belos Block GMRES method with a 'long double' +// ScalarType. +// +// NOTE: No preconditioner is used in this case. +// +// +#include "Teuchos_CommandLineProcessor.hpp" +#include "Teuchos_ParameterList.hpp" +#include "Teuchos_StandardCatchMacros.hpp" +#include "Teuchos_StandardCatchMacros.hpp" + +#include +#include "BelosLinearProblem.hpp" +#include "BelosBlockGmresSolMgr.hpp" +#include "BelosTypes.hpp" +#include "BelosConfigDefs.hpp" +#include "BelosTpetraAdapter.hpp" + +#include +#include + +#include "Tpetra_Map.hpp" +#include "Tpetra_CrsMatrix.hpp" +#include "Tpetra_Operator.hpp" +#include "Tpetra_MultiVector.hpp" +#include "Tpetra_Vector.hpp" +#include "Tpetra_Core.hpp" +#include "Tpetra_Details_DefaultTypes.hpp" + +#include "Teuchos_VerboseObject.hpp" + +using namespace Teuchos; + +int main(int argc, char *argv[]) { + // + typedef long double scalar_type; + typedef int LO; + typedef Tpetra::Map Tpetra_Map; + typedef Tpetra::CrsMatrix Tpetra_CrsMatrix; + typedef Tpetra::Vector Tpetra_Vector; + typedef Tpetra::Operator OP; + typedef Tpetra::MultiVector MV; + typedef Belos::LinearProblem problem_type; + + Teuchos::GlobalMPISession session(&argc, &argv, NULL); + + bool success = false; + + auto comm = Tpetra::getDefaultComm (); + + Teuchos::RCP + out = Teuchos::VerboseObjectBase::getDefaultOStream(); + + try { + // The number of rows and columns in the matrix. + const Tpetra::global_size_t numGblIndices = 50; + + // Construct a Map that puts approximately the same number of + // equations on each processor. + Teuchos::RCP map = Teuchos::rcp (new Tpetra_Map (numGblIndices, 0, comm)); + auto numMyElements = map->getNodeNumElements (); + + // Create a Tpetra sparse matrix whose rows have distribution + // given by the Map. We expect at most three entries per row. + Teuchos::RCP A (new Tpetra_CrsMatrix (map, 3)); + // Fill the sparse matrix, one row at a time. + const scalar_type two = static_cast (2.0); + const scalar_type negOne = static_cast (-1.0); + for (LO lclRow = 0; lclRow < static_cast (numMyElements); ++lclRow) { + const Tpetra::Details::DefaultTypes::global_ordinal_type gblRow = map->getGlobalElement (lclRow); + // A(0, 0:1) = [2, -1] + if (gblRow == 0) { + A->insertGlobalValues (gblRow, tuple (gblRow, gblRow + 1), + tuple<> (two, negOne)); + } + // A(N-1, N-2:N-1) = [-1, 2] + else if (static_cast (gblRow) == numGblIndices - 1) { + A->insertGlobalValues (gblRow, tuple (gblRow - 1, gblRow), + tuple<> (negOne, two)); + } + // A(i, i-1:i+1) = [-1, 2, -1] + else { + A->insertGlobalValues (gblRow, tuple (gblRow - 1, gblRow, gblRow + 1), + tuple<> (negOne, two, negOne)); + } + } + // Tell the sparse matrix that we are done adding entries to it. + A->fillComplete(); + + // Create b, the RHS vector + Teuchos::RCP b (new Tpetra_Vector (map, true)); + b->putScalar(1.0); + + //Allocate solution vector x + Teuchos::RCP x (new Tpetra_Vector (map, true)); + + //Create linear problem + Teuchos::RCP my_problem (new problem_type (A, x, b)); + my_problem->setProblem(); + + //Create BlockGmres solver + Belos::BlockGmresSolMgr my_solver; + Teuchos::RCP solverParams = Teuchos::parameterList (); + my_solver.setParameters(solverParams); + my_solver.setProblem (my_problem); + + //Perform solve + my_solver.solve (); + + //Compute norm of solution vector + scalar_type norm_x = x->norm2(); + *out << "mantissa length of long double ST = " << std::numeric_limits::digits << "\n"; + typedef std::numeric_limits ldbl; + out->precision(ldbl::max_digits10); + *out << "cout precision = " << ldbl::max_digits10 << "\n"; + *out << "||x|| = " << norm_x << "\n"; + scalar_type norm_x_gold = 1695.64442027212771791; + scalar_type diff = std::abs(norm_x-norm_x_gold); + *out << "diff = " << diff << "\n"; + if (diff < 1.0e-15) { + success = true; + } + } + + TEUCHOS_STANDARD_CATCH_STATEMENTS(true, std::cerr, success) + + if (success) { + *out << "End Result: TEST PASSED\n"; + } + else { + *out << "End Result: TEST FAILED\n"; + } + + return ( success ? 0 : 1 ); + +} // end test_bl_gmres_longdouble.cpp diff --git a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index a4d74614d741..6b85e135537a 100644 --- a/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/packages/kokkos-kernels/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -879,8 +879,15 @@ namespace KokkosSparse{ nnz_lno_persistent_work_view_t long_rows_per_color(Kokkos::ViewAllocateWithoutInitializing("long_rows_per_color"), numColors); nnz_lno_persistent_work_view_t max_row_length_per_color(Kokkos::ViewAllocateWithoutInitializing("max_row_length_per_color"), numColors); nnz_lno_t mostLongRowsInColor = 0; - Kokkos::parallel_reduce(team_policy_t(numColors, Kokkos::AUTO()), - SortIntoLongRowsFunctor(xadj, longRowThreshold, color_xadj, color_adj, long_rows_per_color, max_row_length_per_color), + SortIntoLongRowsFunctor sortIntoLongRowsFunctor(xadj, longRowThreshold, + color_xadj, color_adj, long_rows_per_color, max_row_length_per_color); + int sortLongRowsTeamSize = 1; + { + team_policy_t temp(1, 1); + sortLongRowsTeamSize = temp.team_size_recommended(sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag()); + } + Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize), + sortIntoLongRowsFunctor, Kokkos::Max(mostLongRowsInColor)); auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); diff --git a/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_decl.hpp b/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_decl.hpp index 2ac74705d79a..5a2764b9f75c 100644 --- a/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_decl.hpp +++ b/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_decl.hpp @@ -54,6 +54,7 @@ // Stratimikos needs Thyra, so we don't need special guards for Thyra here #include "Thyra_DefaultPreconditioner.hpp" #include "Thyra_BlockedLinearOpBase.hpp" +#include "Thyra_DiagonalLinearOpBase.hpp" #include "Thyra_XpetraLinearOp.hpp" #ifdef HAVE_MUELU_TPETRA #include "Thyra_TpetraLinearOp.hpp" @@ -84,6 +85,7 @@ #include #ifdef HAVE_MUELU_TPETRA #include +#include #endif #ifdef HAVE_MUELU_EPETRA #include @@ -93,16 +95,20 @@ #include "Kokkos_DefaultNode.hpp" +#include namespace Thyra { + using Teuchos::RCP; + using Teuchos::rcp; + + template + static bool replaceWithXpetra(ParameterList& paramList, std::string parameterName); + /** @brief Concrete preconditioner factory subclass for Thyra based on MueLu. @ingroup MueLuAdapters Add support for MueLu preconditioners in Thyra. This class provides an interface both for Epetra and Tpetra. - - The general implementation only handles Tpetra. For Epetra there is a specialization - on SC=double, LO=int, GO=int and NO=EpetraNode. */ template class MueLuPreconditionerFactory : public PreconditionerFactoryBase { @@ -168,390 +174,6 @@ namespace Thyra { }; -#ifdef HAVE_MUELU_EPETRA - /** @brief Concrete preconditioner factory subclass for Thyra based on MueLu. - @ingroup MueLuAdapters - Add support for MueLu preconditioners in Thyra. This class provides an interface both - for Epetra and Tpetra. - - Specialization for Epetra - */ - template <> - class MueLuPreconditionerFactory : public PreconditionerFactoryBase { - public: - typedef double Scalar; - typedef int LocalOrdinal; - typedef int GlobalOrdinal; - typedef Xpetra::EpetraNode Node; - - /** @name Constructors/initializers/accessors */ - //@{ - - /** \brief . */ - MueLuPreconditionerFactory() : paramList_(rcp(new ParameterList())) { } - - //@} - - /** @name Overridden from PreconditionerFactoryBase */ - //@{ - - /** \brief . */ - bool isCompatible(const LinearOpSourceBase& fwdOpSrc) const { - const RCP > fwdOp = fwdOpSrc.getOp(); - -#ifdef HAVE_MUELU_TPETRA - if (Xpetra::ThyraUtils::isTpetra(fwdOp)) return true; -#endif - -#ifdef HAVE_MUELU_EPETRA - if (Xpetra::ThyraUtils::isEpetra(fwdOp)) return true; -#endif - - if (Xpetra::ThyraUtils::isBlockedOperator(fwdOp)) return true; - - return false; - } - - /** \brief . */ - Teuchos::RCP > createPrec() const { - return Teuchos::rcp(new DefaultPreconditioner); - } - - /** \brief . */ - void initializePrec(const Teuchos::RCP >& fwdOpSrc, - PreconditionerBase* prec, - const ESupportSolveUse /* supportSolveUse */ - ) const { - using Teuchos::rcp_dynamic_cast; - - // we are using typedefs here, since we are using objects from different packages (Xpetra, Thyra,...) - typedef Xpetra::Map XpMap; - typedef Xpetra::Operator XpOp; - typedef Xpetra::ThyraUtils XpThyUtils; - typedef Xpetra::CrsMatrix XpCrsMat; - typedef Xpetra::BlockedCrsMatrix XpBlockedCrsMat; - typedef Xpetra::Matrix XpMat; - typedef Xpetra::MultiVector XpMultVec; - typedef Xpetra::MultiVector::magnitudeType,LocalOrdinal,GlobalOrdinal,Node> XpMultVecDouble; - typedef Thyra::LinearOpBase ThyLinOpBase; -#ifdef HAVE_MUELU_TPETRA - // TAW 1/26/2016: We deal with Tpetra objects -#if ((defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_OPENMP) && defined(HAVE_TPETRA_INST_INT_INT))) || \ - (!defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_SERIAL) && defined(HAVE_TPETRA_INST_INT_INT)))) - typedef MueLu::TpetraOperator MueTpOp; - typedef Tpetra::Operator TpOp; - typedef Thyra::TpetraLinearOp ThyTpLinOp; -#endif -#endif -#if defined(HAVE_MUELU_EPETRA) - typedef MueLu::EpetraOperator MueEpOp; - typedef Thyra::EpetraLinearOp ThyEpLinOp; -#endif - - //std::cout << "-======---------------------------------" << std::endl; - //std::cout << *paramList_ << std::endl; - //std::cout << "-======---------------------------------" << std::endl; - - // Check precondition - TEUCHOS_ASSERT(Teuchos::nonnull(fwdOpSrc)); - TEUCHOS_ASSERT(this->isCompatible(*fwdOpSrc)); - TEUCHOS_ASSERT(prec); - - // Create a copy, as we may remove some things from the list - ParameterList paramList = *paramList_; - - // Retrieve wrapped concrete Xpetra matrix from FwdOp - const RCP fwdOp = fwdOpSrc->getOp(); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(fwdOp)); - - // Check whether it is Epetra/Tpetra - bool bIsEpetra = XpThyUtils::isEpetra(fwdOp); - bool bIsTpetra = XpThyUtils::isTpetra(fwdOp); - bool bIsBlocked = XpThyUtils::isBlockedOperator(fwdOp); - TEUCHOS_TEST_FOR_EXCEPT((bIsEpetra == true && bIsTpetra == true)); - TEUCHOS_TEST_FOR_EXCEPT((bIsEpetra == bIsTpetra) && bIsBlocked == false); - TEUCHOS_TEST_FOR_EXCEPT((bIsEpetra != bIsTpetra) && bIsBlocked == true); - - RCP A = Teuchos::null; - if(bIsBlocked) { - Teuchos::RCP > ThyBlockedOp = - Teuchos::rcp_dynamic_cast >(fwdOp); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(ThyBlockedOp)); - - TEUCHOS_TEST_FOR_EXCEPT(ThyBlockedOp->blockExists(0,0)==false); - - Teuchos::RCP > b00 = ThyBlockedOp->getBlock(0,0); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(b00)); - - RCP xpetraFwdCrsMat00 = XpThyUtils::toXpetra(b00); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(xpetraFwdCrsMat00)); - - // MueLu needs a non-const object as input - RCP xpetraFwdCrsMatNonConst00 = Teuchos::rcp_const_cast(xpetraFwdCrsMat00); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(xpetraFwdCrsMatNonConst00)); - - // wrap the forward operator as an Xpetra::Matrix that MueLu can work with - RCP A00 = rcp(new Xpetra::CrsMatrixWrap(xpetraFwdCrsMatNonConst00)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(A00)); - - RCP rowmap00 = A00->getRowMap(); - RCP< const Teuchos::Comm< int > > comm = rowmap00->getComm(); - - // create a Xpetra::BlockedCrsMatrix which derives from Xpetra::Matrix that MueLu can work with - RCP bMat = Teuchos::rcp(new XpBlockedCrsMat(ThyBlockedOp, comm)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(bMat)); - - // save blocked matrix - A = bMat; - } else { - RCP xpetraFwdCrsMat = XpThyUtils::toXpetra(fwdOp); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(xpetraFwdCrsMat)); - - // MueLu needs a non-const object as input - RCP xpetraFwdCrsMatNonConst = Teuchos::rcp_const_cast(xpetraFwdCrsMat); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(xpetraFwdCrsMatNonConst)); - - // wrap the forward operator as an Xpetra::Matrix that MueLu can work with - A = rcp(new Xpetra::CrsMatrixWrap(xpetraFwdCrsMatNonConst)); - } - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(A)); - - // Retrieve concrete preconditioner object - const Teuchos::Ptr > defaultPrec = Teuchos::ptr(dynamic_cast *>(prec)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(defaultPrec)); - - // extract preconditioner operator - RCP thyra_precOp = Teuchos::null; - thyra_precOp = rcp_dynamic_cast >(defaultPrec->getNonconstUnspecifiedPrecOp(), true); - - // Variable for multigrid hierarchy: either build a new one or reuse the existing hierarchy - RCP > H = Teuchos::null; - - // make a decision whether to (re)build the multigrid preconditioner or reuse the old one - // rebuild preconditioner if startingOver == true - // reuse preconditioner if startingOver == false - const bool startingOver = (thyra_precOp.is_null() || !paramList.isParameter("reuse: type") || paramList.get("reuse: type") == "none"); - - if (startingOver == true) { - // extract coordinates from parameter list - Teuchos::RCP coordinates = Teuchos::null; - coordinates = MueLu::Utilities::ExtractCoordinatesFromParameterList(paramList); - - // TODO check for Xpetra or Thyra vectors? - RCP nullspace = Teuchos::null; -#ifdef HAVE_MUELU_TPETRA - if (bIsTpetra) { -#if ((defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_OPENMP) && defined(HAVE_TPETRA_INST_INT_INT))) || \ - (!defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_SERIAL) && defined(HAVE_TPETRA_INST_INT_INT)))) - typedef Tpetra::MultiVector tMV; - RCP tpetra_nullspace = Teuchos::null; - if (paramList.isType >("Nullspace")) { - tpetra_nullspace = paramList.get >("Nullspace"); - paramList.remove("Nullspace"); - nullspace = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_nullspace); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(nullspace)); - } -#else - TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Tpetra does not support GO=int and or EpetraNode."); -#endif - } -#endif -#ifdef HAVE_MUELU_EPETRA - if (bIsEpetra) { - RCP epetra_nullspace = Teuchos::null; - if (paramList.isType >("Nullspace")) { - epetra_nullspace = paramList.get >("Nullspace"); - paramList.remove("Nullspace"); - RCP > xpEpNullspace = Teuchos::rcp(new Xpetra::EpetraMultiVectorT(epetra_nullspace)); - RCP::magnitudeType,int,int,Node> > xpEpNullspaceMult = rcp_dynamic_cast::magnitudeType,int,int,Node> >(xpEpNullspace); - nullspace = rcp_dynamic_cast(xpEpNullspaceMult); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(nullspace)); - } - } -#endif - // build a new MueLu hierarchy - const std::string userName = "user data"; - Teuchos::ParameterList& userParamList = paramList.sublist(userName); - if(Teuchos::nonnull(coordinates)) { - userParamList.set >("Coordinates", coordinates); - } - if(Teuchos::nonnull(nullspace)) { - userParamList.set >("Nullspace", nullspace); - } - H = MueLu::CreateXpetraPreconditioner(A, paramList); - - } else { - // reuse old MueLu hierarchy stored in MueLu Tpetra/Epetra operator and put in new matrix - - // get old MueLu hierarchy -#if defined(HAVE_MUELU_TPETRA) - if (bIsTpetra) { -#if ((defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_OPENMP) && defined(HAVE_TPETRA_INST_INT_INT))) || \ - (!defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_SERIAL) && defined(HAVE_TPETRA_INST_INT_INT)))) - RCP tpetr_precOp = rcp_dynamic_cast(thyra_precOp); - RCP muelu_precOp = rcp_dynamic_cast(tpetr_precOp->getTpetraOperator(),true); - - H = muelu_precOp->GetHierarchy(); -#else - TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Tpetra does not support GO=int and or EpetraNode."); -#endif - } -#endif -#if defined(HAVE_MUELU_EPETRA)// && defined(HAVE_MUELU_SERIAL) - if (bIsEpetra) { - RCP epetr_precOp = rcp_dynamic_cast(thyra_precOp); - RCP muelu_precOp = rcp_dynamic_cast(epetr_precOp->epetra_op(),true); - - H = rcp_dynamic_cast >(muelu_precOp->GetHierarchy()); - } -#endif - // TODO add the blocked matrix case here... - - TEUCHOS_TEST_FOR_EXCEPTION(!H->GetNumLevels(), MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Hierarchy has no levels in it"); - TEUCHOS_TEST_FOR_EXCEPTION(!H->GetLevel(0)->IsAvailable("A"), MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Hierarchy has no fine level operator"); - RCP level0 = H->GetLevel(0); - RCP O0 = level0->Get >("A"); - RCP A0 = rcp_dynamic_cast(O0); - - if (!A0.is_null()) { - // If a user provided a "number of equations" argument in a parameter list - // during the initial setup, we must honor that settings and reuse it for - // all consequent setups. - A->SetFixedBlockSize(A0->GetFixedBlockSize()); - } - - // set new matrix - level0->Set("A", A); - - H->SetupRe(); - } - - // wrap hierarchy H in thyraPrecOp - RCP thyraPrecOp = Teuchos::null; -#if defined(HAVE_MUELU_TPETRA) - if (bIsTpetra) { -#if ((defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_OPENMP) && defined(HAVE_TPETRA_INST_INT_INT))) || \ - (!defined(EPETRA_HAVE_OMP) && (defined(HAVE_TPETRA_INST_SERIAL) && defined(HAVE_TPETRA_INST_INT_INT)))) - RCP muelu_tpetraOp = rcp(new MueTpOp(H)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(muelu_tpetraOp)); - RCP tpOp = Teuchos::rcp_dynamic_cast(muelu_tpetraOp); - thyraPrecOp = Thyra::createLinearOp(tpOp); -#else - TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Tpetra does not support GO=int and or EpetraNode."); -#endif - } -#endif - -#if defined(HAVE_MUELU_EPETRA) - if (bIsEpetra) { - RCP > epetraH = - rcp_dynamic_cast >(H); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::is_null(epetraH), MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Failed to cast Hierarchy to Hierarchy. Epetra runs only on the Serial node."); - RCP muelu_epetraOp = rcp(new MueEpOp(epetraH)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(muelu_epetraOp)); - // attach fwdOp to muelu_epetraOp to guarantee that it will not go away - set_extra_data(fwdOp,"IFPF::fwdOp", Teuchos::inOutArg(muelu_epetraOp), Teuchos::POST_DESTROY,false); - RCP thyra_epetraOp = Thyra::nonconstEpetraLinearOp(muelu_epetraOp, NOTRANS, EPETRA_OP_APPLY_APPLY_INVERSE, EPETRA_OP_ADJOINT_UNSUPPORTED); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(thyra_epetraOp)); - thyraPrecOp = rcp_dynamic_cast(thyra_epetraOp); - } -#endif - - if(bIsBlocked) { - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::nonnull(thyraPrecOp)); - - typedef MueLu::XpetraOperator MueXpOp; - const RCP muelu_xpetraOp = rcp(new MueXpOp(H)); - - RCP > thyraRangeSpace = Xpetra::ThyraUtils::toThyra(muelu_xpetraOp->getRangeMap()); - RCP > thyraDomainSpace = Xpetra::ThyraUtils::toThyra(muelu_xpetraOp->getDomainMap()); - - RCP > xpOp = Teuchos::rcp_dynamic_cast >(muelu_xpetraOp); - thyraPrecOp = Thyra::xpetraLinearOp(thyraRangeSpace, thyraDomainSpace,xpOp); - } - - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(thyraPrecOp)); - - defaultPrec->initializeUnspecified(thyraPrecOp); - } - - /** \brief . */ - void uninitializePrec(PreconditionerBase* prec, - Teuchos::RCP >* fwdOp, - ESupportSolveUse* supportSolveUse - ) const { - TEUCHOS_ASSERT(prec); - - // Retrieve concrete preconditioner object - const Teuchos::Ptr > defaultPrec = Teuchos::ptr(dynamic_cast *>(prec)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(defaultPrec)); - - if (fwdOp) { - // TODO: Implement properly instead of returning default value - *fwdOp = Teuchos::null; - } - - if (supportSolveUse) { - // TODO: Implement properly instead of returning default value - *supportSolveUse = Thyra::SUPPORT_SOLVE_UNSPECIFIED; - } - - defaultPrec->uninitialize(); - } - - //@} - - /** @name Overridden from Teuchos::ParameterListAcceptor */ - //@{ - - /** \brief . */ - void setParameterList(const Teuchos::RCP& paramList) { - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(paramList)); - paramList_ = paramList; - } - /** \brief . */ - Teuchos::RCP unsetParameterList() { - RCP savedParamList = paramList_; - paramList_ = Teuchos::null; - return savedParamList; - } - /** \brief . */ - Teuchos::RCP getNonconstParameterList() { return paramList_; } - /** \brief . */ - Teuchos::RCP getParameterList() const { return paramList_; } - /** \brief . */ - Teuchos::RCP getValidParameters() const { - static RCP validPL; - - if (Teuchos::is_null(validPL)) - validPL = rcp(new ParameterList()); - - return validPL; - } - //@} - - /** \name Public functions overridden from Describable. */ - //@{ - - /** \brief . */ - std::string description() const { return "Thyra::MueLuPreconditionerFactory"; } - - // ToDo: Add an override of describe(...) to give more detail! - - //@} - - private: - Teuchos::RCP paramList_; - }; // end specialization for Epetra - -#endif // HAVE_MUELU_EPETRA - } // namespace Thyra #endif // #ifdef HAVE_MUELU_STRATIMIKOS diff --git a/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_def.hpp b/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_def.hpp index f2db6fa2695b..0b484635f452 100644 --- a/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_def.hpp +++ b/packages/muelu/adapters/stratimikos/Thyra_MueLuPreconditionerFactory_def.hpp @@ -55,8 +55,176 @@ namespace Thyra { using Teuchos::RCP; using Teuchos::rcp; using Teuchos::ParameterList; + using Teuchos::rcp_dynamic_cast; + using Teuchos::rcp_const_cast; + template + static bool replaceWithXpetra(ParameterList& paramList, std::string parameterName) { + typedef typename Teuchos::ScalarTraits::magnitudeType Magnitude; + typedef Xpetra::Operator XpOp; + typedef Xpetra::ThyraUtils XpThyUtils; + typedef Xpetra::CrsMatrixWrap XpCrsMatWrap; + typedef Xpetra::CrsMatrix XpCrsMat; + typedef Xpetra::Matrix XpMat; + typedef Xpetra::MultiVector XpMultVec; + typedef Xpetra::MultiVector XpMagMultVec; + typedef Xpetra::Vector XpVec; + + typedef Thyra::LinearOpBase ThyLinOpBase; + typedef Thyra::DiagonalLinearOpBase ThyDiagLinOpBase; + typedef Thyra::XpetraLinearOp ThyXpOp; + typedef Thyra::SpmdVectorSpaceBase ThyVSBase; + +#ifdef HAVE_MUELU_TPETRA + typedef Tpetra::CrsMatrix TpCrsMat; + typedef Tpetra::Vector tV; + typedef Thyra::TpetraVector thyTpV; + typedef Tpetra::MultiVector tMV; + typedef Tpetra::MultiVector tMagMV; +# if defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) + typedef typename Teuchos::ScalarTraits::halfPrecision HalfMagnitude; + typedef Tpetra::MultiVector tHalfMagMV; +# endif +#endif +#if defined(HAVE_MUELU_EPETRA) + typedef Xpetra::EpetraCrsMatrixT XpEpCrsMat; +#endif + + if (paramList.isParameter(parameterName)) { + if (paramList.isType >(parameterName)) + return true; + else if (paramList.isType >(parameterName)) { + RCP constM = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP M = rcp_const_cast(constM); + paramList.set >(parameterName, M); + return true; + } + else if (paramList.isType >(parameterName)) + return true; + else if (paramList.isType >(parameterName)) { + RCP constX = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP X = rcp_const_cast(constX); + paramList.set >(parameterName, X); + return true; + } + else if (paramList.isType >(parameterName)) + return true; + else if (paramList.isType >(parameterName)) { + RCP constX = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP X = rcp_const_cast(constX); + paramList.set >(parameterName, X); + return true; + } +#ifdef HAVE_MUELU_TPETRA + else if (paramList.isType >(parameterName)) { + RCP tM = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP xM = rcp_dynamic_cast(tM, true); + paramList.set >(parameterName, xM); + return true; + } else if (paramList.isType >(parameterName)) { + RCP tpetra_X = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); + paramList.set >(parameterName, X); + TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); + return true; + } else if (paramList.isType >(parameterName)) { + RCP tpetra_X = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); + paramList.set >(parameterName, X); + TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); + return true; + } +# if defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) + else if (paramList.isType >(parameterName)) { + RCP tpetra_hX = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP tpetra_X = rcp(new tMagMV(tpetra_hX->getMap(),tpetra_hX->getNumVectors())); + Tpetra::deep_copy(*tpetra_X,*tpetra_hX); + RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); + paramList.set >(parameterName, X); + TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); + return true; + } +# endif +#endif +#ifdef HAVE_MUELU_EPETRA + else if (paramList.isType >(parameterName)) { + RCP eM = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP xeM = rcp(new XpEpCrsMat(eM)); + RCP xCrsM = rcp_dynamic_cast(xeM, true); + RCP xwM = rcp(new XpCrsMatWrap(xCrsM)); + RCP xM = rcp_dynamic_cast(xwM); + paramList.set >(parameterName, xM); + return true; + } else if (paramList.isType >(parameterName)) { + RCP epetra_X = Teuchos::null; + epetra_X = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP > xpEpX = rcp(new Xpetra::EpetraMultiVectorT(epetra_X)); + RCP > xpEpXMult = rcp_dynamic_cast >(xpEpX, true); + RCP X = rcp_dynamic_cast(xpEpXMult, true); + paramList.set >(parameterName, X); + return true; + } +#endif + else if (paramList.isType >(parameterName)) { + RCP thyM = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP crsM = XpThyUtils::toXpetra(thyM); + TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(crsM)); + // MueLu needs a non-const object as input + RCP crsMNonConst = rcp_const_cast(crsM); + TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(crsMNonConst)); + // wrap as an Xpetra::Matrix that MueLu can work with + RCP M = rcp(new Xpetra::CrsMatrixWrap(crsMNonConst)); + paramList.set >(parameterName, M); + return true; + } else if (paramList.isType >(parameterName)) { + RCP thyM = paramList.get >(parameterName); + paramList.remove(parameterName); + RCP > diag = thyM->getDiag(); + + RCP xpDiag; +#ifdef HAVE_MUELU_TPETRA + if (!rcp_dynamic_cast(diag).is_null()) { + RCP tDiag = Thyra::TpetraOperatorVectorExtraction::getConstTpetraVector(diag); + if (!tDiag.is_null()) + xpDiag = Xpetra::toXpetra(tDiag); + } +#endif +#ifdef HAVE_MUELU_EPETRA + if (xpDiag.is_null()) { + RCP comm = Thyra::get_Epetra_Comm(*rcp_dynamic_cast(thyM->range())->getComm()); + RCP map = Thyra::get_Epetra_Map(*(thyM->range()), comm); + if (!map.is_null()) { + RCP eDiag = Thyra::get_Epetra_Vector(*map, diag); + RCP nceDiag = rcp_const_cast(eDiag); + RCP > xpEpDiag = rcp(new Xpetra::EpetraVectorT(nceDiag)); + xpDiag = rcp_dynamic_cast(xpEpDiag, true); + } + } +#endif + TEUCHOS_ASSERT(!xpDiag.is_null()); + RCP M = Xpetra::MatrixFactory::Build(xpDiag); + paramList.set >(parameterName, M); + return true; + } + else { + TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, "Parameter " << parameterName << " has wrong type."); + return false; + } + } else + return false; + } + // Constructors/initializers/accessors template @@ -74,6 +242,11 @@ namespace Thyra { if (Xpetra::ThyraUtils::isTpetra(fwdOp)) return true; #endif +#ifdef HAVE_MUELU_EPETRA + if (Xpetra::ThyraUtils::isEpetra(fwdOp)) return true; +#endif + + if (Xpetra::ThyraUtils::isBlockedOperator(fwdOp)) return true; return false; @@ -93,6 +266,7 @@ namespace Thyra { // we are using typedefs here, since we are using objects from different packages (Xpetra, Thyra,...) typedef Xpetra::Map XpMap; typedef Xpetra::Operator XpOp; + typedef MueLu::XpetraOperator MueLuXpOp; typedef Xpetra::ThyraUtils XpThyUtils; typedef Xpetra::CrsMatrix XpCrsMat; typedef Xpetra::BlockedCrsMatrix XpBlockedCrsMat; @@ -100,12 +274,22 @@ namespace Thyra { typedef Xpetra::MultiVector XpMultVec; typedef Xpetra::MultiVector::coordinateType,LocalOrdinal,GlobalOrdinal,Node> XpMultVecDouble; typedef Thyra::LinearOpBase ThyLinOpBase; -#ifdef HAVE_MUELU_TPETRA - typedef MueLu::TpetraOperator MueTpOp; - typedef Tpetra::Operator TpOp; - typedef Thyra::TpetraLinearOp ThyTpLinOp; + typedef Thyra::XpetraLinearOp ThyXpOp; + typedef Xpetra::MultiVector XpMV; + typedef typename Teuchos::ScalarTraits::magnitudeType Magnitude; + typedef Xpetra::MultiVector XpmMV; +#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) + typedef Xpetra::TpetraHalfPrecisionOperator XpHalfPrecOp; + typedef typename XpHalfPrecOp::HalfScalar HalfScalar; + typedef Xpetra::Operator XpHalfOp; + typedef MueLu::XpetraOperator MueLuHalfXpOp; + typedef typename Teuchos::ScalarTraits::halfPrecision HalfMagnitude; + typedef Xpetra::MultiVector XphMV; + typedef Xpetra::MultiVector XphmMV; + typedef Xpetra::Matrix XphMat; #endif + // Check precondition TEUCHOS_ASSERT(Teuchos::nonnull(fwdOpSrc)); TEUCHOS_ASSERT(this->isCompatible(*fwdOpSrc)); @@ -178,104 +362,148 @@ namespace Thyra { RCP thyra_precOp = Teuchos::null; thyra_precOp = rcp_dynamic_cast >(defaultPrec->getNonconstUnspecifiedPrecOp(), true); - // Variable for multigrid hierarchy: either build a new one or reuse the existing hierarchy - RCP > H = Teuchos::null; - // make a decision whether to (re)build the multigrid preconditioner or reuse the old one // rebuild preconditioner if startingOver == true // reuse preconditioner if startingOver == false const bool startingOver = (thyra_precOp.is_null() || !paramList.isParameter("reuse: type") || paramList.get("reuse: type") == "none"); - + bool useHalfPrecision = false; + if (paramList.isParameter("half precision")) + useHalfPrecision = paramList.get("half precision"); + else if (paramList.isSublist("Hierarchy") && paramList.sublist("Hierarchy").isParameter("half precision")) + useHalfPrecision = paramList.sublist("Hierarchy").get("half precision"); + if (useHalfPrecision) + TEUCHOS_TEST_FOR_EXCEPTION(!bIsTpetra, MueLu::Exceptions::RuntimeError, "The only scalar type Epetra knows is double, so a half precision preconditioner cannot be constructed."); + + RCP xpPrecOp; if (startingOver == true) { - // extract coordinates from parameter list - RCP coordinates = Teuchos::null; - coordinates = MueLu::Utilities::ExtractCoordinatesFromParameterList(paramList); - - // TODO check for Xpetra or Thyra vectors? - RCP nullspace = Teuchos::null; -#ifdef HAVE_MUELU_TPETRA - if (bIsTpetra) { - typedef Tpetra::MultiVector tMV; - RCP tpetra_nullspace = Teuchos::null; - if (paramList.isType >("Nullspace")) { - tpetra_nullspace = paramList.get >("Nullspace"); + // Convert to Xpetra + std::list convertXpetra = {"Coordinates", "Nullspace"}; + for (auto it = convertXpetra.begin(); it != convertXpetra.end(); ++it) + replaceWithXpetra(paramList,*it); + + if (useHalfPrecision) { +#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) + + // convert to half precision + RCP halfA = Xpetra::convertToHalfPrecision(A); + const std::string userName = "user data"; + Teuchos::ParameterList& userParamList = paramList.sublist(userName); + if (userParamList.isType >("Coordinates")) { + RCP coords = userParamList.get >("Coordinates"); + userParamList.remove("Coordinates"); + RCP halfCoords = Xpetra::convertToHalfPrecision(coords); + userParamList.set("Coordinates",halfCoords); + } + if (userParamList.isType >("Nullspace")) { + RCP nullspace = userParamList.get >("Nullspace"); + userParamList.remove("Nullspace"); + RCP halfNullspace = Xpetra::convertToHalfPrecision(nullspace); + userParamList.set("Nullspace",halfNullspace); + } + if (paramList.isType >("Coordinates")) { + RCP coords = paramList.get >("Coordinates"); + paramList.remove("Coordinates"); + RCP halfCoords = Xpetra::convertToHalfPrecision(coords); + userParamList.set("Coordinates",halfCoords); + } + if (paramList.isType >("Nullspace")) { + RCP nullspace = paramList.get >("Nullspace"); paramList.remove("Nullspace"); - nullspace = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_nullspace); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(nullspace)); + RCP halfNullspace = Xpetra::convertToHalfPrecision(nullspace); + userParamList.set("Nullspace",halfNullspace); } - } -#endif - // build a new MueLu hierarchy - ParameterList& userParamList = paramList.sublist("user data"); - if(Teuchos::nonnull(coordinates)) { - userParamList.set >("Coordinates", coordinates); - } - if(Teuchos::nonnull(nullspace)) { - userParamList.set >("Nullspace", nullspace); - } - H = MueLu::CreateXpetraPreconditioner(A, paramList); - } else { - // reuse old MueLu hierarchy stored in MueLu Tpetra/Epetra operator and put in new matrix - // get old MueLu hierarchy -#if defined(HAVE_MUELU_TPETRA) - if (bIsTpetra) { + // build a new half-precision MueLu preconditioner - RCP tpetr_precOp = rcp_dynamic_cast(thyra_precOp); - RCP muelu_precOp = rcp_dynamic_cast(tpetr_precOp->getTpetraOperator(),true); - - H = muelu_precOp->GetHierarchy(); - } + RCP > H = MueLu::CreateXpetraPreconditioner(halfA, paramList); + RCP xpOp = rcp(new MueLuHalfXpOp(H)); + xpPrecOp = rcp(new XpHalfPrecOp(xpOp)); +#else + TEUCHOS_TEST_FOR_EXCEPT(true); #endif - // TODO add the blocked matrix case here... - - TEUCHOS_TEST_FOR_EXCEPTION(!H->GetNumLevels(), MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Hierarchy has no levels in it"); - TEUCHOS_TEST_FOR_EXCEPTION(!H->GetLevel(0)->IsAvailable("A"), MueLu::Exceptions::RuntimeError, - "Thyra::MueLuPreconditionerFactory: Hierarchy has no fine level operator"); - RCP level0 = H->GetLevel(0); - RCP O0 = level0->Get >("A"); - RCP A0 = rcp_dynamic_cast(O0); - - if (!A0.is_null()) { - // If a user provided a "number of equations" argument in a parameter list - // during the initial setup, we must honor that settings and reuse it for - // all consequent setups. - A->SetFixedBlockSize(A0->GetFixedBlockSize()); - } - - // set new matrix - level0->Set("A", A); + } else + { + const std::string userName = "user data"; + Teuchos::ParameterList& userParamList = paramList.sublist(userName); + if (paramList.isType >("Coordinates")) { + RCP coords = paramList.get >("Coordinates"); + paramList.remove("Coordinates"); + userParamList.set("Coordinates",coords); + } + if (paramList.isType >("Nullspace")) { + RCP nullspace = paramList.get >("Nullspace"); + paramList.remove("Nullspace"); + userParamList.set("Nullspace",nullspace); + } + + // build a new MueLu RefMaxwell preconditioner + RCP > H = MueLu::CreateXpetraPreconditioner(A, paramList); + xpPrecOp = rcp(new MueLuXpOp(H)); + } + } else { + // reuse old MueLu hierarchy stored in MueLu Xpetra operator and put in new matrix + RCP thyXpOp = rcp_dynamic_cast(thyra_precOp, true); + xpPrecOp = rcp_dynamic_cast(thyXpOp->getXpetraOperator(), true); +#if defined(HAVE_MUELU_TPETRA) && defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) + RCP xpHalfPrecOp = rcp_dynamic_cast(xpPrecOp); + if (!xpHalfPrecOp.is_null()) { + RCP > H = rcp_dynamic_cast(xpHalfPrecOp->GetHalfPrecisionOperator(), true)->GetHierarchy(); + RCP halfA = Xpetra::convertToHalfPrecision(A); + + TEUCHOS_TEST_FOR_EXCEPTION(!H->GetNumLevels(), MueLu::Exceptions::RuntimeError, + "Thyra::MueLuPreconditionerFactory: Hierarchy has no levels in it"); + TEUCHOS_TEST_FOR_EXCEPTION(!H->GetLevel(0)->IsAvailable("A"), MueLu::Exceptions::RuntimeError, + "Thyra::MueLuPreconditionerFactory: Hierarchy has no fine level operator"); + RCP level0 = H->GetLevel(0); + RCP O0 = level0->Get >("A"); + RCP A0 = rcp_dynamic_cast(O0, true); + + if (!A0.is_null()) { + // If a user provided a "number of equations" argument in a parameter list + // during the initial setup, we must honor that settings and reuse it for + // all consequent setups. + halfA->SetFixedBlockSize(A0->GetFixedBlockSize()); + } - H->SetupRe(); - } + // set new matrix + level0->Set("A", halfA); - // wrap hierarchy H in thyraPrecOp - RCP thyraPrecOp = Teuchos::null; -#if defined(HAVE_MUELU_TPETRA) - if (bIsTpetra) { - RCP muelu_tpetraOp = rcp(new MueTpOp(H)); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(muelu_tpetraOp)); - RCP tpOp = Teuchos::rcp_dynamic_cast(muelu_tpetraOp); - thyraPrecOp = Thyra::createLinearOp(tpOp); - } + H->SetupRe(); + } else #endif + { + // get old MueLu hierarchy + RCP xpOp = rcp_dynamic_cast(thyXpOp->getXpetraOperator(), true); + RCP > H = xpOp->GetHierarchy();; + + TEUCHOS_TEST_FOR_EXCEPTION(!H->GetNumLevels(), MueLu::Exceptions::RuntimeError, + "Thyra::MueLuPreconditionerFactory: Hierarchy has no levels in it"); + TEUCHOS_TEST_FOR_EXCEPTION(!H->GetLevel(0)->IsAvailable("A"), MueLu::Exceptions::RuntimeError, + "Thyra::MueLuPreconditionerFactory: Hierarchy has no fine level operator"); + RCP level0 = H->GetLevel(0); + RCP O0 = level0->Get >("A"); + RCP A0 = rcp_dynamic_cast(O0); + + if (!A0.is_null()) { + // If a user provided a "number of equations" argument in a parameter list + // during the initial setup, we must honor that settings and reuse it for + // all consequent setups. + A->SetFixedBlockSize(A0->GetFixedBlockSize()); + } - if(bIsBlocked) { - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::nonnull(thyraPrecOp)); - - typedef MueLu::XpetraOperator MueXpOp; - //typedef Thyra::XpetraLinearOp ThyXpLinOp; // unused - const RCP muelu_xpetraOp = rcp(new MueXpOp(H)); - - RCP > thyraRangeSpace = Xpetra::ThyraUtils::toThyra(muelu_xpetraOp->getRangeMap()); - RCP > thyraDomainSpace = Xpetra::ThyraUtils::toThyra(muelu_xpetraOp->getDomainMap()); + // set new matrix + level0->Set("A", A); - RCP > xpOp = Teuchos::rcp_dynamic_cast >(muelu_xpetraOp); - thyraPrecOp = Thyra::xpetraLinearOp(thyraRangeSpace, thyraDomainSpace,xpOp); + H->SetupRe(); + } } + // wrap preconditioner in thyraPrecOp + RCP > thyraRangeSpace = Xpetra::ThyraUtils::toThyra(xpPrecOp->getRangeMap()); + RCP > thyraDomainSpace = Xpetra::ThyraUtils::toThyra(xpPrecOp->getDomainMap()); + + RCP thyraPrecOp = Thyra::xpetraLinearOp(thyraRangeSpace, thyraDomainSpace, xpPrecOp); TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(thyraPrecOp)); defaultPrec->initializeUnspecified(thyraPrecOp); diff --git a/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_decl.hpp b/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_decl.hpp index aaf801cbb003..3af40c908e52 100644 --- a/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_decl.hpp +++ b/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_decl.hpp @@ -56,6 +56,7 @@ #include "Thyra_BlockedLinearOpBase.hpp" #include "Thyra_DiagonalLinearOpBase.hpp" #include "Thyra_XpetraLinearOp.hpp" +#include #ifdef HAVE_MUELU_TPETRA #include "Thyra_TpetraLinearOp.hpp" #include "Thyra_TpetraThyraWrappers.hpp" @@ -95,9 +96,6 @@ namespace Thyra { - template - bool replaceWithXpetra(ParameterList& paramList, std::string parameterName); - /** @brief Concrete preconditioner factory subclass for Thyra based on MueLu. @ingroup MueLuAdapters Add support for MueLu preconditioners in Thyra. This class provides an interface both @@ -283,7 +281,7 @@ namespace Thyra { // rebuild preconditioner if startingOver == true // reuse preconditioner if startingOver == false const bool startingOver = (thyra_precOp.is_null() || !paramList.isParameter("refmaxwell: enable reuse") || !paramList.get("refmaxwell: enable reuse")); - const bool useHalfPrecision = paramList.get("refmaxwell: half precision", false) && bIsTpetra; + const bool useHalfPrecision = paramList.get("half precision", false) && bIsTpetra; RCP xpPrecOp; if (startingOver == true) { diff --git a/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_def.hpp b/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_def.hpp index df6aba25f482..a098d8065b99 100644 --- a/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_def.hpp +++ b/packages/muelu/adapters/stratimikos/Thyra_MueLuRefMaxwellPreconditionerFactory_def.hpp @@ -59,173 +59,6 @@ namespace Thyra { using Teuchos::rcp_dynamic_cast; using Teuchos::rcp_const_cast; - template - bool replaceWithXpetra(ParameterList& paramList, std::string parameterName) { - typedef typename Teuchos::ScalarTraits::magnitudeType Magnitude; - typedef Xpetra::Operator XpOp; - typedef Xpetra::ThyraUtils XpThyUtils; - typedef Xpetra::CrsMatrixWrap XpCrsMatWrap; - typedef Xpetra::CrsMatrix XpCrsMat; - typedef Xpetra::Matrix XpMat; - typedef Xpetra::MultiVector XpMultVec; - typedef Xpetra::MultiVector XpMagMultVec; - typedef Xpetra::Vector XpVec; - - typedef Thyra::LinearOpBase ThyLinOpBase; - typedef Thyra::DiagonalLinearOpBase ThyDiagLinOpBase; - typedef Thyra::XpetraLinearOp ThyXpOp; - typedef Thyra::SpmdVectorSpaceBase ThyVSBase; - -#ifdef HAVE_MUELU_TPETRA - typedef Tpetra::CrsMatrix TpCrsMat; - typedef Tpetra::Vector tV; - typedef Thyra::TpetraVector thyTpV; - typedef Tpetra::MultiVector tMV; - typedef Tpetra::MultiVector tMagMV; -# if defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) - typedef typename Teuchos::ScalarTraits::halfPrecision HalfMagnitude; - typedef Tpetra::MultiVector tHalfMagMV; -# endif -#endif -#if defined(HAVE_MUELU_EPETRA) - typedef Xpetra::EpetraCrsMatrixT XpEpCrsMat; -#endif - - if (paramList.isParameter(parameterName)) { - if (paramList.isType >(parameterName)) - return true; - else if (paramList.isType >(parameterName)) { - RCP constM = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP M = rcp_const_cast(constM); - paramList.set >(parameterName, M); - return true; - } - else if (paramList.isType >(parameterName)) - return true; - else if (paramList.isType >(parameterName)) { - RCP constX = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP X = rcp_const_cast(constX); - paramList.set >(parameterName, X); - return true; - } - else if (paramList.isType >(parameterName)) - return true; - else if (paramList.isType >(parameterName)) { - RCP constX = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP X = rcp_const_cast(constX); - paramList.set >(parameterName, X); - return true; - } -#ifdef HAVE_MUELU_TPETRA - else if (paramList.isType >(parameterName)) { - RCP tM = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP xM = rcp_dynamic_cast(tM, true); - paramList.set >(parameterName, xM); - return true; - } else if (paramList.isType >(parameterName)) { - RCP tpetra_X = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); - paramList.set >(parameterName, X); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); - return true; - } else if (paramList.isType >(parameterName)) { - RCP tpetra_X = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); - paramList.set >(parameterName, X); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); - return true; - } -# if defined(HAVE_TPETRA_INST_DOUBLE) && defined(HAVE_TPETRA_INST_FLOAT) - else if (paramList.isType >(parameterName)) { - RCP tpetra_hX = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP tpetra_X = rcp(new tMagMV(tpetra_hX->getMap(),tpetra_hX->getNumVectors())); - Tpetra::deep_copy(*tpetra_X,*tpetra_hX); - RCP X = MueLu::TpetraMultiVector_To_XpetraMultiVector(tpetra_X); - paramList.set >(parameterName, X); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(X)); - return true; - } -# endif -#endif -#ifdef HAVE_MUELU_EPETRA - else if (paramList.isType >(parameterName)) { - RCP eM = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP xeM = rcp(new XpEpCrsMat(eM)); - RCP xCrsM = rcp_dynamic_cast(xeM, true); - RCP xwM = rcp(new XpCrsMatWrap(xCrsM)); - RCP xM = rcp_dynamic_cast(xwM); - paramList.set >(parameterName, xM); - return true; - } else if (paramList.isType >(parameterName)) { - RCP epetra_X = Teuchos::null; - epetra_X = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP > xpEpX = rcp(new Xpetra::EpetraMultiVectorT(epetra_X)); - RCP > xpEpXMult = rcp_dynamic_cast >(xpEpX, true); - RCP X = rcp_dynamic_cast(xpEpXMult, true); - paramList.set >(parameterName, X); - return true; - } -#endif - else if (paramList.isType >(parameterName)) { - RCP thyM = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP crsM = XpThyUtils::toXpetra(thyM); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(crsM)); - // MueLu needs a non-const object as input - RCP crsMNonConst = rcp_const_cast(crsM); - TEUCHOS_TEST_FOR_EXCEPT(Teuchos::is_null(crsMNonConst)); - // wrap as an Xpetra::Matrix that MueLu can work with - RCP M = rcp(new Xpetra::CrsMatrixWrap(crsMNonConst)); - paramList.set >(parameterName, M); - return true; - } else if (paramList.isType >(parameterName)) { - RCP thyM = paramList.get >(parameterName); - paramList.remove(parameterName); - RCP > diag = thyM->getDiag(); - - RCP xpDiag; -#ifdef HAVE_MUELU_TPETRA - if (!rcp_dynamic_cast(diag).is_null()) { - RCP tDiag = Thyra::TpetraOperatorVectorExtraction::getConstTpetraVector(diag); - if (!tDiag.is_null()) - xpDiag = Xpetra::toXpetra(tDiag); - } -#endif -#ifdef HAVE_MUELU_EPETRA - if (xpDiag.is_null()) { - RCP comm = Thyra::get_Epetra_Comm(*rcp_dynamic_cast(thyM->range())->getComm()); - RCP map = Thyra::get_Epetra_Map(*(thyM->range()), comm); - if (!map.is_null()) { - RCP eDiag = Thyra::get_Epetra_Vector(*map, diag); - RCP nceDiag = rcp_const_cast(eDiag); - RCP > xpEpDiag = rcp(new Xpetra::EpetraVectorT(nceDiag)); - xpDiag = rcp_dynamic_cast(xpEpDiag, true); - } - } -#endif - TEUCHOS_ASSERT(!xpDiag.is_null()); - RCP M = Xpetra::MatrixFactory::Build(xpDiag); - paramList.set >(parameterName, M); - return true; - } - else { - TEUCHOS_TEST_FOR_EXCEPTION(true, MueLu::Exceptions::RuntimeError, "Parameter " << parameterName << " has wrong type."); - return false; - } - } else - return false; - } - - // Constructors/initializers/accessors template @@ -317,7 +150,7 @@ namespace Thyra { // rebuild preconditioner if startingOver == true // reuse preconditioner if startingOver == false const bool startingOver = (thyra_precOp.is_null() || !paramList.isParameter("refmaxwell: enable reuse") || !paramList.get("refmaxwell: enable reuse")); - const bool useHalfPrecision = paramList.get("refmaxwell: half precision", false) && bIsTpetra; + const bool useHalfPrecision = paramList.get("half precision", false) && bIsTpetra; RCP xpPrecOp; if (startingOver == true) { diff --git a/packages/muelu/doc/UsersGuide/masterList.xml b/packages/muelu/doc/UsersGuide/masterList.xml index efdcf0a2371e..0084a47ec00d 100644 --- a/packages/muelu/doc/UsersGuide/masterList.xml +++ b/packages/muelu/doc/UsersGuide/masterList.xml @@ -146,6 +146,15 @@ Pass parameters to the underlying linear algebra not supported by ML false + + + + half precision + bool + Build half precision preconditioner + not supported by ML + false + false @@ -942,7 +951,7 @@ both (``both''). not supported by ML - + diff --git a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex index 5b5c5c8be0bb..03e394f9e8e7 100644 --- a/packages/muelu/doc/UsersGuide/paramlist_hidden.tex +++ b/packages/muelu/doc/UsersGuide/paramlist_hidden.tex @@ -32,6 +32,8 @@ \cba{matvec params}{\parameterlist}{Pass parameters to the underlying linear algebra} +\cbb{half precision}{bool}{false}{Build half precision preconditioner} + \cbb{smoother: pre or post}{string}{"both"}{Pre- and post-smoother combination. Possible values: "pre" (only pre-smoother), "post" (only post-smoother), "both" (both pre-and post-smoothers), "none" (no smoothing).} \cbb{smoother: type}{string}{"RELAXATION"}{Smoother type. Possible values: see Table~\ref{tab:smoothers}.} diff --git a/packages/muelu/example/basic/Stratimikos.cpp b/packages/muelu/example/basic/Stratimikos.cpp index a3b02f798ac0..9e0d54ed97d3 100644 --- a/packages/muelu/example/basic/Stratimikos.cpp +++ b/packages/muelu/example/basic/Stratimikos.cpp @@ -223,11 +223,12 @@ int MainWrappers::main_(Teuchos::Command linearSolverBuilder.setPreconditioningStrategyFactory(Teuchos::abstractFactoryStd(), "Ifpack2"); #endif - // add coordinates to parameter list + // add coordinates and nullspace to parameter list if (paramList->isSublist("Preconditioner Types") && paramList->sublist("Preconditioner Types").isSublist("MueLu")) { ParameterList& userParamList = paramList->sublist("Preconditioner Types").sublist("MueLu").sublist("user data"); userParamList.set >("Coordinates", coordinates); + userParamList.set >("Nullspace", nullspace); } // Setup solver parameters using a Stratimikos parameter list. diff --git a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp index e9211c47efae..5599bc480fe2 100644 --- a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp +++ b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp @@ -1301,6 +1301,8 @@ namespace MueLu { oss << "\n--------------------------------------------------------------------------------\n"; oss << "--- Multigrid Summary " << std::setw(28) << std::left << label << "---\n"; oss << "--------------------------------------------------------------------------------" << std::endl; + if (verbLevel & Parameters1) + oss << "Scalar = " << Teuchos::ScalarTraits::name() << std::endl; oss << "Number of levels = " << numLevels << std::endl; oss << "Operator complexity = " << std::setprecision(2) << std::setiosflags(std::ios::fixed) << GetOperatorComplexity() << std::endl; diff --git a/packages/muelu/src/MueCentral/MueLu_MasterList.cpp b/packages/muelu/src/MueCentral/MueLu_MasterList.cpp index f1bdcba59603..61ba5bdf1062 100644 --- a/packages/muelu/src/MueCentral/MueLu_MasterList.cpp +++ b/packages/muelu/src/MueCentral/MueLu_MasterList.cpp @@ -184,6 +184,7 @@ namespace MueLu { "" "" "" + "" "" "" "" @@ -570,6 +571,8 @@ namespace MueLu { ("matvec params","matvec params") + ("half precision","half precision") + ("smoother: pre or post","smoother: pre or post") ("smoother: type","smoother: type") diff --git a/packages/piro/src/Piro_NOXSolver_Def.hpp b/packages/piro/src/Piro_NOXSolver_Def.hpp index 967f6f220974..c5916bf95bc2 100644 --- a/packages/piro/src/Piro_NOXSolver_Def.hpp +++ b/packages/piro/src/Piro_NOXSolver_Def.hpp @@ -148,8 +148,6 @@ void Piro::NOXSolver::evalModelImpl( Thyra::SolveStatus solve_status; const Thyra::SolveCriteria solve_criteria; - Teuchos::ParameterList analysisParams; - if(solveState) { const auto timer = Teuchos::rcp(new Teuchos::TimeMonitor(*Teuchos::TimeMonitor::getNewTimer("Piro::NOXSolver::evalModelImpl::solve"))); @@ -201,7 +199,7 @@ void Piro::NOXSolver::evalModelImpl( const RCP > finalSolution = solver->get_current_x(); modelInArgs.set_x(finalSolution); - this->evalConvergedModelResponsesAndSensitivities(modelInArgs, outArgs, analysisParams); + this->evalConvergedModelResponsesAndSensitivities(modelInArgs, outArgs, *appParams); bool computeReducedHessian = false; for (int g_index=0; g_indexnum_g(); ++g_index) { diff --git a/packages/piro/src/Piro_ThyraProductME_Constraint_SimOpt.hpp b/packages/piro/src/Piro_ThyraProductME_Constraint_SimOpt.hpp index a4a4a6b3612f..0303f19986cc 100644 --- a/packages/piro/src/Piro_ThyraProductME_Constraint_SimOpt.hpp +++ b/packages/piro/src/Piro_ThyraProductME_Constraint_SimOpt.hpp @@ -742,6 +742,10 @@ class ThyraProductME_Constraint_SimOpt : public ROL::Constraint_SimOpt { } } + void solve_update(const ROL::Vector &u, const ROL::Vector &z, ROL::UpdateType type, int iter = -1) { + this->update(u, z, type, iter); + } + void solve(ROL::Vector &c, ROL::Vector &u, const ROL::Vector &z, @@ -1160,6 +1164,11 @@ class ThyraProductME_Constraint_SimOpt : public ROL::Constraint_SimOpt { params->set("Optimizer Iteration Number", iter); } + void update_1( const ROL::Vector &u, ROL::UpdateType /*type*/, int iter = -1 ) { + //temporary implementation using update_1 function + this->update_1(u, true, iter); + } + /** \brief Update constraint functions with respect to Opt variable. x is the optimization variable, flag = ??, @@ -1190,6 +1199,11 @@ class ThyraProductME_Constraint_SimOpt : public ROL::Constraint_SimOpt { params->set("Optimizer Iteration Number", iter); } + void update_2( const ROL::Vector &z, ROL::UpdateType /*type*/, int iter = -1 ) { + //temporary implementation using update_1 function + this->update_2(z, true, iter); + } + bool z_hasChanged(const ROL::Vector &rol_z) const { bool changed = true; if (Teuchos::nonnull(rol_z_ptr)) { diff --git a/packages/piro/src/Piro_ThyraProductME_Objective_SimOpt.hpp b/packages/piro/src/Piro_ThyraProductME_Objective_SimOpt.hpp index 9dedb95a0a11..6d0055bc51b6 100644 --- a/packages/piro/src/Piro_ThyraProductME_Objective_SimOpt.hpp +++ b/packages/piro/src/Piro_ThyraProductME_Objective_SimOpt.hpp @@ -678,6 +678,11 @@ class ThyraProductME_Objective_SimOpt : public ROL::Objective_SimOpt { params->set("Optimizer Iteration Number", iter); } + void update( const ROL::Vector &u, const ROL::Vector &z, ROL::UpdateType /*type*/, int iter = -1) { + //temporary implementation using old update function + this->update( u, z, true, iter); + } + bool z_hasChanged(const ROL::Vector &rol_z) const { bool changed = true; if (Teuchos::nonnull(rol_z_ptr)) { diff --git a/packages/rol/adapters/thyra/src/vector/ROL_ThyraVector.hpp b/packages/rol/adapters/thyra/src/vector/ROL_ThyraVector.hpp index 53d7b0714874..44144f6f815b 100644 --- a/packages/rol/adapters/thyra/src/vector/ROL_ThyraVector.hpp +++ b/packages/rol/adapters/thyra/src/vector/ROL_ThyraVector.hpp @@ -274,6 +274,14 @@ class ThyraVector : public Vector { return ::Thyra::dot(*thyra_vec_, *ex.thyra_vec_); } + /** \brief Apply \f$\mathtt{*this}\f$ to a dual vector. This is equivalent + to the call \f$\mathtt{this->dot(x.dual())}\f$. + */ + Real apply( const Vector &x ) const { + const ThyraVector &ex = dynamic_cast(x); + return ::Thyra::dot(*thyra_vec_, *ex.thyra_vec_); + } + /** \brief Returns \f$ \| y \| \f$ where \f$y = \mbox{*this}\f$. */ Real norm() const { diff --git a/packages/stk/stk_doc_tests/stk_mesh/howToUseNgpFieldAsyncCopy.cpp b/packages/stk/stk_doc_tests/stk_mesh/howToUseNgpFieldAsyncCopy.cpp new file mode 100644 index 000000000000..fc8ef13f0d77 --- /dev/null +++ b/packages/stk/stk_doc_tests/stk_mesh/howToUseNgpFieldAsyncCopy.cpp @@ -0,0 +1,114 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +void check_field_data_on_device(stk::mesh::BulkData& bulk, stk::mesh::NgpField& ngpDoubleField, + stk::mesh::NgpField& ngpIntField, double expectedDoubleValue, int expectedIntValue) +{ + stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(bulk); + stk::mesh::Selector selector = bulk.mesh_meta_data().universal_part(); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, selector, + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& elem) + { + double doubleFieldValue = ngpDoubleField(elem, 0); + NGP_EXPECT_EQ(expectedDoubleValue, doubleFieldValue); + + int intFieldValue = ngpIntField(elem, 0); + NGP_EXPECT_EQ(expectedIntValue, intFieldValue); + }); +} + +TEST(stkMeshHowTo, ngpFieldAsyncCopy) +{ + MPI_Comm communicator = MPI_COMM_WORLD; + if(stk::parallel_machine_size(communicator) > 1) { GTEST_SKIP();} + + using DoubleField = stk::mesh::Field; + using IntField = stk::mesh::Field; + + const unsigned spatialDimension = 3; + stk::mesh::MetaData meta(spatialDimension); + stk::mesh::BulkData bulk(meta, MPI_COMM_WORLD); + + unsigned numStates = 1; + DoubleField& doubleField = meta.declare_field(stk::topology::ELEM_RANK, "doubleField", numStates); + IntField& intField = meta.declare_field(stk::topology::ELEM_RANK, "intField", numStates); + + double initialDoubleFieldValue = 1.0; + double modifiedDoubleFieldValue = initialDoubleFieldValue*2; + int initialIntFieldValue = 2; + int modifiedIntFieldValue = initialIntFieldValue*2; + + stk::mesh::put_field_on_entire_mesh_with_initial_value(doubleField, &initialDoubleFieldValue); + stk::mesh::put_field_on_entire_mesh_with_initial_value(intField, &initialIntFieldValue); + stk::io::fill_mesh("generated:1x1x1", bulk); + + stk::mesh::NgpField& ngpDoubleField = stk::mesh::get_updated_ngp_field(doubleField); + stk::mesh::NgpField& ngpIntField = stk::mesh::get_updated_ngp_field(intField); + + stk::mesh::ExecSpaceWrapper<> execSpaceWithStream1 = stk::mesh::get_execution_space_with_stream(); + stk::mesh::ExecSpaceWrapper<> execSpaceWithStream2 = stk::mesh::get_execution_space_with_stream(); + + stk::mesh::Entity elem = bulk.get_entity(stk::topology::ELEM_RANK, 1u); + double* doubleData = reinterpret_cast(stk::mesh::field_data(doubleField, elem)); + *doubleData = initialDoubleFieldValue*2; + int* intData = reinterpret_cast(stk::mesh::field_data(intField, elem)); + *intData = initialIntFieldValue*2; + + ngpDoubleField.modify_on_host(); + ngpDoubleField.sync_to_device(execSpaceWithStream1); + ngpIntField.modify_on_host(); + ngpIntField.sync_to_device(execSpaceWithStream2.get_execution_space()); + + stk::mesh::ngp_field_fence(meta); + + check_field_data_on_device(bulk, ngpDoubleField, ngpIntField, modifiedDoubleFieldValue, modifiedIntFieldValue); +} + + +} \ No newline at end of file diff --git a/packages/stk/stk_emend/Jamfile b/packages/stk/stk_emend/Jamfile index 23abc5dac028..250569099b40 100644 --- a/packages/stk/stk_emend/Jamfile +++ b/packages/stk/stk_emend/Jamfile @@ -60,7 +60,7 @@ import os ; local stk_emend-root-inc ; stk_emend-root-inc = [ ifuserbuild $(sierra-root)/stk/stk_emend/include ] - [ ifdevbuild $(stk_emend-root)/independent_set ] ; + [ ifdevbuild $(stk_emend-root) ] ; project votd : requirements @@ -92,7 +92,7 @@ install install-user-jamfile explicit install-user-include ; install install-user-include - : [ path.glob-tree $(stk_emend-root)/stk_emend/independent_set : *.h *.hpp ] + : [ path.glob-tree $(stk_emend-root) : *.h *.hpp ] : $(install-root)/stk/stk_emend/include ; @@ -130,5 +130,4 @@ alias independent_set_lib : # No build requirements : # No default build : - $(stk_emend-root-inc)/.. ; diff --git a/packages/stk/stk_emend/stk_emend/independent_set/CMakeLists.txt b/packages/stk/stk_emend/stk_emend/independent_set/CMakeLists.txt index 99f9282d5cde..65456ef38091 100644 --- a/packages/stk/stk_emend/stk_emend/independent_set/CMakeLists.txt +++ b/packages/stk/stk_emend/stk_emend/independent_set/CMakeLists.txt @@ -50,5 +50,5 @@ SET(HEADERS "") FILE(GLOB HEADERS *.hpp) INSTALL(FILES ${HEADERS} DESTINATION - ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}/stk_emend/) + ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}/stk_emend/independent_set) diff --git a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_io_using_cmake b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_io_using_cmake index 67af4268af35..2dbfba78fe47 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_io_using_cmake +++ b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_io_using_cmake @@ -6,7 +6,7 @@ exe() { } sierra_proj=${SIERRA_PROJ:-${PWD}} -output_dir=${OUTPUT_DIR:-${PWD}/../stk-no-stk-io-cmake-testing} +output_dir=${OUTPUT_DIR:-${PWD}/../stk-cmake-testing} trilinos_dir=${output_dir}/Trilinos cuda_on_or_off=${CUDA:-OFF} build_type=${CMAKE_BUILD_TYPE:-release} @@ -25,18 +25,19 @@ if [ ! -d ${trilinos_dir} ] ; then exe git clone -b develop https://github.com/trilinos/Trilinos.git ${trilinos_dir} else exe cd ${trilinos_dir} - exe git reset --hard develop + exe git checkout develop + exe git reset --hard origin/develop exe git pull fi trilinos_install_dir=${output_dir}/trilinos_install_${build_type}_${SIERRA_COMPILER} exe rm -rf $trilinos_install_dir -stk_build_dir=${output_dir}/stk_build_${build_type}_${SIERRA_COMPILER} +stk_build_dir=${output_dir}/stk_build_no_stk_io_${build_type}_${SIERRA_COMPILER} exe rm -rf $stk_build_dir exe mkdir -p $stk_build_dir -stk_cmake_testing=${sierra_proj}/stk/stk_integration_tests/cmake_install_test +stk_cmake_testing_source_dir=${sierra_proj}/stk/stk_integration_tests/cmake_install_test printf "\nUsing sierra project: ${sierra_proj}\n"; printf "Using build-type: ${build_type}\n"; @@ -58,7 +59,7 @@ if [ ! -L ${trilinos_dir}/packages/stk ] ; then exe ln -s ${sierra_proj}/stk ${trilinos_dir}/packages fi -exe cp ${stk_cmake_testing}/run_cmake_stk_no_stk_io ${stk_build_dir} +exe cp ${stk_cmake_testing_source_dir}/run_cmake_stk_no_stk_io ${stk_build_dir} exe cd ${stk_build_dir} printf "Configuring trilinos/stk (running cmake)...\n"; exe 'TRILINOS_DIR=${trilinos_dir} \ diff --git a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_mesh_using_cmake b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_mesh_using_cmake index 522973c31312..824b8eb3a79b 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_mesh_using_cmake +++ b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_no_stk_mesh_using_cmake @@ -6,7 +6,7 @@ exe() { } sierra_proj=${SIERRA_PROJ:-${PWD}} -output_dir=${OUTPUT_DIR:-${PWD}/../stk-no-stk-mesh-cmake-testing} +output_dir=${OUTPUT_DIR:-${PWD}/../stk-cmake-testing} trilinos_dir=${output_dir}/Trilinos build_type=${CMAKE_BUILD_TYPE:-release} date_suffix=`date +%F_%H-%M-%S` @@ -24,18 +24,19 @@ if [ ! -d ${trilinos_dir} ] ; then exe git clone -b develop https://github.com/trilinos/Trilinos.git ${trilinos_dir} else exe cd ${trilinos_dir} - exe git reset --hard develop + exe git checkout develop + exe git reset --hard origin/develop exe git pull fi trilinos_install_dir=${output_dir}/trilinos_install_${build_type}_${SIERRA_COMPILER} exe rm -rf $trilinos_install_dir -stk_build_dir=${output_dir}/stk_build_${build_type}_${SIERRA_COMPILER} +stk_build_dir=${output_dir}/stk_build_no_stk_mesh_${build_type}_${SIERRA_COMPILER} exe rm -rf $stk_build_dir exe mkdir -p $stk_build_dir -stk_cmake_testing=${sierra_proj}/stk/stk_integration_tests/cmake_install_test +stk_cmake_testing_source_dir=${sierra_proj}/stk/stk_integration_tests/cmake_install_test printf "\nUsing sierra project: ${sierra_proj}\n"; printf "Using build-type: ${build_type}\n"; @@ -56,7 +57,7 @@ if [ ! -L ${trilinos_dir}/packages/stk ] ; then exe ln -s ${sierra_proj}/stk ${trilinos_dir}/packages fi -exe cp ${stk_cmake_testing}/run_cmake_stk_no_stk_mesh ${stk_build_dir} +exe cp ${stk_cmake_testing_source_dir}/run_cmake_stk_no_stk_mesh ${stk_build_dir} exe cd ${stk_build_dir} printf "Configuring trilinos/stk (running cmake)...\n"; exe 'TRILINOS_DIR=${trilinos_dir} TRILINOS_INSTALL_DIR=${trilinos_install_dir} CMAKE_BUILD_TYPE=${build_type} ./run_cmake_stk_no_stk_mesh >& ${stk_config_log}' diff --git a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_serial_using_cmake b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_serial_using_cmake index 9c68eb1a5169..5e816da0bc17 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_serial_using_cmake +++ b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_serial_using_cmake @@ -6,7 +6,7 @@ exe() { } sierra_proj=${SIERRA_PROJ:-${PWD}} -output_dir=${OUTPUT_DIR:-${PWD}/../stk-serial-cmake-testing} +output_dir=${OUTPUT_DIR:-${PWD}/../stk-cmake-testing} trilinos_dir=${output_dir}/Trilinos build_type=${CMAKE_BUILD_TYPE:-release} date_suffix=`date +%F_%H-%M-%S` @@ -24,18 +24,19 @@ if [ ! -d ${trilinos_dir} ] ; then exe git clone -b develop https://github.com/trilinos/Trilinos.git ${trilinos_dir} else exe cd ${trilinos_dir} - exe git reset --hard develop + exe git checkout develop + exe git reset --hard origin/develop exe git pull fi trilinos_install_dir=${output_dir}/trilinos_install_${build_type}_${SIERRA_COMPILER} exe rm -rf $trilinos_install_dir -stk_build_dir=${output_dir}/stk_build_${build_type}_${SIERRA_COMPILER} +stk_build_dir=${output_dir}/stk_build_serial_${build_type}_${SIERRA_COMPILER} exe rm -rf $stk_build_dir exe mkdir -p $stk_build_dir -stk_cmake_testing=${sierra_proj}/stk/stk_integration_tests/cmake_install_test +stk_cmake_testing_source_dir=${sierra_proj}/stk/stk_integration_tests/cmake_install_test printf "\nUsing sierra project: ${sierra_proj}\n"; printf "Using build-type: ${build_type}\n"; @@ -56,7 +57,7 @@ if [ ! -L ${trilinos_dir}/packages/stk ] ; then exe ln -s ${sierra_proj}/stk ${trilinos_dir}/packages fi -exe cp ${stk_cmake_testing}/run_cmake_stk_serial ${stk_build_dir} +exe cp ${stk_cmake_testing_source_dir}/run_cmake_stk_serial ${stk_build_dir} exe cd ${stk_build_dir} printf "Configuring trilinos/stk (running cmake)...\n"; exe 'TRILINOS_DIR=${trilinos_dir} TRILINOS_INSTALL_DIR=${trilinos_install_dir} CMAKE_BUILD_TYPE=${build_type} CMAKE_CXX_FLAGS="-Wno-pragmas -Wno-unknown-pragmas -Werror" ./run_cmake_stk_serial >& ${stk_config_log}' diff --git a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_user_facing b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_user_facing index 1b8207656b18..22dfe1cd6c33 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_user_facing +++ b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_user_facing @@ -6,7 +6,7 @@ exe() { } sierra_proj=${SIERRA_PROJ:-${PWD}} -output_dir=${OUTPUT_DIR:-${PWD}/../stk-user-facing-cmake-testing} +output_dir=${OUTPUT_DIR:-${PWD}/../stk-cmake-testing} trilinos_dir=${output_dir}/Trilinos build_type=${CMAKE_BUILD_TYPE:-release} date_suffix=`date +%F_%H-%M-%S` @@ -24,18 +24,19 @@ if [ ! -d ${trilinos_dir} ] ; then exe git clone -b develop https://github.com/trilinos/Trilinos.git ${trilinos_dir} else exe cd ${trilinos_dir} - exe git reset --hard develop + exe git checkout develop + exe git reset --hard origin/develop exe git pull fi trilinos_install_dir=${output_dir}/trilinos_install_${build_type}_${SIERRA_COMPILER} exe rm -rf $trilinos_install_dir -stk_build_dir=${output_dir}/stk_build_${build_type}_${SIERRA_COMPILER} +stk_build_dir=${output_dir}/stk_build_user_facing_${build_type}_${SIERRA_COMPILER} exe rm -rf $stk_build_dir exe mkdir -p $stk_build_dir -stk_cmake_testing=${sierra_proj}/stk/stk_integration_tests/cmake_install_test +stk_cmake_testing_source_dir=${sierra_proj}/stk/stk_integration_tests/cmake_install_test stk_app_build_dir=${output_dir}/stk_test_app_build exe rm -rf ${stk_app_build_dir} @@ -48,9 +49,6 @@ printf "Putting output and logs here: ${output_dir}\n"; exe cd $sierra_proj -exe rm -rf objs/tpls/trilinos_tpls/* - -bake_log=${output_dir}/bake-stk_balance.out.$date_suffix stk_config_log=${output_dir}/stk-user-facing-config.out.$date_suffix stk_make_log=${output_dir}/stk-user-facing-make.out.$date_suffix stk_install_log=${output_dir}/stk-user-facing-install.out.$date_suffix @@ -72,12 +70,13 @@ if [ ! -L ${trilinos_dir}/packages/stk ] ; then exe ln -s ${sierra_proj}/stk ${trilinos_dir}/packages fi -exe cp ${stk_cmake_testing}/run_cmake_stk_user_facing ${stk_build_dir} +exe cp ${stk_cmake_testing_source_dir}/run_cmake_stk_user_facing ${stk_build_dir} exe cd ${stk_build_dir} -printf "\nGenerating pre-requisite TPL paths in ${stk_build_dir}/tpl_paths\n"; -exe ${stk_cmake_testing}/generate_tpl_paths_from_sierra_proj ${sierra_proj} ${output_dir} ${stk_build_dir}/tpl_paths + +exe source ${stk_cmake_testing_source_dir}/load_gcc_modules + printf "Configuring trilinos/stk (running cmake)...\n"; -exe 'TRILINOS_DIR=${trilinos_dir} TRILINOS_INSTALL_DIR=${trilinos_install_dir} CMAKE_BUILD_TYPE=${build_type} ./run_cmake_stk_user_facing ${stk_build_dir}/tpl_paths >& ${stk_config_log}' +exe 'TRILINOS_DIR=${trilinos_dir} TRILINOS_INSTALL_DIR=${trilinos_install_dir} CMAKE_BUILD_TYPE=${build_type} ./run_cmake_stk_user_facing >& ${stk_config_log}' if [ $? -ne 0 ] ; then echo "!! error in stk/trilinos config, check output in ${stk_config_log} !!"; exit 1; @@ -106,10 +105,10 @@ printf "\nNow building stk test-app...\n"; exe rm -rf ${stk_app_build_dir} exe mkdir ${stk_app_build_dir} -exe cp ${stk_cmake_testing}/stk_test_app/run_cmake_stk_test_app ${stk_app_build_dir} +exe cp ${stk_cmake_testing_source_dir}/stk_test_app/run_cmake_stk_test_app ${stk_app_build_dir} exe cd ${stk_app_build_dir}; -exe './run_cmake_stk_test_app ${trilinos_install_dir} ${stk_cmake_testing}/stk_test_app >& ${stk_app_config_log}'; +exe './run_cmake_stk_test_app ${trilinos_install_dir} ${stk_cmake_testing_source_dir}/stk_test_app >& ${stk_app_config_log}'; if [ $? -ne 0 ] ; then printf "!! error running cmake for stk test-app, check output in ${stk_app_config_log}\n"; exit 1; diff --git a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_using_cmake b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_using_cmake index 7596c412f7a3..0bf7c7d5284d 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/build_stk_using_cmake +++ b/packages/stk/stk_integration_tests/cmake_install_test/build_stk_using_cmake @@ -29,7 +29,8 @@ if [ ! -d ${trilinos_dir} ] ; then exe git clone -b develop https://github.com/trilinos/Trilinos.git ${trilinos_dir} else exe cd "${trilinos_dir}" - exe git reset --hard develop + exe git checkout develop + exe git reset --hard origin/develop exe git pull fi @@ -40,7 +41,7 @@ stk_build_dir=${output_dir}/stk_build_${build_type}_${SIERRA_COMPILER} exe rm -rf $stk_build_dir exe mkdir -p $stk_build_dir -stk_cmake_testing=${sierra_proj}/stk/stk_integration_tests/cmake_install_test +stk_cmake_testing_source_dir=${sierra_proj}/stk/stk_integration_tests/cmake_install_test printf "\nUsing sierra project: ${sierra_proj}\n"; printf "Using build-type: ${build_type}\n"; @@ -72,10 +73,11 @@ if [ ! -L ${trilinos_dir}/packages/stk ] ; then exe ln -s ${sierra_proj}/stk ${trilinos_dir}/packages fi -exe cp ${stk_cmake_testing}/run_cmake_stk ${stk_build_dir} +exe cp ${stk_cmake_testing_source_dir}/run_cmake_stk ${stk_build_dir} exe cd ${stk_build_dir} -printf "\nGenerating pre-requisite TPL paths in ${stk_build_dir}/tpl_paths\n"; -exe ${stk_cmake_testing}/generate_tpl_paths_from_sierra_proj ${sierra_proj} ${output_dir} ${stk_build_dir}/tpl_paths + +exe source ${stk_cmake_testing_source_dir}/load_gcc_modules + printf "Configuring trilinos/stk (running cmake)...\n"; exe "TRILINOS_DIR=${trilinos_dir} \ TRILINOS_INSTALL_DIR=${trilinos_install_dir} \ diff --git a/packages/stk/stk_integration_tests/cmake_install_test/load_gcc_modules b/packages/stk/stk_integration_tests/cmake_install_test/load_gcc_modules new file mode 100644 index 000000000000..723ff2582004 --- /dev/null +++ b/packages/stk/stk_integration_tests/cmake_install_test/load_gcc_modules @@ -0,0 +1,11 @@ +#!/bin/bash + +module load sierra-devel +module load cde/v2/gcc/7.2.0/netlib-lapack/3.8.0 +module load cde/v2/gcc/7.2.0/boost/1.73.0 +module load cde/v2/gcc/7.2.0/hdf5/1.10.6 +module load cde/v2/gcc/7.2.0/netcdf-c/4.7.3 +module load cde/v2/gcc/7.2.0/parallel-netcdf/1.12.1 +module load cde/v2/gcc/7.2.0/metis/5.1.0 +module load cde/v2/gcc/7.2.0/parmetis/4.0.3 + diff --git a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk index 1b7530b06fde..aa3c96b34b28 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk +++ b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk @@ -8,6 +8,7 @@ fortran_macro=${FORTRAN_MACRO:-FORTRAN_ONE_UNDERSCORE} cmake_cxx_flags=${CMAKE_CXX_FLAGS} cmake_exe_linker_flags=${CMAKE_EXE_LINKER_FLAGS} cuda_on_or_off=${CUDA:-OFF} +CUDA=$cuda_on_or_off printf "\nTRILINOS_DIR=${trilinos_src_dir}\n"; printf "BUILD_DIR=${build_dir}\n"; @@ -30,11 +31,6 @@ if [ "${CUDA}" == "ON" ] ; then fi printf "not_cuda: ${not_cuda}\n"; -if [ $# -ne 1 ] ; then - echo "Usage: run_cmake_stk tpl_paths_file"; - exit 1; -fi - if [ ! -d ${trilinos_src_dir}/packages/seacas ] && [ ! -L ${trilinos_src_dir}/packages/seacas ] ; then echo "Trilinos dir (${trilinos_src_dir}) doesn't have packages/seacas directory. If using a Sierra project, make a soft-link to Sierra's seacas directory."; exit 1; @@ -44,15 +40,6 @@ if [ ! -d ${trilinos_src_dir}/packages/stk ] && [ ! -L ${trilinos_src_dir}/packa exit 1; fi -tpl_paths=$1 - -if [ ! -f ${tpl_paths} ] ; then - echo "Error, tpl_paths file ${tpl_paths} not found."; - exit 1; -fi - -source ${tpl_paths} - mkdir -p $trilinos_install_dir mkdir -p $build_dir @@ -83,7 +70,7 @@ cmake \ -DMPI_BASE_DIR:PATH=$mpi_base_dir \ -DTrilinos_ENABLE_Tpetra:BOOL=ON \ -DTpetraCore_ENABLE_TESTS:BOOL=OFF \ --DTpetra_ENABLE_DEPRECATED_CODE:BOOL=OFF \ +-DTpetra_ENABLE_DEPRECATED_CODE:BOOL=ON \ -DTrilinos_ENABLE_Zoltan2:BOOL=ON \ -DZoltan2_ENABLE_ParMETIS:BOOL=ON \ -DTrilinos_ENABLE_Pamgen:BOOL=ON \ @@ -113,19 +100,9 @@ cmake \ -DTPL_Netcdf_Enables_Pnetcdf:BOOL=ON \ -DTPL_ENABLE_Zlib:BOOL=ON \ -DTPL_ENABLE_ParMETIS:BOOL=ON \ --DParMETIS_INCLUDE_DIRS:PATH="${PARMETIS_INC_DIR};${METIS_INC_DIR}" \ --DParMETIS_LIBRARY_DIRS:PATH="${PARMETIS_LIB_DIR};${METIS_LIB_DIR}" \ -DTPL_ENABLE_Pnetcdf:BOOL=ON \ --DNetCDF_INCLUDE_DIRS:PATH=${NETCDF_INC_DIR} \ --DNetCDF_LIBRARIES="-L${NETCDF_LIB_DIR} -lnetcdf -L${PNETCDF_LIB_DIR} -lpnetcdf -L${HDF_LIB_DIR} -lhdf5_hl -lhdf5 -ldl" \ +-DNetCDF_LIBRARIES="-lnetcdf -lpnetcdf -lhdf5_hl -lhdf5 -ldl" \ -DTPL_ENABLE_HDF5:BOOL=ON \ --DHDF5_INCLUDE_DIRS:PATH=${HDF_INC_DIR} \ --DHDF5_LIBRARY_DIRS:PATH=${HDF_LIB_DIR} \ --DPnetcdf_INCLUDE_DIRS:PATH=${PNETCDF_INC_DIR} \ --DPnetcdf_LIBRARY_DIRS:PATH=${PNETCDF_LIB_DIR} \ --DBoost_INCLUDE_DIRS:PATH="$BOOST_INC_DIR" \ --DTPL_BLAS_LIBRARIES="${BLAS_LIBRARIES}" \ --DTPL_LAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \ -DCMAKE_EXE_LINKER_FLAGS="${cmake_exe_linker_flags}" \ ${trilinos_src_dir}/ diff --git a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_user_facing b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_user_facing index af9605295e35..0bb4847a8faa 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_user_facing +++ b/packages/stk/stk_integration_tests/cmake_install_test/run_cmake_stk_user_facing @@ -19,18 +19,6 @@ printf "TRILINOS_INSTALL_DIR=${trilinos_install_dir}\n"; printf "FORTRAN_MACRO=${fortran_macro}\n"; printf "\nTo change these vars, set as env vars or pass to this script like 'VAR=value run_cmake_stk_user_facing'\n\n"; -if [ $# -ne 1 ] ; then - echo "Usage: run_cmake_stk_user_facing tpl_paths_file"; - exit 1; -fi -tpl_paths=$1 -if [ ! -f ${tpl_paths} ] ; then - echo "Error, tpl_paths file ${tpl_paths} not found."; - exit 1; -fi - -source ${tpl_paths} - mkdir -p $trilinos_install_dir mkdir -p $build_dir @@ -61,7 +49,7 @@ cmake \ -DMPI_BASE_DIR:PATH=$mpi_base_dir \ -DTrilinos_ENABLE_Tpetra:BOOL=ON \ -DTpetraCore_ENABLE_TESTS:BOOL=OFF \ --DTpetra_ENABLE_DEPRECATED_CODE:BOOL=OFF \ +-DTpetra_ENABLE_DEPRECATED_CODE:BOOL=ON \ -DTrilinos_ENABLE_Zoltan2:BOOL=ON \ -DZoltan2_ENABLE_ParMETIS:BOOL=ON \ -DTPL_ENABLE_CUDA:BOOL=${cuda_on_or_off} \ @@ -83,20 +71,10 @@ cmake \ -DTPL_Netcdf_Enables_Netcdf4:BOOL=ON \ -DTPL_Netcdf_Enables_Pnetcdf:BOOL=ON \ -DTPL_ENABLE_Zlib:BOOL=ON \ +-DHAVE_PARMETIS_VERSION_4_0_3=ON \ -DTPL_ENABLE_ParMETIS:BOOL=ON \ --DParMETIS_INCLUDE_DIRS:PATH="${PARMETIS_INC_DIR};${METIS_INC_DIR}" \ --DParMETIS_LIBRARY_DIRS:PATH="${PARMETIS_LIB_DIR};${METIS_LIB_DIR}" \ -DTPL_ENABLE_Pnetcdf:BOOL=ON \ --DNetCDF_INCLUDE_DIRS:PATH=${NETCDF_INC_DIR} \ --DNetCDF_LIBRARIES="-L${NETCDF_LIB_DIR} -lnetcdf -L${PNETCDF_LIB_DIR} -lpnetcdf -L${HDF_LIB_DIR} -lhdf5_hl -lhdf5 -ldl" \ -DTPL_ENABLE_HDF5:BOOL=ON \ --DHDF5_INCLUDE_DIRS:PATH=${HDF_INC_DIR} \ --DHDF5_LIBRARY_DIRS:PATH=${HDF_LIB_DIR} \ --DPnetcdf_INCLUDE_DIRS:PATH=${PNETCDF_INC_DIR} \ --DPnetcdf_LIBRARY_DIRS:PATH=${PNETCDF_LIB_DIR} \ --DBoost_INCLUDE_DIRS:PATH="$BOOST_INC_DIR" \ --DTPL_BLAS_LIBRARIES="${BLAS_LIBRARIES}" \ --DTPL_LAPACK_LIBRARIES="${LAPACK_LIBRARIES}" \ -DCMAKE_EXE_LINKER_FLAGS="${cmake_exe_linker_flags}" \ ${trilinos_src_dir}/ diff --git a/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_stk_test_app b/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_stk_test_app index 8b0cce8dc760..44586dc9d1dd 100755 --- a/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_stk_test_app +++ b/packages/stk/stk_integration_tests/cmake_install_test/stk_test_app/run_cmake_stk_test_app @@ -1,6 +1,9 @@ TRILINOS_ROOT_DIR=$1 TEST_STK_APP_SOURCE_DIR=$2 +echo "Using Trilinos dir: $TRILINOS_ROOT_DIR" +echo "Using STK test-app dir: $TEST_STK_APP_SOURCE_DIR" + cmake \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE:-RELEASE} \ -DENABLE_OPENMP=${ENABLE_OPENMP:-OFF} \ diff --git a/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp b/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp index 748f32c2475b..c9e3a89119d5 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp +++ b/packages/stk/stk_mesh/stk_mesh/base/BulkData.cpp @@ -3639,7 +3639,7 @@ void BulkData::internal_change_ghosting( OrdinalVector removeParts(1, m_ghost_parts[ghosting.ordinal()]->mesh_meta_data_ordinal()); OrdinalVector scratchOrdinalVec, scratchSpace; - std::sort(removeRecvGhosts.begin(), removeRecvGhosts.end(), EntityLess(*this)); + stk::util::sort_and_unique(removeRecvGhosts, EntityLess(*this)); for (unsigned i=0; i void make_field_sync_debugger() const { diff --git a/packages/stk/stk_mesh/stk_mesh/base/GetNgpExecutionSpace.hpp b/packages/stk/stk_mesh/stk_mesh/base/GetNgpExecutionSpace.hpp new file mode 100644 index 000000000000..a9984a5eb97c --- /dev/null +++ b/packages/stk/stk_mesh/stk_mesh/base/GetNgpExecutionSpace.hpp @@ -0,0 +1,125 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#ifndef NGP_EXECUTION_SPACE_HPP +#define NGP_EXECUTION_SPACE_HPP + +#include +#include +#include + +namespace stk { +namespace mesh { + +template +class ExecSpaceWrapper { + public: + operator const ExecSpaceType&() { return space; } + const char* name() const { return space.name(); } + void fence() const { space.fence(); } + const ExecSpaceType& get_execution_space() const { return space; } + + private: + ExecSpaceType space; +}; + +#ifdef KOKKOS_ENABLE_CUDA +namespace impl { +struct CudaStreamDeleter { + void operator()(cudaStream_t* stream) const { + cudaStreamDestroy(*stream); + delete stream; + } +}; + +struct ExecSpaceAndCudaStreamDeleter { + ExecSpaceAndCudaStreamDeleter() {} + ExecSpaceAndCudaStreamDeleter(cudaStream_t* streamPtr_) + : streamPtr(streamPtr_) + {} + ExecSpaceAndCudaStreamDeleter(const ExecSpaceAndCudaStreamDeleter& deleter) + { + streamPtr = deleter.streamPtr; + } + + void operator()(ExecSpace* e) const { + cudaStreamDestroy(*streamPtr); + delete streamPtr; + delete e; + } + + cudaStream_t* streamPtr; +}; +} + +template<> +class ExecSpaceWrapper { + public: + using ExecSpaceType = Kokkos::Cuda; + + ExecSpaceWrapper() + : stream(new cudaStream_t) { + cudaStreamCreate(stream); + space = std::shared_ptr(new ExecSpaceType(*stream), impl::ExecSpaceAndCudaStreamDeleter(stream)); + } + + operator const ExecSpaceType&() { return *space; } + const char* name() const { return space->name(); } + void fence() const { space->fence(); } + const ExecSpaceType& get_execution_space() const { return *space; } + + private: + std::shared_ptr space; + cudaStream_t* stream; +}; +#endif + +inline ExecSpaceWrapper<> get_execution_space_with_stream() +{ + bool launchBlockingIsOn = get_env_var_as_bool("CUDA_LAUNCH_BLOCKING", false); + static bool printedOnce = false; + + if(launchBlockingIsOn && !printedOnce) { + sierra::Env::outputP0() << "CUDA_LAUNCH_BLOCKING is ON. Asynchronous operations will block." << std::endl; + printedOnce = true; + } + + auto execSpace = ExecSpaceWrapper<>(); + return execSpace; +} + +} +} + +#endif \ No newline at end of file diff --git a/packages/stk/stk_mesh/stk_mesh/base/GetNgpField.hpp b/packages/stk/stk_mesh/stk_mesh/base/GetNgpField.hpp index 369f56c1cefe..6004cba4e1f2 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/GetNgpField.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/GetNgpField.hpp @@ -42,22 +42,32 @@ namespace stk { namespace mesh { template class NgpDebugger = DefaultNgpFieldSyncDebugger> -NgpField & get_updated_ngp_field(const FieldBase & stkField) +NgpField & get_updated_ngp_field_async(const FieldBase & stkField, const ExecSpace& execSpace) { NgpFieldBase * ngpField = impl::get_ngp_field(stkField); if (ngpField == nullptr) { - ngpField = new NgpField(stkField.get_mesh(), stkField, true); + ngpField = new NgpField(stkField.get_mesh(), stkField, execSpace, true); impl::set_ngp_field(stkField, ngpField); } else { if (stkField.get_mesh().synchronized_count() != ngpField->synchronized_count()) { + ngpField->set_execution_space(execSpace); ngpField->update_field(); } } + return dynamic_cast< NgpField& >(*ngpField); } +template class NgpDebugger = DefaultNgpFieldSyncDebugger> +NgpField & get_updated_ngp_field(const FieldBase & stkField) +{ + auto& ngpFieldRef = get_updated_ngp_field_async(stkField, Kokkos::DefaultExecutionSpace()); + impl::internal_fence_no_sync_to_host(ngpFieldRef); + return ngpFieldRef; +} + }} #endif // GETNGPFIELD_HPP diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpField.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpField.hpp index a6b9b1c5e30b..21961738922e 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpField.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpField.hpp @@ -72,6 +72,57 @@ class EntityFieldData { unsigned fieldComponentStride; }; +enum NgpFieldSyncMode { + INVALID = 0, + HOST_TO_DEVICE = 1, + HOST_TO_DEVICE_ASYNC = 2, + DEVICE_TO_HOST = 3, + DEVICE_TO_HOST_ASYNC = 4 +}; + +struct AsyncCopyState { + + AsyncCopyState() + : execSpace(Kokkos::DefaultExecutionSpace()), + syncMode(INVALID) + {} + + KOKKOS_FUNCTION + AsyncCopyState(const AsyncCopyState& state) + : execSpace(state.execSpace), + syncMode(state.syncMode) + {} + + void set_state(const ExecSpace& space, NgpFieldSyncMode mode) + { + execSpace = space; + syncMode = mode; + } + + void set_execution_space(const ExecSpace& space) + { + execSpace = space; + } + + void reset_state() + { + execSpace = Kokkos::DefaultExecutionSpace(); + syncMode = INVALID; + } + + ExecSpace execSpace; + NgpFieldSyncMode syncMode; +}; + +namespace impl { + +template class NgpDebugger = DefaultNgpFieldSyncDebugger> +inline void internal_fence_no_sync_to_host(NgpField& ngpField) +{ + ngpField.asyncCopyState.execSpace.fence(); +} +} + template using UnmanagedHostInnerView = Kokkos::View>; template using UnmanagedDevInnerView = Kokkos::View>; @@ -84,7 +135,8 @@ class HostField : public NgpFieldBase HostField() : NgpFieldBase(), - field(nullptr) + field(nullptr), + asyncCopyState(AsyncCopyState()) { needSyncToHost = std::make_shared(false); needSyncToDevice = std::make_shared(false); @@ -92,13 +144,21 @@ class HostField : public NgpFieldBase HostField(const stk::mesh::BulkData& b, const stk::mesh::FieldBase& f, bool isFromGetUpdatedNgpField = false) : NgpFieldBase(), - field(&f) + field(&f), + asyncCopyState(AsyncCopyState()) { field->template make_field_sync_debugger(); needSyncToHost = std::make_shared(false); needSyncToDevice = std::make_shared(false); } + HostField(const stk::mesh::BulkData& b, const stk::mesh::FieldBase& f, + const ExecSpace& execSpace, bool isFromGetUpdatedNgpField = false) + : HostField(b, f, isFromGetUpdatedNgpField) + { + asyncCopyState.set_state(execSpace, INVALID); + } + HostField(const HostField&) = default; HostField(HostField&&) = default; HostField& operator=(const HostField&) = default; @@ -110,8 +170,11 @@ class HostField : public NgpFieldBase void set_field_states(HostField* fields[]) {} + void set_execution_space(const ExecSpace& execSpace) { asyncCopyState.set_execution_space(execSpace); } + size_t num_syncs_to_host() const override { return field->num_syncs_to_host(); } size_t num_syncs_to_device() const override { return field->num_syncs_to_device(); } + void fence() override {} unsigned get_component_stride() const { return 1; } @@ -123,8 +186,6 @@ class HostField : public NgpFieldBase return bucketOrdinal; } - - T& get(const HostMesh& ngpMesh, stk::mesh::Entity entity, int component, const char * fileName = HOST_DEBUG_FILE_NAME, int lineNumber = HOST_DEBUG_LINE_NUMBER) const { @@ -210,6 +271,12 @@ class HostField : public NgpFieldBase } void sync_to_host() override + { + sync_to_host(Kokkos::DefaultExecutionSpace()); + Kokkos::fence(); + } + + void sync_to_host(const ExecSpace& execSpace) override { if (need_sync_to_host()) { field->increment_num_syncs_to_host(); @@ -218,6 +285,12 @@ class HostField : public NgpFieldBase } void sync_to_device() override + { + sync_to_device(Kokkos::DefaultExecutionSpace()); + Kokkos::fence(); + } + + void sync_to_device(const ExecSpace& execSpace) override { if (need_sync_to_device()) { field->increment_num_syncs_to_device(); @@ -274,8 +347,11 @@ class HostField : public NgpFieldBase std::shared_ptr needSyncToHost; std::shared_ptr needSyncToDevice; -}; + template class NgpDebuggerType> + friend void impl::internal_fence_no_sync_to_host(NgpField& ngpField); + AsyncCopyState asyncCopyState; +}; template class NgpDebugger> class DeviceField : public NgpFieldBase @@ -300,6 +376,7 @@ class DeviceField : public NgpFieldBase synchronizedCount(0), userSpecifiedSelector(false), syncSelector(nullptr), + asyncCopyState(AsyncCopyState()), fieldSyncDebugger(nullptr) { const int maxStates = static_cast(stk::mesh::MaximumFieldStates); @@ -321,7 +398,42 @@ class DeviceField : public NgpFieldBase copyCounter("copy_counter"), userSpecifiedSelector(false), syncSelector(new Selector()), + asyncCopyState(AsyncCopyState()), + fieldSyncDebugger(nullptr) + { + ThrowRequireMsg(isFromGetUpdatedNgpField, "NgpField must be obtained from get_updated_ngp_field()"); + initialize(); + update_field(); + + fieldSyncDebugger.initialize_debug_views(this); + } + + DeviceField(const stk::mesh::BulkData& bulk, const stk::mesh::FieldBase &stkField, + const ExecSpace& execSpace, bool isFromGetUpdatedNgpField = false) + : NgpFieldBase(), + rank(stkField.entity_rank()), + ordinal(stkField.mesh_meta_data_ordinal()), + hostBulk(&bulk), + hostField(&stkField), + bucketCapacity(0), + numBucketsForField(0), + maxNumScalarsPerEntity(0), + synchronizedCount(bulk.synchronized_count()), + copyCounter("copy_counter"), + userSpecifiedSelector(false), + syncSelector(new Selector()), + asyncCopyState(AsyncCopyState()), fieldSyncDebugger(nullptr) + { + ThrowRequireMsg(isFromGetUpdatedNgpField, "NgpField must be obtained from get_updated_ngp_field()"); + asyncCopyState.set_state(execSpace, INVALID); + initialize(); + update_field(); + + fieldSyncDebugger.initialize_debug_views(this); + } + + void initialize() { hostField->template make_field_sync_debugger(); fieldSyncDebugger = NgpDebugger(&hostField->get_field_sync_debugger()); @@ -333,11 +445,6 @@ class DeviceField : public NgpFieldBase for (int s=0; s* fields[]) @@ -348,6 +455,11 @@ class DeviceField : public NgpFieldBase } } + void set_execution_space(const ExecSpace& execSpace) + { + asyncCopyState.set_execution_space(execSpace); + } + void update_field(bool needToSyncAllDataToDevice = false) override { ProfilingBlock prof("update_field for " + hostField->name()); @@ -398,10 +510,21 @@ class DeviceField : public NgpFieldBase reset_sync_selector(); } - size_t num_syncs_to_host() const override { return hostField->num_syncs_to_host(); } size_t num_syncs_to_device() const override { return hostField->num_syncs_to_device(); } + void fence() override { + asyncCopyState.execSpace.fence(); + + if(asyncCopyState.syncMode == DEVICE_TO_HOST_ASYNC) { + Selector selector = selectField(*hostField); + copy_host_data_to_stk_field_data(selector); + reset_sync_selector(); + } + + asyncCopyState.reset_state(); + } + void modify_on_host() override { set_modify_on_host(); @@ -457,28 +580,30 @@ class DeviceField : public NgpFieldBase void sync_to_host() override { - if (need_sync_to_host()) { - ProfilingBlock prof("copy_to_host for " + hostField->name()); - copy_device_to_host(); - fieldSyncDebugger.sync_to_host(this); - reset_sync_selector(); - } + asyncCopyState.set_state(Kokkos::DefaultExecutionSpace(), DEVICE_TO_HOST); + internal_sync_to_host(); + Kokkos::fence(); + asyncCopyState.reset_state(); + } + + void sync_to_host(const ExecSpace& newExecSpace) override + { + asyncCopyState.set_state(newExecSpace, DEVICE_TO_HOST_ASYNC); + internal_sync_to_host(); } void sync_to_device() override { - bool needToSyncToDevice = need_sync_to_device(); - if (needToSyncToDevice) { - ProfilingBlock prof("copy_to_device for " + hostField->name()); - if (hostBulk->synchronized_count() == synchronizedCount) { - copy_host_to_device(); - fieldSyncDebugger.sync_to_device(this); - } - else { - update_field(needToSyncToDevice); - } - reset_sync_selector(); - } + asyncCopyState.set_state(Kokkos::DefaultExecutionSpace(), HOST_TO_DEVICE); + internal_sync_to_device(); + Kokkos::fence(); + asyncCopyState.reset_state(); + } + + void sync_to_device(const ExecSpace& newExecSpace) override + { + asyncCopyState.set_state(newExecSpace, HOST_TO_DEVICE_ASYNC); + internal_sync_to_device(); } size_t synchronized_count() const override { return synchronizedCount; } @@ -780,7 +905,7 @@ class DeviceField : public NgpFieldBase UnmanagedView unInnerSrcView(srcPtr, bucketCapacity, numPerEntity); UnmanagedView unInnerDestView(destPtr, bucketCapacity, numPerEntity); - Kokkos::deep_copy(unInnerDestView, unInnerSrcView); + Kokkos::deep_copy(asyncCopyState.execSpace, unInnerDestView, unInnerSrcView); } template @@ -794,7 +919,7 @@ class DeviceField : public NgpFieldBase UnmanagedView unInnerSrcView(srcPtr, ORDER_INDICES(bucketCapacity, numPerEntity)); UnmanagedView unInnerDestView(destPtr, ORDER_INDICES(bucketCapacity, numPerEntity)); - Kokkos::deep_copy(unInnerDestView, unInnerSrcView); + Kokkos::deep_copy(asyncCopyState.execSpace, unInnerDestView, unInnerSrcView); } void copy_bucket_from_device_to_host(Bucket* bucket, unsigned numPerEntity, unsigned numContiguousBuckets = 1) @@ -807,12 +932,11 @@ class DeviceField : public NgpFieldBase T* bufferPtr = bufferData.data() + selectedBucketOffset * bucketCapacity * numPerEntity; UnmanagedDevInnerView unBufferInnerView(bufferPtr, bucketCapacity*numContiguousBuckets, numPerEntity); - transpose_contiguous_device_data_into_buffer(bucketCapacity*numContiguousBuckets, numPerEntity, unDeviceInnerView, unBufferInnerView); - Kokkos::fence(); + transpose_contiguous_device_data_into_buffer(asyncCopyState.execSpace, bucketCapacity*numContiguousBuckets, numPerEntity, unDeviceInnerView, unBufferInnerView); UnmanagedHostInnerView unHostInnerView(&hostData(selectedBucketOffset,0,0), bucketCapacity*numContiguousBuckets, numPerEntity); - Kokkos::deep_copy(unHostInnerView, unBufferInnerView); + Kokkos::deep_copy(asyncCopyState.execSpace, unHostInnerView, unBufferInnerView); } void copy_bucket_from_host_to_device(Bucket* bucket, unsigned maxNumPerEntity, unsigned numContiguousBuckets = 1) @@ -827,13 +951,12 @@ class DeviceField : public NgpFieldBase T* bufferPtr = bufferData.data() + selectedBucketOffset * bucketCapacity * maxNumPerEntity; UnmanagedDevInnerView unBufferInnerView(bufferPtr, bucketCapacity*numContiguousBuckets, maxNumPerEntity); - Kokkos::deep_copy(unBufferInnerView, unHostInnerView); + Kokkos::deep_copy(asyncCopyState.execSpace, unBufferInnerView, unHostInnerView); T* devicePtr = deviceData.data() + selectedBucketOffset * bucketCapacity * maxNumPerEntity; UnmanagedDevInnerView unDeviceInnerView(devicePtr, ORDER_INDICES(bucketCapacity*numContiguousBuckets, maxNumPerEntity)); - transpose_buffer_into_contiguous_device_data(bucketCapacity*numContiguousBuckets, maxNumPerEntity, unBufferInnerView, unDeviceInnerView); - Kokkos::fence(); + transpose_buffer_into_contiguous_device_data(asyncCopyState.execSpace, bucketCapacity*numContiguousBuckets, maxNumPerEntity, unBufferInnerView, unDeviceInnerView); } void copy_new_and_modified_buckets_from_host(const BucketVector& buckets, unsigned numPerEntity) @@ -883,7 +1006,7 @@ class DeviceField : public NgpFieldBase } } - Kokkos::deep_copy(newDeviceSelectedBucketOffset, newHostSelectedBucketOffset); + Kokkos::deep_copy(asyncCopyState.execSpace, newDeviceSelectedBucketOffset, newHostSelectedBucketOffset); } void construct_field_exist_view(const BucketVector& allBuckets, const Selector& selector) @@ -891,13 +1014,13 @@ class DeviceField : public NgpFieldBase deviceFieldExistsOnBucket = BoolViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, hostField->name() + "_exists_on_bucket"), allBuckets.size()); hostFieldExistsOnBucket = Kokkos::create_mirror_view(Kokkos::HostSpace(), deviceFieldExistsOnBucket, Kokkos::WithoutInitializing); - Kokkos::deep_copy(hostFieldExistsOnBucket, false); + Kokkos::deep_copy(asyncCopyState.execSpace, hostFieldExistsOnBucket, false); for (size_t i = 0; i < allBuckets.size(); ++i) { if(selector(*allBuckets[i])) { hostFieldExistsOnBucket(allBuckets[i]->bucket_id()) = true; } } - Kokkos::deep_copy(deviceFieldExistsOnBucket, hostFieldExistsOnBucket); + Kokkos::deep_copy(asyncCopyState.execSpace, deviceFieldExistsOnBucket, hostFieldExistsOnBucket); } void construct_unsigned_bucket_views(const BucketVector & buckets, const std::string& suffix, @@ -916,7 +1039,7 @@ class DeviceField : public NgpFieldBase hostAllFieldsBucketsNumComponentsPerEntity[bucket->bucket_id()] = stk::mesh::field_scalars_per_entity(*hostField, *bucket); } - Kokkos::deep_copy(deviceAllFieldsBucketsNumComponentsPerEntity, hostAllFieldsBucketsNumComponentsPerEntity); + Kokkos::deep_copy(asyncCopyState.execSpace, deviceAllFieldsBucketsNumComponentsPerEntity, hostAllFieldsBucketsNumComponentsPerEntity); } void construct_field_buckets_num_components_per_entity_view(const BucketVector & buckets) { @@ -926,7 +1049,7 @@ class DeviceField : public NgpFieldBase hostFieldBucketsNumComponentsPerEntity[i] = stk::mesh::field_scalars_per_entity(*hostField, *buckets[i]); } - Kokkos::deep_copy(deviceFieldBucketsNumComponentsPerEntity, hostFieldBucketsNumComponentsPerEntity); + Kokkos::deep_copy(asyncCopyState.execSpace, deviceFieldBucketsNumComponentsPerEntity, hostFieldBucketsNumComponentsPerEntity); } void construct_bucket_sizes_view(const BucketVector & buckets) { @@ -936,7 +1059,7 @@ class DeviceField : public NgpFieldBase hostBucketSizes[i] = buckets[i]->size(); } - Kokkos::deep_copy(deviceBucketSizes, hostBucketSizes); + Kokkos::deep_copy(asyncCopyState.execSpace, deviceBucketSizes, hostBucketSizes); } void copy_contiguous_buckets_from_device_to_host(const BucketVector& buckets, unsigned numPerEntity, unsigned& i) @@ -959,10 +1082,10 @@ class DeviceField : public NgpFieldBase void sync_to_host_using_selector() { if (!userSpecifiedSelector) { - transpose_all_device_data_into_buffer(*hostField, deviceData, bufferData, deviceBucketSizes, deviceFieldBucketsNumComponentsPerEntity); - Kokkos::fence(); + auto& execSpace = asyncCopyState.execSpace; + transpose_all_device_data_into_buffer(execSpace, *hostField, deviceData, bufferData, deviceBucketSizes, deviceFieldBucketsNumComponentsPerEntity); - Kokkos::deep_copy(hostData, bufferData); + Kokkos::deep_copy(execSpace, hostData, bufferData); } else { copy_selected_buckets_to_host(); @@ -970,11 +1093,19 @@ class DeviceField : public NgpFieldBase clear_sync_state_flags(); } + void copy_host_data_to_stk_field_data(Selector& selector) + { + const stk::mesh::BucketVector& buckets = hostBulk->get_buckets(hostField->entity_rank(), selector); + hostField->increment_num_syncs_to_host(); + copy_data(buckets, [](T &hostFieldData, T &stkFieldData){ + stkFieldData = hostFieldData; + }, selector); + } + void copy_device_to_host() { if (hostField) { stk::mesh::Selector selector = stk::mesh::selectField(*hostField); - const stk::mesh::BucketVector& buckets = hostBulk->get_buckets(hostField->entity_rank(), selector); if(userSpecifiedSelector) { selector &= *syncSelector; @@ -982,10 +1113,20 @@ class DeviceField : public NgpFieldBase sync_to_host_using_selector(); - hostField->increment_num_syncs_to_host(); - copy_data(buckets, [](T &hostFieldData, T &stkFieldData){ - stkFieldData = hostFieldData; - }, selector); + if(asyncCopyState.syncMode == DEVICE_TO_HOST) { + fence(); + copy_host_data_to_stk_field_data(selector); + reset_sync_selector(); + } + } + } + + void internal_sync_to_host() + { + if (need_sync_to_host()) { + ProfilingBlock prof("copy_to_host for " + hostField->name()); + copy_device_to_host(); + fieldSyncDebugger.sync_to_host(this); } } @@ -1009,10 +1150,10 @@ class DeviceField : public NgpFieldBase void sync_to_device_using_selector() { if (!userSpecifiedSelector) { - Kokkos::deep_copy(bufferData, hostData); + auto& execSpace = asyncCopyState.execSpace; + Kokkos::deep_copy(execSpace, bufferData, hostData); - transpose_buffer_into_all_device_data(*hostField, bufferData, deviceData, deviceBucketSizes, deviceFieldBucketsNumComponentsPerEntity); - Kokkos::fence(); + transpose_buffer_into_all_device_data(execSpace, *hostField, bufferData, deviceData, deviceBucketSizes, deviceFieldBucketsNumComponentsPerEntity); } else { copy_selected_buckets_to_device(); @@ -1038,6 +1179,23 @@ class DeviceField : public NgpFieldBase } } + void internal_sync_to_device() + { + bool needToSyncToDevice = need_sync_to_device(); + + if (needToSyncToDevice) { + ProfilingBlock prof("copy_to_device for " + hostField->name()); + if (hostBulk->synchronized_count() == synchronizedCount) { + copy_host_to_device(); + fieldSyncDebugger.sync_to_device(this); + } + else { + update_field(needToSyncToDevice); + } + reset_sync_selector(); + } + } + template KOKKOS_FUNCTION void swap_views(ViewType & view1, ViewType & view2) @@ -1080,6 +1238,9 @@ class DeviceField : public NgpFieldBase friend NgpDebugger; + template class NgpDebuggerType> + friend void impl::internal_fence_no_sync_to_host(NgpField& ngpField); + FieldDataHostViewType hostData; FieldDataDeviceViewType deviceData; FieldDataDeviceViewType bufferData; @@ -1118,6 +1279,8 @@ class DeviceField : public NgpFieldBase typename UnsignedViewType::HostMirror hostFieldBucketsNumComponentsPerEntity; UnsignedViewType deviceFieldBucketsNumComponentsPerEntity; + AsyncCopyState asyncCopyState; + NgpDebugger fieldSyncDebugger; }; diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldBase.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldBase.hpp index a45fff572fb5..b2b7f6454913 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldBase.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldBase.hpp @@ -29,10 +29,14 @@ class NgpFieldBase virtual void clear_host_sync_state() = 0; virtual void clear_device_sync_state() = 0; virtual void sync_to_host() = 0; + virtual void sync_to_host(const ExecSpace& execSpace) = 0; virtual void sync_to_device() = 0; + virtual void sync_to_device(const ExecSpace& execSpace) = 0; virtual size_t synchronized_count() const = 0; virtual size_t num_syncs_to_host() const = 0; virtual size_t num_syncs_to_device() const = 0; + virtual void set_execution_space(const ExecSpace& execSpace) = 0; + virtual void fence() = 0; virtual void debug_modification_begin() = 0; virtual void debug_modification_end(size_t synchronizationCount) = 0; diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldSyncDebugger.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldSyncDebugger.hpp index 420f3646a065..9c798111558b 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpFieldSyncDebugger.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpFieldSyncDebugger.hpp @@ -233,7 +233,6 @@ class NgpFieldSyncDebugger if (data_is_stale_on_device(index, component)) { print_stale_data_warning(ngpField, index.bucket_id, index.bucket_ord, component, fileName, lineNumber); } - } template diff --git a/packages/stk/stk_mesh/stk_mesh/base/NgpUtils.hpp b/packages/stk/stk_mesh/stk_mesh/base/NgpUtils.hpp index c624fd0a6adc..c17e0d9a8f64 100644 --- a/packages/stk/stk_mesh/stk_mesh/base/NgpUtils.hpp +++ b/packages/stk/stk_mesh/stk_mesh/base/NgpUtils.hpp @@ -46,6 +46,17 @@ namespace stk { namespace mesh { +inline void ngp_field_fence(MetaData& meta) +{ + auto fields = meta.get_fields(); + + for(auto field : fields) { + if(field->has_ngp_field()) { + field->fence(); + } + } +} + inline void require_ngp_mesh_rank_limit(const stk::mesh::MetaData& meta) { const size_t maxNumRanks = stk::topology::NUM_RANKS; @@ -68,10 +79,12 @@ inline stk::NgpVector get_bucket_ids(const stk::mesh::BulkData &bulk, } template -void transpose_contiguous_device_data_into_buffer(unsigned numEntitiesInBlock, unsigned numPerEntity, +void transpose_contiguous_device_data_into_buffer(stk::mesh::ExecSpace & execSpace, unsigned numEntitiesInBlock, unsigned numPerEntity, ViewType & deviceView, ViewType & bufferView) { - Kokkos::parallel_for("transpose_contiguous_device_data_into_buffer", numEntitiesInBlock, + const auto& rangePolicy = Kokkos::RangePolicy(execSpace, 0, numEntitiesInBlock); + + Kokkos::parallel_for("transpose_contiguous_device_data_into_buffer", rangePolicy, KOKKOS_LAMBDA(const int& entityIdx) { for (unsigned i = 0; i < numPerEntity; i++) { bufferView(entityIdx, i) = deviceView(ORDER_INDICES(entityIdx, i)); @@ -81,10 +94,12 @@ void transpose_contiguous_device_data_into_buffer(unsigned numEntitiesInBlock, u } template -void transpose_buffer_into_contiguous_device_data(unsigned numEntitiesInBlock, unsigned numPerEntity, +void transpose_buffer_into_contiguous_device_data(stk::mesh::ExecSpace & execSpace, unsigned numEntitiesInBlock, unsigned numPerEntity, ViewType & bufferView, ViewType & deviceView) { - Kokkos::parallel_for("transpose_buffer_into_contiguous_device_data", numEntitiesInBlock, + const auto& rangePolicy = Kokkos::RangePolicy(execSpace, 0, numEntitiesInBlock); + + Kokkos::parallel_for("transpose_buffer_into_contiguous_device_data", rangePolicy, KOKKOS_LAMBDA(const int& entityIdx) { for (unsigned i = 0; i < numPerEntity; i++) { deviceView(ORDER_INDICES(entityIdx, i)) = bufferView(entityIdx, i); @@ -93,8 +108,9 @@ void transpose_buffer_into_contiguous_device_data(unsigned numEntitiesInBlock, u ); } -template -void transpose_all_device_data_into_buffer(const stk::mesh::FieldBase & stkField, +template +void transpose_all_device_data_into_buffer(ExecSpaceType & execSpace, + const stk::mesh::FieldBase & stkField, DeviceViewType & deviceView, BufferViewType & bufferView, DeviceUnsignedViewType & bucketSizes, @@ -103,8 +119,8 @@ void transpose_all_device_data_into_buffer(const stk::mesh::FieldBase & stkField stk::mesh::Selector selector = stk::mesh::selectField(stkField); size_t numBuckets = bucketSizes.extent(0); - typedef Kokkos::TeamPolicy::member_type TeamHandleType; - const auto& teamPolicy = Kokkos::TeamPolicy(numBuckets, Kokkos::AUTO); + typedef typename Kokkos::TeamPolicy::member_type TeamHandleType; + const auto& teamPolicy = Kokkos::TeamPolicy(execSpace, numBuckets, Kokkos::AUTO); Kokkos::parallel_for("transpose_all_device_data_into_buffer", teamPolicy, KOKKOS_LAMBDA(const TeamHandleType & team) { const unsigned bucketIndex = team.league_rank(); @@ -120,8 +136,9 @@ void transpose_all_device_data_into_buffer(const stk::mesh::FieldBase & stkField }); } -template -void transpose_buffer_into_all_device_data(const stk::mesh::FieldBase & stkField, +template +void transpose_buffer_into_all_device_data(ExecSpaceType & execSpace, + const stk::mesh::FieldBase & stkField, BufferViewType & bufferView, DeviceViewType & deviceView, DeviceUnsignedViewType & bucketSizes, @@ -130,8 +147,8 @@ void transpose_buffer_into_all_device_data(const stk::mesh::FieldBase & stkField stk::mesh::Selector selector = stk::mesh::selectField(stkField); size_t numBuckets = bucketSizes.extent(0); - typedef Kokkos::TeamPolicy::member_type TeamHandleType; - const auto& teamPolicy = Kokkos::TeamPolicy(numBuckets, Kokkos::AUTO); + typedef typename Kokkos::TeamPolicy::member_type TeamHandleType; + const auto& teamPolicy = Kokkos::TeamPolicy(execSpace, numBuckets, Kokkos::AUTO); Kokkos::parallel_for("transpose_buffer_into_all_device_data", teamPolicy, KOKKOS_LAMBDA(const TeamHandleType & team) { const unsigned bucketIndex = team.league_rank(); diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.cpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.cpp index 7037de6d98b7..f4ccb5448912 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.cpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.cpp @@ -456,6 +456,14 @@ FieldBaseImpl::set_ngp_field(NgpFieldBase * ngpField) const m_ngpField = ngpField; } +void +FieldBaseImpl::fence() const +{ + if(m_ngpField != nullptr) { + m_ngpField->fence(); + } +} + size_t FieldBaseImpl::num_syncs_to_host() const { diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.hpp index 069619dc9809..6bc657578a06 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/FieldBaseImpl.hpp @@ -179,6 +179,7 @@ class FieldBaseImpl { NgpFieldBase * get_ngp_field() const; void set_ngp_field(NgpFieldBase * ngpField) const; + void fence() const; size_t num_syncs_to_host() const; size_t num_syncs_to_device() const; diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/MeshImplUtils.cpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/MeshImplUtils.cpp index 047bf469d0ff..15ccb7f0e178 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/MeshImplUtils.cpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/MeshImplUtils.cpp @@ -60,12 +60,6 @@ bool is_in_list(Entity entity, const Entity* begin, const Entity* end) return std::find(begin, end, entity) != end; } -void remove_index_from_list(size_t index, std::vector& elementsInCommon) -{ - std::swap(elementsInCommon[index], elementsInCommon.back()); - elementsInCommon.resize(elementsInCommon.size() - 1); -} - void remove_entities_not_in_list(const Entity* begin, const Entity* end, std::vector& elementsInCommon) { int numElemsFound=0; @@ -83,10 +77,7 @@ void remove_entities_not_in_list(const Entity* begin, const Entity* end, std::ve void remove_entities_not_connected_to_other_nodes(const BulkData& mesh, stk::mesh::EntityRank rank, unsigned numNodes, const Entity* nodes, std::vector& elementsInCommon) { for(unsigned i = 1; i < numNodes; ++i) { - const MeshIndex& meshIndex = mesh.mesh_index(nodes[i]); - const Bucket& bucket = *meshIndex.bucket; - const unsigned ord = meshIndex.bucket_ordinal; - remove_entities_not_in_list(bucket.begin(ord, rank), bucket.end(ord, rank), elementsInCommon); + remove_entities_not_in_list(mesh.begin(nodes[i], rank), mesh.end(nodes[i], rank), elementsInCommon); } } @@ -95,10 +86,7 @@ void find_entities_these_nodes_have_in_common(const BulkData& mesh, stk::mesh::E elementsInCommon.clear(); if(numNodes > 0) { - const MeshIndex& meshIndex = mesh.mesh_index(nodes[0]); - const Bucket& bucket = *meshIndex.bucket; - const unsigned ord = meshIndex.bucket_ordinal; - elementsInCommon.assign(bucket.begin(ord, rank), bucket.end(ord, rank)); + elementsInCommon.assign(mesh.begin(nodes[0], rank), mesh.end(nodes[0], rank)); remove_entities_not_connected_to_other_nodes(mesh, rank, numNodes, nodes, elementsInCommon); } } @@ -1308,6 +1296,16 @@ void insert_upward_relations(const BulkData& bulk_data, Entity rel_entity, } } +EntityRank get_highest_upward_connected_rank(const BulkData& mesh, Entity entity) +{ + const EntityRank entityRank = mesh.entity_rank(entity); + EntityRank highestRank = static_cast(mesh.mesh_meta_data().entity_rank_count()-1); + while(highestRank > entityRank && mesh.num_connectivity(entity, highestRank) == 0) { + highestRank = static_cast(highestRank-1); + } + return highestRank; +} + void insert_upward_relations(const BulkData& bulk_data, const EntityProcMapping& entitySharing, Entity rel_entity, @@ -1323,22 +1321,16 @@ void insert_upward_relations(const BulkData& bulk_data, send.addEntityProc(rel_entity,share_proc); - // There may be even higher-ranking entities that need to be ghosted, so we must recurse - EntityRank rel_entity_rank = bucket.entity_rank(); - ThrowAssert(rel_entity_rank > rank_of_orig_entity); - const unsigned bucketOrd = idx.bucket_ordinal; - const EntityRank end_rank = static_cast(bulk_data.mesh_meta_data().entity_rank_count()); - for (EntityRank irank = static_cast(rel_entity_rank + 1); irank < end_rank; ++irank) - { - const int num_rels = bucket.num_connectivity(bucketOrd, irank); - Entity const* rels = bucket.begin(bucketOrd, irank); - - for (int r = 0; r < num_rels; ++r) - { - Entity const rel_of_rel_entity = rels[r]; - if (bulk_data.is_valid(rel_of_rel_entity)) { - insert_upward_relations(bulk_data, entitySharing, rel_of_rel_entity, rel_entity_rank, share_proc, send); + const EntityRank upwardRank = get_highest_upward_connected_rank(bulk_data, rel_entity); + const int numRels = bucket.num_connectivity(bucketOrd, upwardRank); + Entity const* rels = bucket.begin(bucketOrd, upwardRank); + + for (int r = 0; r < numRels; ++r) { + Entity const upwardEntity = rels[r]; + if (bulk_data.is_valid(upwardEntity) && bulk_data.bucket(upwardEntity).owned()) { + if (!entitySharing.find(upwardEntity, share_proc)) { + send.addEntityProc(upwardEntity, share_proc); } } } diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/Visitors.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/Visitors.hpp index ba33e1dcf325..f2e0127ed63d 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/Visitors.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/Visitors.hpp @@ -51,35 +51,62 @@ namespace mesh { namespace impl { template -void VisitClosureGeneral( +void VisitClosureNoRecurse( const BulkData & mesh, Entity inputEntity, DO_THIS_FOR_ENTITY_IN_CLOSURE & do_this, DESIRED_ENTITY & desired_entity) { - if (desired_entity(inputEntity)) { - do_this(inputEntity); - const EntityRank inputEntityRank = mesh.entity_rank(inputEntity); - for (EntityRank rank = stk::topology::NODE_RANK ; rank < inputEntityRank ; ++rank) { - unsigned num_entities_of_rank = mesh.num_connectivity(inputEntity,rank); - if (num_entities_of_rank > 0) { - const bool dontRecurse = rank == stk::topology::NODE_RANK || - inputEntityRank <= stk::topology::ELEM_RANK; - const Entity * entities = mesh.begin(inputEntity,rank); - - for (unsigned i=0 ; i 0) { + const Entity * entities = mesh.begin(inputEntity,rank); + + for (unsigned i=0 ; i(mesh.entity_rank(entity) - 1); + while (mesh.num_connectivity(entity, nextLowerRank) == 0 && nextLowerRank > stk::topology::NODE_RANK) { + nextLowerRank = static_cast(nextLowerRank-1); + } + return nextLowerRank; +} + +template +void VisitClosureGeneral( + const BulkData & mesh, + Entity inputEntity, + DO_THIS_FOR_ENTITY_IN_CLOSURE & do_this, + DESIRED_ENTITY & desired_entity) +{ + const EntityRank inputEntityRank = mesh.entity_rank(inputEntity); + if (inputEntityRank <= stk::topology::ELEM_RANK) { + VisitClosureNoRecurse(mesh, inputEntity, do_this, desired_entity); + } + else if (desired_entity(inputEntity)) { + do_this(inputEntity); + const EntityRank nextLowerRank = get_highest_downward_connected_rank(mesh, inputEntity); + const unsigned num_entities_of_rank = mesh.num_connectivity(inputEntity,nextLowerRank); + if (num_entities_of_rank > 0) { + const Entity * entities = mesh.begin(inputEntity,nextLowerRank); + for (unsigned i=0 ; i ord_and_perm = -// stk::mesh::get_ordinal_and_permutation(bulkData, element_with_perm_4, side_rank, side_nodes_vec); -// -// report_error_with_invalid_ordinal(ord_and_perm, bulkData, side_nodes_vec, element_with_perm_0, element_with_perm_4); -// -// bulkData.declare_relation(element_with_perm_4, side, ord_and_perm.first, ord_and_perm.second); } } else @@ -902,16 +887,6 @@ bool process_killed_elements(stk::mesh::BulkData& bulkData, return remote_death_boundary.get_topology_modification_status(); } -stk::mesh::SideConnector ElemElemGraph::get_side_connector() -{ - return stk::mesh::SideConnector(m_bulk_data, m_graph, m_coincidentGraph, m_idMapper); -} - -stk::mesh::SideNodeConnector ElemElemGraph::get_side_node_connector() -{ - return stk::mesh::SideNodeConnector(m_bulk_data, m_graph, m_coincidentGraph, m_parallelInfoForGraphEdges, m_idMapper); -} - stk::mesh::SideIdChooser ElemElemGraph::get_side_id_chooser() { return stk::mesh::SideIdChooser(m_bulk_data, m_idMapper, m_graph, m_coincidentGraph); diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/ElemElemGraph.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/ElemElemGraph.hpp index a26ffa85cb07..0635fbfbb8f7 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/ElemElemGraph.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/ElemElemGraph.hpp @@ -148,8 +148,8 @@ class ElemElemGraph void fill_from_mesh(); - stk::mesh::SideConnector get_side_connector(); - stk::mesh::SideNodeConnector get_side_node_connector(); + stk::mesh::SideConnector& get_side_connector() { return m_sideConnector; } + stk::mesh::SideNodeConnector& get_side_node_connector() { return m_sideNodeConnector; } stk::mesh::SideIdChooser get_side_id_chooser(); const stk::mesh::BulkData& get_mesh() const; @@ -312,6 +312,8 @@ class ElemElemGraph std::vector m_deleted_elem_pool; impl::SparseGraph m_coincidentGraph; impl::ElementLocalIdMapper m_idMapper; + SideConnector m_sideConnector; + SideNodeConnector m_sideNodeConnector; private: void add_side_for_remote_edge(const GraphEdge & graphEdge, int elemSide, diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.cpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.cpp index 96e9d5eb9392..7ac1ba46c247 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.cpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.cpp @@ -58,13 +58,12 @@ void SideNodeConnector::connect_side_to_nodes(stk::mesh::Entity sideEntity, stk: connect_side_to_other_elements_nodes(graphEdgeForElementToCreateSideOn , sideEntity, elemEntity, elemSide); } -void declare_relations_to_nodes(stk::mesh::BulkData &bulk, stk::mesh::Entity sideEntity, const stk::mesh::EntityVector &sideNodes) +void SideNodeConnector::declare_relations_to_nodes(stk::mesh::Entity sideEntity, const stk::mesh::EntityVector &sideNodes) { - stk::mesh::OrdinalVector scratch1, scratch2, scratch3; stk::mesh::Permutation perm = stk::mesh::Permutation::INVALID_PERMUTATION; for(size_t i = 0; i < sideNodes.size(); i++) { bulk.declare_relation(sideEntity, sideNodes[i], i, perm, - scratch1, scratch2, scratch3); + m_scratchOrdinals1, m_scratchOrdinals2, m_scratchOrdinals3); } } @@ -72,7 +71,7 @@ void SideNodeConnector::connect_side_to_elements_nodes(stk::mesh::Entity sideEnt { stk::mesh::EntityVector sideNodes; stk::mesh::impl::fill_element_side_nodes_from_topology(bulk, elemEntity, elemSide, sideNodes); - declare_relations_to_nodes(bulk, sideEntity, sideNodes); + declare_relations_to_nodes(sideEntity, sideNodes); } stk::mesh::EntityVector SideNodeConnector::get_permuted_side_nodes(stk::mesh::Entity elemEntity, int elemSide, const stk::mesh::EntityVector &sideNodes, int permutation) @@ -97,7 +96,7 @@ void SideNodeConnector::connect_side_to_other_elements_nodes(const GraphEdge &ed const stk::mesh::impl::ParallelInfo &parInfo = parallelGraph.get_parallel_info_for_graph_edge(edgeWithMinId); stk::mesh::EntityVector permutedSideNodes = get_permuted_side_nodes(elemEntity, elemSide, sideNodes, parInfo.m_permutation); - declare_relations_to_nodes(bulk, sideEntity, permutedSideNodes); + declare_relations_to_nodes(sideEntity, permutedSideNodes); } } @@ -123,7 +122,7 @@ void SideConnector::connect_side_to_elem(stk::mesh::Entity sideEntity, int sideOrd) { stk::mesh::Permutation perm = get_permutation_for_side(sideEntity, element, sideOrd); - m_bulk_data.declare_relation(element, sideEntity, sideOrd, perm); + m_bulk_data.declare_relation(element, sideEntity, sideOrd, perm, m_scratchOrdinals1, m_scratchOrdinals2, m_scratchOrdinals3); } void SideConnector::connect_side_to_adjacent_elements(stk::mesh::Entity sideEntity, diff --git a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.hpp b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.hpp index 813e4d545193..c09e5b27e4cc 100644 --- a/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.hpp +++ b/packages/stk/stk_mesh/stk_mesh/baseImpl/elementGraph/SideConnector.hpp @@ -88,19 +88,22 @@ class SideNodeConnector const stk::mesh::impl::SparseGraph &cg, const stk::mesh::ParallelInfoForGraphEdges &p, const stk::mesh::impl::ElementLocalIdMapper & lm) - : bulk(b), graph(g), coincidentGraph(cg), parallelGraph(p), localMapper(lm) + : bulk(b), graph(g), coincidentGraph(cg), parallelGraph(p), localMapper(lm), + m_scratchOrdinals1(), m_scratchOrdinals2(), m_scratchOrdinals3() { } void connect_side_to_nodes(stk::mesh::Entity sideEntity, stk::mesh::Entity elemEntity, int elemSide); private: void connect_side_to_elements_nodes(stk::mesh::Entity sideEntity, stk::mesh::Entity elemEntity, int elemSide); void connect_side_to_other_elements_nodes(const GraphEdge &edgeWithMinId, stk::mesh::Entity sideEntity, stk::mesh::Entity elemEntity, int elemSide); stk::mesh::EntityVector get_permuted_side_nodes(stk::mesh::Entity elemEntity, int elemSide, const stk::mesh::EntityVector &sideNodes, int permutation); + void declare_relations_to_nodes(stk::mesh::Entity sideEntity, const stk::mesh::EntityVector &sideNodes); private: stk::mesh::BulkData &bulk; const stk::mesh::Graph &graph; const stk::mesh::impl::SparseGraph &coincidentGraph; const stk::mesh::ParallelInfoForGraphEdges ¶llelGraph; const stk::mesh::impl::ElementLocalIdMapper &localMapper; + OrdinalVector m_scratchOrdinals1, m_scratchOrdinals2, m_scratchOrdinals3; }; class SideConnector @@ -113,7 +116,8 @@ class SideConnector m_bulk_data(b), m_graph(g), m_coincidentGraph(cg), - m_localMapper(localMapper) + m_localMapper(localMapper), + m_scratchOrdinals1(), m_scratchOrdinals2(), m_scratchOrdinals3() { } @@ -143,6 +147,7 @@ class SideConnector const stk::mesh::Graph &m_graph; const stk::mesh::impl::SparseGraph &m_coincidentGraph; const stk::mesh::impl::ElementLocalIdMapper & m_localMapper; + OrdinalVector m_scratchOrdinals1, m_scratchOrdinals2, m_scratchOrdinals3; }; } diff --git a/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAsync.cpp b/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAsync.cpp new file mode 100644 index 000000000000..0810ba70066c --- /dev/null +++ b/packages/stk/stk_performance_tests/stk_mesh/NgpFieldAsync.cpp @@ -0,0 +1,826 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SPEEDUP_DELTA 1.0 + +class NgpFieldAsyncTest : public stk::unit_test_util::MeshFixture +{ +public: + NgpFieldAsyncTest() + : stk::unit_test_util::MeshFixture(), + m_numBlocks(1), + m_numElemsPerDim(100), + m_numElements(std::pow(m_numElemsPerDim, 3)), + m_numComponents(3), + m_increment(10), + m_defaultLaunchBlockingEnvVarSet(false), + m_defaultLaunchBlockingEnvVarValue(0) + { + set_launch_blocking_env_var(); + } + + ~NgpFieldAsyncTest() + { + revert_launch_blocking_env_var(); + } + + void setup_simple_mesh_with_fields(unsigned numElemsPerDim) + { + m_numElemsPerDim = numElemsPerDim; + setup_fields(); + setup_mesh_with_many_blocks_many_elements(); + } + + void setup_multi_block_mesh_with_field_per_block(unsigned numElemsPerDim, unsigned numBlocks, unsigned numFields) + { + m_numElemsPerDim = numElemsPerDim; + m_numBlocks = numBlocks; + + setup_fields_on_all_blocks(numFields); + setup_mesh_with_many_blocks_many_elements(); + } + + void setup_fields() + { + std::vector init; + setup_field_component_data(init); + + auto field1 = &get_meta().declare_field>(stk::topology::ELEMENT_RANK, "intField1", 1); + auto field2 = &get_meta().declare_field>(stk::topology::ELEMENT_RANK, "intField2", 1); + auto field3 = &get_meta().declare_field>(stk::topology::ELEMENT_RANK, "intField3", 1); + + stk::mesh::put_field_on_mesh(*field1, get_meta().universal_part(), m_numComponents, init.data()); + stk::mesh::put_field_on_mesh(*field2, get_meta().universal_part(), m_numComponents, init.data()); + stk::mesh::put_field_on_mesh(*field3, get_meta().universal_part(), m_numComponents, init.data()); + } + + void setup_fields_on_all_blocks(unsigned numFields) + { + unsigned numStates = 1; + std::vector init; + setup_field_component_data(init); + + for(unsigned i = 1; i <= m_numBlocks; i++) { + std::string blockName = "block_" + std::to_string(i); + + stk::mesh::Part& part = get_meta().declare_part_with_topology(blockName, stk::topology::HEX_8); + get_meta().set_part_id(part, i); + EXPECT_NE(&part, nullptr); + + for(unsigned j = 1; j <= numFields; j++) { + std::string fieldName = "intField" + std::to_string(j); + stk::mesh::Field& field = get_meta().declare_field>(stk::topology::ELEM_RANK, fieldName, numStates); + stk::mesh::put_field_on_mesh(field, part, m_numComponents, init.data()); + } + } + } + + void setup_mesh_with_many_blocks_many_elements(stk::mesh::BulkData::AutomaticAuraOption auraOption = stk::mesh::BulkData::NO_AUTO_AURA) + { + std::string meshDesc = "generated:" + std::to_string(m_numElemsPerDim) + "x" + + std::to_string(m_numElemsPerDim) + "x" + + std::to_string(m_numElemsPerDim); + stk::performance_tests::setup_multiple_blocks(get_meta(), m_numBlocks); + setup_mesh(meshDesc, auraOption); + stk::performance_tests::move_elements_to_other_blocks(get_bulk(), m_numElemsPerDim); + } + + void pass_time_on_device(const stk::mesh::ExecSpace& space, unsigned iterationSpent = 100) + { + typedef typename Kokkos::TeamPolicy::member_type TeamHandleType; + const auto& teamPolicy = Kokkos::TeamPolicy(space, 10, Kokkos::AUTO); + + Kokkos::parallel_for("run_with_team_policy", teamPolicy, + KOKKOS_LAMBDA(const TeamHandleType & team) { + for(unsigned j = 0; j < iterationSpent; ++j) { + clock_t start = clock(); + clock_t now; + for (;;) { + now = clock(); + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= 1e6) { + break; + } + } + } + } + ); + } + + void pass_time_on_host(unsigned sleepMilliseconds) + { + std::this_thread::sleep_for(std::chrono::milliseconds(sleepMilliseconds)); + } + + template + void set_fields_values_on_host(stk::mesh::FieldVector& fields, Func&& setValue) + { + for(auto field : fields) { + stk::mesh::EntityVector elems; + stk::mesh::get_selected_entities(stk::mesh::Selector(*field), get_bulk().buckets(stk::topology::ELEM_RANK), elems); + + for(auto elem : elems) { + int* data = reinterpret_cast(stk::mesh::field_data(*field, elem)); + unsigned numComponents = stk::mesh::field_scalars_per_entity(*field, elem); + for(unsigned j = 0; j < numComponents; j++) { + setValue(data, j); + } + } + } + } + + void update_fields_values_on_host(stk::mesh::FieldVector& fields) + { + auto updateValueFunc = [this](int* data, unsigned component) + { + data[component] += m_increment; + }; + + set_fields_values_on_host(fields, updateValueFunc); + } + + void reset_fields_values_on_host(stk::mesh::FieldVector& fields) + { + auto updateValueFunc = [this](int* data, unsigned component) + { + data[component] = component; + }; + + set_fields_values_on_host(fields, updateValueFunc); + } + + void set_fields_values_on_device(stk::mesh::FieldVector& fields, unsigned value) + { + stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + + for(auto field : fields) { + stk::mesh::NgpField ngpField = stk::mesh::get_updated_ngp_field(*field); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, stk::mesh::Selector(*field), + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& entityIndex) { + const int numScalarsPerEntity = ngpField.get_num_components_per_entity(entityIndex); + + for (int component = 0; component < numScalarsPerEntity; component++) { + ngpField(entityIndex, component) = value; + } + }); + } + } + + void verify_values_on_device(stk::mesh::FieldVector& fields) + { + stk::mesh::NgpMesh& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + unsigned numComponents = m_numComponents; + unsigned increments = m_increment; + + for(auto field : fields) { + auto ngpField = stk::mesh::get_updated_ngp_field(*field); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, stk::mesh::Selector(*field), + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& elem) + { + for(unsigned i = 0; i < numComponents; i++) { + int expected = i + increments; + int fieldValue = ngpField(elem, i); + NGP_EXPECT_EQ(expected, fieldValue); + } + }); + } + + Kokkos::fence(); + } + + void verify_values_on_host(stk::mesh::FieldVector& fields, unsigned expectedValue) + { + for(auto field : fields) { + auto ngpField = stk::mesh::get_updated_ngp_field(*field); + + stk::mesh::EntityVector elems; + stk::mesh::get_selected_entities(stk::mesh::Selector(*field), get_bulk().buckets(stk::topology::ELEM_RANK), elems); + + for(auto elem : elems) { + int* data = reinterpret_cast(stk::mesh::field_data(*field, elem)); + unsigned numComponents = stk::mesh::field_scalars_per_entity(*field, elem); + for(unsigned j = 0; j < numComponents; j++) { + EXPECT_EQ((int)expectedValue, data[j]); + } + } + } + } + +private: + void setup_field_component_data(std::vector& init) + { + for(unsigned i = 0; i < m_numComponents; i++) { + init.push_back(i); + } + } + + void revert_launch_blocking_env_var() + { + if(m_defaultLaunchBlockingEnvVarSet) { + setenv(m_launchBlockingEnvVar.c_str(), std::to_string(m_defaultLaunchBlockingEnvVarValue).c_str(), 1); + } else { + unsetenv(m_launchBlockingEnvVar.c_str()); + } + } + + void set_launch_blocking_env_var() + { + char* varValue = std::getenv(m_launchBlockingEnvVar.c_str()); + m_defaultLaunchBlockingEnvVarSet = (varValue != nullptr); + + if(m_defaultLaunchBlockingEnvVarSet) { + m_defaultLaunchBlockingEnvVarValue = stk::get_env_var_as_int(m_launchBlockingEnvVar, 0); + setenv(m_launchBlockingEnvVar.c_str(), "0", 1); + } + } + + unsigned m_numBlocks; + unsigned m_numElemsPerDim; + unsigned m_numElements; + unsigned m_numComponents; + unsigned m_increment; + + bool m_defaultLaunchBlockingEnvVarSet; + int m_defaultLaunchBlockingEnvVarValue; + const std::string m_launchBlockingEnvVar = "CUDA_LAUNCH_BLOCKING"; +}; + +TEST_F(NgpFieldAsyncTest, SyncToDeviceAsyncTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 3); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 100); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + + setup_simple_mesh_with_fields(numElemsPerDim); + + stk::mesh::FieldBase* intField1 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField1"); + stk::mesh::FieldBase* intField2 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField2"); + stk::mesh::FieldBase* intField3 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField3"); + stk::mesh::NgpField& ngpIntField1 = stk::mesh::get_updated_ngp_field(*intField1); + stk::mesh::NgpField& ngpIntField2 = stk::mesh::get_updated_ngp_field(*intField2); + stk::mesh::NgpField& ngpIntField3 = stk::mesh::get_updated_ngp_field(*intField3); + stk::mesh::FieldVector fields{intField1, intField2, intField3}; + std::vector*> ngpFields = {&ngpIntField1, &ngpIntField2, &ngpIntField3}; + + for(unsigned run = 0; run < NUM_RUNS; run++) { + + { + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_host(); + ngpField->sync_to_device(); + pass_time_on_device(defaultExecSpace, waitIteration); + ngpField->fence(); + } + + timer.update_timing(); + verify_values_on_device(fields); + timer.print_timing(NUM_RUNS); + } + + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + timer2.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + ngpField->modify_on_host(); + ngpField->sync_to_device(space); + pass_time_on_device(space, waitIteration); + } + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + verify_values_on_device(fields); + } + + double blockingSyncTime = timer.get_timing(); + double nonBlockingSyncTime = timer2.get_timing(); + double speedup = blockingSyncTime / nonBlockingSyncTime; + + EXPECT_GE(speedup, 1.0); + EXPECT_LE(speedup, numStreams + SPEEDUP_DELTA); + } + + timer2.print_timing(NUM_RUNS); +} + +TEST_F(NgpFieldAsyncTest, SyncToHostAsyncTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 3); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 100); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + + setup_simple_mesh_with_fields(numElemsPerDim); + + stk::mesh::FieldBase* intField1 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField1"); + stk::mesh::FieldBase* intField2 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField2"); + stk::mesh::FieldBase* intField3 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField3"); + stk::mesh::NgpField& ngpIntField1 = stk::mesh::get_updated_ngp_field(*intField1); + stk::mesh::NgpField& ngpIntField2 = stk::mesh::get_updated_ngp_field(*intField2); + stk::mesh::NgpField& ngpIntField3 = stk::mesh::get_updated_ngp_field(*intField3); + stk::mesh::FieldVector fields{intField1, intField2, intField3}; + std::vector*> ngpFields = {&ngpIntField1, &ngpIntField2, &ngpIntField3}; + + for(unsigned run = 0; run < NUM_RUNS; run++) { + + unsigned initialValue = 0; + unsigned setValue = run+1; + + { + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + set_fields_values_on_device(fields, initialValue); + set_fields_values_on_device(fields, setValue); + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_device(); + ngpField->sync_to_host(); + pass_time_on_device(defaultExecSpace, waitIteration); + ngpField->fence(); + } + + timer.update_timing(); + verify_values_on_host(fields, setValue); + timer.print_timing(NUM_RUNS); + } + + { + set_fields_values_on_device(fields, initialValue); + set_fields_values_on_device(fields, setValue); + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + timer2.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + ngpField->modify_on_device(); + ngpField->sync_to_host(space); + pass_time_on_device(space, waitIteration); + } + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + verify_values_on_host(fields, setValue); + } + + double blockingSyncTime = timer.get_timing(); + double nonBlockingSyncTime = timer2.get_timing(); + double speedup = blockingSyncTime / nonBlockingSyncTime; + + EXPECT_GE(speedup, 1.0); + EXPECT_LE(speedup, numStreams + SPEEDUP_DELTA); + } + + timer2.print_timing(NUM_RUNS); +} + +TEST_F(NgpFieldAsyncTest, SyncAsyncTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 3); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 100); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + + setup_simple_mesh_with_fields(numElemsPerDim); + + stk::mesh::FieldBase* intField1 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField1"); + stk::mesh::FieldBase* intField2 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField2"); + stk::mesh::FieldBase* intField3 = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField3"); + stk::mesh::NgpField& ngpIntField1 = stk::mesh::get_updated_ngp_field(*intField1); + stk::mesh::NgpField& ngpIntField2 = stk::mesh::get_updated_ngp_field(*intField2); + stk::mesh::NgpField& ngpIntField3 = stk::mesh::get_updated_ngp_field(*intField3); + stk::mesh::FieldVector fields{intField1, intField2, intField3}; + std::vector*> ngpFields = {&ngpIntField1, &ngpIntField2, &ngpIntField3}; + + for(unsigned i = 0; i < NUM_RUNS; i++) { + + { + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_host(); + ngpField->sync_to_device(); + pass_time_on_device(defaultExecSpace, waitIteration); + ngpField->modify_on_device(); + ngpField->sync_to_host(); + ngpField->fence(); + } + + timer.update_timing(); + timer.print_timing(NUM_RUNS); + } + + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + timer2.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + + ngpField->modify_on_host(); + ngpField->sync_to_device(space); + pass_time_on_device(space, waitIteration); + ngpField->modify_on_device(); + ngpField->sync_to_host(space); + } + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + } + + double blockingSyncTime = timer.get_timing(); + double nonBlockingSyncTime = timer2.get_timing(); + double speedup = blockingSyncTime / nonBlockingSyncTime; + + EXPECT_GE(speedup, 1.0); + EXPECT_LE(speedup, numStreams + SPEEDUP_DELTA); + } + + timer2.print_timing(NUM_RUNS); +} + +TEST_F(NgpFieldAsyncTest, PartialSyncToDeviceAsyncTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 3); + unsigned numFields = stk::unit_test_util::get_command_line_option("-f", 3); + unsigned numBlocks = stk::unit_test_util::get_command_line_option("-b", 3); + unsigned numBlocksToSync = stk::unit_test_util::get_command_line_option("-c", 1); + EXPECT_TRUE(numBlocksToSync <= numBlocks && numBlocksToSync >= 1); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 100); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + + setup_multi_block_mesh_with_field_per_block(numElemsPerDim, numBlocks, numFields); + + stk::mesh::FieldVector fields; + std::vector*> ngpFields; + + for(unsigned i = 1; i <= numFields; i++) { + stk::mesh::FieldBase* intField = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField" + std::to_string(i)); + EXPECT_NE(nullptr, intField); + fields.push_back(intField); + + stk::mesh::NgpField& ngpIntField = stk::mesh::get_updated_ngp_field(*intField); + ngpFields.push_back(&ngpIntField); + } + + stk::mesh::Selector selector; + for(unsigned i = 1; i <= numBlocksToSync; i++) { + selector = selector | stk::mesh::Selector(*get_meta().get_part("block_" + std::to_string(i))); + } + + for(unsigned run = 0; run < NUM_RUNS; run++) { + + { + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_host(selector); + ngpField->sync_to_device(); + pass_time_on_device(defaultExecSpace, waitIteration); + ngpField->fence(); + } + + timer.update_timing(); + timer.print_timing(NUM_RUNS); + } + + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + timer2.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + + ngpField->modify_on_host(selector); + ngpField->sync_to_device(space); + pass_time_on_device(space, waitIteration); + } + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + } + + double blockingSyncTime = timer.get_timing(); + double nonBlockingSyncTime = timer2.get_timing(); + double speedup = blockingSyncTime / nonBlockingSyncTime; + + EXPECT_GE(speedup, 1.0); + EXPECT_LE(speedup, numStreams + SPEEDUP_DELTA); + } + + timer2.print_timing(NUM_RUNS); +} + +TEST_F(NgpFieldAsyncTest, PartialSyncToHostAsyncTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 3); + unsigned numFields = stk::unit_test_util::get_command_line_option("-f", 3); + unsigned numBlocks = stk::unit_test_util::get_command_line_option("-b", 3); + unsigned numBlocksToSync = stk::unit_test_util::get_command_line_option("-c", 1); + EXPECT_TRUE(numBlocksToSync <= numBlocks && numBlocksToSync >= 1); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 100); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + + setup_multi_block_mesh_with_field_per_block(numElemsPerDim, numBlocks, numFields); + + stk::mesh::FieldVector fields; + std::vector*> ngpFields; + + for(unsigned i = 1; i <= numFields; i++) { + stk::mesh::FieldBase* intField = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField" + std::to_string(i)); + EXPECT_NE(nullptr, intField); + fields.push_back(intField); + + stk::mesh::NgpField& ngpIntField = stk::mesh::get_updated_ngp_field(*intField); + ngpFields.push_back(&ngpIntField); + } + + stk::mesh::Selector selector; + for(unsigned i = 1; i <= numBlocksToSync; i++) { + selector = selector | stk::mesh::Selector(*get_meta().get_part("block_" + std::to_string(i))); + } + + for(unsigned run = 0; run < NUM_RUNS; run++) { + + unsigned initialValue = 0; + unsigned setValue = run+1; + + { + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + set_fields_values_on_device(fields, initialValue); + set_fields_values_on_device(fields, setValue); + + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_device(selector); + ngpField->sync_to_host(); + pass_time_on_device(defaultExecSpace, waitIteration); + ngpField->fence(); + } + + timer.update_timing(); + timer.print_timing(NUM_RUNS); + } + + { + set_fields_values_on_device(fields, initialValue); + set_fields_values_on_device(fields, setValue); + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + timer2.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + + ngpField->modify_on_device(selector); + ngpField->sync_to_host(space); + pass_time_on_device(space, waitIteration); + } + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + } + + double blockingSyncTime = timer.get_timing(); + double nonBlockingSyncTime = timer2.get_timing(); + double speedup = blockingSyncTime / nonBlockingSyncTime; + + EXPECT_GE(speedup, 1.0); + EXPECT_LE(speedup, numStreams + SPEEDUP_DELTA); + } + + timer2.print_timing(NUM_RUNS); +} + +TEST_F(NgpFieldAsyncTest, AsyncDeepCopyTiming) +{ + if(get_parallel_size() != 1) return; + + unsigned NUM_RUNS = stk::unit_test_util::get_command_line_option("-r", 1); + unsigned numStreams = stk::unit_test_util::get_command_line_option("-s", 10); + unsigned numFields = stk::unit_test_util::get_command_line_option("-f", 10); + unsigned numBlocks = stk::unit_test_util::get_command_line_option("-b", 1); + unsigned numElemsPerDim = stk::unit_test_util::get_command_line_option("-e", 100); + unsigned sleepTime = stk::unit_test_util::get_command_line_option("-m", 50); + unsigned waitIteration = stk::unit_test_util::get_command_line_option("-p", 20); + stk::performance_tests::Timer timer(MPI_COMM_WORLD); + stk::performance_tests::Timer timer2(MPI_COMM_WORLD); + stk::performance_tests::Timer timer3(MPI_COMM_WORLD); + + setup_multi_block_mesh_with_field_per_block(numElemsPerDim, numBlocks, numFields); + + stk::mesh::FieldVector fields; + std::vector*> ngpFields; + + for(unsigned i = 1; i <= numFields; i++) { + stk::mesh::FieldBase* intField = get_meta().get_field(stk::topology::ELEMENT_RANK, "intField" + std::to_string(i)); + EXPECT_NE(nullptr, intField); + fields.push_back(intField); + + stk::mesh::NgpField& ngpIntField = stk::mesh::get_updated_ngp_field(*intField); + ngpFields.push_back(&ngpIntField); + } + + std::vector> spaces; + for(unsigned i = 0; i < numStreams; i++) { + auto space = stk::mesh::get_execution_space_with_stream(); + spaces.push_back(space); + } + + for(unsigned run = 0; run < NUM_RUNS; run++) { + + Kokkos::fence(); + + auto defaultExecSpace = Kokkos::DefaultExecutionSpace(); + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + timer.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_host(); + ngpField->sync_to_device(); + pass_time_on_device(defaultExecSpace, waitIteration); + } + pass_time_on_host(sleepTime); + + timer.update_timing(); + } + + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + timer2.start_timing(); + + for(auto ngpField : ngpFields) { + ngpField->modify_on_host(); + ngpField->sync_to_device(defaultExecSpace); + pass_time_on_device(defaultExecSpace, waitIteration); + } + pass_time_on_host(sleepTime); + + stk::mesh::ngp_field_fence(get_meta()); + timer2.update_timing(); + } + + { + reset_fields_values_on_host(fields); + update_fields_values_on_host(fields); + + timer3.start_timing(); + + for(unsigned i = 0; i < ngpFields.size(); i++) { + auto ngpField = ngpFields[i]; + auto space = spaces[i % spaces.size()]; + ngpField->modify_on_host(); + ngpField->sync_to_device(space); + pass_time_on_device(space, waitIteration); + } + pass_time_on_host(sleepTime); + + stk::mesh::ngp_field_fence(get_meta()); + timer3.update_timing(); + } + } + + timer.print_timing(NUM_RUNS); + timer2.print_timing(NUM_RUNS); + timer3.print_timing(NUM_RUNS); +} \ No newline at end of file diff --git a/packages/stk/stk_performance_tests/stk_mesh/timer.hpp b/packages/stk/stk_performance_tests/stk_mesh/timer.hpp index 136fbf1cb192..f68dfc61020a 100644 --- a/packages/stk/stk_performance_tests/stk_mesh/timer.hpp +++ b/packages/stk/stk_performance_tests/stk_mesh/timer.hpp @@ -75,6 +75,8 @@ class Timer stk::print_stats_for_performance_compare(std::cout, timeAll, meshOperationHwm, iterationCount, communicator); } + double get_timing() { return cumulativeTime; } + private: MPI_Comm communicator; double iterationStartTime; diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestCommInfoObserver.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestCommInfoObserver.cpp index ff11b751abc5..da5e1074441a 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestCommInfoObserver.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestCommInfoObserver.cpp @@ -82,9 +82,8 @@ class CommInfoObserverTest : public ::testing::Test EXPECT_TRUE(!observer->was_comm_info_changed(stk::topology::CONSTRAINT_RANK)); } - void stop_ghosting_of_element1_to_proc1() + void add_element1_and_nodes_to_remove_from_ghosting(std::vector& entityKeysToStopGhosting) { - std::vector entityKeysToStopGhosting; if(bulk.parallel_rank() == 1) { entityKeysToStopGhosting.push_back(stk::mesh::EntityKey(stk::topology::ELEM_RANK, 1)); @@ -93,6 +92,16 @@ class CommInfoObserverTest : public ::testing::Test entityKeysToStopGhosting.push_back(stk::mesh::EntityKey(stk::topology::NODE_RANK, nodeId)); } } + } + + void stop_ghosting_of_element1_to_proc1(bool testDuplicateRemoveGhostKeys) + { + std::vector entityKeysToStopGhosting; + add_element1_and_nodes_to_remove_from_ghosting(entityKeysToStopGhosting); + if (testDuplicateRemoveGhostKeys) { + add_element1_and_nodes_to_remove_from_ghosting(entityKeysToStopGhosting); + } + bulk.modification_begin(); bulk.change_ghosting(*ghost, {}, entityKeysToStopGhosting); bulk.modification_end(); @@ -150,7 +159,23 @@ TEST_F(CommInfoObserverTest, addAndRemoveFromGhosting) reset_comm_info_status(); - stop_ghosting_of_element1_to_proc1(); + bool testDuplicateRemoveGhostKeys = false; + stop_ghosting_of_element1_to_proc1(testDuplicateRemoveGhostKeys); + expect_comm_info_changed_for_nodes_and_elements_only(); + } +} + +TEST_F(CommInfoObserverTest, addAndRemoveFromGhosting_testDuplicateKeys) +{ + MPI_Comm comm = MPI_COMM_WORLD; + if(stk::parallel_machine_size(comm) == 2) + { + ghost_element1_to_proc1_and_check_for_comm_info_change(); + + reset_comm_info_status(); + + bool testDuplicateRemoveGhostKeys = true; + stop_ghosting_of_element1_to_proc1(testDuplicateRemoveGhostKeys); expect_comm_info_changed_for_nodes_and_elements_only(); } } diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestElemGraphCoincidentElements.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestElemGraphCoincidentElements.cpp index 546a5ac9a86e..a1e6eb717f7c 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestElemGraphCoincidentElements.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestElemGraphCoincidentElements.cpp @@ -234,7 +234,7 @@ TEST_F(HexShellShell, SideConnections) setup_hex_shell_shell_on_procs({0, 0, 0}); stk::mesh::ElemElemGraph elemElemGraph(get_bulk()); - stk::mesh::SideConnector sideConnector = elemElemGraph.get_side_connector(); + stk::mesh::SideConnector& sideConnector = elemElemGraph.get_side_connector(); get_bulk().modification_begin(); stk::mesh::Entity shell2 = get_bulk().get_entity(stk::topology::ELEM_RANK, 2); diff --git a/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp b/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp index bf3f472e64c5..27edc0491ce9 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/UnitTestField.cpp @@ -746,6 +746,13 @@ TEST_F(FieldFixture, DISABLED_writingDifferentElementFieldsPerSolutionCase) test_solution_case_with_rank(stk::topology::ELEM_RANK); } +TEST_F(FieldFixture, fenceWithoutNgpField) +{ + stk::mesh::Field &field = get_meta().declare_field>(stk::topology::ELEM_RANK, "doubleField"); + + EXPECT_NO_THROW(field.fence()); +} + class LateFieldFixtureNoTest : public stk::unit_test_util::MeshFixtureNoTest { protected: diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldAsyncTest.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldAsyncTest.cpp new file mode 100644 index 000000000000..bd698e7ed462 --- /dev/null +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpFieldAsyncTest.cpp @@ -0,0 +1,998 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "NgpUnitTestUtils.hpp" +#include +#include +#include + +#ifndef KOKKOS_ENABLE_CUDA +#define TEST_ONLY_ON_CUDA(testname) DISABLED_##testname +#else +#define TEST_ONLY_ON_CUDA(testname) testname +#endif + +class NgpAsyncDeepCopyFixture : public stk::unit_test_util::MeshFixture +{ +public: + NgpAsyncDeepCopyFixture() + : m_numComponents(3), + m_bucketCapacity(5), + m_numBlocks(3), + m_numFields(m_numBlocks), + m_numStreams(3), + m_multiplier(5), + m_defaultLaunchBlockingEnvVarSet(false), + m_defaultLaunchBlockingEnvVarValue(0) + { + set_launch_blocking_env_var(); + setup_empty_mesh(stk::mesh::BulkData::NO_AUTO_AURA, m_bucketCapacity); + } + + ~NgpAsyncDeepCopyFixture() + { + revert_launch_blocking_env_var(); + } + + std::vector> get_execution_spaces_with_streams(unsigned numStreams) + { + std::vector> execSpaces; + + for(unsigned i = 0; i < numStreams; i++) { + execSpaces.push_back(stk::mesh::get_execution_space_with_stream()); + } + + return execSpaces; + } + + void setup_multi_block_mesh_with_field_per_block() + { + std::string meshDesc = stk::unit_test_util::get_many_block_mesh_desc(m_numBlocks); + std::vector coordinates = stk::unit_test_util::get_many_block_coordinates(m_numBlocks); + + setup_field_per_block(); + stk::unit_test_util::setup_text_mesh(get_bulk(), meshDesc, coordinates); + construct_ngp_fields(); + } + + void setup_multi_block_mesh_with_fields_on_all_blocks() + { + std::string meshDesc = stk::unit_test_util::get_many_block_mesh_desc(m_numBlocks); + std::vector coordinates = stk::unit_test_util::get_many_block_coordinates(m_numBlocks); + + setup_fields_on_all_blocks(); + stk::unit_test_util::setup_text_mesh(get_bulk(), meshDesc, coordinates); + construct_ngp_fields(); + } + + void construct_ngp_fields() + { + for(auto field : m_fields) { + stk::mesh::get_updated_ngp_field(*field); + } + } + + void setup_field_per_block() + { + unsigned numStates = 1; + std::vector init; + setup_field_component_data(init); + + for(unsigned i = 1; i <= m_numBlocks; i++) { + std::string blockName = "block_" + std::to_string(i); + std::string fieldName = "field_" + std::to_string(i); + + stk::mesh::Part& part = get_meta().declare_part_with_topology(blockName, stk::topology::HEX_8); + get_meta().set_part_id(part, i); + EXPECT_NE(&part, nullptr); + + stk::mesh::Field& field = get_meta().declare_field>(stk::topology::ELEM_RANK, fieldName, numStates); + m_fields.push_back(&field); + stk::mesh::put_field_on_mesh(field, part, m_numComponents, init.data()); + } + } + + void setup_fields_on_all_blocks() + { + unsigned numStates = 1; + std::vector init; + setup_field_component_data(init); + + for(unsigned i = 1; i <= m_numBlocks; i++) { + std::string blockName = "block_" + std::to_string(i); + + stk::mesh::Part& part = get_meta().declare_part_with_topology(blockName, stk::topology::HEX_8); + get_meta().set_part_id(part, i); + EXPECT_NE(&part, nullptr); + + for(unsigned j = 1; j <= m_numFields; j++) { + std::string fieldName = "field_on_all_blocks_" + std::to_string(j); + stk::mesh::Field& field = get_meta().declare_field>(stk::topology::ELEM_RANK, fieldName, numStates); + stk::mesh::put_field_on_mesh(field, part, m_numComponents, init.data()); + m_fields.push_back(&field); + } + } + } + + std::vector*> get_fields() + { + return m_fields; + } + + stk::mesh::PartVector get_parts() + { + auto allParts = get_meta().get_parts(); + stk::mesh::PartVector elemParts; + + for(auto part : allParts) { + if(part->primary_entity_rank() == stk::topology::ELEM_RANK && part->id() != stk::mesh::Part::INVALID_ID) { + elemParts.push_back(part); + } + } + + return elemParts; + } + + void setup_test(unsigned numBlocks, unsigned numStreams, unsigned multiplier) + { + m_numBlocks = numBlocks; + m_numStreams = numStreams; + m_multiplier = multiplier; + m_numFields = m_numBlocks; + } + + void setup_field_data_on_host() + { + for(auto field : m_fields) { + set_element_field_data(*field, stk::mesh::Selector(*field), m_multiplier); + } + } + + void setup_selected_field_data_on_host(stk::mesh::Selector& selector) + { + for(auto field : m_fields) { + set_element_field_data(*field, selector, m_multiplier); + } + } + + stk::mesh::Selector get_block_selector_for_partial_sync() + { + unsigned numPartialSyncBlocks = stk::unit_test_util::get_command_line_option("-b", m_numBlocks/2); + + numPartialSyncBlocks = std::min(numPartialSyncBlocks, m_numBlocks); + + stk::mesh::PartVector syncParts; + for(unsigned i = 1; i <= numPartialSyncBlocks; i++) { + std::string partName = "block_" + std::to_string(i); + + stk::mesh::Part* part = get_meta().get_part(partName); + EXPECT_NE(part, nullptr); + + syncParts.push_back(part); + } + + stk::mesh::Selector selector = stk::mesh::selectUnion(syncParts); + return selector; + } + + void setup_field_data_on_device() + { + for(auto field : m_fields) { + auto& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + set_element_field_data_on_device(ngpMesh, *field, stk::mesh::Selector(*field), m_multiplier); + } + } + + void setup_selected_field_data_on_device(stk::mesh::Selector& selector) + { + for(auto field : m_fields) { + auto& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + set_element_field_data_on_device(ngpMesh, *field, selector, m_multiplier); + } + } + + void add_parts_to_all_blocks(stk::mesh::PartVector& addParts) + { + stk::mesh::EntityVector entities; + stk::mesh::get_entities(get_bulk(), stk::topology::ELEM_RANK, entities); + + get_bulk().batch_change_entity_parts(entities, addParts, {}); + } + + void change_parts_on_selected_blocks(stk::mesh::PartVector& addParts, stk::mesh::PartVector& removeParts, + stk::mesh::Selector& blockSelector) + { + stk::mesh::EntityVector entities; + stk::mesh::get_selected_entities(blockSelector, get_bulk().buckets(stk::topology::ELEM_RANK), entities); + + get_bulk().batch_change_entity_parts(entities, addParts, removeParts); + } + + void sync_fields_to_host() + { + for(unsigned i = 0; i < m_fields.size(); i++) { + auto field = m_fields[i]; + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.modify_on_device(); + ngpField.sync_to_host(); + } + } + + void sync_fields_to_host_async(std::vector>& execSpaces) + { + for(unsigned i = 0; i < m_fields.size(); i++) { + auto field = m_fields[i]; + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.modify_on_device(); + ngpField.sync_to_host(execSpaces[i % execSpaces.size()]); + } + } + + void sync_fields_to_host_async(std::vector>& execSpaces, + stk::mesh::Selector& selector) + { + for(unsigned i = 0; i < m_fields.size(); i++) { + auto field = m_fields[i]; + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.modify_on_device(selector); + ngpField.sync_to_host(execSpaces[i % execSpaces.size()]); + } + } + + void sync_fields_to_device_async(std::vector>& execSpaces) + { + for(unsigned i = 0; i < m_fields.size(); i++) { + auto field = m_fields[i]; + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.modify_on_host(); + ngpField.sync_to_device(execSpaces[i % execSpaces.size()]); + } + } + + void sync_fields_to_device_async(std::vector>& execSpaces, + stk::mesh::Selector& selector) + { + for(unsigned i = 0; i < m_fields.size(); i++) { + auto field = m_fields[i]; + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.modify_on_host(selector); + ngpField.sync_to_device(execSpaces[i % execSpaces.size()]); + } + } + + template + void compare_device_data_to_init_data(FieldType& field) + { + unsigned numComponents = m_numComponents; + unsigned multiplier = m_multiplier; + auto& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, stk::mesh::Selector(*field), + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& elem) + { + auto entity = ngpMesh.get_entity(stk::topology::ELEM_RANK, elem); + + for(unsigned i = 0; i < numComponents; i++) { + int expected = ngpMesh.identifier(entity) * multiplier + i; + int fieldValue = ngpField(elem, i); + NGP_EXPECT_EQ(expected, fieldValue); + } + }); + Kokkos::fence(); + } + + template + void test_partial_copy_to_device_result(FieldType field, stk::mesh::Selector& selector) + { + unsigned numComponents = m_numComponents; + unsigned multiplier = m_multiplier; + auto& ngpMesh = stk::mesh::get_updated_ngp_mesh(get_bulk()); + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, selector, + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& elem) + { + auto entity = ngpMesh.get_entity(stk::topology::ELEM_RANK, elem); + + for(unsigned i = 0; i < numComponents; i++) { + int expected = ngpMesh.identifier(entity) * multiplier + i; + int fieldValue = ngpField(elem, i); + NGP_EXPECT_EQ(expected, fieldValue); + } + }); + Kokkos::fence(); + + stk::NgpVector ngpVector; + std::vector init; + setup_field_component_data(init); + + for(auto val : init) { + ngpVector.push_back(val); + } + ngpVector.copy_host_to_device(); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, !selector, + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& elem) + { + for(unsigned i = 0; i < numComponents; i++) { + int expected = ngpVector.device_get(i); + int fieldValue = ngpField(elem, i); + NGP_EXPECT_EQ(expected, fieldValue); + } + }); + Kokkos::fence(); + } + + template + void compare_host_data_to_modified_data(FieldType field, unsigned scale = 1) + { + stk::mesh::EntityVector elems; + stk::mesh::get_selected_entities(stk::mesh::Selector(*field), get_bulk().buckets(stk::topology::ELEM_RANK), elems); + unsigned multiplier = m_multiplier * scale; + + for(auto elem : elems) { + int* data = reinterpret_cast(stk::mesh::field_data(*field, elem)); + unsigned numComponents = stk::mesh::field_scalars_per_entity(*field, elem); + for(unsigned j = 0; j < numComponents; j++) { + unsigned expectedValue = get_bulk().identifier(elem) * multiplier + j; + EXPECT_EQ((int)expectedValue, data[j]); + } + } + } + + template + void test_partial_copy_to_host_result(FieldType field, stk::mesh::Selector& selector) + { + stk::mesh::EntityVector elems; + stk::mesh::get_selected_entities(selector, get_bulk().buckets(stk::topology::ELEM_RANK), elems); + + check_result_on_host_expect_multiplied_data(field, elems, m_multiplier); + + stk::mesh::Selector otherSelector = stk::mesh::Selector(*field) - selector; + + stk::mesh::EntityVector notSelectedElems; + stk::mesh::get_selected_entities(otherSelector, get_bulk().buckets(stk::topology::ELEM_RANK), notSelectedElems); + + check_result_on_host_expect_init_data(field, notSelectedElems); + } + + void set_element_field_data(stk::mesh::Field& stkIntField, stk::mesh::Selector selector, unsigned multiplier) + { + stk::mesh::EntityVector elements; + stk::mesh::get_selected_entities(selector, get_bulk().buckets(stk::topology::ELEM_RANK), elements); + + for(stk::mesh::Entity elem : elements) { + int* data = reinterpret_cast(stk::mesh::field_data(stkIntField, elem)); + unsigned numComponents = stk::mesh::field_scalars_per_entity(stkIntField, elem); + for(unsigned j = 0; j < numComponents; j++) { + data[j] = get_bulk().identifier(elem) * multiplier + j; + } + } + } + + void set_element_field_data_on_device(stk::mesh::NgpMesh& ngpMesh, stk::mesh::Field& stkIntField, + const stk::mesh::Selector& selector, unsigned multiplier) + { + stk::mesh::NgpField& ngpField = stk::mesh::get_updated_ngp_field(stkIntField); + + stk::mesh::for_each_entity_run(ngpMesh, stk::topology::ELEM_RANK, selector, + KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex& entityIndex) { + const int numScalarsPerEntity = ngpField.get_num_components_per_entity(entityIndex); + for (int component=0; component + void check_result_on_host(FieldType field, stk::mesh::EntityVector elems, Func&& testValues) + { + for(auto elem : elems) { + int* data = reinterpret_cast(stk::mesh::field_data(*field, elem)); + unsigned numComponents = stk::mesh::field_scalars_per_entity(*field, elem); + for(unsigned j = 0; j < numComponents; j++) { + testValues(data, elem, j); + } + } + } + + template + void check_result_on_host_expect_init_data(FieldType field, stk::mesh::EntityVector& elems) + { + auto expectInitData = [this](int* data, stk::mesh::Entity entity, unsigned component) + { + int expectedValue = component; + EXPECT_EQ(data[component], expectedValue); + }; + + check_result_on_host(field, elems, expectInitData); + } + + template + void check_result_on_host_expect_multiplied_data(FieldType field, stk::mesh::EntityVector& elems, unsigned multiplier) + { + auto expectInitData = [this, multiplier](int* data, stk::mesh::Entity elem, unsigned component) + { + int expectedValue = get_bulk().identifier(elem) * multiplier + component; + EXPECT_EQ(data[component], expectedValue); + }; + + check_result_on_host(field, elems, expectInitData); + } + + void setup_field_component_data(std::vector& init) + { + for(unsigned i = 0; i < m_numComponents; i++) { + init.push_back(i); + } + } + + void revert_launch_blocking_env_var() + { + if(m_defaultLaunchBlockingEnvVarSet) { + setenv(m_launchBlockingEnvVar.c_str(), std::to_string(m_defaultLaunchBlockingEnvVarValue).c_str(), 1); + } else { + unsetenv(m_launchBlockingEnvVar.c_str()); + } + } + + void set_launch_blocking_env_var() + { + char* varValue = std::getenv(m_launchBlockingEnvVar.c_str()); + m_defaultLaunchBlockingEnvVarSet = (varValue != nullptr); + + if(m_defaultLaunchBlockingEnvVarSet) { + m_defaultLaunchBlockingEnvVarValue = stk::get_env_var_as_int(m_launchBlockingEnvVar, 0); + setenv(m_launchBlockingEnvVar.c_str(), "0", 1); + } + } + + unsigned m_numComponents; + unsigned m_bucketCapacity; + unsigned m_numBlocks; + unsigned m_numFields; + unsigned m_numStreams; + unsigned m_multiplier; + + bool m_defaultLaunchBlockingEnvVarSet; + int m_defaultLaunchBlockingEnvVarValue; + + std::vector*> m_fields; + const std::string m_launchBlockingEnvVar = "CUDA_LAUNCH_BLOCKING"; +}; + +TEST_F(NgpAsyncDeepCopyFixture, TwoStreamsAsyncSyncToDevice) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 2; + unsigned numStreams = 2; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + compare_device_data_to_init_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TwoStreamsAsyncSyncToHostFenceAll) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 2; + unsigned numStreams = 2; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_device(); + + sync_fields_to_host_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TwoStreamsAsyncSyncToHostFenceEach) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 2; + unsigned numStreams = 2; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_device(); + + sync_fields_to_host_async(execSpaces); + + for(auto field : get_fields()) { + auto& ngpField = stk::mesh::get_updated_ngp_field(*field); + ngpField.fence(); + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TwoStreamsAsyncSync) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + sync_fields_to_host_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncSyncUsingSameStreams) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 1; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + sync_fields_to_host_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + compare_device_data_to_init_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncSyncToDeviceThenMeshMod) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 2; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + stk::mesh::Part& newPart = get_meta().declare_part("testPart", stk::topology::ELEMENT_RANK); + stk::mesh::PartVector addParts(1, &newPart); + + add_parts_to_all_blocks(addParts); + + for(auto field : get_fields()) { + compare_device_data_to_init_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncCopyFollowedBySyncCopy) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + sync_fields_to_host(); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncCopyFollowedByGetUpdatedNgpField) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_host(); + + sync_fields_to_device_async(execSpaces); + + for(auto field : get_fields()) { + stk::mesh::get_updated_ngp_field(*field); + } + sync_fields_to_host(); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncSyncToHostFollowedByMeshMod) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_device(); + + sync_fields_to_host_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + stk::mesh::Part& newPart = get_meta().declare_part("testPart", stk::topology::ELEMENT_RANK); + stk::mesh::PartVector addParts(1, &newPart); + + add_parts_to_all_blocks(addParts); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncSyncToHostFollowedByDataModOnHostThenGetUpdatedNgpField) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 1; + unsigned numStreams = 1; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + + setup_multi_block_mesh_with_field_per_block(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + + setup_field_data_on_device(); + + sync_fields_to_host_async(execSpaces); + + stk::mesh::ngp_field_fence(get_meta()); + + unsigned scale = 3; + + for(auto field : get_fields()) { + set_element_field_data(*field, stk::mesh::Selector(*field), multiplier*scale); + } + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field, scale); + } + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + compare_host_data_to_modified_data(field, scale); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TEST_ONLY_ON_CUDA(ThreeStreamsAsyncPartialSyncToDevice)) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + auto selector = get_block_selector_for_partial_sync(); + + stk::mesh::Selector allBlockSelector = get_meta().universal_part(); + setup_selected_field_data_on_host(allBlockSelector); + + sync_fields_to_device_async(execSpaces, selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_device_result(field, selector); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TEST_ONLY_ON_CUDA(FourStreamsAsyncPartialSyncToDevice_MeshModAfterPartialSync)) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 4; + unsigned numStreams = 4; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + const stk::mesh::PartVector& parts = get_parts(); + auto block1Selector = stk::mesh::Selector(*parts[0]); + auto block2Selector = stk::mesh::Selector(*parts[1]); + auto block3Selector = stk::mesh::Selector(*parts[2]); + auto block4Selector = stk::mesh::Selector(*parts[3]); + auto allBlockSelector = stk::mesh::Selector(get_meta().universal_part()); + + setup_selected_field_data_on_host(allBlockSelector); + + sync_fields_to_device_async(execSpaces, block1Selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_device_result(field, block1Selector); + } + + stk::mesh::PartVector addParts(1, parts[1]); + stk::mesh::PartVector removeParts(1, parts[3]); + + stk::mesh::Selector block1And2Selector = block1Selector | block2Selector; + change_parts_on_selected_blocks(addParts, removeParts, block4Selector); + + for(auto field : get_fields()) { + stk::mesh::get_updated_ngp_field(*field); + } + + for(auto field : get_fields()) { + test_partial_copy_to_device_result(field, block1And2Selector); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TEST_ONLY_ON_CUDA(ThreeStreamsAsyncPartialSyncToHost)) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 3; + unsigned numStreams = 3; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + auto selector = get_block_selector_for_partial_sync(); + + stk::mesh::Selector allBlockSelector = get_meta().universal_part(); + setup_selected_field_data_on_device(allBlockSelector); + + sync_fields_to_host_async(execSpaces, selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_host_result(field, selector); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TEST_ONLY_ON_CUDA(FourStreamsAsyncPartialSyncToHost_MeshModAfterPartialSync)) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 4; + unsigned numStreams = 4; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + const stk::mesh::PartVector& parts = get_parts(); + auto block1Selector = stk::mesh::Selector(*parts[0]); + auto block2Selector = stk::mesh::Selector(*parts[1]); + auto block3Selector = stk::mesh::Selector(*parts[2]); + auto block4Selector = stk::mesh::Selector(*parts[3]); + auto allBlockSelector = stk::mesh::Selector(get_meta().universal_part()); + + setup_selected_field_data_on_device(allBlockSelector); + + sync_fields_to_host_async(execSpaces, block1Selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_host_result(field, block1Selector); + } + + stk::mesh::PartVector addParts(1, parts[1]); + stk::mesh::PartVector removeParts(1, parts[3]); + + stk::mesh::Selector block1And2Selector = block1Selector | block2Selector; + change_parts_on_selected_blocks(addParts, removeParts, block4Selector); + + for(auto field : get_fields()) { + stk::mesh::get_updated_ngp_field(*field); + } + + for(auto field : get_fields()) { + test_partial_copy_to_host_result(field, block1Selector); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, TEST_ONLY_ON_CUDA(FourStreamsAsyncPartialSyncToDeviceThenPartialSyncToHost)) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 4; + unsigned numStreams = 4; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + const stk::mesh::PartVector& parts = get_parts(); + auto block1Selector = stk::mesh::Selector(*parts[0]); + auto block2Selector = stk::mesh::Selector(*parts[1]); + auto block3Selector = stk::mesh::Selector(*parts[2]); + auto block4Selector = stk::mesh::Selector(*parts[3]); + auto allBlockSelector = stk::mesh::Selector(get_meta().universal_part()); + + setup_selected_field_data_on_host(allBlockSelector); + + auto block1And3Selector = block1Selector | block3Selector; + sync_fields_to_device_async(execSpaces, block1And3Selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_device_result(field, block1And3Selector); + } + + auto block2And4Selector = block2Selector | block4Selector; + sync_fields_to_host_async(execSpaces, block2And4Selector); + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : get_fields()) { + test_partial_copy_to_host_result(field, block1And3Selector); + } +} + +TEST_F(NgpAsyncDeepCopyFixture, AsyncGetUpdatedNgpField) +{ + if (get_parallel_size() != 1) GTEST_SKIP(); + + unsigned numBlocks = 2; + unsigned numStreams = 2; + unsigned multiplier = 5; + + setup_test(numBlocks, numStreams, multiplier); + setup_multi_block_mesh_with_fields_on_all_blocks(); + std::vector> execSpaces = get_execution_spaces_with_streams(numStreams); + const stk::mesh::PartVector& parts = get_parts(); + auto block1Selector = stk::mesh::Selector(*parts[0]); + auto block2Selector = stk::mesh::Selector(*parts[1]); + auto allBlockSelector = stk::mesh::Selector(get_meta().universal_part()); + + stk::mesh::PartVector addParts(1, parts[0]); + stk::mesh::PartVector removeParts(1, parts[1]); + + change_parts_on_selected_blocks(addParts, removeParts, block2Selector); + + setup_field_data_on_host(); + + auto fields = get_fields(); + for(unsigned i = 0; i < fields.size(); i++) { + auto& ngpField = stk::mesh::get_updated_ngp_field_async(*fields[i], execSpaces[i % execSpaces.size()]); + ngpField.modify_on_host(); + ngpField.sync_to_device(execSpaces[i % execSpaces.size()]); + } + + stk::mesh::ngp_field_fence(get_meta()); + + for(auto field : fields) { + test_partial_copy_to_device_result(field, allBlockSelector); + } +} \ No newline at end of file diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp index cdce8581f61c..baf3b1404e4a 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/NgpUnitTestUtils.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -126,7 +127,6 @@ inline void check_bucket_layout(const stk::mesh::BulkData& bulk, const std::vect }); } - } // ngp_unit_test_utils #endif diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpDebugFieldSync_PartialSync.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpDebugFieldSync_PartialSync.cpp index 58899ae027cc..e484bb8d2708 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpDebugFieldSync_PartialSync.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/UnitTestNgpDebugFieldSync_PartialSync.cpp @@ -496,7 +496,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, HostToDevice_Scalar_WriteAll_SyncOverlappi } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncSelector_ReadAll_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteAll_SyncSelector_ReadAll_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2"}); @@ -518,7 +518,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncSelector_ check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncTwoSelectors_ReadAll_WarnOutsideSelectors) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteAll_SyncTwoSelectors_ReadAll_WarnOutsideSelectors) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); @@ -687,7 +687,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteSelector_SyncNoth check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2"}); @@ -709,7 +709,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteOutsideSelector_S check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Vector_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Vector_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2"}); @@ -736,7 +736,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Vector_WriteOutsideSelector_S check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_ScalarPartial_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_ScalarPartial_WriteOutsideSelector_SyncSelector_ReadAll_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); @@ -839,7 +839,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncNothing_R check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2"}); @@ -861,7 +861,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncSelector_ check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_ScalarPartial_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_ScalarPartial_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); @@ -906,7 +906,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, HostToDevice_ScalarPartial_SyncOutsideSele check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_ScalarPartial_SyncOutsideSelector_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_ScalarPartial_SyncOutsideSelector_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); @@ -929,7 +929,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_ScalarPartial_SyncOutsideSele check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Vector_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Vector_WriteAll_SyncSelector_ReadOutsideSelector_WarnOutsideSelector) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2"}); @@ -956,7 +956,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Vector_WriteAll_SyncSelector_ check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncMultipleSelectors_ReadAll_WarnOutsideSelectors) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteAll_SyncMultipleSelectors_ReadAll_WarnOutsideSelectors) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); @@ -980,7 +980,7 @@ TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncMultipleS check_no_warnings(stdoutString); } -TEST_F(NgpDebugFieldSync_PartialSync, DeviceToHost_Scalar_WriteAll_SyncOverlappingSelectors_ReadAll_WarnOutsideSelectors) +TEST_F(NgpDebugFieldSync_PartialSync, DISABLED_DeviceToHost_Scalar_WriteAll_SyncOverlappingSelectors_ReadAll_WarnOutsideSelectors) { if (stk::parallel_machine_size(MPI_COMM_WORLD) != 1) return; create_parts({"Part1", "Part2", "Part3"}); diff --git a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp index a781d784d34d..fddad1825844 100644 --- a/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp +++ b/packages/stk/stk_unit_tests/stk_mesh/ngp/ngpFieldTest.cpp @@ -1,3 +1,37 @@ +// Copyright 2002 - 2008, 2010, 2011 National Technology Engineering +// Solutions of Sandia, LLC (NTESS). Under the terms of Contract +// DE-NA0003525 with NTESS, the U.S. Government retains certain rights +// in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following +// disclaimer in the documentation and/or other materials provided +// with the distribution. +// +// * Neither the name of NTESS nor the names of its contributors +// may be used to endorse or promote products derived from this +// software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// + #include #include #include @@ -17,9 +51,11 @@ #include #include #include +#include #include "NgpUnitTestUtils.hpp" #include #include +#include namespace ngp_field_test { diff --git a/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp b/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp index 80b50f1276a4..451f1167dd72 100644 --- a/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp +++ b/packages/stk/stk_util/stk_util/registry/ProductRegistry.cpp @@ -42,7 +42,7 @@ //In Sierra, STK_VERSION_STRING is provided on the compile line by bake. //For Trilinos stk snapshots, the following macro definition gets populated with //the real version string by the trilinos_snapshot.sh script. -#define STK_VERSION_STRING "5.1.3-550-g5afc1675" +#define STK_VERSION_STRING "5.1.4-66-g8a819459" #endif namespace stk { diff --git a/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.cpp b/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.cpp index 3e0d7732d19c..bf4e3ff6dbb2 100644 --- a/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.cpp +++ b/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.cpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_HIP template<> - std::string KokkosDeviceWrapperNode::name() { + std::string KokkosDeviceWrapperNode::name() { return std::string("HIP/Wrapper"); } #endif // KOKKOS_ENABLE_HIP diff --git a/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.hpp b/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.hpp index 811c0217b1ca..cf6ca5f6f8af 100644 --- a/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.hpp +++ b/packages/tpetra/classic/NodeAPI/KokkosCompat_ClassicNodeAPI_Wrapper.hpp @@ -54,7 +54,7 @@ class KokkosDeviceWrapperNode { #endif #ifdef KOKKOS_ENABLE_HIP - typedef KokkosDeviceWrapperNode KokkosHIPWrapperNode; + typedef KokkosDeviceWrapperNode KokkosHIPWrapperNode; #endif #ifdef KOKKOS_ENABLE_CUDA diff --git a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_HIP.hpp b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_HIP.hpp index a6e8df0b8f56..4447328af238 100644 --- a/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_HIP.hpp +++ b/packages/tpetra/core/ext/TpetraExt_MatrixMatrix_HIP.hpp @@ -179,20 +179,24 @@ void KernelWrappers KernelHandle; // Grab the Kokkos::SparseCrsMatrices - const KCRS & Amat = Aview.origMatrix->getLocalMatrix(); - const KCRS & Bmat = Bview.origMatrix->getLocalMatrix(); + const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice(); + const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice(); - c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map; - const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries; - const scalar_view_t Avals = Amat.values, Bvals = Bmat.values; + c_lno_view_t Arowptr = Amat.graph.row_map, + Browptr = Bmat.graph.row_map; + const lno_nnz_view_t Acolind = Amat.graph.entries, + Bcolind = Bmat.graph.entries; + const scalar_view_t Avals = Amat.values, + Bvals = Bmat.values; c_lno_view_t Irowptr; lno_nnz_view_t Icolind; scalar_view_t Ivals; if(!Bview.importMatrix.is_null()) { - Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map; - Icolind = Bview.importMatrix->getLocalMatrix().graph.entries; - Ivals = Bview.importMatrix->getLocalMatrix().values; + auto lclB = Bview.importMatrix->getLocalMatrixDevice(); + Irowptr = lclB.graph.row_map; + Icolind = lclB.graph.entries; + Ivals = lclB.values; } @@ -260,10 +264,10 @@ template void KernelWrappers::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct& Aview, CrsMatrixStruct& Bview, - const LocalOrdinalViewType & targetMapToOrigRow, - const LocalOrdinalViewType & targetMapToImportRow, - const LocalOrdinalViewType & Bcol2Ccol, - const LocalOrdinalViewType & Icol2Ccol, + const LocalOrdinalViewType & targetMapToOrigRow_dev, + const LocalOrdinalViewType & targetMapToImportRow_dev, + const LocalOrdinalViewType & Bcol2Ccol_dev, + const LocalOrdinalViewType & Icol2Ccol_dev, CrsMatrix& C, Teuchos::RCP > Cimport, const std::string& label, @@ -283,7 +287,7 @@ void KernelWrappers::local_matrix_type KCRS; + typedef typename Tpetra::CrsMatrix::local_matrix_host_type KCRS; typedef typename KCRS::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type::const_type c_lno_view_t; typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; @@ -298,8 +302,21 @@ void KernelWrappers::invalid(); const SC SC_ZERO = Teuchos::ScalarTraits::zero(); - // Since this is being run on HIP, we need to fence because the below host code will use UVM - typename graph_t::execution_space().fence(); + // KDDKDD UVM Without UVM, need to copy targetMap arrays to host. + // KDDKDD UVM Ideally, this function would run on device and use + // KDDKDD UVM KokkosKernels instead of this host implementation. + auto targetMapToOrigRow = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + targetMapToOrigRow_dev); + auto targetMapToImportRow = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + targetMapToImportRow_dev); + auto Bcol2Ccol = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + Bcol2Ccol_dev); + auto Icol2Ccol = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + Icol2Ccol_dev); // Sizes RCP Ccolmap = C.getColMap(); @@ -307,12 +324,16 @@ void KernelWrappersgetNodeNumElements(); // Grab the Kokkos::SparseCrsMatrices & inner stuff - const KCRS & Amat = Aview.origMatrix->getLocalMatrix(); - const KCRS & Bmat = Bview.origMatrix->getLocalMatrix(); - const KCRS & Cmat = C.getLocalMatrix(); - - c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map; - const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries; + const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost(); + const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost(); + const KCRS & Cmat = C.getLocalMatrixHost(); + + c_lno_view_t Arowptr = Amat.graph.row_map, + Browptr = Bmat.graph.row_map, + Crowptr = Cmat.graph.row_map; + const lno_nnz_view_t Acolind = Amat.graph.entries, + Bcolind = Bmat.graph.entries, + Ccolind = Cmat.graph.entries; const scalar_view_t Avals = Amat.values, Bvals = Bmat.values; scalar_view_t Cvals = Cmat.values; @@ -320,9 +341,10 @@ void KernelWrappersgetLocalMatrix().graph.row_map; - Icolind = Bview.importMatrix->getLocalMatrix().graph.entries; - Ivals = Bview.importMatrix->getLocalMatrix().values; + auto lclB = Bview.importMatrix->getLocalMatrixHost(); + Irowptr = lclB.graph.row_map; + Icolind = lclB.graph.entries; + Ivals = lclB.values; } #ifdef HAVE_TPETRA_MMM_TIMINGS @@ -468,10 +490,10 @@ void KernelWrappers2 & Dinv, CrsMatrixStruct& Aview, CrsMatrixStruct& Bview, - const LocalOrdinalViewType & targetMapToOrigRow, - const LocalOrdinalViewType & targetMapToImportRow, - const LocalOrdinalViewType & Bcol2Ccol, - const LocalOrdinalViewType & Icol2Ccol, + const LocalOrdinalViewType & targetMapToOrigRow_dev, + const LocalOrdinalViewType & targetMapToImportRow_dev, + const LocalOrdinalViewType & Bcol2Ccol_dev, + const LocalOrdinalViewType & Icol2Ccol_dev, CrsMatrix& C, Teuchos::RCP > Cimport, const std::string& label, @@ -490,7 +512,7 @@ void KernelWrappers2::local_matrix_type KCRS; + typedef typename Tpetra::CrsMatrix::local_matrix_host_type KCRS; typedef typename KCRS::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type::const_type c_lno_view_t; typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; @@ -506,18 +528,32 @@ void KernelWrappers2::invalid(); const SC SC_ZERO = Teuchos::ScalarTraits::zero(); - // Since this is being run on HIP, we need to fence because the below host code will use UVM - typename graph_t::execution_space().fence(); - + // KDDKDD UVM Without UVM, need to copy targetMap arrays to host. + // KDDKDD UVM Ideally, this function would run on device and use + // KDDKDD UVM KokkosKernels instead of this host implementation. + auto targetMapToOrigRow = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + targetMapToOrigRow_dev); + auto targetMapToImportRow = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + targetMapToImportRow_dev); + auto Bcol2Ccol = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + Bcol2Ccol_dev); + auto Icol2Ccol = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + Icol2Ccol_dev); + + // Sizes RCP Ccolmap = C.getColMap(); size_t m = Aview.origMatrix->getNodeNumRows(); size_t n = Ccolmap->getNodeNumElements(); // Grab the Kokkos::SparseCrsMatrices & inner stuff - const KCRS & Amat = Aview.origMatrix->getLocalMatrix(); - const KCRS & Bmat = Bview.origMatrix->getLocalMatrix(); - const KCRS & Cmat = C.getLocalMatrix(); + const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost(); + const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost(); + const KCRS & Cmat = C.getLocalMatrixHost(); c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map; const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries; @@ -528,9 +564,10 @@ void KernelWrappers2getLocalMatrix().graph.row_map; - Icolind = Bview.importMatrix->getLocalMatrix().graph.entries; - Ivals = Bview.importMatrix->getLocalMatrix().values; + auto lclB = Bview.importMatrix->getLocalMatrixHost(); + Irowptr = lclB.graph.row_map; + Icolind = lclB.graph.entries; + Ivals = lclB.values; } // Jacobi-specific inner stuff @@ -661,8 +698,8 @@ void KernelWrappers2::zero(), - std::runtime_error, + TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits::zero(), + std::runtime_error, "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl << "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl); } @@ -670,14 +707,14 @@ void KernelWrappers2::local_matrix_type; + using matrix_t = typename Tpetra::CrsMatrix::local_matrix_device_type; using graph_t = typename matrix_t::StaticCrsGraphType; using lno_view_t = typename graph_t::row_map_type::non_const_type; using c_lno_view_t = typename graph_t::row_map_type::const_type; using lno_nnz_view_t = typename graph_t::entries_type::non_const_type; using scalar_view_t = typename matrix_t::values_type::non_const_type; - // KokkosKernels handle + // KokkosKernels handle using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle< typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type, typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >; @@ -687,17 +724,18 @@ void KernelWrappers2getLocalMatrix().graph.row_map; - Icolind = Bview.importMatrix->getLocalMatrix().graph.entries; - Ivals = Bview.importMatrix->getLocalMatrix().values; + auto lclB = Bview.importMatrix->getLocalMatrixDevice(); + Irowptr = lclB.graph.row_map; + Icolind = lclB.graph.entries; + Ivals = lclB.values; } // Merge the B and Bimport matrices const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements()); // Get the properties and arrays of input matrices - const matrix_t & Amat = Aview.origMatrix->getLocalMatrix(); - const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix(); + const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice(); + const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice(); typename handle_t::nnz_lno_t AnumRows = Amat.numRows(); typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows(); @@ -713,7 +751,7 @@ void KernelWrappers2isParameter("hip: algorithm")) diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp index fca7491e1fce..4dfb6aaa2aaf 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp @@ -44,7 +44,7 @@ /// \brief Declaration of the Tpetra::CrsMatrix class #include "Tpetra_CrsMatrix_fwd.hpp" -#include "Tpetra_LocalCrsMatrixOperator_fwd.hpp" +#include "Tpetra_LocalCrsMatrixOperator.hpp" #include "Tpetra_RowMatrix_decl.hpp" #include "Tpetra_Exceptions.hpp" #include "Tpetra_DistObject.hpp" @@ -2457,6 +2457,16 @@ namespace Tpetra { values_wdv_type valuesUnpacked_wdv; mutable values_wdv_type valuesPacked_wdv; + using ordinal_rowptrs_type = typename local_multiply_op_type::ordinal_view_type; + /// \brief local_ordinal typed version of local matrix's rowptrs. + /// This allows the LocalCrsMatrixOperator to have rowptrs and entries be the same type, + /// so cuSPARSE SpMV (including merge-path) can be used for apply. + /// This is allocated and populated lazily in getLocalMultiplyOperator(), only if all 4 conditions are met: + /// - node_type is KokkosCudaWrapperNode + /// - the cuSPARSE TPL is enabled + /// - local_ordinal_type can represent getNodeNumEntries() + mutable ordinal_rowptrs_type ordinalRowptrs; + public: using row_ptrs_device_view_type = diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index fb742c8bef7f..214c06e43378 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -1082,6 +1082,33 @@ namespace Tpetra { CrsMatrix:: getLocalMultiplyOperator () const { + auto localMatrix = getLocalMatrixDevice(); +#ifdef HAVE_TPETRACORE_CUDA +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if(this->getNodeNumEntries() <= size_t(Teuchos::OrdinalTraits::max()) && + std::is_same::value) + { + if(this->ordinalRowptrs.data() == nullptr) + { + auto originalRowptrs = localMatrix.graph.row_map; + //create LocalOrdinal-typed copy of the local graph's rowptrs. + //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV. + this->ordinalRowptrs = ordinal_rowptrs_type( + Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0)); + auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this' + Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs", + Kokkos::RangePolicy(0, originalRowptrs.extent(0)), + KOKKOS_LAMBDA(LocalOrdinal i) + { + ordinalRowptrs_(i) = originalRowptrs(i); + }); + } + //return local operator using ordinalRowptrs + return std::make_shared( + std::make_shared(localMatrix), this->ordinalRowptrs); + } +#endif +#endif // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR return std::make_shared( std::make_shared( diff --git a/packages/tpetra/core/src/Tpetra_DistObject_decl.hpp b/packages/tpetra/core/src/Tpetra_DistObject_decl.hpp index 300e4ff6e49e..5bd91b8dbd00 100644 --- a/packages/tpetra/core/src/Tpetra_DistObject_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_DistObject_decl.hpp @@ -515,6 +515,54 @@ namespace Tpetra { const CombineMode CM, const bool restrictedMode = false); + void + beginImport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode = false); + + void + beginExport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode = false); + + void + beginImport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode = false); + + void + beginExport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode = false); + + void + endImport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode = false); + + void + endExport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode = false); + + void + endImport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode = false); + + void + endExport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode = false); + //@} //! @name Attribute accessor methods //@{ @@ -721,22 +769,38 @@ namespace Tpetra { /// LID DualViews come from the Transfer object given to /// doTransfer. They are always sync'd on both host and /// device. Users must never attempt to modify or sync them. - virtual void - doTransferNew (const SrcDistObject& src, - const CombineMode CM, - const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs, - const Kokkos::DualView& permuteFromLIDs, - const Kokkos::DualView& remoteLIDs, - const Kokkos::DualView& exportLIDs, - Distributor& distor, - const ReverseOption revOp, - const bool commOnHost, - const bool restrictedMode); + void beginTransfer(const SrcDistObject& src, + const ::Tpetra::Details::Transfer& transfer, + const char modeString[], + const ReverseOption revOp, + const CombineMode CM, + const bool restrictedMode); + + void endTransfer(const SrcDistObject& src, + const ::Tpetra::Details::Transfer& transfer, + const char modeString[], + const ReverseOption revOp, + const CombineMode CM, + const bool restrictedMode); + + void doPosts(Distributor& distor, + size_t constantNumPackets, + bool commOnHost, + ReverseOption revOp, + std::shared_ptr prefix); + + void doWaits(Distributor& distor, + ReverseOption revOp); + + void doPackAndPrepare(const SrcDistObject& src, + const Kokkos::DualView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor); + + void doUnpackAndCombine(const Kokkos::DualView& remoteLIDs, + size_t constantNumPackets, + Distributor& distor, + CombineMode CM); /// \name Methods implemented by subclasses and used by doTransfer(). /// diff --git a/packages/tpetra/core/src/Tpetra_DistObject_def.hpp b/packages/tpetra/core/src/Tpetra_DistObject_def.hpp index 8a3ab66c8fb2..fca77903864f 100644 --- a/packages/tpetra/core/src/Tpetra_DistObject_def.hpp +++ b/packages/tpetra/core/src/Tpetra_DistObject_def.hpp @@ -293,8 +293,8 @@ namespace Tpetra { os << *prefix << "Start" << endl; std::cerr << os.str (); } - this->doTransfer (source, importer, modeString, DoForward, CM, - restrictedMode); + this->beginImport(source, importer, CM, restrictedMode); + this->endImport(source, importer, CM, restrictedMode); if (verbose) { std::ostringstream os; os << *prefix << "Done" << endl; @@ -325,8 +325,8 @@ namespace Tpetra { os << *prefix << "Start" << endl; std::cerr << os.str (); } - this->doTransfer (source, exporter, modeString, DoForward, CM, - restrictedMode); + this->beginExport(source, exporter, CM, restrictedMode); + this->endExport(source, exporter, CM, restrictedMode); if (verbose) { std::ostringstream os; os << *prefix << "Done" << endl; @@ -357,8 +357,8 @@ namespace Tpetra { os << *prefix << "Start" << endl; std::cerr << os.str (); } - this->doTransfer (source, exporter, modeString, DoReverse, CM, - restrictedMode); + this->beginImport(source, exporter, CM, restrictedMode); + this->endImport(source, exporter, CM, restrictedMode); if (verbose) { std::ostringstream os; os << *prefix << "Done" << endl; @@ -389,8 +389,256 @@ namespace Tpetra { os << *prefix << "Start" << endl; std::cerr << os.str (); } - this->doTransfer (source, importer, modeString, DoReverse, CM, - restrictedMode); + this->beginExport(source, importer, CM, restrictedMode); + this->endExport(source, importer, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + beginImport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doImport (forward mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->beginTransfer(source, importer, modeString, DoForward, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + beginExport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doExport (forward mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->beginTransfer(source, exporter, modeString, DoForward, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + beginImport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doImport (reverse mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->beginTransfer(source, exporter, modeString, DoReverse, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + beginExport(const SrcDistObject& source, + const Import & importer, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doExport (reverse mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->beginTransfer(source, importer, modeString, DoReverse, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + endImport(const SrcDistObject& source, + const Import& importer, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doImport (forward mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->endTransfer(source, importer, modeString, DoForward, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + endExport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doExport (forward mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->endTransfer(source, exporter, modeString, DoForward, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + endImport(const SrcDistObject& source, + const Export& exporter, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doImport (reverse mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->endTransfer(source, exporter, modeString, DoReverse, CM, restrictedMode); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done" << endl; + std::cerr << os.str (); + } + } + + template + void + DistObject:: + endExport(const SrcDistObject& source, + const Import & importer, + const CombineMode CM, + const bool restrictedMode) + { + using Details::Behavior; + using std::endl; + const char modeString[] = "doExport (reverse mode)"; + + // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug + // output to std::cerr on every MPI process. This is unwise for + // runs with large numbers of MPI processes. + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", modeString); + std::ostringstream os; + os << *prefix << "Start" << endl; + std::cerr << os.str (); + } + this->endTransfer(source, importer, modeString, DoReverse, CM, restrictedMode); if (verbose) { std::ostringstream os; os << *prefix << "Done" << endl; @@ -415,7 +663,468 @@ namespace Tpetra { template void DistObject:: - doTransfer (const SrcDistObject& src, + doTransfer (const SrcDistObject& src, + const ::Tpetra::Details::Transfer& transfer, + const char modeString[], + const ReverseOption revOp, + const CombineMode CM, + bool restrictedMode) + { + beginTransfer(src, transfer, modeString, revOp, CM, restrictedMode); + endTransfer(src, transfer, modeString, revOp, CM, restrictedMode); + } + + template + bool + DistObject:: + reallocImportsIfNeeded (const size_t newSize, + const bool verbose, + const std::string* prefix) + { + if (verbose) { + std::ostringstream os; + os << *prefix << "Realloc (if needed) imports_ from " + << imports_.extent (0) << " to " << newSize << std::endl; + std::cerr << os.str (); + } + using ::Tpetra::Details::reallocDualViewIfNeeded; + const bool reallocated = + reallocDualViewIfNeeded (this->imports_, newSize, "imports"); + if (verbose) { + std::ostringstream os; + os << *prefix << "Finished realloc'ing imports_" << std::endl; + std::cerr << os.str (); + } + return reallocated; + } + + template + bool + DistObject:: + reallocArraysForNumPacketsPerLid (const size_t numExportLIDs, + const size_t numImportLIDs) + { + using Details::Behavior; + using ::Tpetra::Details::dualViewStatusToString; + using ::Tpetra::Details::reallocDualViewIfNeeded; + using std::endl; + // If an array is already allocated, and if is at least + // tooBigFactor times bigger than it needs to be, free it and + // reallocate to the size we need, in order to save space. + // Otherwise, take subviews to reduce allocation size. + constexpr size_t tooBigFactor = 10; + + const bool verbose = Behavior::verbose("DistObject"); + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("DistObject", + "reallocArraysForNumPacketsPerLid"); + std::ostringstream os; + os << *prefix + << "numExportLIDs: " << numExportLIDs + << ", numImportLIDs: " << numImportLIDs + << endl; + os << *prefix << "DualView status before:" << endl + << *prefix + << dualViewStatusToString (this->numExportPacketsPerLID_, + "numExportPacketsPerLID_") + << endl + << *prefix + << dualViewStatusToString (this->numImportPacketsPerLID_, + "numImportPacketsPerLID_") + << endl; + std::cerr << os.str (); + } + + // Reallocate numExportPacketsPerLID_ if needed. + const bool firstReallocated = + reallocDualViewIfNeeded (this->numExportPacketsPerLID_, + numExportLIDs, + "numExportPacketsPerLID", + tooBigFactor, + true); // need fence before, if realloc'ing + + // If we reallocated above, then we fenced after that + // reallocation. This means that we don't need to fence again, + // before the next reallocation. + const bool needFenceBeforeNextAlloc = ! firstReallocated; + const bool secondReallocated = + reallocDualViewIfNeeded (this->numImportPacketsPerLID_, + numImportLIDs, + "numImportPacketsPerLID", + tooBigFactor, + needFenceBeforeNextAlloc); + + if (verbose) { + std::ostringstream os; + os << *prefix << "DualView status after:" << endl + << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_, + "numExportPacketsPerLID_") + << endl + << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_, + "numImportPacketsPerLID_") + << endl; + std::cerr << os.str (); + } + + return firstReallocated || secondReallocated; + } + + template + void + DistObject:: + beginTransfer(const SrcDistObject& src, + const ::Tpetra::Details::Transfer& transfer, + const char modeString[], + const ReverseOption revOp, + const CombineMode CM, + bool restrictedMode) + { + using Details::Behavior; + using ::Tpetra::Details::dualViewStatusToString; + using ::Tpetra::Details::getArrayViewFromDualView; + using Details::ProfilingRegion; + using Kokkos::Compat::getArrayView; + using Kokkos::Compat::getConstArrayView; + using Kokkos::Compat::getKokkosViewDeepCopy; + using Kokkos::Compat::create_const_view; + using std::endl; + using Details::getDualViewCopyFromArrayView; + using Details::ProfilingRegion; + const char funcName[] = "Tpetra::DistObject::doTransfer"; + + ProfilingRegion region_doTransfer(funcName); + const bool verbose = Behavior::verbose("DistObject"); + std::shared_ptr prefix; + if (verbose) { + std::ostringstream os; + prefix = this->createPrefix("DistObject", "doTransfer"); + os << *prefix << "Source type: " << Teuchos::typeName(src) + << ", Target type: " << Teuchos::typeName(*this) << endl; + std::cerr << os.str(); + } + + // "Restricted Mode" does two things: + // 1) Skips copyAndPermute + // 2) Allows the "target" Map of the transfer to be a subset of + // the Map of *this, in a "locallyFitted" sense. + // + // This cannot be used if #2 is not true, OR there are permutes. + // Source Maps still need to match + + // mfh 18 Oct 2017: Set TPETRA_DEBUG to true to enable extra debug + // checks. These may communicate more. + const bool debug = Behavior::debug("DistObject"); + if (debug) { + if (! restrictedMode && revOp == DoForward) { + const bool myMapSameAsTransferTgtMap = + this->getMap ()->isSameAs (* (transfer.getTargetMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! myMapSameAsTransferTgtMap, std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For forward-mode " + "communication, the target DistObject's Map must be the same " + "(in the sense of Tpetra::Map::isSameAs) as the input " + "Export/Import object's target Map."); + } + else if (! restrictedMode && revOp == DoReverse) { + const bool myMapSameAsTransferSrcMap = + this->getMap ()->isSameAs (* (transfer.getSourceMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! myMapSameAsTransferSrcMap, std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For reverse-mode " + "communication, the target DistObject's Map must be the same " + "(in the sense of Tpetra::Map::isSameAs) as the input " + "Export/Import object's source Map."); + } + else if (restrictedMode && revOp == DoForward) { + const bool myMapLocallyFittedTransferTgtMap = + this->getMap ()->isLocallyFitted (* (transfer.getTargetMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! myMapLocallyFittedTransferTgtMap , std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For forward-mode " + "communication using restricted mode, Export/Import object's " + "target Map must be locally fitted (in the sense of " + "Tpetra::Map::isLocallyFitted) to target DistObject's Map."); + } + else { // if (restrictedMode && revOp == DoReverse) + const bool myMapLocallyFittedTransferSrcMap = + this->getMap ()->isLocallyFitted (* (transfer.getSourceMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! myMapLocallyFittedTransferSrcMap, std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For reverse-mode " + "communication using restricted mode, Export/Import object's " + "source Map must be locally fitted (in the sense of " + "Tpetra::Map::isLocallyFitted) to target DistObject's Map."); + } + + // SrcDistObject need not even _have_ Maps. However, if the + // source object is a DistObject, it has a Map, and we may + // compare that Map with the Transfer's Maps. + const this_type* srcDistObj = dynamic_cast (&src); + if (srcDistObj != nullptr) { + if (revOp == DoForward) { + const bool srcMapSameAsImportSrcMap = + srcDistObj->getMap ()->isSameAs (* (transfer.getSourceMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! srcMapSameAsImportSrcMap, std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For forward-mode " + "communication, the source DistObject's Map must be the same " + "as the input Export/Import object's source Map."); + } + else { // revOp == DoReverse + const bool srcMapSameAsImportTgtMap = + srcDistObj->getMap ()->isSameAs (* (transfer.getTargetMap ())); + TEUCHOS_TEST_FOR_EXCEPTION + (! srcMapSameAsImportTgtMap, std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": For reverse-mode " + "communication, the source DistObject's Map must be the same " + "as the input Export/Import object's target Map."); + } + } + } + + const size_t numSameIDs = transfer.getNumSameIDs (); + Distributor& distor = transfer.getDistributor (); + + TEUCHOS_TEST_FOR_EXCEPTION + (debug && restrictedMode && + (transfer.getPermuteToLIDs_dv().extent(0) != 0 || + transfer.getPermuteFromLIDs_dv().extent(0) != 0), + std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": Transfer object " + "cannot have permutes in restricted mode."); + + // Do we need all communication buffers to live on host? + const bool commOnHost = ! Behavior::assumeMpiIsCudaAware (); + if (verbose) { + std::ostringstream os; + os << *prefix << "doTransfer: Use new interface; " + "commOnHost=" << (commOnHost ? "true" : "false") << endl; + std::cerr << os.str (); + } + + using const_lo_dv_type = + Kokkos::DualView; + const_lo_dv_type permuteToLIDs = (revOp == DoForward) ? + transfer.getPermuteToLIDs_dv () : + transfer.getPermuteFromLIDs_dv (); + const_lo_dv_type permuteFromLIDs = (revOp == DoForward) ? + transfer.getPermuteFromLIDs_dv () : + transfer.getPermuteToLIDs_dv (); + const_lo_dv_type remoteLIDs = (revOp == DoForward) ? + transfer.getRemoteLIDs_dv () : + transfer.getExportLIDs_dv (); + const_lo_dv_type exportLIDs = (revOp == DoForward) ? + transfer.getExportLIDs_dv () : + transfer.getRemoteLIDs_dv (); + + ProfilingRegion region_dTN(funcName); +#ifdef HAVE_TPETRA_TRANSFER_TIMERS + // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor + // of Kokkos profiling. + Teuchos::TimeMonitor doXferMon (*doXferTimer_); +#endif // HAVE_TPETRA_TRANSFER_TIMERS + + if (verbose) { + std::ostringstream os; + os << *prefix << "Input arguments:" << endl + << *prefix << " combineMode: " << combineModeToString (CM) << endl + << *prefix << " numSameIDs: " << numSameIDs << endl + << *prefix << " " + << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl + << *prefix << " " + << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl + << *prefix << " " + << dualViewStatusToString (remoteLIDs, "remoteLIDs") << endl + << *prefix << " " + << dualViewStatusToString (exportLIDs, "exportLIDs") << endl + << *prefix << " revOp: Do" << (revOp == DoReverse ? "Reverse" : "Forward") << endl + << *prefix << " commOnHost: " << (commOnHost ? "true" : "false") << endl; + std::cerr << os.str (); + } + + { + ProfilingRegion region_cs ("Tpetra::DistObject::doTransferNew::checkSizes"); + if (verbose) { + std::ostringstream os; + os << *prefix << "1. checkSizes" << endl; + std::cerr << os.str (); + } + const bool checkSizesResult = this->checkSizes (src); + TEUCHOS_TEST_FOR_EXCEPTION + (! checkSizesResult, std::invalid_argument, + "Tpetra::DistObject::doTransfer: checkSizes() indicates that the " + "destination object is not a legal target for redistribution from the " + "source object. This probably means that they do not have the same " + "dimensions. For example, MultiVectors must have the same number of " + "rows and columns."); + } + + // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood + // that if CM == INSERT || CM == REPLACE, the target object could + // be write only. We don't optimize for that here. + + if (!restrictedMode && numSameIDs + permuteToLIDs.extent (0) != 0) { + // There is at least one GID to copy or permute. + if (verbose) { + std::ostringstream os; + os << *prefix << "2. copyAndPermute" << endl; + std::cerr << os.str (); + } + ProfilingRegion region_cp + ("Tpetra::DistObject::doTransferNew::copyAndPermute"); +#ifdef HAVE_TPETRA_TRANSFER_TIMERS + // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor + // of Kokkos profiling. + Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_); +#endif // HAVE_TPETRA_TRANSFER_TIMERS + + if (numSameIDs + permuteToLIDs.extent (0) != 0) { + // There is at least one GID to copy or permute. + if (verbose) { + std::ostringstream os; + os << *prefix << "2. copyAndPermute" << endl; + std::cerr << os.str (); + } + this->copyAndPermute (src, numSameIDs, permuteToLIDs, + permuteFromLIDs, CM); + if (verbose) { + std::ostringstream os; + os << *prefix << "After copyAndPermute:" << endl + << *prefix << " " + << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") + << endl + << *prefix << " " + << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") + << endl; + std::cerr << os.str (); + } + } + } + + // The method may return zero even if the implementation actually + // does have a constant number of packets per LID. However, if it + // returns nonzero, we may use this information to avoid + // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare() + // will set this to its final value. + // + // We only need this if CM != ZERO, but it has to be lifted out of + // that scope because there are multiple tests for CM != ZERO. + size_t constantNumPackets = this->constantNumberOfPackets (); + if (verbose) { + std::ostringstream os; + os << *prefix << "constantNumPackets=" << constantNumPackets << endl; + std::cerr << os.str (); + } + + // We only need to pack communication buffers if the combine mode + // is not ZERO. A "ZERO combine mode" means that the results are + // the same as if we had received all zeros, and added them to the + // existing values. That means we don't need to communicate. + if (CM != ZERO) { + if (constantNumPackets == 0) { + if (verbose) { + std::ostringstream os; + os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID" + << endl; + std::cerr << os.str (); + } + // This only reallocates if necessary, that is, if the sizes + // don't match. + this->reallocArraysForNumPacketsPerLid (exportLIDs.extent (0), + remoteLIDs.extent (0)); + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "4. packAndPrepare: before, " + << dualViewStatusToString (this->exports_, "exports_") + << endl; + std::cerr << os.str (); + } + + doPackAndPrepare(src, exportLIDs, constantNumPackets, distor); + if (commOnHost) { + this->exports_.sync_host(); + } + else { + this->exports_.sync_device(); + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "5.1. After packAndPrepare, " + << dualViewStatusToString (this->exports_, "exports_") + << endl; + std::cerr << os.str (); + } + } // if (CM != ZERO) + + // We only need to send data if the combine mode is not ZERO. + if (CM != ZERO) { + if (constantNumPackets != 0) { + // There are a constant number of packets per element. We + // already know (from the number of "remote" (incoming) + // elements) how many incoming elements we expect, so we can + // resize the buffer accordingly. + const size_t rbufLen = remoteLIDs.extent (0) * constantNumPackets; + reallocImportsIfNeeded (rbufLen, verbose, prefix.get ()); + } + + // Do we need to do communication (via doPostsAndWaits)? + bool needCommunication = true; + + // This may be NULL. It will be used below. + const this_type* srcDistObj = dynamic_cast (&src); + + if (revOp == DoReverse && ! this->isDistributed ()) { + needCommunication = false; + } + // FIXME (mfh 30 Jun 2013): Checking whether the source object + // is distributed requires a cast to DistObject. If it's not a + // DistObject, then I'm not quite sure what to do. Perhaps it + // would be more appropriate for SrcDistObject to have an + // isDistributed() method. For now, I'll just assume that we + // need to do communication unless the cast succeeds and the + // source is not distributed. + else if (revOp == DoForward && srcDistObj != NULL && + ! srcDistObj->isDistributed ()) { + needCommunication = false; + } + + if (! needCommunication) { + if (verbose) { + std::ostringstream os; + os << *prefix << "Comm not needed; skipping" << endl; + std::cerr << os.str (); + } + } + else { + ProfilingRegion region_dpw + ("Tpetra::DistObject::doTransferNew::doPostsAndWaits"); +#ifdef HAVE_TPETRA_TRANSFER_TIMERS + // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in + // favor of Kokkos profiling. + Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_); +#endif // HAVE_TPETRA_TRANSFER_TIMERS + + if (verbose) { + std::ostringstream os; + os << *prefix << "7.0. " + << (revOp == DoReverse ? "Reverse" : "Forward") + << " mode" << endl; + std::cerr << os.str (); + } + + doPosts(distor, constantNumPackets, commOnHost, revOp, prefix); + } // if (needCommunication) + } // if (CM != ZERO) + } + + template + void + DistObject:: + endTransfer(const SrcDistObject& src, const ::Tpetra::Details::Transfer& transfer, const char modeString[], const ReverseOption revOp, @@ -423,14 +1132,21 @@ namespace Tpetra { bool restrictedMode) { using Details::Behavior; - using Details::getDualViewCopyFromArrayView; + using ::Tpetra::Details::dualViewStatusToString; + using ::Tpetra::Details::getArrayViewFromDualView; using Details::ProfilingRegion; + using Kokkos::Compat::getArrayView; + using Kokkos::Compat::getConstArrayView; + using Kokkos::Compat::getKokkosViewDeepCopy; + using Kokkos::Compat::create_const_view; using std::endl; + using Details::getDualViewCopyFromArrayView; + using Details::ProfilingRegion; const char funcName[] = "Tpetra::DistObject::doTransfer"; ProfilingRegion region_doTransfer(funcName); const bool verbose = Behavior::verbose("DistObject"); - std::unique_ptr prefix; + std::shared_ptr prefix; if (verbose) { std::ostringstream os; prefix = this->createPrefix("DistObject", "doTransfer"); @@ -511,391 +1227,48 @@ namespace Tpetra { srcDistObj->getMap ()->isSameAs (* (transfer.getTargetMap ())); TEUCHOS_TEST_FOR_EXCEPTION (! srcMapSameAsImportTgtMap, std::invalid_argument, - "Tpetra::DistObject::" << modeString << ": For reverse-mode " - "communication, the source DistObject's Map must be the same " - "as the input Export/Import object's target Map."); - } - } - } - - const size_t numSameIDs = transfer.getNumSameIDs (); - Distributor& distor = transfer.getDistributor (); - - TEUCHOS_TEST_FOR_EXCEPTION - (debug && restrictedMode && - (transfer.getPermuteToLIDs_dv().extent(0) != 0 || - transfer.getPermuteFromLIDs_dv().extent(0) != 0), - std::invalid_argument, - "Tpetra::DistObject::" << modeString << ": Transfer object " - "cannot have permutes in restricted mode."); - - // Do we need all communication buffers to live on host? - const bool commOnHost = ! Behavior::assumeMpiIsCudaAware (); - if (verbose) { - std::ostringstream os; - os << *prefix << "doTransfer: Use new interface; " - "commOnHost=" << (commOnHost ? "true" : "false") << endl; - std::cerr << os.str (); - } - - using const_lo_dv_type = - Kokkos::DualView; - const_lo_dv_type permToLIDs = (revOp == DoForward) ? - transfer.getPermuteToLIDs_dv () : - transfer.getPermuteFromLIDs_dv (); - const_lo_dv_type permFromLIDs = (revOp == DoForward) ? - transfer.getPermuteFromLIDs_dv () : - transfer.getPermuteToLIDs_dv (); - const_lo_dv_type remoteLIDs = (revOp == DoForward) ? - transfer.getRemoteLIDs_dv () : - transfer.getExportLIDs_dv (); - const_lo_dv_type exportLIDs = (revOp == DoForward) ? - transfer.getExportLIDs_dv () : - transfer.getRemoteLIDs_dv (); - doTransferNew (src, CM, numSameIDs, permToLIDs, permFromLIDs, - remoteLIDs, exportLIDs, distor, revOp, commOnHost, - restrictedMode); - - if (verbose) { - std::ostringstream os; - os << *prefix << "Tpetra::DistObject::doTransfer: Done!" << endl; - std::cerr << os.str (); - } - } - - template - bool - DistObject:: - reallocImportsIfNeeded (const size_t newSize, - const bool verbose, - const std::string* prefix) - { - if (verbose) { - std::ostringstream os; - os << *prefix << "Realloc (if needed) imports_ from " - << imports_.extent (0) << " to " << newSize << std::endl; - std::cerr << os.str (); - } - using ::Tpetra::Details::reallocDualViewIfNeeded; - const bool reallocated = - reallocDualViewIfNeeded (this->imports_, newSize, "imports"); - if (verbose) { - std::ostringstream os; - os << *prefix << "Finished realloc'ing imports_" << std::endl; - std::cerr << os.str (); - } - return reallocated; - } - - template - bool - DistObject:: - reallocArraysForNumPacketsPerLid (const size_t numExportLIDs, - const size_t numImportLIDs) - { - using Details::Behavior; - using ::Tpetra::Details::dualViewStatusToString; - using ::Tpetra::Details::reallocDualViewIfNeeded; - using std::endl; - // If an array is already allocated, and if is at least - // tooBigFactor times bigger than it needs to be, free it and - // reallocate to the size we need, in order to save space. - // Otherwise, take subviews to reduce allocation size. - constexpr size_t tooBigFactor = 10; - - const bool verbose = Behavior::verbose("DistObject"); - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("DistObject", - "reallocArraysForNumPacketsPerLid"); - std::ostringstream os; - os << *prefix - << "numExportLIDs: " << numExportLIDs - << ", numImportLIDs: " << numImportLIDs - << endl; - os << *prefix << "DualView status before:" << endl - << *prefix - << dualViewStatusToString (this->numExportPacketsPerLID_, - "numExportPacketsPerLID_") - << endl - << *prefix - << dualViewStatusToString (this->numImportPacketsPerLID_, - "numImportPacketsPerLID_") - << endl; - std::cerr << os.str (); - } - - // Reallocate numExportPacketsPerLID_ if needed. - const bool firstReallocated = - reallocDualViewIfNeeded (this->numExportPacketsPerLID_, - numExportLIDs, - "numExportPacketsPerLID", - tooBigFactor, - true); // need fence before, if realloc'ing - - // If we reallocated above, then we fenced after that - // reallocation. This means that we don't need to fence again, - // before the next reallocation. - const bool needFenceBeforeNextAlloc = ! firstReallocated; - const bool secondReallocated = - reallocDualViewIfNeeded (this->numImportPacketsPerLID_, - numImportLIDs, - "numImportPacketsPerLID", - tooBigFactor, - needFenceBeforeNextAlloc); - - if (verbose) { - std::ostringstream os; - os << *prefix << "DualView status after:" << endl - << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_, - "numExportPacketsPerLID_") - << endl - << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_, - "numImportPacketsPerLID_") - << endl; - std::cerr << os.str (); - } - - return firstReallocated || secondReallocated; - } - - template - void - DistObject:: - doTransferNew (const SrcDistObject& src, - const CombineMode CM, - const size_t numSameIDs, - const Kokkos::DualView& permuteToLIDs, - const Kokkos::DualView& permuteFromLIDs, - const Kokkos::DualView& remoteLIDs, - const Kokkos::DualView& exportLIDs, - Distributor& distor, - const ReverseOption revOp, - const bool commOnHost, - const bool restrictedMode) - { - using Details::Behavior; - using ::Tpetra::Details::dualViewStatusToString; - using ::Tpetra::Details::getArrayViewFromDualView; - using Details::ProfilingRegion; - using Kokkos::Compat::getArrayView; - using Kokkos::Compat::getConstArrayView; - using Kokkos::Compat::getKokkosViewDeepCopy; - using Kokkos::Compat::create_const_view; - using std::endl; - const char funcName[] = "Tpetra::DistObject::doTransferNew"; - - ProfilingRegion region_dTN(funcName); -#ifdef HAVE_TPETRA_TRANSFER_TIMERS - // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor - // of Kokkos profiling. - Teuchos::TimeMonitor doXferMon (*doXferTimer_); -#endif // HAVE_TPETRA_TRANSFER_TIMERS - - const bool debug = Behavior::debug("DistObject"); - const bool verbose = Behavior::verbose("DistObject"); - // Prefix for verbose output. Use a pointer, so we don't pay for - // string construction unless needed. We set this below. - std::unique_ptr prefix; - if (verbose) { - prefix = this->createPrefix("DistObject", "doTransferNew"); - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Input arguments:" << endl - << *prefix << " combineMode: " << combineModeToString (CM) << endl - << *prefix << " numSameIDs: " << numSameIDs << endl - << *prefix << " " - << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl - << *prefix << " " - << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl - << *prefix << " " - << dualViewStatusToString (remoteLIDs, "remoteLIDs") << endl - << *prefix << " " - << dualViewStatusToString (exportLIDs, "exportLIDs") << endl - << *prefix << " revOp: Do" << (revOp == DoReverse ? "Reverse" : "Forward") << endl - << *prefix << " commOnHost: " << (commOnHost ? "true" : "false") << endl; - std::cerr << os.str (); - } - - { - ProfilingRegion region_cs ("Tpetra::DistObject::doTransferNew::checkSizes"); - if (verbose) { - std::ostringstream os; - os << *prefix << "1. checkSizes" << endl; - std::cerr << os.str (); - } - const bool checkSizesResult = this->checkSizes (src); - TEUCHOS_TEST_FOR_EXCEPTION - (! checkSizesResult, std::invalid_argument, - "Tpetra::DistObject::doTransfer: checkSizes() indicates that the " - "destination object is not a legal target for redistribution from the " - "source object. This probably means that they do not have the same " - "dimensions. For example, MultiVectors must have the same number of " - "rows and columns."); - } - - // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood - // that if CM == INSERT || CM == REPLACE, the target object could - // be write only. We don't optimize for that here. - - if (!restrictedMode && numSameIDs + permuteToLIDs.extent (0) != 0) { - // There is at least one GID to copy or permute. - if (verbose) { - std::ostringstream os; - os << *prefix << "2. copyAndPermute" << endl; - std::cerr << os.str (); - } - ProfilingRegion region_cp - ("Tpetra::DistObject::doTransferNew::copyAndPermute"); -#ifdef HAVE_TPETRA_TRANSFER_TIMERS - // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor - // of Kokkos profiling. - Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_); -#endif // HAVE_TPETRA_TRANSFER_TIMERS - - if (numSameIDs + permuteToLIDs.extent (0) != 0) { - // There is at least one GID to copy or permute. - if (verbose) { - std::ostringstream os; - os << *prefix << "2. copyAndPermute" << endl; - std::cerr << os.str (); - } - this->copyAndPermute (src, numSameIDs, permuteToLIDs, - permuteFromLIDs, CM); - if (verbose) { - std::ostringstream os; - os << *prefix << "After copyAndPermute:" << endl - << *prefix << " " - << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") - << endl - << *prefix << " " - << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") - << endl; - std::cerr << os.str (); + "Tpetra::DistObject::" << modeString << ": For reverse-mode " + "communication, the source DistObject's Map must be the same " + "as the input Export/Import object's target Map."); } } } - // The method may return zero even if the implementation actually - // does have a constant number of packets per LID. However, if it - // returns nonzero, we may use this information to avoid - // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare() - // will set this to its final value. - // - // We only need this if CM != ZERO, but it has to be lifted out of - // that scope because there are multiple tests for CM != ZERO. - size_t constantNumPackets = this->constantNumberOfPackets (); + Distributor& distor = transfer.getDistributor (); + + TEUCHOS_TEST_FOR_EXCEPTION + (debug && restrictedMode && + (transfer.getPermuteToLIDs_dv().extent(0) != 0 || + transfer.getPermuteFromLIDs_dv().extent(0) != 0), + std::invalid_argument, + "Tpetra::DistObject::" << modeString << ": Transfer object " + "cannot have permutes in restricted mode."); + + // Do we need all communication buffers to live on host? + const bool commOnHost = ! Behavior::assumeMpiIsCudaAware (); if (verbose) { std::ostringstream os; - os << *prefix << "constantNumPackets=" << constantNumPackets << endl; + os << *prefix << "doTransfer: Use new interface; " + "commOnHost=" << (commOnHost ? "true" : "false") << endl; std::cerr << os.str (); } - // We only need to pack communication buffers if the combine mode - // is not ZERO. A "ZERO combine mode" means that the results are - // the same as if we had received all zeros, and added them to the - // existing values. That means we don't need to communicate. - if (CM != ZERO) { - if (constantNumPackets == 0) { - if (verbose) { - std::ostringstream os; - os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID" - << endl; - std::cerr << os.str (); - } - // This only reallocates if necessary, that is, if the sizes - // don't match. - this->reallocArraysForNumPacketsPerLid (exportLIDs.extent (0), - remoteLIDs.extent (0)); - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "4. packAndPrepare: before, " - << dualViewStatusToString (this->exports_, "exports_") - << endl; - std::cerr << os.str (); - } - { - ProfilingRegion region_pp - ("Tpetra::DistObject::doTransferNew::packAndPrepare"); -#ifdef HAVE_TPETRA_TRANSFER_TIMERS - // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in - // favor of Kokkos profiling. - Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_); -#endif // HAVE_TPETRA_TRANSFER_TIMERS + using const_lo_dv_type = + Kokkos::DualView; + const_lo_dv_type permuteToLIDs = (revOp == DoForward) ? + transfer.getPermuteToLIDs_dv () : + transfer.getPermuteFromLIDs_dv (); + const_lo_dv_type permuteFromLIDs = (revOp == DoForward) ? + transfer.getPermuteFromLIDs_dv () : + transfer.getPermuteToLIDs_dv (); + const_lo_dv_type remoteLIDs = (revOp == DoForward) ? + transfer.getRemoteLIDs_dv () : + transfer.getExportLIDs_dv (); + const_lo_dv_type exportLIDs = (revOp == DoForward) ? + transfer.getExportLIDs_dv () : + transfer.getRemoteLIDs_dv (); - // Ask the source to pack data. Also ask it whether there are - // a constant number of packets per element - // (constantNumPackets is an output argument). If there are, - // constantNumPackets will come back nonzero. Otherwise, the - // source will fill the numExportPacketsPerLID_ array. - - // FIXME (mfh 18 Oct 2017) if (! commOnHost), sync to device? - // Alternately, make packAndPrepare take a "commOnHost" - // argument to tell it where to leave the data? - // - // NOTE (mfh 04 Feb 2019) Subclasses of DistObject should have - // the freedom to pack and unpack either on host or device. - // We should prefer sync'ing only on demand. Thus, we can - // answer the above question: packAndPrepare should not - // take a commOnHost argument, and doTransferNew should sync - // where needed, if needed. - if (debug) { - std::ostringstream lclErrStrm; - bool lclSuccess = false; - try { - this->packAndPrepare (src, exportLIDs, this->exports_, - this->numExportPacketsPerLID_, - constantNumPackets, distor); - lclSuccess = true; - } - catch (std::exception& e) { - lclErrStrm << "packAndPrepare threw an exception: " - << endl << e.what(); - } - catch (...) { - lclErrStrm << "packAndPrepare threw an exception " - "not a subclass of std::exception."; - } - const char gblErrMsgHeader[] = "Tpetra::DistObject::" - "doTransferNew threw an exception in packAndPrepare on " - "one or more processes in the DistObject's communicator."; - auto comm = getMap()->getComm(); - Details::checkGlobalError(std::cerr, lclSuccess, - lclErrStrm.str().c_str(), - gblErrMsgHeader, *comm); - } - else { - this->packAndPrepare (src, exportLIDs, this->exports_, - this->numExportPacketsPerLID_, - constantNumPackets, distor); - } - if (commOnHost) { - if (this->exports_.need_sync_host ()) { - this->exports_.sync_host (); - } - } - else { // ! commOnHost - if (this->exports_.need_sync_device ()) { - this->exports_.sync_device (); - } - } - } - if (verbose) { - std::ostringstream os; - os << *prefix << "5.1. After packAndPrepare, " - << dualViewStatusToString (this->exports_, "exports_") - << endl; - std::cerr << os.str (); - } - } // if (CM != ZERO) + size_t constantNumPackets = this->constantNumberOfPackets (); // We only need to send data if the combine mode is not ZERO. if (CM != ZERO) { @@ -937,302 +1310,399 @@ namespace Tpetra { } } else { - ProfilingRegion region_dpw - ("Tpetra::DistObject::doTransferNew::doPostsAndWaits"); -#ifdef HAVE_TPETRA_TRANSFER_TIMERS - // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in - // favor of Kokkos profiling. - Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_); -#endif // HAVE_TPETRA_TRANSFER_TIMERS + doWaits(distor, revOp); if (verbose) { std::ostringstream os; - os << *prefix << "7.0. " - << (revOp == DoReverse ? "Reverse" : "Forward") - << " mode" << endl; + os << *prefix << "8. unpackAndCombine" << endl; std::cerr << os.str (); } + doUnpackAndCombine(remoteLIDs, constantNumPackets, distor, CM); + } // if (needCommunication) + } // if (CM != ZERO) - if (constantNumPackets == 0) { // variable num packets per LID - if (verbose) { - std::ostringstream os; - os << *prefix << "7.1. Variable # packets / LID: first comm " - << "(commOnHost = " << (commOnHost ? "true" : "false") << ")" - << endl; - std::cerr << os.str (); - } - size_t totalImportPackets = 0; - if (commOnHost) { - if (this->numExportPacketsPerLID_.need_sync_host ()) { - this->numExportPacketsPerLID_.sync_host (); - } - if (this->numImportPacketsPerLID_.need_sync_host ()) { - this->numImportPacketsPerLID_.sync_host (); - } - this->numImportPacketsPerLID_.modify_host (); // out arg - auto numExp_h = - create_const_view (this->numExportPacketsPerLID_.view_host ()); - auto numImp_h = this->numImportPacketsPerLID_.view_host (); - - // MPI communication happens here. - if (verbose) { - std::ostringstream os; - os << *prefix << "Call do" - << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits" - << endl; - std::cerr << os.str (); - } - if (revOp == DoReverse) { - distor.doReversePostsAndWaits (numExp_h, 1, numImp_h); - } - else { - distor.doPostsAndWaits (numExp_h, 1, numImp_h); - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Count totalImportPackets" << std::endl; - std::cerr << os.str (); - } - using the_dev_type = typename decltype (numImp_h)::device_type; - totalImportPackets = countTotalImportPackets (numImp_h); - } - else { // ! commOnHost - if (this->numExportPacketsPerLID_.need_sync_device ()) { - this->numExportPacketsPerLID_.sync_device (); - } - if (this->numImportPacketsPerLID_.need_sync_device ()) { - this->numImportPacketsPerLID_.sync_device (); - } - this->numImportPacketsPerLID_.modify_device (); // out arg - auto numExp_d = create_const_view - (this->numExportPacketsPerLID_.view_device ()); - auto numImp_d = this->numImportPacketsPerLID_.view_device (); - - // MPI communication happens here. - if (verbose) { - std::ostringstream os; - os << *prefix << "Call do" - << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits" - << endl; - std::cerr << os.str (); - } - if (revOp == DoReverse) { - distor.doReversePostsAndWaits (numExp_d, 1, numImp_d); - } - else { - distor.doPostsAndWaits (numExp_d, 1, numImp_d); - } - - if (verbose) { - std::ostringstream os; - os << *prefix << "Count totalImportPackets" << std::endl; - std::cerr << os.str (); - } - using the_dev_type = typename decltype (numImp_d)::device_type; - totalImportPackets = countTotalImportPackets (numImp_d); - } + if (verbose) { + std::ostringstream os; + os << *prefix << "9. Done!" << endl; + std::cerr << os.str (); + } - if (verbose) { - std::ostringstream os; - os << *prefix << "totalImportPackets=" << totalImportPackets << endl; - std::cerr << os.str (); - } - this->reallocImportsIfNeeded (totalImportPackets, verbose, - prefix.get ()); - if (verbose) { - std::ostringstream os; - os << *prefix << "7.3. Second comm" << std::endl; - std::cerr << os.str (); - } + if (verbose) { + std::ostringstream os; + os << *prefix << "Tpetra::DistObject::doTransfer: Done!" << endl; + std::cerr << os.str (); + } + } - // mfh 04 Feb 2019: Distributor expects the "num packets per - // LID" arrays on host, so that it can issue MPI sends and - // receives correctly. - if (this->numExportPacketsPerLID_.need_sync_host ()) { - this->numExportPacketsPerLID_.sync_host (); - } - if (this->numImportPacketsPerLID_.need_sync_host ()) { - this->numImportPacketsPerLID_.sync_host (); - } + template + void + DistObject:: + doPosts(Distributor& distor, + size_t constantNumPackets, + bool commOnHost, + ReverseOption revOp, + std::shared_ptr prefix) + { + using ::Tpetra::Details::dualViewStatusToString; + using ::Tpetra::Details::getArrayViewFromDualView; + using Kokkos::Compat::create_const_view; + using std::endl; - // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and - // doReversePostsAndWaits currently want - // numExportPacketsPerLID and numImportPacketsPerLID as - // Teuchos::ArrayView, rather than as Kokkos::View. - // - // NOTE (mfh 04 Feb 2019) This does NOT copy from host to - // device. The above syncs might. - auto numExportPacketsPerLID_av = - getArrayViewFromDualView (this->numExportPacketsPerLID_); - auto numImportPacketsPerLID_av = - getArrayViewFromDualView (this->numImportPacketsPerLID_); - - // imports_ is for output only, so we don't need to sync it - // before marking it as modified. However, in order to - // prevent spurious debug-mode errors (e.g., "modified on - // both device and host"), we first need to clear its - // "modified" flags. - this->imports_.clear_sync_state (); - - if (verbose) { - std::ostringstream os; - os << *prefix << "Comm on " - << (commOnHost ? "host" : "device") - << "; call do" << (revOp == DoReverse ? "Reverse" : "") - << "PostsAndWaits" << endl; - std::cerr << os.str (); - } + const bool verbose = Details::Behavior::verbose("DistObject"); - if (commOnHost) { - this->imports_.modify_host (); - if (revOp == DoReverse) { - distor.doReversePostsAndWaits - (create_const_view (this->exports_.view_host ()), - numExportPacketsPerLID_av, - this->imports_.view_host (), - numImportPacketsPerLID_av); - } - else { - distor.doPostsAndWaits - (create_const_view (this->exports_.view_host ()), - numExportPacketsPerLID_av, - this->imports_.view_host (), - numImportPacketsPerLID_av); - } - } - else { // comm on device - Kokkos::fence(); // for UVM - this->imports_.modify_device (); - if (revOp == DoReverse) { - distor.doReversePostsAndWaits - (create_const_view (this->exports_.view_device ()), - numExportPacketsPerLID_av, - this->imports_.view_device (), - numImportPacketsPerLID_av); - } - else { - distor.doPostsAndWaits - (create_const_view (this->exports_.view_device ()), - numExportPacketsPerLID_av, - this->imports_.view_device (), - numImportPacketsPerLID_av); - } - } + if (constantNumPackets == 0) { // variable num packets per LID + if (verbose) { + std::ostringstream os; + os << *prefix << "7.1. Variable # packets / LID: first comm " + << "(commOnHost = " << (commOnHost ? "true" : "false") << ")" + << endl; + std::cerr << os.str (); + } + size_t totalImportPackets = 0; + if (commOnHost) { + if (this->numExportPacketsPerLID_.need_sync_host ()) { + this->numExportPacketsPerLID_.sync_host (); } - else { // constant number of packets per LID - if (verbose) { - std::ostringstream os; - os << *prefix << "7.1. Const # packets per LID: " << endl - << *prefix << " " - << dualViewStatusToString (this->exports_, "exports_") - << endl - << *prefix << " " - << dualViewStatusToString (this->exports_, "imports_") - << endl; - std::cerr << os.str (); - } - // imports_ is for output only, so we don't need to sync it - // before marking it as modified. However, in order to - // prevent spurious debug-mode errors (e.g., "modified on - // both device and host"), we first need to clear its - // "modified" flags. - this->imports_.clear_sync_state (); - - if (verbose) { - std::ostringstream os; - os << *prefix << "7.2. Comm on " - << (commOnHost ? "host" : "device") - << "; call do" << (revOp == DoReverse ? "Reverse" : "") - << "PostsAndWaits" << endl; - std::cerr << os.str (); - } - if (commOnHost) { - this->imports_.modify_host (); - if (revOp == DoReverse) { - distor.doReversePostsAndWaits - (create_const_view (this->exports_.view_host ()), - constantNumPackets, - this->imports_.view_host ()); - } - else { - distor.doPostsAndWaits - (create_const_view (this->exports_.view_host ()), - constantNumPackets, - this->imports_.view_host ()); - } - } - else { // pack on device - Kokkos::fence(); // for UVM - this->imports_.modify_device (); - if (revOp == DoReverse) { - distor.doReversePostsAndWaits - (create_const_view (this->exports_.view_device ()), - constantNumPackets, - this->imports_.view_device ()); - } - else { - distor.doPostsAndWaits - (create_const_view (this->exports_.view_device ()), - constantNumPackets, - this->imports_.view_device ()); - } - } // commOnHost - } // constant or variable num packets per LID + if (this->numImportPacketsPerLID_.need_sync_host ()) { + this->numImportPacketsPerLID_.sync_host (); + } + this->numImportPacketsPerLID_.modify_host (); // out arg + auto numExp_h = + create_const_view (this->numExportPacketsPerLID_.view_host ()); + auto numImp_h = this->numImportPacketsPerLID_.view_host (); + // MPI communication happens here. if (verbose) { std::ostringstream os; - os << *prefix << "8. unpackAndCombine" << endl; + os << *prefix << "Call do" + << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits" + << endl; std::cerr << os.str (); } - ProfilingRegion region_uc - ("Tpetra::DistObject::doTransferNew::unpackAndCombine"); -#ifdef HAVE_TPETRA_TRANSFER_TIMERS - // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in - // favor of Kokkos profiling. - Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_); -#endif // HAVE_TPETRA_TRANSFER_TIMERS + if (revOp == DoReverse) { + distor.doReversePostsAndWaits (numExp_h, 1, numImp_h); + } + else { + distor.doPostsAndWaits (numExp_h, 1, numImp_h); + } - if (debug) { - std::ostringstream lclErrStrm; - bool lclSuccess = false; - try { - this->unpackAndCombine (remoteLIDs, this->imports_, - this->numImportPacketsPerLID_, - constantNumPackets, distor, CM); - lclSuccess = true; - } - catch (std::exception& e) { - lclErrStrm << "unpackAndCombine threw an exception: " - << endl << e.what(); - } - catch (...) { - lclErrStrm << "unpackAndCombine threw an exception " - "not a subclass of std::exception."; - } - const char gblErrMsgHeader[] = "Tpetra::DistObject::" - "doTransferNew threw an exception in unpackAndCombine on " - "one or more processes in the DistObject's communicator."; - auto comm = getMap()->getComm(); - Details::checkGlobalError(std::cerr, lclSuccess, - lclErrStrm.str().c_str(), - gblErrMsgHeader, *comm); + if (verbose) { + std::ostringstream os; + os << *prefix << "Count totalImportPackets" << std::endl; + std::cerr << os.str (); + } + using the_dev_type = typename decltype (numImp_h)::device_type; + totalImportPackets = countTotalImportPackets (numImp_h); + } + else { // ! commOnHost + this->numExportPacketsPerLID_.sync_device (); + this->numImportPacketsPerLID_.sync_device (); + this->numImportPacketsPerLID_.modify_device (); // out arg + auto numExp_d = create_const_view + (this->numExportPacketsPerLID_.view_device ()); + auto numImp_d = this->numImportPacketsPerLID_.view_device (); + + // MPI communication happens here. + if (verbose) { + std::ostringstream os; + os << *prefix << "Call do" + << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits" + << endl; + std::cerr << os.str (); + } + if (revOp == DoReverse) { + distor.doReversePostsAndWaits (numExp_d, 1, numImp_d); } else { - this->unpackAndCombine (remoteLIDs, this->imports_, - this->numImportPacketsPerLID_, - constantNumPackets, distor, CM); + distor.doPostsAndWaits (numExp_d, 1, numImp_d); } - } // if (needCommunication) - } // if (CM != ZERO) - if (verbose) { - std::ostringstream os; - os << *prefix << "9. Done!" << endl; - std::cerr << os.str (); + if (verbose) { + std::ostringstream os; + os << *prefix << "Count totalImportPackets" << std::endl; + std::cerr << os.str (); + } + using the_dev_type = typename decltype (numImp_d)::device_type; + totalImportPackets = countTotalImportPackets (numImp_d); + } + + if (verbose) { + std::ostringstream os; + os << *prefix << "totalImportPackets=" << totalImportPackets << endl; + std::cerr << os.str (); + } + this->reallocImportsIfNeeded (totalImportPackets, verbose, + prefix.get ()); + if (verbose) { + std::ostringstream os; + os << *prefix << "7.3. Second comm" << std::endl; + std::cerr << os.str (); + } + + // mfh 04 Feb 2019: Distributor expects the "num packets per + // LID" arrays on host, so that it can issue MPI sends and + // receives correctly. + this->numExportPacketsPerLID_.sync_host (); + this->numImportPacketsPerLID_.sync_host (); + + // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and + // doReversePostsAndWaits currently want + // numExportPacketsPerLID and numImportPacketsPerLID as + // Teuchos::ArrayView, rather than as Kokkos::View. + // + // NOTE (mfh 04 Feb 2019) This does NOT copy from host to + // device. The above syncs might. + auto numExportPacketsPerLID_av = + getArrayViewFromDualView (this->numExportPacketsPerLID_); + auto numImportPacketsPerLID_av = + getArrayViewFromDualView (this->numImportPacketsPerLID_); + + // imports_ is for output only, so we don't need to sync it + // before marking it as modified. However, in order to + // prevent spurious debug-mode errors (e.g., "modified on + // both device and host"), we first need to clear its + // "modified" flags. + this->imports_.clear_sync_state (); + + if (verbose) { + std::ostringstream os; + os << *prefix << "Comm on " + << (commOnHost ? "host" : "device") + << "; call do" << (revOp == DoReverse ? "Reverse" : "") + << "PostsAndWaits" << endl; + std::cerr << os.str (); + } + + if (commOnHost) { + this->imports_.modify_host (); + if (revOp == DoReverse) { + distor.doReversePosts + (create_const_view (this->exports_.view_host ()), + numExportPacketsPerLID_av, + this->imports_.view_host (), + numImportPacketsPerLID_av); + } + else { + distor.doPosts + (create_const_view (this->exports_.view_host ()), + numExportPacketsPerLID_av, + this->imports_.view_host (), + numImportPacketsPerLID_av); + } + } + else { // pack on device + Kokkos::fence(); // for UVM + this->imports_.modify_device (); + if (revOp == DoReverse) { + distor.doReversePosts + (create_const_view (this->exports_.view_device ()), + numExportPacketsPerLID_av, + this->imports_.view_device (), + numImportPacketsPerLID_av); + } + else { + distor.doPosts + (create_const_view (this->exports_.view_device ()), + numExportPacketsPerLID_av, + this->imports_.view_device (), + numImportPacketsPerLID_av); + } + } + } + else { // constant number of packets per LID + if (verbose) { + std::ostringstream os; + os << *prefix << "7.1. Const # packets per LID: " << endl + << *prefix << " " + << dualViewStatusToString (this->exports_, "exports_") + << endl + << *prefix << " " + << dualViewStatusToString (this->exports_, "imports_") + << endl; + std::cerr << os.str (); + } + // imports_ is for output only, so we don't need to sync it + // before marking it as modified. However, in order to + // prevent spurious debug-mode errors (e.g., "modified on + // both device and host"), we first need to clear its + // "modified" flags. + this->imports_.clear_sync_state (); + + if (verbose) { + std::ostringstream os; + os << *prefix << "7.2. Comm on " + << (commOnHost ? "host" : "device") + << "; call do" << (revOp == DoReverse ? "Reverse" : "") + << "PostsAndWaits" << endl; + std::cerr << os.str (); + } + if (commOnHost) { + this->imports_.modify_host (); + if (revOp == DoReverse) { + distor.doReversePosts + (create_const_view (this->exports_.view_host ()), + constantNumPackets, + this->imports_.view_host ()); + } + else { + distor.doPosts + (create_const_view (this->exports_.view_host ()), + constantNumPackets, + this->imports_.view_host ()); + } + } + else { // pack on device + Kokkos::fence(); // for UVM + this->imports_.modify_device (); + if (revOp == DoReverse) { + distor.doReversePosts + (create_const_view (this->exports_.view_device ()), + constantNumPackets, + this->imports_.view_device ()); + } + else { + distor.doPosts + (create_const_view (this->exports_.view_device ()), + constantNumPackets, + this->imports_.view_device ()); + } + } // commOnHost + } // constant or variable num packets per LID + } + + template + void + DistObject:: + doWaits(Distributor& distor, + ReverseOption revOp) + { + if (revOp == DoReverse) { + distor.doReverseWaits(); + } + else { + distor.doWaits(); + } + } + + template + void + DistObject:: + doPackAndPrepare(const SrcDistObject& src, + const Kokkos::DualView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor) + { + using Details::ProfilingRegion; + using std::endl; + const bool debug = Details::Behavior::debug("DistObject"); + + ProfilingRegion region_pp + ("Tpetra::DistObject::doTransferNew::packAndPrepare"); +#ifdef HAVE_TPETRA_TRANSFER_TIMERS + // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in + // favor of Kokkos profiling. + Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_); +#endif // HAVE_TPETRA_TRANSFER_TIMERS + + // Ask the source to pack data. Also ask it whether there are + // a constant number of packets per element + // (constantNumPackets is an output argument). If there are, + // constantNumPackets will come back nonzero. Otherwise, the + // source will fill the numExportPacketsPerLID_ array. + + // FIXME (mfh 18 Oct 2017) if (! commOnHost), sync to device? + // Alternately, make packAndPrepare take a "commOnHost" + // argument to tell it where to leave the data? + // + // NOTE (mfh 04 Feb 2019) Subclasses of DistObject should have + // the freedom to pack and unpack either on host or device. + // We should prefer sync'ing only on demand. Thus, we can + // answer the above question: packAndPrepare should not + // take a commOnHost argument, and doTransferNew should sync + // where needed, if needed. + if (debug) { + std::ostringstream lclErrStrm; + bool lclSuccess = false; + try { + this->packAndPrepare (src, exportLIDs, this->exports_, + this->numExportPacketsPerLID_, + constantNumPackets, distor); + lclSuccess = true; + } + catch (std::exception& e) { + lclErrStrm << "packAndPrepare threw an exception: " + << endl << e.what(); + } + catch (...) { + lclErrStrm << "packAndPrepare threw an exception " + "not a subclass of std::exception."; + } + const char gblErrMsgHeader[] = "Tpetra::DistObject " + "threw an exception in packAndPrepare on " + "one or more processes in the DistObject's communicator."; + auto comm = getMap()->getComm(); + Details::checkGlobalError(std::cerr, lclSuccess, + lclErrStrm.str().c_str(), + gblErrMsgHeader, *comm); + } + else { + this->packAndPrepare (src, exportLIDs, this->exports_, + this->numExportPacketsPerLID_, + constantNumPackets, distor); } } + template + void + DistObject:: + doUnpackAndCombine(const Kokkos::DualView& remoteLIDs, + size_t constantNumPackets, + Distributor& distor, + CombineMode CM) + { + using Details::ProfilingRegion; + using std::endl; + const bool debug = Details::Behavior::debug("DistObject"); + + ProfilingRegion region_uc + ("Tpetra::DistObject::doTransferNew::unpackAndCombine"); +#ifdef HAVE_TPETRA_TRANSFER_TIMERS + // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in + // favor of Kokkos profiling. + Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_); +#endif // HAVE_TPETRA_TRANSFER_TIMERS + + if (debug) { + std::ostringstream lclErrStrm; + bool lclSuccess = false; + try { + this->unpackAndCombine (remoteLIDs, this->imports_, + this->numImportPacketsPerLID_, + constantNumPackets, distor, CM); + lclSuccess = true; + } + catch (std::exception& e) { + lclErrStrm << "unpackAndCombine threw an exception: " + << endl << e.what(); + } + catch (...) { + lclErrStrm << "unpackAndCombine threw an exception " + "not a subclass of std::exception."; + } + const char gblErrMsgHeader[] = "Tpetra::DistObject " + "threw an exception in unpackAndCombine on " + "one or more processes in the DistObject's communicator."; + auto comm = getMap()->getComm(); + Details::checkGlobalError(std::cerr, lclSuccess, + lclErrStrm.str().c_str(), + gblErrMsgHeader, *comm); + } + else { + this->unpackAndCombine (remoteLIDs, this->imports_, + this->numImportPacketsPerLID_, + constantNumPackets, distor, CM); + } + } template void diff --git a/packages/tpetra/core/src/Tpetra_EpetraRowMatrix.hpp b/packages/tpetra/core/src/Tpetra_EpetraRowMatrix.hpp index 7e1d69526f5a..f4d03b52d81a 100644 --- a/packages/tpetra/core/src/Tpetra_EpetraRowMatrix.hpp +++ b/packages/tpetra/core/src/Tpetra_EpetraRowMatrix.hpp @@ -178,22 +178,22 @@ EpetraRowMatrix::EpetraRowMatrix( this->SetMaps (epetraRowMap, epetraColMap); } -#ifdef TPETRA_ENABLE_DEPRECATED_CODE template int EpetraRowMatrix::ExtractMyRowCopy(int MyRow, int Length, int & NumEntries, double *Values, int * Indices) const { + using inds_view = typename TpetraMatrixType::nonconst_local_inds_host_view_type; + using vals_view = typename TpetraMatrixType::nonconst_values_host_view_type; static_assert (std::is_same::value, "This code assumes that Tpetra::CrsMatrix's scalar_type is int."); static_assert (std::is_same::value, "This code assumes that Tpetra::CrsMatrix's local_ordinal_type is int."); - Teuchos::ArrayView inds(Indices, Length); - Teuchos::ArrayView vals(Values, Length); + inds_view IndicesView(Indices, Length); + vals_view ValuesView(Values, Length); size_t num_entries = NumEntries; - tpetra_matrix_->getLocalRowCopy(MyRow, inds, vals, num_entries); + tpetra_matrix_->getLocalRowCopy(MyRow, IndicesView, ValuesView, num_entries); NumEntries = num_entries; return 0; } -#endif template int EpetraRowMatrix::ExtractMyEntryView(int CurEntry, double * & Value, int & RowIndex, int & ColIndex) diff --git a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp index 26cc6850c617..795e8ef4ca84 100644 --- a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_decl.hpp @@ -87,10 +87,12 @@ namespace Tpetra { void, local_ordinal_type>; using local_graph_device_type = typename local_matrix_device_type::StaticCrsGraphType; - using ordinal_view_type = typename local_graph_device_type::entries_type::non_const_type; public: + using ordinal_view_type = typename local_graph_device_type::entries_type::non_const_type; + LocalCrsMatrixOperator (const std::shared_ptr& A); + LocalCrsMatrixOperator (const std::shared_ptr& A, const ordinal_view_type& A_ordinal_rowptrs); ~LocalCrsMatrixOperator () override = default; void @@ -118,12 +120,8 @@ namespace Tpetra { private: std::shared_ptr A_; - //If the number of entries in A_ can be represented as ordinal, - //make a copy of the rowptrs as ordinal. This allows the use of cuSPARSE spmv. - //If cusparse is not enabled or there would be no benefit from using these, - //they are not allocated/initialized. - ordinal_view_type A_ordinal_rowptrs; local_cusparse_matrix_type A_cusparse; + const bool have_A_cusparse; }; } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp index d7b4587ae67c..9ad516eb4657 100644 --- a/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp +++ b/packages/tpetra/core/src/Tpetra_LocalCrsMatrixOperator_def.hpp @@ -51,32 +51,26 @@ namespace Tpetra { template LocalCrsMatrixOperator:: LocalCrsMatrixOperator (const std::shared_ptr& A) - : A_ (A) + : A_ (A), have_A_cusparse(false) +{ + const char tfecfFuncName[] = "LocalCrsMatrixOperator: "; + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (A_.get () == nullptr, std::invalid_argument, + "Input matrix A is null."); +} + +template +LocalCrsMatrixOperator:: +LocalCrsMatrixOperator (const std::shared_ptr& A, const ordinal_view_type& A_ordinal_rowptrs) : + A_ (A), + A_cusparse("LocalCrsMatrixOperator_cuSPARSE", A->numRows(), A->numCols(), A->nnz(), + A->values, A_ordinal_rowptrs, A->graph.entries), + have_A_cusparse(true) { const char tfecfFuncName[] = "LocalCrsMatrixOperator: "; TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC (A_.get () == nullptr, std::invalid_argument, "Input matrix A is null."); -#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - //Only create A_ordinal_rowptrs if: - // - KokkosKernels cuSPARSE support is enabled (otherwise, no benefit) - // - The execution space is CUDA - // - The local matrix offset and ordinal types are different (otherwise, no reason to enable) - // - The number of entries can be represented by the ordinal type. - using kk_offset_t = typename std::remove_const::type; - using kk_ordinal_t = typename std::remove_const::type; - using exec_space = typename Device::execution_space; - if(std::is_same::value && - !std::is_same::value && - A_->nnz() < static_cast(Teuchos::OrdinalTraits::max())) - { - A_ordinal_rowptrs = ordinal_view_type(Kokkos::ViewAllocateWithoutInitializing("A_ordinal_rowptrs"), A_->numRows() + 1); - //This is just like a deep copy, but it implicitly converts each element - KokkosKernels::Impl::copy_view - (A_ordinal_rowptrs.extent(0), A_->graph.row_map, A_ordinal_rowptrs); - A_cusparse = local_cusparse_matrix_type("A(cusparse)", A_->numRows(), A_->numCols(), A_->nnz(), A_->values, A_ordinal_rowptrs, A_->graph.entries); - } -#endif } template @@ -120,7 +114,7 @@ apply (Kokkos::View& params) using values_type = typename local_matrix_device_type::values_type::non_const_type; using execution_space = typename local_matrix_device_type::execution_space; - local_matrix_device_type lclMatrix = crsMatrix->getLocalMatrix (); + local_matrix_device_type lclMatrix = crsMatrix->getLocalMatrixDevice (); local_matrix_device_type lclTransposeMatrix = KokkosKernels::Impl::transpose_matrix(lclMatrix); if (sort) KokkosKernels::Impl::sort_crs_matrix(lclTransposeMatrix); diff --git a/packages/tpetra/core/test/MultiVector/Bug7758.cpp b/packages/tpetra/core/test/MultiVector/Bug7758.cpp index d92ffe3f3297..1e9b62d3e5e2 100644 --- a/packages/tpetra/core/test/MultiVector/Bug7758.cpp +++ b/packages/tpetra/core/test/MultiVector/Bug7758.cpp @@ -82,9 +82,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, DefaultToDefault, Scalar,LO,GO,Node) Teuchos::RCP defaultMap = rcp(new map_t(nGlobalEntries, 0, comm)); - std::cout << me << " DEFAULT MAP" << std::endl; - defaultMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t defaultVecTgt(defaultMap); @@ -98,9 +95,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, DefaultToDefault, Scalar,LO,GO,Node) Tpetra::Export defaultToDefault(defaultMap, defaultMap); defaultVecTgt.doExport(defaultVecSrc, defaultToDefault, Tpetra::ADD); - std::cout << me << " DEFAULT TO DEFAULT " << std::endl; - defaultVecTgt.describe(foo, Teuchos::VERB_EXTREME); - // Check result; all vector entries should be srcScalar auto data = defaultVecTgt.getLocalViewHost(Tpetra::Access::ReadOnly); @@ -144,9 +138,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, CyclicToDefault, Scalar,LO,GO,Node) Teuchos::RCP defaultMap = rcp(new map_t(nGlobalEntries, 0, comm)); - std::cout << me << " DEFAULT MAP" << std::endl; - defaultMap->describe(foo, Teuchos::VERB_EXTREME); - // One-to-one cyclic map: deal out entries like cards int nMyEntries = 0; @@ -161,9 +152,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, CyclicToDefault, Scalar,LO,GO,Node) Teuchos::RCP cyclicMap = rcp(new map_t(dummy, myEntries(0,nMyEntries), 0, comm)); - std::cout << me << " CYCLIC MAP" << std::endl; - cyclicMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t defaultVecTgt(defaultMap); @@ -177,9 +165,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, CyclicToDefault, Scalar,LO,GO,Node) Tpetra::Export cyclicToDefault(cyclicMap, defaultMap); defaultVecTgt.doExport(cyclicVecSrc, cyclicToDefault, Tpetra::ADD); - std::cout << me << " CYCLIC TO DEFAULT " << std::endl; - defaultVecTgt.describe(foo, Teuchos::VERB_EXTREME); - // Check result auto invalid = Teuchos::OrdinalTraits::invalid(); @@ -234,9 +219,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OverlapToDefault, Scalar,LO,GO,Node) Teuchos::RCP defaultMap = rcp(new map_t(nGlobalEntries, 0, comm)); - std::cout << me << " DEFAULT MAP" << std::endl; - defaultMap->describe(foo, Teuchos::VERB_EXTREME); - // Overlap map; some entries are stored on two procs int nMyEntries = 0; for (size_t i = 0; i < defaultMap->getNodeNumElements()/2; i++) { @@ -252,9 +234,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OverlapToDefault, Scalar,LO,GO,Node) Teuchos::RCP overlapMap = rcp(new map_t(dummy, myEntries(0,nMyEntries), 0, comm)); - std::cout << me << " OVERLAP MAP" << std::endl; - overlapMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t defaultVecTgt(defaultMap); @@ -268,9 +247,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OverlapToDefault, Scalar,LO,GO,Node) Tpetra::Export overlapToDefault(overlapMap, defaultMap); defaultVecTgt.doExport(overlapVecSrc, overlapToDefault, Tpetra::ADD); - std::cout << me << " OVERLAP TO DEFAULT " << std::endl; - defaultVecTgt.describe(foo, Teuchos::VERB_EXTREME); - auto data = defaultVecTgt.getLocalViewHost(Tpetra::Access::ReadOnly); for (size_t i = 0; i < defaultVecTgt.getLocalLength()/2; i++) { // overlapped; initial target values were overwritten @@ -331,9 +307,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OddEvenToSerial, Scalar,LO,GO,Node) Teuchos::RCP oddEvenMap = rcp(new map_t(dummy, myEntries(0,nMyEntries), 0, comm)); - std::cout << me << " ODDEVEN MAP" << std::endl; - oddEvenMap->describe(foo, Teuchos::VERB_EXTREME); - // Map with all entries on one processor dummy = Teuchos::OrdinalTraits::invalid(); @@ -341,9 +314,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OddEvenToSerial, Scalar,LO,GO,Node) Teuchos::RCP serialMap = rcp(new map_t(dummy, nSerialEntries, 0, comm)); - std::cout << me << " SERIAL MAP" << std::endl; - serialMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t oddEvenVecSrc(oddEvenMap); @@ -357,9 +327,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, OddEvenToSerial, Scalar,LO,GO,Node) Tpetra::Export oddEvenToSerial(oddEvenMap, serialMap); serialVecTgt.doExport(oddEvenVecSrc, oddEvenToSerial, Tpetra::ADD); - std::cout << me << " ODDEVEN TO SERIAL " << std::endl; - serialVecTgt.describe(foo, Teuchos::VERB_EXTREME); - // Check result auto data = serialVecTgt.getLocalViewHost(Tpetra::Access::ReadOnly); @@ -408,9 +375,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, SupersetToDefault, Scalar,LO,GO,Node) Teuchos::RCP defaultMap = rcp(new map_t(nGlobalEntries, 0, comm)); - std::cout << me << " DEFAULT MAP" << std::endl; - defaultMap->describe(foo, Teuchos::VERB_EXTREME); - // Superset map; some entries are stored on two procs int nMyEntries = 0; for (size_t i = 0; i < defaultMap->getNodeNumElements(); i++) { @@ -426,9 +390,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, SupersetToDefault, Scalar,LO,GO,Node) Teuchos::RCP supersetMap = rcp(new map_t(dummy, myEntries(0,nMyEntries), 0, comm)); - std::cout << me << " SUPERSET MAP" << std::endl; - supersetMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t defaultVecTgt(defaultMap); @@ -442,9 +403,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, SupersetToDefault, Scalar,LO,GO,Node) Tpetra::Export supersetToDefault(supersetMap, defaultMap); defaultVecTgt.doExport(supersetVecSrc, supersetToDefault, Tpetra::ADD); - std::cout << me << " SUPERSET TO DEFAULT " << std::endl; - defaultVecTgt.describe(foo, Teuchos::VERB_EXTREME); - auto data = defaultVecTgt.getLocalViewHost(Tpetra::Access::ReadOnly); for (size_t i = 0; i < defaultVecTgt.getLocalLength()/2; i++) if (data(i,0) != srcScalar+srcScalar) ierr++; // overlapped @@ -490,9 +448,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, NoSamesToDefault, Scalar,LO,GO,Node) Teuchos::RCP defaultMap = rcp(new map_t(nGlobalEntries, 0, comm)); - std::cout << me << " DEFAULT MAP" << std::endl; - defaultMap->describe(foo, Teuchos::VERB_EXTREME); - // Map with no sames or permutes int nMyEntries = 0; for (size_t i = 0; i < defaultMap->getNodeNumElements(); i++) { @@ -505,9 +460,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, NoSamesToDefault, Scalar,LO,GO,Node) Teuchos::RCP noSamesMap = rcp(new map_t(dummy, myEntries(0,nMyEntries), 0, comm)); - std::cout << me << " NOSAMES MAP" << std::endl; - noSamesMap->describe(foo, Teuchos::VERB_EXTREME); - // Create vectors; see what the result is with CombineMode=ADD vector_t defaultVecTgt(defaultMap); @@ -521,9 +473,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Bug7758, NoSamesToDefault, Scalar,LO,GO,Node) Tpetra::Export noSamesToDefault(noSamesMap, defaultMap); defaultVecTgt.doExport(noSamesVecSrc, noSamesToDefault, Tpetra::ADD); - std::cout << me << " NOSAMES TO DEFAULT " << std::endl; - defaultVecTgt.describe(foo, Teuchos::VERB_EXTREME); - auto data = defaultVecTgt.getLocalViewHost(Tpetra::Access::ReadOnly); for (size_t i = 0; i < defaultVecTgt.getLocalLength(); i++) if (data(i,0) != tgtScalar + srcScalar) ierr++; diff --git a/packages/xpetra/sup/Utils/Xpetra_ThyraUtils.hpp b/packages/xpetra/sup/Utils/Xpetra_ThyraUtils.hpp index 76aba0bc7597..ae25f862fdd3 100644 --- a/packages/xpetra/sup/Utils/Xpetra_ThyraUtils.hpp +++ b/packages/xpetra/sup/Utils/Xpetra_ThyraUtils.hpp @@ -86,6 +86,7 @@ #ifdef HAVE_XPETRA_TPETRA #include #include +#include #include #include #include @@ -437,67 +438,48 @@ class ThyraUtils { static Teuchos::RCP > toThyraMultiVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); - // create Thyra MultiVector - Teuchos::RCP< Thyra::MultiVectorBase > thMVec = Thyra::createMembers(thMap, vec->getNumVectors()); - Teuchos::RCP< Thyra::SpmdMultiVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast MultiVectorBase to SpmdMultiVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpMV = Teuchos::rcp_dynamic_cast(vec)->getTpetra_MultiVector(); + auto thyDomMap = Thyra::tpetraVectorSpace(Tpetra::createLocalMapWithNode(vec->getNumVectors(), vec->getMap()->getComm())); + auto thyMV = rcp(new Thyra::TpetraMultiVector()); + thyMV->initialize(thyTpMap, thyDomMap, tpMV); + return thyMV; + } +#endif + +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Epetra needs SC=double, LO=int, and GO=int or GO=long long"); } +#endif - return thMVec; + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "MultiVector cannot be converted to Thyra."); } static Teuchos::RCP > toThyraVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); + // create Thyra Vector +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpVec = Teuchos::rcp_dynamic_cast(vec)->getTpetra_Vector(); + auto thyVec = rcp(new Thyra::TpetraVector()); + thyVec->initialize(thyTpMap, tpVec); + return thyVec; + } +#endif - // create Thyra MultiVector - Teuchos::RCP< Thyra::VectorBase > thMVec = Thyra::createMember(thMap); - Teuchos::RCP< Thyra::SpmdVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast VectorBase to SpmdVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Epetra needs SC=double, LO=int, and GO=int or GO=long long"); } +#endif - return thMVec; + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Vector cannot be converted to Thyra."); } // update Thyra multi vector with data from Xpetra multi vector @@ -1211,67 +1193,55 @@ class ThyraUtils { static Teuchos::RCP > toThyraMultiVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); - // create Thyra MultiVector - Teuchos::RCP< Thyra::MultiVectorBase > thMVec = Thyra::createMembers(thMap, vec->getNumVectors()); - Teuchos::RCP< Thyra::SpmdMultiVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast MultiVectorBase to SpmdMultiVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpMV = Teuchos::rcp_dynamic_cast(vec)->getTpetra_MultiVector(); + auto thyDomMap = Thyra::tpetraVectorSpace(Tpetra::createLocalMapWithNode(vec->getNumVectors(), vec->getMap()->getComm())); + auto thyMV = rcp(new Thyra::TpetraMultiVector()); + thyMV->initialize(thyTpMap, thyDomMap, tpMV); + return thyMV; + } +#endif + +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + auto thyEpMap = Thyra::create_VectorSpace(Teuchos::rcp_dynamic_cast >(vec->getMap())->getEpetra_MapRCP()); + auto epMV = Teuchos::rcp_dynamic_cast >(vec)->getEpetra_MultiVector(); + auto thyMV = Thyra::create_MultiVector(epMV, thyEpMap); + return thyMV; } +#endif + + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "MultiVector cannot be converted to Thyra."); - return thMVec; } static Teuchos::RCP > toThyraVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); + // create Thyra Vector +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpVec = Teuchos::rcp_dynamic_cast(vec)->getTpetra_Vector(); + auto thyVec = rcp(new Thyra::TpetraVector()); + thyVec->initialize(thyTpMap, tpVec); + return thyVec; + } +#endif - // create Thyra MultiVector - Teuchos::RCP< Thyra::VectorBase > thMVec = Thyra::createMember(thMap); - Teuchos::RCP< Thyra::SpmdVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast VectorBase to SpmdVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + auto thyEpMap = Thyra::create_VectorSpace(Teuchos::rcp_dynamic_cast >(vec->getMap())->getEpetra_MapRCP()); + auto epVec = rcp(Teuchos::rcp_dynamic_cast >(vec)->getEpetra_Vector(), false); + auto thyVec = Thyra::create_Vector(epVec, thyEpMap); + return thyVec; } +#endif - return thMVec; + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Vector cannot be converted to Thyra."); } static void updateThyra(Teuchos::RCP > source, Teuchos::RCP > mapExtractor, const Teuchos::RCP > & target) { @@ -1927,67 +1897,54 @@ class ThyraUtils { static Teuchos::RCP > toThyraMultiVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); - // create Thyra MultiVector - Teuchos::RCP< Thyra::MultiVectorBase > thMVec = Thyra::createMembers(thMap, vec->getNumVectors()); - Teuchos::RCP< Thyra::SpmdMultiVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast MultiVectorBase to SpmdMultiVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpMV = Teuchos::rcp_dynamic_cast(vec)->getTpetra_MultiVector(); + auto thyDomMap = Thyra::tpetraVectorSpace(Tpetra::createLocalMapWithNode(vec->getNumVectors(), vec->getMap()->getComm())); + auto thyMV = rcp(new Thyra::TpetraMultiVector()); + thyMV->initialize(thyTpMap, thyDomMap, tpMV); + return thyMV; } +#endif + +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + auto thyEpMap = Thyra::create_VectorSpace(Teuchos::rcp_dynamic_cast >(vec->getMap())->getEpetra_MapRCP()); + auto epMV = Teuchos::rcp_dynamic_cast >(vec)->getEpetra_MultiVector(); + auto thyMV = Thyra::create_MultiVector(epMV, thyEpMap); + return thyMV; + } +#endif - return thMVec; + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "MultiVector cannot be converted to Thyra."); } static Teuchos::RCP > toThyraVector(Teuchos::RCP > vec) { - // create Thyra vector space out of Xpetra Map - Teuchos::RCP > thMap = Xpetra::ThyraUtils::toThyra(vec->getMap()); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getGlobalNumElements())!=thMap->dim(), std::logic_error, "Global dimension of Xpetra map and Thyra VectorSpaceBase are different."); - Teuchos::RCP > thSpmdMap = Teuchos::rcp_dynamic_cast >(thMap); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMap == Teuchos::null, std::logic_error, "Cannot cast VectorSpaceBase to SpmdVectorSpaceBase."); - TEUCHOS_TEST_FOR_EXCEPTION(Teuchos::as(vec->getMap()->getNodeNumElements())!=thSpmdMap->localSubDim(), std::logic_error, "Local dimension of Xpetra map and Thyra VectorSpaceBase on one (or more) processor(s) are different."); + // create Thyra Vector +#ifdef HAVE_XPETRA_TPETRA + if (vec->getMap()->lib() == Xpetra::UseTpetra) { + auto thyTpMap = Thyra::tpetraVectorSpace(Teuchos::rcp_dynamic_cast(vec->getMap())->getTpetra_Map()); + RCP> tpVec = Teuchos::rcp_dynamic_cast(vec)->getTpetra_Vector(); + auto thyVec = rcp(new Thyra::TpetraVector()); + thyVec->initialize(thyTpMap, tpVec); + return thyVec; + } +#endif - // create Thyra MultiVector - Teuchos::RCP< Thyra::VectorBase > thMVec = Thyra::createMember(thMap); - Teuchos::RCP< Thyra::SpmdVectorBase > thSpmdMVec = Teuchos::rcp_dynamic_cast >(thMVec); - TEUCHOS_TEST_FOR_EXCEPTION(thSpmdMVec == Teuchos::null, std::logic_error, "Cannot cast VectorBase to SpmdVectorBase."); - - // fill multivector with some data - const LocalOrdinal localOffset = ( thSpmdMap != Teuchos::null ? thSpmdMap->localOffset() : 0 ); - const LocalOrdinal localSubDim = ( thSpmdMap != Teuchos::null ? thSpmdMap->localSubDim() : thMap->dim() ); - Teuchos::RCP > thyData = - Teuchos::rcp(new Thyra::DetachedMultiVectorView(*thSpmdMVec,Teuchos::Range1D(localOffset,localOffset+localSubDim-1))); - - // loop over all vectors in multivector - for(size_t j = 0; j < Teuchos::as(thSpmdMVec->domain()->dim()); ++j) { - Teuchos::ArrayRCP< const Scalar > vecData = vec->getData(j); - // loop over all local rows - for(LocalOrdinal i = 0; i < localSubDim; ++i) { - (*thyData)(i,j) = vecData[i]; - } +#ifdef HAVE_XPETRA_EPETRA + if (vec->getMap()->lib() == Xpetra::UseEpetra) { + auto thyEpMap = Thyra::create_VectorSpace(Teuchos::rcp_dynamic_cast >(vec->getMap())->getEpetra_MapRCP()); + auto epVec = rcp(Teuchos::rcp_dynamic_cast >(vec)->getEpetra_Vector(), false); + auto thyVec = Thyra::create_Vector(epVec, thyEpMap); + return thyVec; } +#endif - return thMVec; + TEUCHOS_TEST_FOR_EXCEPTION(true, Xpetra::Exceptions::RuntimeError, "Vector cannot be converted to Thyra."); } static void updateThyra(Teuchos::RCP > source, Teuchos::RCP > mapExtractor, const Teuchos::RCP > & target) { diff --git a/packages/zoltan2/core/src/algorithms/partition/Zoltan2_AlgScotch.hpp b/packages/zoltan2/core/src/algorithms/partition/Zoltan2_AlgScotch.hpp index 62a78e4e2f8c..223ab8cc7a7c 100644 --- a/packages/zoltan2/core/src/algorithms/partition/Zoltan2_AlgScotch.hpp +++ b/packages/zoltan2/core/src/algorithms/partition/Zoltan2_AlgScotch.hpp @@ -542,7 +542,7 @@ void AlgPTScotch::partition( env->memory("Zoltan2-Scotch: After creating solution"); // Clean up copies made due to differing data sizes. - TPL_Traits::DELETE_ARRAY(&vertloctab); + TPL_Traits::DELETE_ARRAY(&vertloctab); TPL_Traits::DELETE_ARRAY(&edgeloctab); if (nVwgts) delete [] velotab;