diff --git a/build.sh b/build.sh index c1cfe43d8d..3f82a882da 100755 --- a/build.sh +++ b/build.sh @@ -1,17 +1,10 @@ #!/bin/bash set -eu -uname_s=$(uname -s) -if [[ ${uname_s} == Darwin ]]; then - UFS_MODEL_DIR=$(greadlink -f -n "${BASH_SOURCE[0]}") - UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}") - UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P) -else - UFS_MODEL_DIR=$(readlink -f -n "${BASH_SOURCE[0]}") - UFS_MODEL_DIR=$(dirname "${UFS_MODEL_DIR}") - UFS_MODEL_DIR=$(cd "${UFS_MODEL_DIR}" && pwd -P) -fi -echo "UFS MODEL DIR: ${UFS_MODEL_DIR}" + +SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}") +UFS_MODEL_DIR=$(dirname "${SCRIPT_REALPATH}") readonly UFS_MODEL_DIR +echo "UFS MODEL DIR: ${UFS_MODEL_DIR}" export CC=${CC:-mpicc} export CXX=${CXX:-mpicxx} @@ -26,4 +19,4 @@ for i in ${CMAKE_FLAGS}; do ARR_CMAKE_FLAGS+=("${i}") ; done cmake "${UFS_MODEL_DIR}" "${ARR_CMAKE_FLAGS[@]}" # Turn off OpenMP threading for parallel builds # to avoid exhausting the number of user processes -OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}" \ No newline at end of file +OMP_NUM_THREADS=1 make -j "${BUILD_JOBS:-4}" "VERBOSE=${BUILD_VERBOSE:-}" diff --git a/tests/compile.sh b/tests/compile.sh index 458d985a88..8ab0f60b82 100755 --- a/tests/compile.sh +++ b/tests/compile.sh @@ -12,14 +12,8 @@ function trim { SECONDS=0 -uname_s=$(uname -s) -if [[ ${uname_s} == Darwin ]]; then - greadlnk=$(greadlink -f -n "${BASH_SOURCE[0]}" ) - MYDIR=$(cd "$(dirname "${greadlnk}" )" && pwd -P) -else - readlnk=$(readlink -f -n "${BASH_SOURCE[0]}" ) - MYDIR=$(cd "$(dirname "${readlnk}" )" && pwd -P) -fi +SCRIPT_REALPATH=$(realpath "${BASH_SOURCE[0]}") +MYDIR=$(dirname "${SCRIPT_REALPATH}") readonly MYDIR # ---------------------------------------------------------------------- diff --git a/tests/error-test.conf b/tests/error-test.conf index 2382c59a9e..3e931e67dc 100644 --- a/tests/error-test.conf +++ b/tests/error-test.conf @@ -11,6 +11,9 @@ COMPILE | atm_dyn32 | intel | -DAPP=ATM -DCCPP_SUITES=FV3_GFS_v16,FV3_GFS_v16_fl # This should succeed RUN | control_c48.v2.sfc | | baseline | +# This should fail due to wall clock timeout +RUN | control_c48.v2.sfc_timeout | | baseline | + # These tests should always fail, and prevent the workflow from completing. RUN | fail_to_copy | | baseline | RUN | fail_to_run | | baseline | diff --git a/tests/rt.sh b/tests/rt.sh index ff3808ac0e..5d3b6d96e3 100755 --- a/tests/rt.sh +++ b/tests/rt.sh @@ -1041,6 +1041,7 @@ if [[ ${skip_check_results} == true ]]; then else REGRESSIONTEST_LOG=${PATHRT}/logs/RegressionTests_${MACHINE_ID}.log fi +rm -f "${REGRESSIONTEST_LOG}" TEST_START_TIME="$(date '+%Y%m%d %T')" export TEST_START_TIME diff --git a/tests/rt_utils.sh b/tests/rt_utils.sh index 6f049cf161..3fb1070547 100755 --- a/tests/rt_utils.sh +++ b/tests/rt_utils.sh @@ -124,10 +124,6 @@ submit_and_wait() { local -r job_card=$1 - ROCOTO=${ROCOTO:-false} - ECFLOW=${ECFLOW:-false} - - local test_status='PASS' case ${SCHEDULER} in pbs) qsubout=$( qsub "${job_card}" ) @@ -187,26 +183,38 @@ submit_and_wait() { set +e job_info=$( qstat "${jobid}" ) set -e + if grep -q "${jobid}" <<< "${job_info}"; then + job_running=true + # Getting the status letter from scheduler info + status=$( grep "${jobid}" <<< "${job_info}" ) + status=$( awk '{print $5}' <<< "${status}" ) + else + job_running=false + status='COMPLETED' + set +e + exit_status=$( qstat "${jobid}" -x -f | grep Exit_status | awk '{print $3}') + set -e + if [[ ${exit_status} != 0 ]]; then + status='FAILED' + fi + fi ;; slurm) - job_info=$( squeue -u "${USER}" -j "${jobid}" ) + job_info=$( squeue -u "${USER}" -j "${jobid}" -o '%i %T' ) + if grep -q "${jobid}" <<< "${job_info}"; then + job_running=true + else + job_running=false + job_info=$( sacct -n -j "${jobid}" --format=JobID,state%20,Jobname%64 | grep "^${jobid}" | grep "${JBNME}" ) + fi + # Getting the status letter from scheduler info + status=$( grep "${jobid}" <<< "${job_info}" ) + status=$( awk '{print $2}' <<< "${status}" ) ;; *) ;; esac - - if grep -q "${jobid}" <<< "${job_info}"; then - job_running=true - else - job_running=false - continue - fi - - # Getting the status letter from scheduler info - status=$( grep "${jobid}" <<< "${job_info}" ) - status=$( awk '{print $5}' <<< "${status}" ) - case ${status} in #waiting cases #pbs: Q @@ -217,7 +225,7 @@ submit_and_wait() { #running cases #pbs: R #slurm: (old: R, new: RUNNING) - R|RUNNING) + R|RUNNING|COMPLETING) status_label='Job running' ;; #held cases @@ -229,14 +237,15 @@ submit_and_wait() { #fail/completed cases #slurm: F/FAILED TO/TIMEOUT CA/CANCELLED F|TO|CA|FAILED|TIMEOUT|CANCELLED) - echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!!" + echo "rt_utils.sh: !!!!!!!!!!JOB TERMINATED!!!!!!!!!! status=${status}" job_running=false #Trip the loop to end with these status flags interrupt_job exit 1 ;; #completed - #pbs only: C-Complete E-Exiting - C|E) + #pbs: C-Complete E-Exiting + #slurm: CD/COMPLETED + C|E|CD|COMPLETED) status_label='Completed' ;; *) @@ -253,140 +262,6 @@ submit_and_wait() { done } -check_results() { - echo "rt_utils.sh: Checking results of the regression test: ${TEST_ID}" - - ROCOTO=${ROCOTO:-false} - ECFLOW=${ECFLOW:-false} - - local test_status='PASS' - - # Give one minute for data to show up on file system - #sleep 60 - - { - echo - echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" - echo "working dir = ${RUNDIR}" - echo "Checking test ${TEST_ID} results ...." - } > "${RT_LOG}" - echo - echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" - echo "working dir = ${RUNDIR}" - echo "Checking test ${TEST_ID} results ...." - - if [[ ${CREATE_BASELINE} = false ]]; then - # - # --- regression test comparison - # - for i in ${LIST_FILES} ; do - printf %s " Comparing ${i} ....." >> "${RT_LOG}" - printf %s " Comparing ${i} ....." - - if [[ ! -f ${RUNDIR}/${i} ]] ; then - - echo ".......MISSING file" >> "${RT_LOG}" - echo ".......MISSING file" - test_status='FAIL' - - elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then - - echo ".......MISSING baseline" >> "${RT_LOG}" - echo ".......MISSING baseline" - test_status='FAIL' - - else - if [[ ${i##*.} == nc* ]] ; then - if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then - printf "USING NCCMP.." >> "${RT_LOG}" - printf "USING NCCMP.." - if [[ ${CMP_DATAONLY} == false ]]; then - nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? - else - nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? - fi - if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then - printf "....ERROR" >> "${RT_LOG}" - printf "....ERROR" - test_status='FAIL' - fi - fi - else - printf "USING CMP.." >> "${RT_LOG}" - printf "USING CMP.." - cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$? - if [[ ${d} -eq 2 ]]; then - printf "....ERROR" >> "${RT_LOG}" - printf "....ERROR" - test_status='FAIL' - fi - - fi - - if [[ ${d} -ne 0 ]]; then - echo "....NOT IDENTICAL" >> "${RT_LOG}" - echo "....NOT IDENTICAL" - test_status='FAIL' - else - echo "....OK" >> "${RT_LOG}" - echo "....OK" - fi - - fi - - done - - else - # - # --- create baselines - # - echo;echo "Moving baseline ${TEST_ID} files ...." - echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}" - - for i in ${LIST_FILES} ; do - printf %s " Moving ${i} ....." - printf %s " Moving ${i} ....." >> "${RT_LOG}" - if [[ -f ${RUNDIR}/${i} ]] ; then - mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")" - cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}" - echo "....OK" >> "${RT_LOG}" - echo "....OK" - else - echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}" - echo "....NOT OK. Missing ${RUNDIR}/${i}" - test_status='FAIL' - fi - done - - fi - - { - echo - grep "The total amount of wall time" "${RUNDIR}/out" - grep "The maximum resident set size" "${RUNDIR}/out" - echo - } >> "${RT_LOG}" - - TRIES='' - if [[ ${ECFLOW} == true ]]; then - if [[ ${ECF_TRYNO} -gt 1 ]]; then - TRIES=" Tries: ${ECF_TRYNO}" - fi - fi - echo "Test ${TEST_ID} ${test_status}${TRIES}" >> "${RT_LOG}" - echo >> "${RT_LOG}" - echo "Test ${TEST_ID} ${test_status}${TRIES}" - echo - - if [[ ${test_status} = 'FAIL' ]]; then - echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}" - return 1 - else - return 0 - fi -} - - kill_job() { echo "rt_utils.sh: Killing job: ${jobid} on ${SCHEDULER}..." [[ -z $1 ]] && exit 1 @@ -580,14 +455,16 @@ ecflow_create_compile_task() { cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/compile_${COMPILE_ID}.ecf" %include -${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log" 2>&1 & +( +cd "${LOG_DIR}" +ln -sf "compile_${COMPILE_ID}.log.\${ECF_TRYNO}" "compile_${COMPILE_ID}.log" +) +${PATHRT}/run_compile.sh "${PATHRT}" "${RUNDIR_ROOT}" "${MAKE_OPT}" "${COMPILE_ID}" > "${LOG_DIR}/compile_${COMPILE_ID}.log.\${ECF_TRYNO}" 2>&1 & %include EOF { echo " task compile_${COMPILE_ID}" echo " label build_options '${MAKE_OPT}'" - echo " label job_id ''" - echo " label job_status ''" echo " inlimit max_builds" } >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def" } @@ -596,13 +473,15 @@ ecflow_create_run_task() { echo "rt_utils.sh: ${TEST_ID}: Creating ECFLOW run task" cat << EOF > "${ECFLOW_RUN}/${ECFLOW_SUITE}/${TEST_ID}${RT_SUFFIX}.ecf" %include -${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" 2>&1 & +( +cd "${LOG_DIR}" +ln -sf "run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log" +) +${PATHRT}/run_test.sh "${PATHRT}" "${RUNDIR_ROOT}" "${TEST_NAME}" "${TEST_ID}" "${COMPILE_ID}" > "${LOG_DIR}/run_${TEST_ID}${RT_SUFFIX}.log.\${ECF_TRYNO}" 2>&1 & %include EOF { echo " task ${TEST_ID}${RT_SUFFIX}" - echo " label job_id ''" - echo " label job_status ''" echo " inlimit max_jobs" } >> "${ECFLOW_RUN}/${ECFLOW_SUITE}.def" if [[ ${DEP_RUN} != '' ]]; then diff --git a/tests/run_compile.sh b/tests/run_compile.sh index 1685f89653..6eeb72b13e 100755 --- a/tests/run_compile.sh +++ b/tests/run_compile.sh @@ -17,7 +17,16 @@ cleanup() { write_fail_test() { echo "${JBNME} failed in run_compile" >> "${PATHRT}/fail_${JBNME}" - exit 1 + if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then + # if this script has been submitted by a workflow return non-zero exit status + # so that workflow can resubmit it + exit 1 + else + # if this script has been executed interactively, return zero exit status + # so that rt.sh can continue running, and hope that rt.sh's generate_log + # will catch failed tests + exit 0 + fi } remove_fail_test() { diff --git a/tests/run_test.sh b/tests/run_test.sh index ace4fd0cf1..88e10210d6 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -17,12 +17,16 @@ cleanup() { write_fail_test() { echo "${TEST_ID} failed in run_test" >> "${PATHRT}/fail_test_${TEST_ID}" - exit 1 -} - -remove_fail_test() { - echo "Removing test failure flag file for ${TEST_ID}" - rm -f "${PATHRT}/fail_test_${TEST_ID}" + if [[ ${ROCOTO:-false} == true ]] || [[ ${ECFLOW:-false} == true ]]; then + # if this script has been submitted by a workflow return non-zero exit status + # so that workflow can resubmit it + exit 1 + else + # if this script has been executed interactively, return zero exit status + # so that rt.sh can continue running, and hope that rt.sh's generate_log + # will catch failed tests + exit 0 + fi } if [[ $# != 5 ]]; then @@ -53,7 +57,7 @@ source default_vars.sh [[ -e ${RUNDIR_ROOT}/run_test_${TEST_ID}.env ]] && source "${RUNDIR_ROOT}/run_test_${TEST_ID}.env" source "tests/${TEST_NAME}" -remove_fail_test +rm -f "${PATHRT}/fail_test_${TEST_ID}" # Save original CNTL_DIR name as INPUT_DIR for regression # tests that try to copy input data from CNTL_DIR @@ -396,11 +400,123 @@ else fi skip_check_results=${skip_check_results:-false} -results_okay=YES -if [[ ${skip_check_results} = false ]]; then - if ( ! check_results ) ; then - results_okay=NO +if [[ ${skip_check_results} == false ]]; then + + test_status='PASS' + + { + echo + echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" + echo "working dir = ${RUNDIR}" + echo "Checking test ${TEST_ID} results ...." + } > "${RT_LOG}" + echo + echo "baseline dir = ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}" + echo "working dir = ${RUNDIR}" + echo "Checking test ${TEST_ID} results ...." + + if [[ ${CREATE_BASELINE} = false ]]; then + # + # --- regression test comparison + # + for i in ${LIST_FILES} ; do + printf %s " Comparing ${i} ....." >> "${RT_LOG}" + printf %s " Comparing ${i} ....." + + if [[ ! -f ${RUNDIR}/${i} ]] ; then + + echo ".......MISSING file" >> "${RT_LOG}" + echo ".......MISSING file" + test_status='FAIL' + + elif [[ ! -f ${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i} ]] ; then + + echo ".......MISSING baseline" >> "${RT_LOG}" + echo ".......MISSING baseline" + test_status='FAIL' + + else + if [[ ${i##*.} == nc* ]] ; then + if [[ " orion hercules hera wcoss2 acorn derecho gaea jet s4 noaacloud " =~ ${MACHINE_ID} ]]; then + printf "USING NCCMP.." >> "${RT_LOG}" + printf "USING NCCMP.." + if [[ ${CMP_DATAONLY} == false ]]; then + nccmp -d -S -q -f -g -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? + else + nccmp -d -S -q -f -B --Attribute=checksum --warn=format "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" > "${i}_nccmp.log" 2>&1 && d=$? || d=$? + fi + if [[ ${d} -ne 0 && ${d} -ne 1 ]]; then + printf "....ERROR" >> "${RT_LOG}" + printf "....ERROR" + test_status='FAIL' + fi + fi + else + printf "USING CMP.." >> "${RT_LOG}" + printf "USING CMP.." + cmp "${RTPWD}/${CNTL_DIR}_${RT_COMPILER}/${i}" "${RUNDIR}/${i}" >/dev/null 2>&1 && d=$? || d=$? + if [[ ${d} -eq 2 ]]; then + printf "....ERROR" >> "${RT_LOG}" + printf "....ERROR" + test_status='FAIL' + fi + + fi + + if [[ ${d} -ne 0 ]]; then + echo "....NOT IDENTICAL" >> "${RT_LOG}" + echo "....NOT IDENTICAL" + test_status='FAIL' + else + echo "....OK" >> "${RT_LOG}" + echo "....OK" + fi + + fi + + done + + else + # + # --- create baselines + # + echo;echo "Moving baseline ${TEST_ID} files ...." + echo;echo "Moving baseline ${TEST_ID} files ...." >> "${RT_LOG}" + + for i in ${LIST_FILES} ; do + printf %s " Moving ${i} ....." + printf %s " Moving ${i} ....." >> "${RT_LOG}" + if [[ -f ${RUNDIR}/${i} ]] ; then + mkdir -p "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/$(dirname "${i}")" + cp "${RUNDIR}/${i}" "${NEW_BASELINE}/${CNTL_DIR}_${RT_COMPILER}/${i}" + echo "....OK" >> "${RT_LOG}" + echo "....OK" + else + echo "....NOT OK. Missing ${RUNDIR}/${i}" >> "${RT_LOG}" + echo "....NOT OK. Missing ${RUNDIR}/${i}" + test_status='FAIL' + fi + done + fi + + { + echo + grep "The total amount of wall time" "${RUNDIR}/out" + grep "The maximum resident set size" "${RUNDIR}/out" + echo + echo "Test ${TEST_ID} ${test_status}" + echo + } >> "${RT_LOG}" + + echo "Test ${TEST_ID} ${test_status}" + echo + + if [[ ${test_status} = 'FAIL' ]]; then + echo "${TEST_ID} failed in check_result" >> "${PATHRT}/fail_test_${TEST_ID}" + write_fail_test + fi + else { echo @@ -408,7 +524,7 @@ else grep "The maximum resident set size" "${RUNDIR}/out" echo echo "Test ${TEST_ID} RUN_SUCCESS" - echo;echo;echo + echo;echo;echo } >> "${RT_LOG}" fi @@ -416,10 +532,6 @@ if [[ ${SCHEDULER} != 'none' ]]; then cat "${RUNDIR}/job_timestamp.txt" >> "${LOG_DIR}/${JBNME}_timestamp.txt" fi -if [[ ${results_okay} == YES ]]; then - remove_fail_test -fi - ################################################################################ # End test ################################################################################ diff --git a/tests/tests/control_c48.v2.sfc_timeout b/tests/tests/control_c48.v2.sfc_timeout new file mode 100644 index 0000000000..ec78b47892 --- /dev/null +++ b/tests/tests/control_c48.v2.sfc_timeout @@ -0,0 +1,4 @@ +source tests/control_c48.v2.sfc + +# Intentionally make this test fail due to wall clock timeout. Used by error-test.conf +export WLCLK=2