Skip to content

Commit

Permalink
SRE-574 Build: Hardware failure check fixes (#13249)
Browse files Browse the repository at this point in the history
Testing script must not abort before collecting hardware
failure results.

Pass build information to hardware check.

Signed-off-by: John E. Malmberg <john.e.malmberg@intel.com>
  • Loading branch information
JohnMalmberg committed Dec 15, 2023
1 parent 363a4ec commit 40dd690
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 17 deletions.
44 changes: 29 additions & 15 deletions ci/functional/test_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@ test_tag="$TEST_TAG"
tnodes=$(echo "$NODELIST" | cut -d ',' -f 1-"$NODE_COUNT")
first_node=${NODELIST%%,*}

hardware_ok=false

cluster_reboot () {
# shellcheck disable=SC2029,SC2089
clush -B -S -o '-i ci_key' -l root -w "${tnodes}" reboot || true

# shellcheck disable=SC2029,SC2089
poll_cmd=( clush -B -S -o "-i ci_key" -l root -w "${tnodes}" )
poll_cmd+=( '"cat /etc/os-release"' )
reboot_timeout=900 # 15 minutes
poll_cmd+=( cat /etc/os-release )
# 20 minutes, HPE systems may take more than 15 minutes.
reboot_timeout=1200
retry_wait=10 # seconds
timeout=$((SECONDS + reboot_timeout))
while [ "$SECONDS" -lt "$timeout" ]; do
Expand All @@ -42,6 +45,8 @@ test_cluster() {
FIRST_NODE=${first_node} \
TEST_RPMS=${TEST_RPMS} \
NODELIST=${tnodes} \
BUILD_URL=\"$BUILD_URL\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_prep_node.sh)"
}

Expand All @@ -50,8 +55,13 @@ clush -B -S -o '-i ci_key' -l root -w "${first_node}" \

if ! test_cluster; then
# Sometimes a cluster reboot will fix the issue so try it once.
cluster_reboot
test_cluster
if cluster_reboot; then
if test_cluster; then
hardware_ok=true
fi
fi
else
hardware_ok=true
fi

# collect the _results.xml files from test_main_prep_nodes before they
Expand Down Expand Up @@ -79,17 +89,20 @@ export DAOS_TARGET_OVERSUBSCRIBE=1
rm -rf install/lib/daos/TESTING/ftest/avocado ./*_results.xml

mkdir -p install/lib/daos/TESTING/ftest/avocado/job-results
if $TEST_RPMS; then
# shellcheck disable=SC2029
ssh -i ci_key -l jenkins "${first_node}" \
"TEST_TAG=\"$test_tag\" \
TNODES=\"$tnodes\" \
FTEST_ARG=\"${FTEST_ARG:-}\" \
WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_node.sh)"
else
./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG"

if "$hardware_ok"; then
if $TEST_RPMS; then
# shellcheck disable=SC2029
ssh -i ci_key -l jenkins "${first_node}" \
"TEST_TAG=\"$test_tag\" \
TNODES=\"$tnodes\" \
FTEST_ARG=\"${FTEST_ARG:-}\" \
WITH_VALGRIND=\"${WITH_VALGRIND:-}\" \
STAGE_NAME=\"$STAGE_NAME\" \
$(cat ci/functional/test_main_node.sh)"
else
./ftest.sh "$test_tag" "$tnodes" "$FTEST_ARG"
fi
fi

# Now rename the previously collected hardware test data for Jenkins
Expand All @@ -104,3 +117,4 @@ for node in ${tnodes//,/ }; do
mv "$old_name" "$new_name"
fi
done
"$hardware_ok"
7 changes: 5 additions & 2 deletions ci/functional/test_main_prep_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ set -eux

: "${FIRST_NODE:=}"
: "${OPERATIONS_EMAIL:=}"
: "${STAGE_NAME:=Unknown}"
: "${BUILD_URL:=Unknown}"

result=0
mail_message=''
Expand Down Expand Up @@ -38,9 +40,10 @@ function do_mail {
return
fi
# shellcheck disable=SC2059
build_info="BUILD_URL = $BUILD_URL$nl STAGE = $STAGE_NAME$nl$nl"
mail -s "Hardware check failed after reboot!" \
-r "$HOSTNAME"@intel.com "$OPERATIONS_EMAIL" \
<<< "$mail_message"
<<< "$build_info$mail_message"
set -x
}

Expand Down Expand Up @@ -242,7 +245,7 @@ if [ -e /sys/class/net/ib1 ]; then
testcases+=" </testcase>$nl"

((testruns++)) || true
testcases+=" <testcase name=\"NVMe lsblk Count Node $mynodenum\">${nl}"
testcases+=" <testcase name=\"PMEM lsblk Count Node $mynodenum\">${nl}"
if [ "$lsblk_pmem" -ne "$dimm_rcount" ]; then
lsblk_pmem_msg="Only $lsblk_pmem of $dimm_rcount PMEM devices seen."
mail_message+="$nl$lsblk_pmem_msg$nl$(lsblk)$nl"
Expand Down

0 comments on commit 40dd690

Please sign in to comment.