From 7680c37b66fb79fb505afebe91cbb3c8dddb9b86 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 17 Jul 2023 15:21:35 -0700 Subject: [PATCH 01/58] add onetouch_nhc.sh --- onetouch_nhc.sh | 170 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 onetouch_nhc.sh diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh new file mode 100644 index 0000000..fee8a1d --- /dev/null +++ b/onetouch_nhc.sh @@ -0,0 +1,170 @@ +#!/bin/bash + + +#Setup tempdir along with cleanup trap +#TEMPDIR=`mktemp -d` +#echo "Temp directory setup at '$TEMPDIR'" + +cleanup() { + exitcode=$? + #rm -rf $TEMPDIR + echo + if [ $exitcode != 0 ]; then + echo -e "\tFAILED\n" + else + echo -e "\tSUCCESS\n" + fi +} +trap "cleanup" EXIT + +print_help() { +cat << EOF +Usage: ./onetouch_nhc [-h] [-v|--version ] [-c|--config ] [-w|--working ] [-o|--output ] [-f|--force] +Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, installs pre-requisites, and executes a health check. + +-h, -help, --help Display this help + +-v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" + Can be a branch name like "main" for the latest or a full commit hash for a specific version. + +-c, -config, --config Optional path to a custom NHC config file. + If not specified the current VM SKU will be detected and the appropriate conf file will be used. + +-w, -working, --working Optional path to specify as the working directory. This is where all content will be downloaded and executed from. + If not specified it will default to the path "~/onetouch_nhc/" + +-o, -output, --output Optional path to output the health check logs to. + If not specified will be output to a file under the working directory with a name following the format "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")-health.log". + +-f, -force, --force If set, forces the script the redownload and reinstall everything +EOF +} + +VERSION="main" +WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") +CUSTOM_CONF="" +FORCE=false + +# Parse out arguments +options=$(getopt -l "help,version:,config:,working:" -o "hv:c:w:" -a -- "$@") +eval set -- "$options" +while true +do +case "$1" in +-h|--help) + print_help + exit 0 + ;; +-v|--version) + shift + VERSION="$1" + ;; +-c|--config) + shift + CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" + ;; +-w|--working) + shift + WORKING_DIR="$(realpath -m ${1//\~/$HOME})" + ;; +-f|--force) + FORCE=true + ;; +--) + shift + break;; +esac +shift +done + +# Define expected paths +AZ_NHC_DIR=$(realpath -m "$WORKING_DIR/az-nhc-$VERSION") +INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" +RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" +HEALTH_LOG_FILE_PATH="$AZ_NHC_DIR/$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")-health.log" + +download_az_nhc() { + version="$1" + output_dir="$2" + + if [ -z $version ]; then + echo "A version must be provided" + exit 1 + fi + + if [ -z $output_dir ]; then + echo "An output directory must be provided" + exit 1 + fi + + if [ ! $FORCE ] && [ -d $output_dir ]; then + if [ -f "$output_dir/install-nhc.sh" ] && [ -f "$output_dir/run-health-checks.sh" ]; then + echo "Version $version of AZ NHC is already downloaded at $output_dir" + return 0 + fi + fi + + archive_url="https://github.com/Azure/azurehpc-health-checks/archive/$version.tar.gz" + mkdir -p $output_dir + wget -O - $archive_url | tar -xz --strip=1 -C $output_dir +} + +install_nhc() { + + # attempt to see if NHC is installed with all custom tests + NHC_INSTALLED=$(! $FORCE) + if $NHC_INSTALLED && [ -z $(which nhc) ]; then + echo "nhc is missing, reinstalling" + NHC_INSTALLED=false + fi + + if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then + echo "Custom tests differ, reinstalling" + NHC_INSTALLED=false + fi + + if $NHC_INSTALLED; then + echo "NHC is installed with all custom tests" + else + sudo $INSTALL_SCRIPT_PATH + echo "Would install NHC" + fi +} + +run_health_checks() { + log_file_path="$1" + custom_conf="$2" + + if [ -z $log_file_path ]; then + echo "A log file path must be provided" + exit 1 + fi + + log_file_path=$(realpath -m "$log_file_path") + + if [ -z $custom_conf ]; then + # if no custom config is provided, let run-health-checks.sh auto-detect + sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path + else + # otherwise, run it ourselves + custom_conf=$(realpath "$custom_conf") + echo "Running health checks using $custom_conf" + sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 + fi + +} + + +# Download AZ NHC +download_az_nhc $VERSION $AZ_NHC_DIR 1 +echo "Finished Downloading AZ NHC" + +pushd $AZ_NHC_DIR + +# Install NHC +install_nhc +echo "Finished NHC Install" + +# Execute Health Checks +run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF +echo "Finished Running Health Checks" From 2f5f42cf66eb1b5c111f161cfd37856ed31e1b58 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 17 Jul 2023 16:27:23 -0700 Subject: [PATCH 02/58] begin adding in error detection --- onetouch_nhc.sh | 103 ++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index fee8a1d..82c1c10 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -1,25 +1,7 @@ #!/bin/bash - - -#Setup tempdir along with cleanup trap -#TEMPDIR=`mktemp -d` -#echo "Temp directory setup at '$TEMPDIR'" - -cleanup() { - exitcode=$? - #rm -rf $TEMPDIR - echo - if [ $exitcode != 0 ]; then - echo -e "\tFAILED\n" - else - echo -e "\tSUCCESS\n" - fi -} -trap "cleanup" EXIT - print_help() { cat << EOF -Usage: ./onetouch_nhc [-h] [-v|--version ] [-c|--config ] [-w|--working ] [-o|--output ] [-f|--force] +Usage: ./onetouch_nhc [-h|--help] [-v|--version ] [-c|--config ] [-w|--working ] [-o|--output ] [-f|--force] Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, installs pre-requisites, and executes a health check. -h, -help, --help Display this help @@ -83,7 +65,30 @@ INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" HEALTH_LOG_FILE_PATH="$AZ_NHC_DIR/$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")-health.log" -download_az_nhc() { +install_nhc() { + force="$1" + + # attempt to see if NHC is installed with all custom tests + NHC_INSTALLED=$(! $force) + if $NHC_INSTALLED && [ -z $(which nhc) ]; then + echo "nhc is missing, reinstalling" + NHC_INSTALLED=false + fi + + if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then + echo "Custom tests differ, reinstalling" + NHC_INSTALLED=false + fi + + if $NHC_INSTALLED; then + echo "NHC is installed with all custom tests" + else + echo "Installing NHC" + sudo $INSTALL_SCRIPT_PATH + fi +} + +setup_nhc() { version="$1" output_dir="$2" @@ -97,9 +102,11 @@ download_az_nhc() { exit 1 fi - if [ ! $FORCE ] && [ -d $output_dir ]; then + if [ $FORCE ] && [ -d $output_dir ]; then if [ -f "$output_dir/install-nhc.sh" ] && [ -f "$output_dir/run-health-checks.sh" ]; then echo "Version $version of AZ NHC is already downloaded at $output_dir" + pushd $output_dir > /dev/null + install_nhc $FORCE return 0 fi fi @@ -107,30 +114,13 @@ download_az_nhc() { archive_url="https://github.com/Azure/azurehpc-health-checks/archive/$version.tar.gz" mkdir -p $output_dir wget -O - $archive_url | tar -xz --strip=1 -C $output_dir -} - -install_nhc() { - - # attempt to see if NHC is installed with all custom tests - NHC_INSTALLED=$(! $FORCE) - if $NHC_INSTALLED && [ -z $(which nhc) ]; then - echo "nhc is missing, reinstalling" - NHC_INSTALLED=false - fi - - if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then - echo "Custom tests differ, reinstalling" - NHC_INSTALLED=false - fi - if $NHC_INSTALLED; then - echo "NHC is installed with all custom tests" - else - sudo $INSTALL_SCRIPT_PATH - echo "Would install NHC" - fi + # If we had to download, force re-install + pushd $output_dir > /dev/null + install_nhc true } + run_health_checks() { log_file_path="$1" custom_conf="$2" @@ -156,15 +146,26 @@ run_health_checks() { # Download AZ NHC -download_az_nhc $VERSION $AZ_NHC_DIR 1 -echo "Finished Downloading AZ NHC" - -pushd $AZ_NHC_DIR - -# Install NHC -install_nhc -echo "Finished NHC Install" +echo +setup_nhc $VERSION $AZ_NHC_DIR 1 +echo "=== Finished Setting up AZ NHC ===" # Execute Health Checks +echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF -echo "Finished Running Health Checks" +echo "=== Finished Running Health Checks ===" +results=$(cat $HEALTH_LOG_FILE_PATH) + +echo +echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" +echo "$results" + +echo +echo "=== Detected Errors (if any) ===" +if grep "ERROR" $HEALTH_LOG_FILE_PATH; then + echo "Errors found!" + exit 1 +else + echo "No errors found!" + exit 0 +fi \ No newline at end of file From 4a2f7392f57a767ac55b2af3ecdba639ac37117e Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 17 Jul 2023 17:01:07 -0700 Subject: [PATCH 03/58] logging --- onetouch_nhc.sh | 103 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 79 insertions(+), 24 deletions(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 82c1c10..574669f 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -19,13 +19,24 @@ Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, install If not specified will be output to a file under the working directory with a name following the format "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")-health.log". -f, -force, --force If set, forces the script the redownload and reinstall everything + +-e, -errors, --errors If set, forces the script to only print out errors + +-V, -verbose, --verbose If set, forces the script to only print out errors EOF } +# Logging +LOG_VERBOSE=0 +LOG_INFO=1 +LOG_ERROR=2 + +# Arguments VERSION="main" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") CUSTOM_CONF="" FORCE=false +LOG_LEVEL=$LOG_INFO # Parse out arguments options=$(getopt -l "help,version:,config:,working:" -o "hv:c:w:" -a -- "$@") @@ -52,6 +63,12 @@ case "$1" in -f|--force) FORCE=true ;; +-e|--errors) + ERRORS_ONLY=$LOG_ERROR + ;; +-V|--verbose) + ERRORS_ONLY=$LOG_VERBOSE + ;; --) shift break;; @@ -65,26 +82,65 @@ INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" HEALTH_LOG_FILE_PATH="$AZ_NHC_DIR/$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")-health.log" +log() { + host=$(hostname) + ts=$(date +"%Y-%m-%d %H:%M:%S") + level=$1 + + if [ -z $level ]; then + echo "A log level must be defined" + exit 1 + fi + + msg="$2" + + if [ $level -ge $LOG_LEVEL ]; then + echo "[$ts][$host] $msg" + fi +} + +log_verbose() { + if [ $# == 0 ]; then + log $LOG_VERBOSE < /dev/stdin + else + log $LOG_VERBOSE "$1" + fi +} +log_info() { + if [ $# == 0 ]; then + log $LOG_INFO < /dev/stdin + else + log $LOG_INFO "$1" + fi +} +log_error() { + if [ $# == 0 ]; then + log $LOG_ERROR < /dev/stdin + else + log $LOG_ERROR "$1" + fi +} + install_nhc() { force="$1" # attempt to see if NHC is installed with all custom tests NHC_INSTALLED=$(! $force) if $NHC_INSTALLED && [ -z $(which nhc) ]; then - echo "nhc is missing, reinstalling" + log_info "nhc is missing, reinstalling" NHC_INSTALLED=false fi if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then - echo "Custom tests differ, reinstalling" + log_info "Custom tests differ, reinstalling" NHC_INSTALLED=false fi if $NHC_INSTALLED; then - echo "NHC is installed with all custom tests" + log_info "NHC is installed with all custom tests" else - echo "Installing NHC" - sudo $INSTALL_SCRIPT_PATH + log_info "Installing NHC" + log_verbose < sudo $INSTALL_SCRIPT_PATH fi } @@ -93,18 +149,18 @@ setup_nhc() { output_dir="$2" if [ -z $version ]; then - echo "A version must be provided" + log_error "A version must be provided" exit 1 fi if [ -z $output_dir ]; then - echo "An output directory must be provided" + log_info "An output directory must be provided" exit 1 fi if [ $FORCE ] && [ -d $output_dir ]; then if [ -f "$output_dir/install-nhc.sh" ] && [ -f "$output_dir/run-health-checks.sh" ]; then - echo "Version $version of AZ NHC is already downloaded at $output_dir" + log_info "Version $version of AZ NHC is already downloaded at $output_dir" pushd $output_dir > /dev/null install_nhc $FORCE return 0 @@ -112,8 +168,8 @@ setup_nhc() { fi archive_url="https://github.com/Azure/azurehpc-health-checks/archive/$version.tar.gz" - mkdir -p $output_dir - wget -O - $archive_url | tar -xz --strip=1 -C $output_dir + log_verbose $(mkdir -p $output_dir) + log_verbose $(wget -O - $archive_url | tar -xz --strip=1 -C $output_dir) # If we had to download, force re-install pushd $output_dir > /dev/null @@ -126,7 +182,7 @@ run_health_checks() { custom_conf="$2" if [ -z $log_file_path ]; then - echo "A log file path must be provided" + log_error "A log file path must be provided" exit 1 fi @@ -134,38 +190,37 @@ run_health_checks() { if [ -z $custom_conf ]; then # if no custom config is provided, let run-health-checks.sh auto-detect - sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path + log_info $(sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path) else # otherwise, run it ourselves custom_conf=$(realpath "$custom_conf") - echo "Running health checks using $custom_conf" - sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 + log_info "Running health checks using $custom_conf" + log_info $(sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500) fi } - # Download AZ NHC -echo +log_info "Test info" + setup_nhc $VERSION $AZ_NHC_DIR 1 -echo "=== Finished Setting up AZ NHC ===" +log_info "=== Finished Setting up AZ NHC ===" # Execute Health Checks -echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF -echo "=== Finished Running Health Checks ===" +log_info "=== Finished Running Health Checks ===" results=$(cat $HEALTH_LOG_FILE_PATH) -echo -echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" -echo "$results" +log_info "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" +log_info "$results" echo echo "=== Detected Errors (if any) ===" if grep "ERROR" $HEALTH_LOG_FILE_PATH; then - echo "Errors found!" + log_error $(grep "ERROR" $HEALTH_LOG_FILE_PATH) + log_error "Errors found!" exit 1 else - echo "No errors found!" + log_error "No errors found!" exit 0 fi \ No newline at end of file From fd4e0cb9221ef873f0fdce703baa1f4e33d02a32 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 18 Jul 2023 15:31:35 -0700 Subject: [PATCH 04/58] simplify --- onetouch_nhc.sh | 127 +++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 83 deletions(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 574669f..72551f4 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -19,27 +19,23 @@ Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, install If not specified will be output to a file under the working directory with a name following the format "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")-health.log". -f, -force, --force If set, forces the script the redownload and reinstall everything - --e, -errors, --errors If set, forces the script to only print out errors - --V, -verbose, --verbose If set, forces the script to only print out errors EOF } -# Logging -LOG_VERBOSE=0 -LOG_INFO=1 -LOG_ERROR=2 - # Arguments VERSION="main" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") CUSTOM_CONF="" FORCE=false -LOG_LEVEL=$LOG_INFO # Parse out arguments -options=$(getopt -l "help,version:,config:,working:" -o "hv:c:w:" -a -- "$@") +options=$(getopt -l "help,version:,config:,working:force" -o "hv:c:w:f" -a -- "$@") + +if [ $? -ne 0 ]; then + print_help + exit 1 +fi + eval set -- "$options" while true do @@ -63,12 +59,6 @@ case "$1" in -f|--force) FORCE=true ;; --e|--errors) - ERRORS_ONLY=$LOG_ERROR - ;; --V|--verbose) - ERRORS_ONLY=$LOG_VERBOSE - ;; --) shift break;; @@ -82,65 +72,30 @@ INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" HEALTH_LOG_FILE_PATH="$AZ_NHC_DIR/$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")-health.log" -log() { - host=$(hostname) - ts=$(date +"%Y-%m-%d %H:%M:%S") - level=$1 - - if [ -z $level ]; then - echo "A log level must be defined" - exit 1 - fi - - msg="$2" - - if [ $level -ge $LOG_LEVEL ]; then - echo "[$ts][$host] $msg" - fi -} - -log_verbose() { - if [ $# == 0 ]; then - log $LOG_VERBOSE < /dev/stdin - else - log $LOG_VERBOSE "$1" - fi -} -log_info() { - if [ $# == 0 ]; then - log $LOG_INFO < /dev/stdin - else - log $LOG_INFO "$1" - fi -} -log_error() { - if [ $# == 0 ]; then - log $LOG_ERROR < /dev/stdin - else - log $LOG_ERROR "$1" - fi -} - install_nhc() { - force="$1" + force=$1 # attempt to see if NHC is installed with all custom tests - NHC_INSTALLED=$(! $force) + NHC_INSTALLED=true + if $force; then + NHC_INSTALLED=false + fi + if $NHC_INSTALLED && [ -z $(which nhc) ]; then - log_info "nhc is missing, reinstalling" + echo "nhc is missing, reinstalling" NHC_INSTALLED=false fi if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then - log_info "Custom tests differ, reinstalling" + echo "Custom tests differ, reinstalling" NHC_INSTALLED=false fi if $NHC_INSTALLED; then - log_info "NHC is installed with all custom tests" + echo "NHC is installed with all custom tests" else - log_info "Installing NHC" - log_verbose < sudo $INSTALL_SCRIPT_PATH + echo "Installing NHC" + sudo $INSTALL_SCRIPT_PATH fi } @@ -149,18 +104,18 @@ setup_nhc() { output_dir="$2" if [ -z $version ]; then - log_error "A version must be provided" + echo "A version must be provided" exit 1 fi if [ -z $output_dir ]; then - log_info "An output directory must be provided" + echo "An output directory must be provided" exit 1 fi - - if [ $FORCE ] && [ -d $output_dir ]; then + + if ! $FORCE && [ -d $output_dir ]; then if [ -f "$output_dir/install-nhc.sh" ] && [ -f "$output_dir/run-health-checks.sh" ]; then - log_info "Version $version of AZ NHC is already downloaded at $output_dir" + echo "Version $version of AZ NHC is already downloaded at $output_dir" pushd $output_dir > /dev/null install_nhc $FORCE return 0 @@ -168,8 +123,14 @@ setup_nhc() { fi archive_url="https://github.com/Azure/azurehpc-health-checks/archive/$version.tar.gz" - log_verbose $(mkdir -p $output_dir) - log_verbose $(wget -O - $archive_url | tar -xz --strip=1 -C $output_dir) + + mkdir -p $output_dir > /dev/null + wget -q -O - $archive_url | tar -xz --strip=1 -C $output_dir + + if [ $? -ne 0 ]; then + echo "Failed to download and unpack archive from $archive_url" + exit 1 + fi # If we had to download, force re-install pushd $output_dir > /dev/null @@ -182,7 +143,7 @@ run_health_checks() { custom_conf="$2" if [ -z $log_file_path ]; then - log_error "A log file path must be provided" + echo "A log file path must be provided" exit 1 fi @@ -190,37 +151,37 @@ run_health_checks() { if [ -z $custom_conf ]; then # if no custom config is provided, let run-health-checks.sh auto-detect - log_info $(sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path) + sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path else # otherwise, run it ourselves custom_conf=$(realpath "$custom_conf") - log_info "Running health checks using $custom_conf" - log_info $(sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500) + echo "Running health checks using $custom_conf" + sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 fi } # Download AZ NHC -log_info "Test info" - setup_nhc $VERSION $AZ_NHC_DIR 1 -log_info "=== Finished Setting up AZ NHC ===" +echo "=== Finished Setting up AZ NHC ===" # Execute Health Checks +echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF -log_info "=== Finished Running Health Checks ===" +echo "=== Finished Running Health Checks ===" results=$(cat $HEALTH_LOG_FILE_PATH) -log_info "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" -log_info "$results" +echo +echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" +echo "$results" echo echo "=== Detected Errors (if any) ===" if grep "ERROR" $HEALTH_LOG_FILE_PATH; then - log_error $(grep "ERROR" $HEALTH_LOG_FILE_PATH) - log_error "Errors found!" + echo $(grep "ERROR" $HEALTH_LOG_FILE_PATH) + echo "Errors found!" exit 1 else - log_error "No errors found!" + echo "No errors found!" exit 0 fi \ No newline at end of file From 8dbeabf3262a31e773d4be2faefe103d06abf655 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 18 Jul 2023 17:32:35 -0700 Subject: [PATCH 05/58] add distributed_nhc slurm and pssh --- distributed_nhc.sb.sh | 62 ++++++++++++++++++++++++++++++++++++++++++ distributed_nhc.sbatch | 8 ++++++ 2 files changed, 70 insertions(+) create mode 100644 distributed_nhc.sb.sh create mode 100644 distributed_nhc.sbatch diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh new file mode 100644 index 0000000..fe1163f --- /dev/null +++ b/distributed_nhc.sb.sh @@ -0,0 +1,62 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --job-name distributed_nhc +#SBATCH --error="logs/%x-%j.err" +#SBATCH --output="logs/%x-%j.out" +#SBATCH --time 00:15:00 + +# Running with SLURM +if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then + srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sort + exit 0 +fi + +# Running with Parallel SSH +print_help() { +cat << EOF +Usage: ./distributed_nhc.sb.sh [-h|--help] [--nodefile ] +Run Azure NHC distributed onto the specified set of nodes and collects the results. Script can also be ran directly with sbatch. Running it as a shell script will use parallel-ssh + +-h, -help, --help Display this help + +-F --nodefile File contains a list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -F/--nodefile argument +EOF +} + +# Arguments +NODEFILE="" + +# Parse out arguments +options=$(getopt -l "help,nodefile:" -o "hF:" -a -- "$@") +if [ $? -ne 0 ]; then + print_help + exit 1 +fi + +eval set -- "$options" +while true +do +case "$1" in +-h|--help) + print_help + exit 0 + ;; +-F|--nodefile) + shift + NODEFILE="$1" + ;; +--) + shift + break;; +esac +shift +done + +output_path="logs/distributed_nhc-pssh-$(date +"%Y-%m-%d_%H-%M-%S").out" + +timeout=900 # 15 minute timeout +onetouch_nhc_path=$(realpath "./onetouch_nhc.sh") + +output=$(parallel-ssh -P -t $timeout -h $NODEFILE $onetouch_nhc_path) +echo "$output" | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sed "s/.*: //g" | sort > $output_path \ No newline at end of file diff --git a/distributed_nhc.sbatch b/distributed_nhc.sbatch new file mode 100644 index 0000000..4f26e93 --- /dev/null +++ b/distributed_nhc.sbatch @@ -0,0 +1,8 @@ +#!/bin/bash +#SBATCH --nodes=2 +#SBATCH --exclusive +#SBATCH --job-name distributed_nhc +#SBATCH --error="logs/%x-%j.err" +#SBATCH --output="logs/%x-%j.out" +#SBATCH --time 00:15:00 +srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sort \ No newline at end of file From 336f9d7c976e17828f5b847c06f4ba153ce6fa56 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 18 Jul 2023 17:32:50 -0700 Subject: [PATCH 06/58] updates --- onetouch_nhc.sh | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 72551f4..04cf28f 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -171,17 +171,10 @@ run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF echo "=== Finished Running Health Checks ===" results=$(cat $HEALTH_LOG_FILE_PATH) -echo -echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" -echo "$results" - echo echo "=== Detected Errors (if any) ===" if grep "ERROR" $HEALTH_LOG_FILE_PATH; then - echo $(grep "ERROR" $HEALTH_LOG_FILE_PATH) - echo "Errors found!" - exit 1 + grep "ERROR" $HEALTH_LOG_FILE_PATH | while read line; do echo "[NHC-RESULT][$(hostname)] $line"; done else - echo "No errors found!" - exit 0 + echo "[NHC-RESULT][$(hostname)] Healthy" fi \ No newline at end of file From f018606621622f0e70ac0e8d21720a584caff874 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 19 Jul 2023 15:01:43 -0700 Subject: [PATCH 07/58] leave logs on node --- onetouch_nhc.sh | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 04cf28f..a4317d6 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -1,7 +1,8 @@ #!/bin/bash + print_help() { cat << EOF -Usage: ./onetouch_nhc [-h|--help] [-v|--version ] [-c|--config ] [-w|--working ] [-o|--output ] [-f|--force] +Usage: ./onetouch_nhc [-h|--help] [-v|--version ] [-c|--config ] [-w|--working ] [-o|--output ] [-n|--name ] [-f|--force] Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, installs pre-requisites, and executes a health check. -h, -help, --help Display this help @@ -15,8 +16,11 @@ Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, install -w, -working, --working Optional path to specify as the working directory. This is where all content will be downloaded and executed from. If not specified it will default to the path "~/onetouch_nhc/" --o, -output, --output Optional path to output the health check logs to. - If not specified will be output to a file under the working directory with a name following the format "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")-health.log". +-o, -output, --output Optional directory path to output the health check, stdout, and stderr logs to. + If not specified it will use the same as the working directory". + +-n, -name, --name Optional name to provide for a given execution run. This impacts the names of the log files generated. + If not specified the job name will be generated with "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")". -f, -force, --force If set, forces the script the redownload and reinstall everything EOF @@ -25,11 +29,13 @@ EOF # Arguments VERSION="main" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") +OUTPUT_DIR=$WORKING_DIR +JOB_NAME="$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")" CUSTOM_CONF="" FORCE=false # Parse out arguments -options=$(getopt -l "help,version:,config:,working:force" -o "hv:c:w:f" -a -- "$@") +options=$(getopt -l "help,version:,config:,working:,output:,name:force" -o "hv:c:w:o:n:f" -a -- "$@") if [ $? -ne 0 ]; then print_help @@ -56,6 +62,14 @@ case "$1" in shift WORKING_DIR="$(realpath -m ${1//\~/$HOME})" ;; +-o|--output) + shift + OUTPUT_DIR="$(realpath -m ${1//\~/$HOME})" + ;; +-n|--name) + shift + JOB_NAME="$1" + ;; -f|--force) FORCE=true ;; @@ -70,7 +84,14 @@ done AZ_NHC_DIR=$(realpath -m "$WORKING_DIR/az-nhc-$VERSION") INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" -HEALTH_LOG_FILE_PATH="$AZ_NHC_DIR/$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")-health.log" + +OUT_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME.out" +ERR_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME.err" +HEALTH_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME-health.log" + +# Setup redirection for the rest of the script +mkdir -p $OUTPUT_DIR +exec > >(tee $OUT_LOG_FILE_PATH) 2> >(tee $ERR_LOG_FILE_PATH >&2) install_nhc() { force=$1 @@ -162,6 +183,7 @@ run_health_checks() { } # Download AZ NHC +echo "Running OneTouch NHC with Job Name $JOB_NAME" setup_nhc $VERSION $AZ_NHC_DIR 1 echo "=== Finished Setting up AZ NHC ===" @@ -169,12 +191,16 @@ echo "=== Finished Setting up AZ NHC ===" echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF echo "=== Finished Running Health Checks ===" -results=$(cat $HEALTH_LOG_FILE_PATH) + +echo +echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" +cat $HEALTH_LOG_FILE_PATH echo echo "=== Detected Errors (if any) ===" -if grep "ERROR" $HEALTH_LOG_FILE_PATH; then - grep "ERROR" $HEALTH_LOG_FILE_PATH | while read line; do echo "[NHC-RESULT][$(hostname)] $line"; done +errors=$(grep "ERROR" $HEALTH_LOG_FILE_PATH) +if [ -n "$errors" ]; then + echo $errors | while read line; do echo "NHC-RESULT $(hostname) | $line"; done else - echo "[NHC-RESULT][$(hostname)] Healthy" + echo "NHC-RESULT $(hostname) | Healthy" fi \ No newline at end of file From 690141bb82ab786c6da592543b153b7b0e52ff94 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 19 Jul 2023 17:24:29 -0700 Subject: [PATCH 08/58] leave full execution logs on nodes, adding argument supoprt, expanding slurm node list, working on detecting missing node results --- distributed_nhc.sb.sh | 203 ++++++++++++++++++++++++++++++++++-------- onetouch_nhc.sh | 4 +- 2 files changed, 167 insertions(+), 40 deletions(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index fe1163f..6c55bce 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -6,57 +6,184 @@ #SBATCH --output="logs/%x-%j.out" #SBATCH --time 00:15:00 -# Running with SLURM -if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then - srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sort - exit 0 -fi - -# Running with Parallel SSH print_help() { cat << EOF -Usage: ./distributed_nhc.sb.sh [-h|--help] [--nodefile ] +Usage: ./distributed_nhc.sb.sh [-h|--help] [-F|--nodefile ] [-F|--nodefile ] Run Azure NHC distributed onto the specified set of nodes and collects the results. Script can also be ran directly with sbatch. Running it as a shell script will use parallel-ssh +Example Usage: + sbatch -N4 ./distributed_nhc.sb.sh + ./distributed_nhc.sb.sh -F ./my_node_file + ./distributed_nhc.sb.sh -w node1,node2,node3 + ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 + -h, -help, --help Display this help -F --nodefile File contains a list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -F/--nodefile argument + +-w --nodelist Comma seperate list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -w/--nodelist argument but does not support ranges of hosts (eg host[1-5,7,...]). + If -F/--nodefile is provided, any nodes specified with -w/--nodelist will be added to the list of hostnames to run NHC on. This does not modify the provided -F/--nodefile file. + +-v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" + Can be a branch name like "main" for the latest or a full commit hash for a specific version. + +-c, -config, --config Optional path to a custom NHC config file. + If not specified the current VM SKU will be detected and the appropriate conf file will be used. + +-f, -force, --force If set, forces the NHC script the redownload and reinstall everything EOF } -# Arguments -NODEFILE="" +expand_nodelist() { + nodelist="$1" + # make nodelist bash "friendly" for expansion + # ie turn "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" + # into "aice-ndv5-iad21-{000170,{000201..000203},{000218..000220}}" + # which bash can easily expand into + # aice-ndv5-iad21-000170 aice-ndv5-iad21-000201 aice-ndv5-iad21-000202 aice-ndv5-iad21-000203 aice-ndv5-iad21-000218 aice-ndv5-iad21-000219 aice-ndv5-iad21-000220 -# Parse out arguments -options=$(getopt -l "help,nodefile:" -o "hF:" -a -- "$@") -if [ $? -ne 0 ]; then - print_help - exit 1 -fi + # converts "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" + # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" + # which we can then stick into an array. If we have 1 element, there were no ranges + # otherwise, expand the ranges and rebuild the node names + host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) -eval set -- "$options" -while true -do -case "$1" in --h|--help) - print_help - exit 0 - ;; --F|--nodefile) - shift - NODEFILE="$1" - ;; ---) + if [ ${#host_num_split[@]} -eq 1 ]; then + echo ${host_num_split[0]} + return + fi + + nodenumbers=${host_num_split[1]} + bash_friendly_ranges=$(echo $nodenumbers | sed -r -e 's:[[](.*)[]]:{\1}:' -e 's:([0-9]+)[-]([0-9]+):{\1..\2}:g') + bash_friendly_node_range="${host_num_split[0]}$bash_friendly_ranges" + eval echo $bash_friendly_node_range | tr -d '{}' +} + +RAW_OUTPUT="" +HEALTH_LOG_FILE_PATH="" + +# Running with SLURM +if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then + #srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/\s*NHC-RESULT\s*//g' | sort + #exit 0 + NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID" + HEALTH_LOG_FILE_PATH="logs/$NHC_JOB_NAME.health.log" + NODELIST_ARR=( $(expand_nodelist $SLURM_NODELIST) ) + { RAW_OUTPUT=$(srun ./onetouch_nhc.sh -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 +else + # Running with Parallel SSH + # Arguments + NODEFILE="" + NODELIST="" + GIT_VERSION="" + CUSTOM_CONF="" + FORCE=false + + # Parse out arguments + options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force" -o "hF:w:v:c:f" -a -- "$@") + if [ $? -ne 0 ]; then + print_help + exit 1 + fi + + eval set -- "$options" + while true + do + case "$1" in + -h|--help) + print_help + exit 0 + ;; + -F|--nodefile) + shift + NODEFILE="$1" + ;; + -w|--nodelist) + shift + NODELIST="$1" + ;; + -v|--version) + shift + GIT_VERSION="$1" + ;; + -c|--config) + shift + CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" + ;; + -f|--force) + FORCE=true + ;; + --) + shift + break;; + esac shift - break;; -esac -shift -done + done + + # Parse out nodes + NODELIST_ARR=() + + if [ -f "$NODEFILE" ]; then + mapfile -t NODELIST_ARR < $NODEFILE + fi + + if [ -n "$NODELIST" ]; then + NODELIST_ARR+=( $(echo $NODELIST | sed "s/,/ /g") ) + fi + + if [ ${#NODELIST_ARR[@]} -eq 0 ]; then + echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a comma seperate list with -w/--nodelist" + echo + print_help + exit 1 + fi + + # Make unique and sorted + NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) + + # Log file paths + jobname="distributed_nhc-pssh-$(date +'%Y-%m-%d_%H-%M-%S')" + HEALTH_LOG_FILE_PATH="logs/$jobname.health.log" + output_path="logs/$jobname.out" + error_path="logs/$jobname.err" + + # Pssh args + timeout=900 # 15 minute timeout + onetouch_nhc_path=$(realpath "./onetouch_nhc.sh") + + pssh_host_args=() + for node in "${NODELIST_ARR[@]}"; do + pssh_host_args+="-H $node " + done + + nhc_args=() + if [ -n "$GIT_VERSION" ]; then + nhc_args+=("-v" "$GIT_VERSION") + fi + + if [ -n "$CUSTOM_CONF" ]; then + nhc_args+=("-c" "$CUSTOM_CONF") + fi + + if $FORCE ; then + nhc_args+=("-f") + fi + + + #output=$(parallel-ssh -P -t $timeout -h $NODEFILE $onetouch_nhc_path) + #echo "$output" | grep "NHC-RESULT" | sed 's/\s*NHC-RESULT\s*//g' | sed "s/.*:\s+//g" | sort > $output_path + + echo "Running Parallel SSH Distributed NHC on:" + echo "${NODELIST_ARR[@]}" | tr ' ' '\n' + echo "======================" + parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 2> $error_path | tee $output_path + RAW_OUTPUT=$(cat $output_path) +fi -output_path="logs/distributed_nhc-pssh-$(date +"%Y-%m-%d_%H-%M-%S").out" +# Filter down to NHC-RESULTS +NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g' | sort) -timeout=900 # 15 minute timeout -onetouch_nhc_path=$(realpath "./onetouch_nhc.sh") +nodes_with_results_arr=( $( echo "$NHC_RESULTS" | sed 's/\s*|.*//g' | tr '\n' ' ' ) ) -output=$(parallel-ssh -P -t $timeout -h $NODEFILE $onetouch_nhc_path) -echo "$output" | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sed "s/.*: //g" | sort > $output_path \ No newline at end of file +echo "$NHC_RESULTS" >> $HEALTH_LOG_FILE_PATH +echo "$nodes_with_results" >> $HEALTH_LOG_FILE_PATH \ No newline at end of file diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index a4317d6..88c89ff 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -87,7 +87,7 @@ RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" OUT_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME.out" ERR_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME.err" -HEALTH_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME-health.log" +HEALTH_LOG_FILE_PATH="$OUTPUT_DIR/$JOB_NAME.health.log" # Setup redirection for the rest of the script mkdir -p $OUTPUT_DIR @@ -183,7 +183,7 @@ run_health_checks() { } # Download AZ NHC -echo "Running OneTouch NHC with Job Name $JOB_NAME" +echo "Running OneTouch NHC with Job Name $JOB_NAME on host $(hostname)" setup_nhc $VERSION $AZ_NHC_DIR 1 echo "=== Finished Setting up AZ NHC ===" From 64b14a7e5bfa255b8d4a63efb41cc2d883c6d716 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 19 Jul 2023 17:34:46 -0700 Subject: [PATCH 09/58] report nodes that didn't report results --- distributed_nhc.sb.sh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 6c55bce..a7bd87b 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -43,10 +43,7 @@ expand_nodelist() { # aice-ndv5-iad21-000170 aice-ndv5-iad21-000201 aice-ndv5-iad21-000202 aice-ndv5-iad21-000203 aice-ndv5-iad21-000218 aice-ndv5-iad21-000219 aice-ndv5-iad21-000220 # converts "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" - # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" - # which we can then stick into an array. If we have 1 element, there were no ranges - # otherwise, expand the ranges and rebuild the node names - host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) + # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" # which we can then stick into an array. If we have 1 element, there were no ranges # otherwise, expand the ranges and rebuild the node names host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) if [ ${#host_num_split[@]} -eq 1 ]; then echo ${host_num_split[0]} @@ -61,6 +58,7 @@ expand_nodelist() { RAW_OUTPUT="" HEALTH_LOG_FILE_PATH="" +NODELIST_ARR=() # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then @@ -181,9 +179,15 @@ else fi # Filter down to NHC-RESULTS -NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g' | sort) +NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g') +# Identify nodes who should have reported results but didn't, these failed for some unknown reason nodes_with_results_arr=( $( echo "$NHC_RESULTS" | sed 's/\s*|.*//g' | tr '\n' ' ' ) ) +nodes_missing_results=(`echo ${NODELIST_ARR[@]} ${nodes_with_results_arr[@]} | tr ' ' '\n' | sort | uniq -u`) -echo "$NHC_RESULTS" >> $HEALTH_LOG_FILE_PATH -echo "$nodes_with_results" >> $HEALTH_LOG_FILE_PATH \ No newline at end of file +newline=$'\n' +for missing_node in "${nodes_missing_results[@]}"; do + NHC_RESULTS+="$newline$missing_node | ERROR: No results reported" +done + +echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH \ No newline at end of file From 1c329e92d26c91c7e03d953ad0df68189c991347 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 19 Jul 2023 18:25:21 -0700 Subject: [PATCH 10/58] fix bug expanding nodelist --- distributed_nhc.sb.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index a7bd87b..4a89567 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -13,6 +13,7 @@ Run Azure NHC distributed onto the specified set of nodes and collects the resul Example Usage: sbatch -N4 ./distributed_nhc.sb.sh + sbatch -N4 ./distributed_nhc.sb.sh -c ./distributed_nhc.sb.sh -F ./my_node_file ./distributed_nhc.sb.sh -w node1,node2,node3 ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 @@ -43,8 +44,10 @@ expand_nodelist() { # aice-ndv5-iad21-000170 aice-ndv5-iad21-000201 aice-ndv5-iad21-000202 aice-ndv5-iad21-000203 aice-ndv5-iad21-000218 aice-ndv5-iad21-000219 aice-ndv5-iad21-000220 # converts "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" - # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" # which we can then stick into an array. If we have 1 element, there were no ranges # otherwise, expand the ranges and rebuild the node names host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) - + # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" + # which we can then stick into an array. If we have 1 element, there were no ranges + # otherwise, expand the ranges and rebuild the node names + host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) if [ ${#host_num_split[@]} -eq 1 ]; then echo ${host_num_split[0]} return @@ -62,11 +65,9 @@ NODELIST_ARR=() # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then - #srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/\s*NHC-RESULT\s*//g' | sort - #exit 0 NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID" HEALTH_LOG_FILE_PATH="logs/$NHC_JOB_NAME.health.log" - NODELIST_ARR=( $(expand_nodelist $SLURM_NODELIST) ) + NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) { RAW_OUTPUT=$(srun ./onetouch_nhc.sh -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 else # Running with Parallel SSH From 9e87e71b4cf8336ef3e62dac221671e327895b4d Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 24 Jul 2023 09:31:45 -0700 Subject: [PATCH 11/58] add timestamp to slurm health output, have pssh output just the health result --- distributed_nhc.sb.sh | 8 +++--- h100_fail.conf | 57 +++++++++++++++++++++++++++++++++++++++++++ h100_pass.conf | 57 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 h100_fail.conf create mode 100644 h100_pass.conf diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 4a89567..c8f0502 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -65,7 +65,7 @@ NODELIST_ARR=() # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then - NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID" + NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID-$(date +'%Y-%m-%d_%H-%M-%S')" HEALTH_LOG_FILE_PATH="logs/$NHC_JOB_NAME.health.log" NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) { RAW_OUTPUT=$(srun ./onetouch_nhc.sh -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 @@ -175,8 +175,7 @@ else echo "Running Parallel SSH Distributed NHC on:" echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" - parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 2> $error_path | tee $output_path - RAW_OUTPUT=$(cat $output_path) + RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 3> $error_path | tee $output_path) fi # Filter down to NHC-RESULTS @@ -191,4 +190,5 @@ for missing_node in "${nodes_missing_results[@]}"; do NHC_RESULTS+="$newline$missing_node | ERROR: No results reported" done -echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH \ No newline at end of file +echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH +cat $HEALTH_LOG_FILE_PATH \ No newline at end of file diff --git a/h100_fail.conf b/h100_fail.conf new file mode 100644 index 0000000..8aff28b --- /dev/null +++ b/h100_fail.conf @@ -0,0 +1,57 @@ +# NHC Configuration File +# +# Lines are in the form "||" +# Hostmask is a glob, /regexp/, or {noderange} +# Comments begin with '#' +# +# This file was automatically generated by nhc-genconf +# Sat May 6 00:05:37 UTC 2023 +# + + +####################################################################### +### +### Hardware checks +### + * || check_hw_cpuinfo 2 96 97 +# * || check_hw_physmem 1915071MB 1915071MB 5% +# * || check_hw_swap 0kB 0kB 3% +# * || check_hw_ib 400 mlx5_ib0:1 +# * || check_hw_ib 400 mlx5_ib1:1 +# * || check_hw_ib 400 mlx5_ib2:1 +# * || check_hw_ib 400 mlx5_ib3:1 +# * || check_hw_ib 400 mlx5_ib4:1 +# * || check_hw_ib 400 mlx5_ib5:1 +# * || check_hw_ib 400 mlx5_ib6:1 +# * || check_hw_ib 400 mlx5_ib7:1 +# * || check_hw_eth lo +# * || check_hw_eth eth0 +# * || check_hw_eth ib0 +# * || check_hw_eth ib1 +# * || check_hw_eth ib2 +# * || check_hw_eth ib3 +# * || check_hw_eth ib4 +# * || check_hw_eth ib5 +# * || check_hw_eth ib6 +# * || check_hw_eth ib7 +# * || check_hw_eth docker0 +# +######################################################################## +##### +##### GPU checks +##### +# * || check_nvsmi_healthmon +# * || check_gpu_xid +# * || check_cuda_bw 52 +# * || check_gpu_ecc 20000000 10000 +# * || check_gpu_clock_throttling +# * || check_nccl_allreduce 460.0 1 /opt/microsoft/ndv5-topo.xml 16G +# +# +######################################################################## +##### +##### Additional IB checks +##### +# * || check_ib_bw_gdr 380 nd96isr_v5 +# * || check_nccl_allreduce_ib_loopback 40.0 1 /opt/microsoft/ndv5-topo.xml 16G +# * || check_ib_link_flapping 6 diff --git a/h100_pass.conf b/h100_pass.conf new file mode 100644 index 0000000..ec594f5 --- /dev/null +++ b/h100_pass.conf @@ -0,0 +1,57 @@ +# NHC Configuration File +# +# Lines are in the form "||" +# Hostmask is a glob, /regexp/, or {noderange} +# Comments begin with '#' +# +# This file was automatically generated by nhc-genconf +# Sat May 6 00:05:37 UTC 2023 +# + + +####################################################################### +### +### Hardware checks +### + * || check_hw_cpuinfo 2 96 96 +# * || check_hw_physmem 1915071MB 1915071MB 5% +# * || check_hw_swap 0kB 0kB 3% +# * || check_hw_ib 400 mlx5_ib0:1 +# * || check_hw_ib 400 mlx5_ib1:1 +# * || check_hw_ib 400 mlx5_ib2:1 +# * || check_hw_ib 400 mlx5_ib3:1 +# * || check_hw_ib 400 mlx5_ib4:1 +# * || check_hw_ib 400 mlx5_ib5:1 +# * || check_hw_ib 400 mlx5_ib6:1 +# * || check_hw_ib 400 mlx5_ib7:1 +# * || check_hw_eth lo +# * || check_hw_eth eth0 +# * || check_hw_eth ib0 +# * || check_hw_eth ib1 +# * || check_hw_eth ib2 +# * || check_hw_eth ib3 +# * || check_hw_eth ib4 +# * || check_hw_eth ib5 +# * || check_hw_eth ib6 +# * || check_hw_eth ib7 +# * || check_hw_eth docker0 +# +######################################################################## +##### +##### GPU checks +##### +# * || check_nvsmi_healthmon +# * || check_gpu_xid +# * || check_cuda_bw 52 +# * || check_gpu_ecc 20000000 10000 +# * || check_gpu_clock_throttling +# * || check_nccl_allreduce 460.0 1 /opt/microsoft/ndv5-topo.xml 16G +# +# +######################################################################## +##### +##### Additional IB checks +##### +# * || check_ib_bw_gdr 380 nd96isr_v5 +# * || check_nccl_allreduce_ib_loopback 40.0 1 /opt/microsoft/ndv5-topo.xml 16G +# * || check_ib_link_flapping 6 From 98c4d2d8243a5a8889ea465b7f7a5beccf4c5c22 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 24 Jul 2023 09:40:54 -0700 Subject: [PATCH 12/58] fix git ignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8f67b55..331dbed 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ lbnl-nhc-1.4.3/ *.deb *stream.c *health.log +.vscode \ No newline at end of file From 2745e099fb45f9003618d9d6cc4ed26d36026aa8 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 25 Jul 2023 17:20:10 -0700 Subject: [PATCH 13/58] add gpu count test --- conf/nd96isr_h100_v5.conf | 3 ++- customTests/azure_gpu_count.nhc | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 customTests/azure_gpu_count.nhc diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index c021734..63eeef0 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -40,6 +40,7 @@ #### #### GPU checks #### + * || check_gpu_count 8 * || check_nvsmi_healthmon * || check_gpu_xid * || check_cuda_bw 52 @@ -52,6 +53,6 @@ #### #### Additional IB checks #### - * || check_ib_bw_gdr 380 nd96isr_v5 + * || check_ib_bw_gdr 375 nd96isr_v5 * || check_nccl_allreduce_ib_loopback 40.0 1 /opt/microsoft/ndv5-topo.xml 16G * || check_ib_link_flapping 6 diff --git a/customTests/azure_gpu_count.nhc b/customTests/azure_gpu_count.nhc new file mode 100644 index 0000000..ba05ef3 --- /dev/null +++ b/customTests/azure_gpu_count.nhc @@ -0,0 +1,8 @@ +#!/bin/bash +function check_gpu_count() { + EXPECTED_NUM_GPU="$1" + gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [ "$gpu_count" -ne "$1" ]; + die 1 "$FUNCNAME: Expected to see $EXPECTED_NUM_GPU but found $gpu_count" + fi +} \ No newline at end of file From 8fe16359a4742dac6de322501cefdf12279b73ed Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 13:38:47 -0700 Subject: [PATCH 14/58] remove old .sbatch --- distributed_nhc.sbatch | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 distributed_nhc.sbatch diff --git a/distributed_nhc.sbatch b/distributed_nhc.sbatch deleted file mode 100644 index 4f26e93..0000000 --- a/distributed_nhc.sbatch +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -#SBATCH --nodes=2 -#SBATCH --exclusive -#SBATCH --job-name distributed_nhc -#SBATCH --error="logs/%x-%j.err" -#SBATCH --output="logs/%x-%j.out" -#SBATCH --time 00:15:00 -srun ./onetouch_nhc.sh | grep "NHC-RESULT" | sed 's/NHC-RESULT //g' | sort \ No newline at end of file From 8dc4975e8fe0c900248574b934b77941d17a47ae Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 16:19:47 -0700 Subject: [PATCH 15/58] add git option --- distributed_nhc.sb.sh | 27 ++++++++++++++++++--------- onetouch_nhc.sh | 17 ++++++++++++++--- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index c8f0502..1e8f203 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -4,7 +4,7 @@ #SBATCH --job-name distributed_nhc #SBATCH --error="logs/%x-%j.err" #SBATCH --output="logs/%x-%j.out" -#SBATCH --time 00:15:00 +#SBATCH --time 00:30:00 print_help() { cat << EOF @@ -13,7 +13,7 @@ Run Azure NHC distributed onto the specified set of nodes and collects the resul Example Usage: sbatch -N4 ./distributed_nhc.sb.sh - sbatch -N4 ./distributed_nhc.sb.sh -c + sbatch -N4 ./distributed_nhc.sb.sh -v ./distributed_nhc.sb.sh -F ./my_node_file ./distributed_nhc.sb.sh -w node1,node2,node3 ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 @@ -28,6 +28,8 @@ Example Usage: -v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" Can be a branch name like "main" for the latest or a full commit hash for a specific version. +-g, -git, --git Optional git url to download az nhc from. Defaults to "https://github.com/Azure/azurehpc-health-checks" + -c, -config, --config Optional path to a custom NHC config file. If not specified the current VM SKU will be detected and the appropriate conf file will be used. @@ -62,24 +64,28 @@ expand_nodelist() { RAW_OUTPUT="" HEALTH_LOG_FILE_PATH="" NODELIST_ARR=() +onetouch_nhc_path=$(realpath -e "./onetouch_nhc.sh") # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID-$(date +'%Y-%m-%d_%H-%M-%S')" HEALTH_LOG_FILE_PATH="logs/$NHC_JOB_NAME.health.log" NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) - { RAW_OUTPUT=$(srun ./onetouch_nhc.sh -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 + + # verify file presence on all nodes + { RAW_OUTPUT=$(srun --gpus-per-node=8 $onetouch_nhc_path -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 else # Running with Parallel SSH # Arguments NODEFILE="" NODELIST="" GIT_VERSION="" + GIT_URL="" CUSTOM_CONF="" FORCE=false # Parse out arguments - options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force" -o "hF:w:v:c:f" -a -- "$@") + options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force,git" -o "hF:w:v:c:fg:" -a -- "$@") if [ $? -ne 0 ]; then print_help exit 1 @@ -105,6 +111,10 @@ else shift GIT_VERSION="$1" ;; + -g|--git) + shift + GIT_URL="$1" + ;; -c|--config) shift CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" @@ -148,7 +158,6 @@ else # Pssh args timeout=900 # 15 minute timeout - onetouch_nhc_path=$(realpath "./onetouch_nhc.sh") pssh_host_args=() for node in "${NODELIST_ARR[@]}"; do @@ -160,6 +169,10 @@ else nhc_args+=("-v" "$GIT_VERSION") fi + if [ -n "$GIT_URL" ]; then + nhc_args+=("-g" "$GIT_URL") + fi + if [ -n "$CUSTOM_CONF" ]; then nhc_args+=("-c" "$CUSTOM_CONF") fi @@ -168,10 +181,6 @@ else nhc_args+=("-f") fi - - #output=$(parallel-ssh -P -t $timeout -h $NODEFILE $onetouch_nhc_path) - #echo "$output" | grep "NHC-RESULT" | sed 's/\s*NHC-RESULT\s*//g' | sed "s/.*:\s+//g" | sort > $output_path - echo "Running Parallel SSH Distributed NHC on:" echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 88c89ff..2d694f4 100644 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -10,6 +10,8 @@ Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, install -v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" Can be a branch name like "main" for the latest or a full commit hash for a specific version. +-g, -git, --git Optional git url to download Az NHC from. Defaults to "https://github.com/Azure/azurehpc-health-checks" + -c, -config, --config Optional path to a custom NHC config file. If not specified the current VM SKU will be detected and the appropriate conf file will be used. @@ -28,6 +30,7 @@ EOF # Arguments VERSION="main" +GIT_URL="https://github.com/Azure/azurehpc-health-checks" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") OUTPUT_DIR=$WORKING_DIR JOB_NAME="$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")" @@ -35,7 +38,7 @@ CUSTOM_CONF="" FORCE=false # Parse out arguments -options=$(getopt -l "help,version:,config:,working:,output:,name:force" -o "hv:c:w:o:n:f" -a -- "$@") +options=$(getopt -l "help,version:,config:,working:,output:,name:,force,git:" -o "hv:c:w:o:n:fg:" -a -- "$@") if [ $? -ne 0 ]; then print_help @@ -54,6 +57,10 @@ case "$1" in shift VERSION="$1" ;; +-g|--git) + shift + GIT_URL="$1" + ;; -c|--config) shift CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" @@ -80,8 +87,12 @@ esac shift done +# extract git info from url +git_url_parts=($(echo "$GIT_URL" | tr '/' ' ')) +user_repo=$(echo "${git_url_parts[@]: -2:2}" | tr ' ' '_') + # Define expected paths -AZ_NHC_DIR=$(realpath -m "$WORKING_DIR/az-nhc-$VERSION") +AZ_NHC_DIR=$(realpath -m "$WORKING_DIR/$user_repo-$VERSION") INSTALL_SCRIPT_PATH="$AZ_NHC_DIR/install-nhc.sh" RUN_HEALTH_CHECKS_SCRIPT_PATH="$AZ_NHC_DIR/run-health-checks.sh" @@ -143,7 +154,7 @@ setup_nhc() { fi fi - archive_url="https://github.com/Azure/azurehpc-health-checks/archive/$version.tar.gz" + archive_url="$GIT_URL/archive/$version.tar.gz" mkdir -p $output_dir > /dev/null wget -q -O - $archive_url | tar -xz --strip=1 -C $output_dir From 7c00ce6b96b5a5114018c644039e1f27894cbd68 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 16:22:39 -0700 Subject: [PATCH 16/58] spaces --- conf/nd96isr_h100_v5.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index 63eeef0..8fd236c 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -40,7 +40,7 @@ #### #### GPU checks #### - * || check_gpu_count 8 + * || check_gpu_count 8 * || check_nvsmi_healthmon * || check_gpu_xid * || check_cuda_bw 52 From 77dcbb826b95b0a7734809d9da8cae2803d6b706 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 16:26:48 -0700 Subject: [PATCH 17/58] bruh syntax error --- customTests/azure_gpu_count.nhc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/customTests/azure_gpu_count.nhc b/customTests/azure_gpu_count.nhc index ba05ef3..26cdf31 100644 --- a/customTests/azure_gpu_count.nhc +++ b/customTests/azure_gpu_count.nhc @@ -2,7 +2,7 @@ function check_gpu_count() { EXPECTED_NUM_GPU="$1" gpu_count=$(nvidia-smi --list-gpus | wc -l) - if [ "$gpu_count" -ne "$1" ]; + if [ "$gpu_count" -ne "$1" ]; then die 1 "$FUNCNAME: Expected to see $EXPECTED_NUM_GPU but found $gpu_count" fi } \ No newline at end of file From 55a8be087f0c88ccab8b112c4f8aa64de166e854 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 18:52:04 -0700 Subject: [PATCH 18/58] add topology check --- conf/nd96isr_h100_v5.conf | 1 + customTests/azure_hw_topology_check.nhc | 76 +++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 customTests/azure_hw_topology_check.nhc diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index 8fd236c..9037ca3 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -35,6 +35,7 @@ * || check_hw_eth ib6 * || check_hw_eth ib7 * || check_hw_eth docker0 + * || check_hw_topology /opt/microsoft/ndv5-topo.xml ####################################################################### #### diff --git a/customTests/azure_hw_topology_check.nhc b/customTests/azure_hw_topology_check.nhc new file mode 100644 index 0000000..7c633a6 --- /dev/null +++ b/customTests/azure_hw_topology_check.nhc @@ -0,0 +1,76 @@ +#!/bin/bash + +function load_expected_topology() { + TOPO_FILE="$1" + + # Filter down to just numaids and sections + filtered_topo=$(cat $TOPO_FILE | grep -o -E -e 'numaid=\"[[:digit:]]{1,}\"' -e '' ) + + # Set the initial value of numaid + numaid="0" + + # Loop through each line of the input file + echo "$filtered_topo" | while read line; do + # Check if the line starts with "numaid" + if [[ $line == numaid=* ]]; then + # If it does, update the value of numaid + numaid="${line#numaid=}" + else + # If it doesn't, extract the busid and print the line with the current value of numaid + busid="$(echo $line | awk -F'"' '{print $2}')" + result="numaid=$numaid-busid=$busid" + result="${result//\"/}" + echo $result + fi + done +} + +function get_actual_topology() { + raw_topo=$(lstopo) + + numaid="0" + echo "$raw_topo" | while read line; do + if [[ $line == *"NUMANode"* ]]; then + numaid=$( echo "${line#*L#}" | cut -d' ' -f1) + elif [[ $line == *"PCI"* ]]; then + busid=$( echo $line | awk '{print $2}') + echo "numaid=$numaid-busid=$busid" + fi + done +} + +function check_hw_topology() { + TOPO_FILE="$1" + + if [ ! -f "$TOPO_FILE" ]; then + #die 1 "$FUNCNAME: Specified topology file does not exist!" + fi + log "Checking topology against $TOPO_FILE" + expected_topo=$(load_expected_topology $TOPO_FILE) + expected_topo_arr=($( echo "$expected_topo" | tr '\n' ' ')) + + actual_topo=$(get_actual_topology) + + missing_items=() + for item in "${expected_topo_arr[@]}" + do + if ! echo "$actual_topo" | grep -q -x "$item"; then + missing_items+=("$item") + fi + done + + if [ ${#missing_items[@]} -ne 0 ]; then + + formatted_missing=() + for item in "${missing_items[@]}" + do + item=$(echo $item | tr '-' '=') # change to = for awk + numaid=$(echo $item | awk -F'=' '{print $2}') + busid=$(echo $item | awk -F'=' '{print $4}') + formatted_missing+=(" PCI Bus $busid on NUMANode $numaid") + done + + missing_str=$(IFS=',' ; echo "${formatted_missing[*]}") + die 1 "$FUNCNAME: Topology mismatch! Expected to find$missing_str" # awkward spacing to get formatting right + fi +} \ No newline at end of file From 263981cca6509e7cc67818acb1660416f9df1fa8 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 18:57:52 -0700 Subject: [PATCH 19/58] fix nodelist --- distributed_nhc.sb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 1e8f203..1b3346b 100644 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -137,7 +137,7 @@ else fi if [ -n "$NODELIST" ]; then - NODELIST_ARR+=( $(echo $NODELIST | sed "s/,/ /g") ) + NODELIST_ARR+=( $(expand_nodelist $NODELIST ) ) fi if [ ${#NODELIST_ARR[@]} -eq 0 ]; then From a036e0dd6c174f01b7be7f434779366cdf9b6e6b Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 19:01:16 -0700 Subject: [PATCH 20/58] try fix --- customTests/azure_hw_topology_check.nhc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/customTests/azure_hw_topology_check.nhc b/customTests/azure_hw_topology_check.nhc index 7c633a6..be380d3 100644 --- a/customTests/azure_hw_topology_check.nhc +++ b/customTests/azure_hw_topology_check.nhc @@ -43,10 +43,10 @@ function check_hw_topology() { TOPO_FILE="$1" if [ ! -f "$TOPO_FILE" ]; then - #die 1 "$FUNCNAME: Specified topology file does not exist!" + die 1 "$FUNCNAME: Specified topology file does not exist!" fi log "Checking topology against $TOPO_FILE" - expected_topo=$(load_expected_topology $TOPO_FILE) + expected_topo=$(load_expected_topology "$TOPO_FILE") expected_topo_arr=($( echo "$expected_topo" | tr '\n' ' ')) actual_topo=$(get_actual_topology) From c2f5485b37bab29da977df7b450b11d11d257ee2 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 26 Jul 2023 19:15:12 -0700 Subject: [PATCH 21/58] chmod +x --- distributed_nhc.sb.sh | 0 onetouch_nhc.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 distributed_nhc.sb.sh mode change 100644 => 100755 onetouch_nhc.sh diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh old mode 100644 new mode 100755 diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh old mode 100644 new mode 100755 From f2485f25c9712ba3df266a19f1ed992962e0dada Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 10:13:05 -0700 Subject: [PATCH 22/58] remind people that it will take a few minutes to complete --- distributed_nhc.sb.sh | 1 + onetouch_nhc.sh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 1b3346b..88eed81 100755 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -184,6 +184,7 @@ else echo "Running Parallel SSH Distributed NHC on:" echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" + echo "The health check is running, it will take a few minutes to complete." RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 3> $error_path | tee $output_path) fi diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 2d694f4..db58a90 100755 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -183,11 +183,13 @@ run_health_checks() { if [ -z $custom_conf ]; then # if no custom config is provided, let run-health-checks.sh auto-detect + echo "The health check has been started, it will typically take a few minutes to complete" sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path else # otherwise, run it ourselves custom_conf=$(realpath "$custom_conf") echo "Running health checks using $custom_conf" + echo "The health check has been started, it will typically take a few minutes to complete" sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 fi From f3585b1df91cc55a332e4cf1284140b1e98a7a47 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 12:59:54 -0700 Subject: [PATCH 23/58] updating stdout logs, export to kusto --- distributed_nhc.sb.sh | 17 +++++++++-- export_health_log_to_kusto.py | 53 +++++++++++++++++++++++++++++++++++ onetouch_nhc.sh | 4 +-- 3 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 export_health_log_to_kusto.py diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 88eed81..2f5afd9 100755 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -65,6 +65,7 @@ RAW_OUTPUT="" HEALTH_LOG_FILE_PATH="" NODELIST_ARR=() onetouch_nhc_path=$(realpath -e "./onetouch_nhc.sh") +nhc_start_time=$(date +%s.%N) # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then @@ -151,7 +152,7 @@ else NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) # Log file paths - jobname="distributed_nhc-pssh-$(date +'%Y-%m-%d_%H-%M-%S')" + jobname="distributed_nhc-pssh-$(date --utc +'%Y-%m-%d_%H-%M-%S')" HEALTH_LOG_FILE_PATH="logs/$jobname.health.log" output_path="logs/$jobname.out" error_path="logs/$jobname.err" @@ -188,6 +189,9 @@ else RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 3> $error_path | tee $output_path) fi +nhc_end_time=$(date +%s.%N) +nhc_duration=$(printf "%.2f" $(echo "($nhc_end_time - $nhc_start_time) / 60" | bc -l)) + # Filter down to NHC-RESULTS NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g') @@ -201,4 +205,13 @@ for missing_node in "${nodes_missing_results[@]}"; do done echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH -cat $HEALTH_LOG_FILE_PATH \ No newline at end of file +echo "======================" +cat $HEALTH_LOG_FILE_PATH + +echo "======================" +echo "NHC took $nhc_duration minutes to finish" +echo +echo "Exporting results to Kusto" +kusto_export_script=$(realpath -e "./export_health_log_to_kusto.py") +python3 $kusto_export_script $HEALTH_LOG_FILE_PATH +echo "Ingestion queued, results take ~5 minutes to appear in Kusto" \ No newline at end of file diff --git a/export_health_log_to_kusto.py b/export_health_log_to_kusto.py new file mode 100644 index 0000000..ce327b8 --- /dev/null +++ b/export_health_log_to_kusto.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +import sys +import os +from datetime import datetime +from csv import DictReader +from argparse import ArgumentParser +from azure.identity import ManagedIdentityCredential +from azure.kusto.data import KustoConnectionStringBuilder +from azure.kusto.ingest import QueuedIngestClient, IngestionProperties +import pandas as pd + +health_files = sys.argv[1:] + +for health_file in health_files: + try: + if not os.path.exists(health_file): + raise FileNotFoundError(f"Cannot find file '{health_file}'") + + # distributed_nhc-pssh-2023-07-27_02-04-52.health.log + filename_parts = os.path.basename(health_file).split("-", maxsplit=2) + ts_str = filename_parts[2].split(".")[0] + ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") + + job_name = filename_parts[1] + + if job_name == "pssh": + job_name = f"{job_name}-{ts_str}" + + with open(health_file, 'r') as f: + lines = f.readlines() + reader = DictReader(lines, fieldnames = ["Hostname", "RawResult"], delimiter='|', restkey="extra") + + df = pd.DataFrame(reader) + df['Timestamp'] = ts + df['JobName'] = job_name + df['NodeName'] = df.apply(lambda x: x['Hostname'].strip(), axis=1) + df['RawResult'] = df.apply(lambda x: x['RawResult'].strip(), axis=1) + df['Healthy'] = df.apply(lambda x: x['RawResult'] == "Healthy", axis=1) + df = df[['Timestamp', 'JobName', 'Hostname', 'Healthy', 'RawResult']] + + creds = ManagedIdentityCredential( + client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" + ) + + ingest_url = "https://ingest-aistresstests.centralus.kusto.windows.net" + database = "sat13c04_stress_testdb" + table_name = "NodeHealthCheck" + + ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) + print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{table_name}") + ingest_client.ingest_from_dataframe(df, IngestionProperties(database, table_name)) + except FileNotFoundError: + print("Cannot find file '{health_file}', skipping...") \ No newline at end of file diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index db58a90..7d64bfc 100755 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -30,10 +30,10 @@ EOF # Arguments VERSION="main" -GIT_URL="https://github.com/Azure/azurehpc-health-checks" +GIT_URL="https://github.com/mpwillia/azurehpc-health-checks" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") OUTPUT_DIR=$WORKING_DIR -JOB_NAME="$(hostname)-$(date +"%Y-%m-%d_%H-%M-%S")" +JOB_NAME="$(hostname)-$(date --utc +"%Y-%m-%d_%H-%M-%S")" CUSTOM_CONF="" FORCE=false From dcf33842de06c95a8d01a069a081ff9b704a360b Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 13:46:48 -0700 Subject: [PATCH 24/58] errors --- export_health_log_to_kusto.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/export_health_log_to_kusto.py b/export_health_log_to_kusto.py index ce327b8..70cd756 100644 --- a/export_health_log_to_kusto.py +++ b/export_health_log_to_kusto.py @@ -50,4 +50,6 @@ print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{table_name}") ingest_client.ingest_from_dataframe(df, IngestionProperties(database, table_name)) except FileNotFoundError: + if len(health_files) == 1: + raise print("Cannot find file '{health_file}', skipping...") \ No newline at end of file From 68d54e04ff063cec7329b81dcaeb74f8c200d550 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 14:02:49 -0700 Subject: [PATCH 25/58] remove old confs --- h100_fail.conf | 57 -------------------------------------------------- h100_pass.conf | 57 -------------------------------------------------- 2 files changed, 114 deletions(-) delete mode 100644 h100_fail.conf delete mode 100644 h100_pass.conf diff --git a/h100_fail.conf b/h100_fail.conf deleted file mode 100644 index 8aff28b..0000000 --- a/h100_fail.conf +++ /dev/null @@ -1,57 +0,0 @@ -# NHC Configuration File -# -# Lines are in the form "||" -# Hostmask is a glob, /regexp/, or {noderange} -# Comments begin with '#' -# -# This file was automatically generated by nhc-genconf -# Sat May 6 00:05:37 UTC 2023 -# - - -####################################################################### -### -### Hardware checks -### - * || check_hw_cpuinfo 2 96 97 -# * || check_hw_physmem 1915071MB 1915071MB 5% -# * || check_hw_swap 0kB 0kB 3% -# * || check_hw_ib 400 mlx5_ib0:1 -# * || check_hw_ib 400 mlx5_ib1:1 -# * || check_hw_ib 400 mlx5_ib2:1 -# * || check_hw_ib 400 mlx5_ib3:1 -# * || check_hw_ib 400 mlx5_ib4:1 -# * || check_hw_ib 400 mlx5_ib5:1 -# * || check_hw_ib 400 mlx5_ib6:1 -# * || check_hw_ib 400 mlx5_ib7:1 -# * || check_hw_eth lo -# * || check_hw_eth eth0 -# * || check_hw_eth ib0 -# * || check_hw_eth ib1 -# * || check_hw_eth ib2 -# * || check_hw_eth ib3 -# * || check_hw_eth ib4 -# * || check_hw_eth ib5 -# * || check_hw_eth ib6 -# * || check_hw_eth ib7 -# * || check_hw_eth docker0 -# -######################################################################## -##### -##### GPU checks -##### -# * || check_nvsmi_healthmon -# * || check_gpu_xid -# * || check_cuda_bw 52 -# * || check_gpu_ecc 20000000 10000 -# * || check_gpu_clock_throttling -# * || check_nccl_allreduce 460.0 1 /opt/microsoft/ndv5-topo.xml 16G -# -# -######################################################################## -##### -##### Additional IB checks -##### -# * || check_ib_bw_gdr 380 nd96isr_v5 -# * || check_nccl_allreduce_ib_loopback 40.0 1 /opt/microsoft/ndv5-topo.xml 16G -# * || check_ib_link_flapping 6 diff --git a/h100_pass.conf b/h100_pass.conf deleted file mode 100644 index ec594f5..0000000 --- a/h100_pass.conf +++ /dev/null @@ -1,57 +0,0 @@ -# NHC Configuration File -# -# Lines are in the form "||" -# Hostmask is a glob, /regexp/, or {noderange} -# Comments begin with '#' -# -# This file was automatically generated by nhc-genconf -# Sat May 6 00:05:37 UTC 2023 -# - - -####################################################################### -### -### Hardware checks -### - * || check_hw_cpuinfo 2 96 96 -# * || check_hw_physmem 1915071MB 1915071MB 5% -# * || check_hw_swap 0kB 0kB 3% -# * || check_hw_ib 400 mlx5_ib0:1 -# * || check_hw_ib 400 mlx5_ib1:1 -# * || check_hw_ib 400 mlx5_ib2:1 -# * || check_hw_ib 400 mlx5_ib3:1 -# * || check_hw_ib 400 mlx5_ib4:1 -# * || check_hw_ib 400 mlx5_ib5:1 -# * || check_hw_ib 400 mlx5_ib6:1 -# * || check_hw_ib 400 mlx5_ib7:1 -# * || check_hw_eth lo -# * || check_hw_eth eth0 -# * || check_hw_eth ib0 -# * || check_hw_eth ib1 -# * || check_hw_eth ib2 -# * || check_hw_eth ib3 -# * || check_hw_eth ib4 -# * || check_hw_eth ib5 -# * || check_hw_eth ib6 -# * || check_hw_eth ib7 -# * || check_hw_eth docker0 -# -######################################################################## -##### -##### GPU checks -##### -# * || check_nvsmi_healthmon -# * || check_gpu_xid -# * || check_cuda_bw 52 -# * || check_gpu_ecc 20000000 10000 -# * || check_gpu_clock_throttling -# * || check_nccl_allreduce 460.0 1 /opt/microsoft/ndv5-topo.xml 16G -# -# -######################################################################## -##### -##### Additional IB checks -##### -# * || check_ib_bw_gdr 380 nd96isr_v5 -# * || check_nccl_allreduce_ib_loopback 40.0 1 /opt/microsoft/ndv5-topo.xml 16G -# * || check_ib_link_flapping 6 From 74aabe9247c51daeb99241022501f9ce0ec97700 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 14:10:59 -0700 Subject: [PATCH 26/58] add requirements.txt --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8b67417 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +azure-identity +azure-kusto-data +azure-kusto-ingest +pandas \ No newline at end of file From e2486c9b58c7b8d1378aa7c65d4c06e249156fdf Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 15:53:19 -0700 Subject: [PATCH 27/58] make nhc very verbose --- onetouch_nhc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index 7d64bfc..b215923 100755 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -190,7 +190,7 @@ run_health_checks() { custom_conf=$(realpath "$custom_conf") echo "Running health checks using $custom_conf" echo "The health check has been started, it will typically take a few minutes to complete" - sudo nhc CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 + sudo nhc -d -v CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 fi } From 83e39cce58e76341ce6a7ce55e4a711f61410372 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 15:58:00 -0700 Subject: [PATCH 28/58] very verbose nhc --- run-health-checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run-health-checks.sh b/run-health-checks.sh index dfad5d7..eefa961 100755 --- a/run-health-checks.sh +++ b/run-health-checks.sh @@ -42,4 +42,4 @@ fi log_path="${1:-./health.log}" log_path=$(realpath "$log_path") -nhc CONFFILE=$(dirname "${BASH_SOURCE[0]}")/conf/$conf_name.conf LOGFILE=$log_path TIMEOUT=500 +nhc -d -v CONFFILE=$(dirname "${BASH_SOURCE[0]}")/conf/$conf_name.conf LOGFILE=$log_path TIMEOUT=500 From 049efacf68b4ad3b1a3c7b522e353b6fbb40a218 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 16:37:56 -0700 Subject: [PATCH 29/58] update tests with more info --- customTests/azure_cuda_bandwidth.nhc | 260 +++++++++--------- customTests/azure_hw_topology_check.nhc | 2 +- .../azure_nccl_allreduce_ib_loopback.nhc | 7 +- 3 files changed, 142 insertions(+), 127 deletions(-) diff --git a/customTests/azure_cuda_bandwidth.nhc b/customTests/azure_cuda_bandwidth.nhc index a4fbfe2..f049c7a 100755 --- a/customTests/azure_cuda_bandwidth.nhc +++ b/customTests/azure_cuda_bandwidth.nhc @@ -10,21 +10,21 @@ #Catch error codes that may be thrown by the executable passed as the first #input, and if an error code is tripped throw the second input as a message catch_error() { - declare -g output - output=$($1) - err_code=$? - if [ $err_code -ne 0 ]; then - die 1 "\t $2 $err_code" >&2 - return 1 - fi - return 0 + declare -g output + output=$($1) + err_code=$? + if [ $err_code -ne 0 ]; then + die 1 "\t $2 $err_code" >&2 + return 1 + fi + return 0 } function cleanup { - dbg "Unlocking graphics clock before exit..." - sudo timeout 3m nvidia-smi -rgc > /dev/null 2>&1 + dbg "Unlocking graphics clock before exit..." + sudo timeout 3m nvidia-smi -rgc > /dev/null 2>&1 } @@ -32,117 +32,131 @@ function cleanup { function check_cuda_bw() { - #set expected BW set to default value if argument empty - EXP_CUDA_BW=$1 - if [[ -z "$EXP_CUDA_BW" ]]; then - EXP_CUDA_BW=24 - fi - - # location of executables, must match setup location - EXE_DIR=$2 - if [[ -z "$EXE_DIR" ]]; then - EXE_DIR=/opt/azurehpc/test/nhc - fi - #Count the number of gpu-name nvidia-smi outputs. - error_smi="**Fail** nvidia-smi failed with error code" - #Lock graphics clocks to max freq to eliminate any time for the GPUs to boost. - #This likely isn't important for performance here, but we will do it anyway - #to be safe. - SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") - SKU="${SKU,,}" - lock_clocks= - if echo "$SKU" | grep -q "nd96asr_v4"; then - lock_clocks="sudo nvidia-smi -lgc 1400" - elif echo "$SKU" | grep -q "nd96amsr_a100_v4"; then - lock_clocks="sudo nvidia-smi -lgc 1400" - elif echo "$SKU" | grep -q "nd96isr_h100_v5"; then - lock_clocks="sudo nvidia-smi -lgc 2619" - fi - - if [[ -n "$lock_clocks" ]]; then - if ! catch_error "$lock_clocks" "$error_smi"; then - return 0 - fi - fi - - #exit function to unlock clocks on exit - trap cleanup EXIT - - #Count the GPUs. - gpu_list="timeout 3m nvidia-smi --query-gpu=name --format=csv,noheader" - if ! catch_error "$gpu_list" "$error_smi"; then - return 0 - fi - ngpus=$(echo "$output" | wc -l) - - #Run device to host bandwidth test. - exec_htod="timeout 3m $EXE_DIR/gpu-copy --size 134217728 --htod" - error_htod="**Fail** The htod gpu_copy test failed to execute." - error_htod+="It exited with error code" - if ! catch_error "$exec_htod" "$error_htod"; then - return 0 - fi - x_htod=$(echo "$output") - - #Run host to device bandwidth test. - exec_dtoh="timeout 3m $EXE_DIR/gpu-copy --size 134217728 --dtoh" - error_dtoh="**Fail** The dtoh gpu_copy test failed to execute." - error_dtoh+="It exited with error code" - if ! catch_error "$exec_dtoh" "$error_dtoh"; then - return 0 - fi - x_dtoh=$(echo "$output") - pass=1 - - #Loop over all of the detected GPUs. - for i in $(seq 0 $((ngpus-1))); do - #Collect host to device bandwidths computed in each numa zone. - bw_htod=$(echo "$x_htod" | grep "gpu$i" | cut -d' ' -f2 | cut -d. -f1) - max_htodbw=0 - min_bw=100 - #Loop over the bandwidths observed in each numa zone and find max. - for bw in $bw_htod; do - if [ $max_htodbw -lt $bw ]; then - max_htodbw=$bw - fi - done - - #Collect device to host bandwidths computed in each numa zone. - bw_dtoh=$(echo "$x_dtoh" | grep "gpu$i" | cut -d' ' -f2 | cut -d. -f1) - max_dtohbw=0 - #Loop over bandwidths observed in each numa zone and find max. - for bw in $bw_dtoh; do - if [ $max_dtohbw -lt $bw ]; then - max_dtohbw=$bw - fi - done - #Find minimum of the htod and dtoh bandwidths. - if [ $max_htodbw -lt $max_dtohbw ]; then - min_bw=$max_htodbw - else - min_bw=$max_dtohbw - fi - - #If the min bandwidth is too low the test has failed. - if [ $min_bw -lt $EXP_CUDA_BW ]; then - die 1 "Bandwidth is low on device $i. Reported bandwidth is"\ - "$min_bw GB/s." - pass=0 - return 0 - fi - done - #Unlock the graphics clock. - unlock_clocks="sudo timeout 3m nvidia-smi -rgc" - - if ! catch_error "$unlock_clocks" "$error_smi"; then - return 0 - fi - - if [ $pass -ne 1 ]; then - die 1 -e "\t **Fail** At least one device reported low htod or dtoh"\ - "bandwidth." - return 0 - else - return 0 - fi + #set expected BW set to default value if argument empty + EXP_CUDA_BW=$1 + if [[ -z "$EXP_CUDA_BW" ]]; then + EXP_CUDA_BW=24 + fi + + # location of executables, must match setup location + EXE_DIR=$2 + if [[ -z "$EXE_DIR" ]]; then + EXE_DIR=/opt/azurehpc/test/nhc + fi + #Count the number of gpu-name nvidia-smi outputs. + error_smi="**Fail** nvidia-smi failed with error code" + #Lock graphics clocks to max freq to eliminate any time for the GPUs to boost. + #This likely isn't important for performance here, but we will do it anyway + #to be safe. + SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") + SKU="${SKU,,}" + lock_clocks= + if echo "$SKU" | grep -q "nd96asr_v4"; then + lock_clocks="sudo nvidia-smi -lgc 1400" + elif echo "$SKU" | grep -q "nd96amsr_a100_v4"; then + lock_clocks="sudo nvidia-smi -lgc 1400" + elif echo "$SKU" | grep -q "nd96isr_h100_v5"; then + lock_clocks="sudo nvidia-smi -lgc 2619" + fi + + if [[ -n "$lock_clocks" ]]; then + if ! catch_error "$lock_clocks" "$error_smi"; then + return 0 + fi + fi + + #exit function to unlock clocks on exit + trap cleanup EXIT + + #Count the GPUs. + gpu_list="timeout 3m nvidia-smi --query-gpu=name --format=csv,noheader" + if ! catch_error "$gpu_list" "$error_smi"; then + return 0 + fi + ngpus=$(echo "$output" | wc -l) + + #Run device to host bandwidth test. + exec_htod="timeout 3m $EXE_DIR/gpu-copy --size 134217728 --htod" + error_htod="**Fail** The htod gpu_copy test failed to execute." + error_htod+="It exited with error code" + if ! catch_error "$exec_htod" "$error_htod"; then + return 0 + fi + x_htod=$(echo "$output") + + #Run host to device bandwidth test. + exec_dtoh="timeout 3m $EXE_DIR/gpu-copy --size 134217728 --dtoh" + error_dtoh="**Fail** The dtoh gpu_copy test failed to execute." + error_dtoh+="It exited with error code" + if ! catch_error "$exec_dtoh" "$error_dtoh"; then + return 0 + fi + x_dtoh=$(echo "$output") + pass=1 + + #Loop over all of the detected GPUs. + + low_bw_devices=() + for i in $(seq 0 $((ngpus-1))); do + #Collect host to device bandwidths computed in each numa zone. + bw_htod=$(echo "$x_htod" | grep "gpu$i" | cut -d' ' -f2 | cut -d. -f1) + max_htodbw=0 + min_bw=100 + #Loop over the bandwidths observed in each numa zone and find max. + for bw in $bw_htod; do + if [ $max_htodbw -lt $bw ]; then + max_htodbw=$bw + fi + done + + dbg "Device $i Host to Device reported bandwidth is $max_htodbw GB/s" + + #Collect device to host bandwidths computed in each numa zone. + bw_dtoh=$(echo "$x_dtoh" | grep "gpu$i" | cut -d' ' -f2 | cut -d. -f1) + max_dtohbw=0 + #Loop over bandwidths observed in each numa zone and find max. + for bw in $bw_dtoh; do + if [ $max_dtohbw -lt $bw ]; then + max_dtohbw=$bw + fi + done + + dbg "Device $i Device to Host reported bandwidth is $max_dtohbw GB/s" + + #Find minimum of the htod and dtoh bandwidths. + if [ $max_htodbw -lt $max_dtohbw ]; then + min_bw=$max_htodbw + else + min_bw=$max_dtohbw + fi + + #If the min bandwidth is too low the test has failed. + if [ $min_bw -lt $EXP_CUDA_BW ]; then + low_bw_devices+=("$i-$min_bw") + pass=0 + fi + done + #Unlock the graphics clock. + unlock_clocks="sudo timeout 3m nvidia-smi -rgc" + + if ! catch_error "$unlock_clocks" "$error_smi"; then + return 0 + fi + + if [ $pass -ne 1 ]; then + + formatted_low_bw=() + for item in "${low_bw_devices[@]}" + do + deviceid=$(echo $item | awk -F'-' '{print $1}') + bw=$(echo $item | awk -F'-' '{print $2}') + formatted_low_bw+=(" Device $deviceid reports low bandwidth of $bw GB/s") + done + + low_bw_str=$(IFS=',' ; echo "${formatted_low_bw[*]}") + die 1 "$FUNCNAME: Low bandwidth reported on one or more devices!$low_bw_str" + return 0 + else + return 0 + fi } diff --git a/customTests/azure_hw_topology_check.nhc b/customTests/azure_hw_topology_check.nhc index be380d3..02ea8fe 100644 --- a/customTests/azure_hw_topology_check.nhc +++ b/customTests/azure_hw_topology_check.nhc @@ -45,7 +45,7 @@ function check_hw_topology() { if [ ! -f "$TOPO_FILE" ]; then die 1 "$FUNCNAME: Specified topology file does not exist!" fi - log "Checking topology against $TOPO_FILE" + dbg "Checking topology against $TOPO_FILE" expected_topo=$(load_expected_topology "$TOPO_FILE") expected_topo_arr=($( echo "$expected_topo" | tr '\n' ' ')) diff --git a/customTests/azure_nccl_allreduce_ib_loopback.nhc b/customTests/azure_nccl_allreduce_ib_loopback.nhc index a722457..ec041b1 100644 --- a/customTests/azure_nccl_allreduce_ib_loopback.nhc +++ b/customTests/azure_nccl_allreduce_ib_loopback.nhc @@ -52,10 +52,11 @@ function check_nccl_allreduce_ib_loopback() { dbg "$nccl_allreduce_ib_loopback_out" log "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s" else + dbg "NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s" return 0 fi - done + done - die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s)" - return 0 + die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s)" + return 0 } From 7be8e3eb7020f38db1faf09eb7708cbf13ad5b42 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 18:48:26 -0700 Subject: [PATCH 30/58] debug logs and debug log export --- distributed_nhc.sb.sh | 18 +++++-- export_health_log_to_kusto.py | 55 -------------------- export_nhc_result_to_kusto.py | 96 +++++++++++++++++++++++++++++++++++ onetouch_nhc.sh | 16 +++++- 4 files changed, 125 insertions(+), 60 deletions(-) delete mode 100644 export_health_log_to_kusto.py create mode 100644 export_nhc_result_to_kusto.py diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 2f5afd9..858160e 100755 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -63,6 +63,7 @@ expand_nodelist() { RAW_OUTPUT="" HEALTH_LOG_FILE_PATH="" +DEBUG_LOG_FILE_PATH="" NODELIST_ARR=() onetouch_nhc_path=$(realpath -e "./onetouch_nhc.sh") nhc_start_time=$(date +%s.%N) @@ -70,7 +71,8 @@ nhc_start_time=$(date +%s.%N) # Running with SLURM if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID-$(date +'%Y-%m-%d_%H-%M-%S')" - HEALTH_LOG_FILE_PATH="logs/$NHC_JOB_NAME.health.log" + HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.health.log") + DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.debug.log") NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) # verify file presence on all nodes @@ -153,7 +155,8 @@ else # Log file paths jobname="distributed_nhc-pssh-$(date --utc +'%Y-%m-%d_%H-%M-%S')" - HEALTH_LOG_FILE_PATH="logs/$jobname.health.log" + HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.health.log") + DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.debug.log") output_path="logs/$jobname.out" error_path="logs/$jobname.err" @@ -194,6 +197,9 @@ nhc_duration=$(printf "%.2f" $(echo "($nhc_end_time - $nhc_start_time) / 60" | b # Filter down to NHC-RESULTS NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g') +NHC_DEBUG=$(echo "$RAW_OUTPUT" | grep "NHC-DEBUG" | sed 's/.*NHC-DEBUG\s*//g') +echo "Dumping NHC Debug into $DEBUG_LOG_FILE_PATH" +echo "$NHC_DEBUG" | sort >> $DEBUG_LOG_FILE_PATH # Identify nodes who should have reported results but didn't, these failed for some unknown reason nodes_with_results_arr=( $( echo "$NHC_RESULTS" | sed 's/\s*|.*//g' | tr '\n' ' ' ) ) @@ -204,6 +210,7 @@ for missing_node in "${nodes_missing_results[@]}"; do NHC_RESULTS+="$newline$missing_node | ERROR: No results reported" done +echo "Health report can be found into $HEALTH_LOG_FILE_PATH" echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH echo "======================" cat $HEALTH_LOG_FILE_PATH @@ -212,6 +219,9 @@ echo "======================" echo "NHC took $nhc_duration minutes to finish" echo echo "Exporting results to Kusto" -kusto_export_script=$(realpath -e "./export_health_log_to_kusto.py") -python3 $kusto_export_script $HEALTH_LOG_FILE_PATH +requirements_file=$(realpath -e "./requirements.txt") +$(pip install -r $requirements_file) +kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") +echo "Using export script $kusto_export_script" +python3 $kusto_export_script $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH echo "Ingestion queued, results take ~5 minutes to appear in Kusto" \ No newline at end of file diff --git a/export_health_log_to_kusto.py b/export_health_log_to_kusto.py deleted file mode 100644 index 70cd756..0000000 --- a/export_health_log_to_kusto.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/python3 -import sys -import os -from datetime import datetime -from csv import DictReader -from argparse import ArgumentParser -from azure.identity import ManagedIdentityCredential -from azure.kusto.data import KustoConnectionStringBuilder -from azure.kusto.ingest import QueuedIngestClient, IngestionProperties -import pandas as pd - -health_files = sys.argv[1:] - -for health_file in health_files: - try: - if not os.path.exists(health_file): - raise FileNotFoundError(f"Cannot find file '{health_file}'") - - # distributed_nhc-pssh-2023-07-27_02-04-52.health.log - filename_parts = os.path.basename(health_file).split("-", maxsplit=2) - ts_str = filename_parts[2].split(".")[0] - ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") - - job_name = filename_parts[1] - - if job_name == "pssh": - job_name = f"{job_name}-{ts_str}" - - with open(health_file, 'r') as f: - lines = f.readlines() - reader = DictReader(lines, fieldnames = ["Hostname", "RawResult"], delimiter='|', restkey="extra") - - df = pd.DataFrame(reader) - df['Timestamp'] = ts - df['JobName'] = job_name - df['NodeName'] = df.apply(lambda x: x['Hostname'].strip(), axis=1) - df['RawResult'] = df.apply(lambda x: x['RawResult'].strip(), axis=1) - df['Healthy'] = df.apply(lambda x: x['RawResult'] == "Healthy", axis=1) - df = df[['Timestamp', 'JobName', 'Hostname', 'Healthy', 'RawResult']] - - creds = ManagedIdentityCredential( - client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" - ) - - ingest_url = "https://ingest-aistresstests.centralus.kusto.windows.net" - database = "sat13c04_stress_testdb" - table_name = "NodeHealthCheck" - - ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) - print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{table_name}") - ingest_client.ingest_from_dataframe(df, IngestionProperties(database, table_name)) - except FileNotFoundError: - if len(health_files) == 1: - raise - print("Cannot find file '{health_file}', skipping...") \ No newline at end of file diff --git a/export_nhc_result_to_kusto.py b/export_nhc_result_to_kusto.py new file mode 100644 index 0000000..db94757 --- /dev/null +++ b/export_nhc_result_to_kusto.py @@ -0,0 +1,96 @@ +#!/usr/bin/python3 +import sys +import os +from datetime import datetime +from csv import DictReader +from argparse import ArgumentParser +from azure.identity import ManagedIdentityCredential +from azure.kusto.data import KustoConnectionStringBuilder +from azure.kusto.ingest import QueuedIngestClient, IngestionProperties +import pandas as pd + +ingest_url = "https://ingest-aistresstests.centralus.kusto.windows.net" +database = "sat13c04_stress_testdb" +health_table_name = "NodeHealthCheck" +debug_table_name = "NodeHealthCheck_Debug" + +def ingest_health_log(health_file): + filename_parts = os.path.basename(health_file).split("-", maxsplit=2) + ts_str = filename_parts[2].split(".")[0] + ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") + + job_name = filename_parts[1] + + if job_name == "pssh": + job_name = f"{job_name}-{ts_str}" + + with open(health_file, 'r') as f: + lines = f.readlines() + reader = DictReader(lines, fieldnames = ["Hostname", "RawResult"], delimiter='|', restkey="extra") + + df = pd.DataFrame(reader) + df['Timestamp'] = ts + df['JobName'] = job_name + df['NodeName'] = df.apply(lambda x: x['Hostname'].strip(), axis=1) + df['RawResult'] = df.apply(lambda x: x['RawResult'].strip(), axis=1) + df['Healthy'] = df.apply(lambda x: x['RawResult'] == "Healthy", axis=1) + df = df[['Timestamp', 'JobName', 'Hostname', 'Healthy', 'RawResult']] + + creds = ManagedIdentityCredential( + client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" + ) + + ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) + print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{health_table_name}") + ingest_client.ingest_from_dataframe(df, IngestionProperties(database, health_table_name)) + +def ingest_debug_log(debug_file): + filename_parts = os.path.basename(debug_file).split("-", maxsplit=2) + ts_str = filename_parts[2].split(".")[0] + ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") + + job_name = filename_parts[1] + + if job_name == "pssh": + job_name = f"{job_name}-{ts_str}" + + with open(health_file, 'r') as f: + lines = f.readlines() + reader = DictReader(lines, fieldnames = ["Hostname", "DebugLog"], delimiter='|', restkey="extra") + + df = pd.DataFrame(reader) + df['Timestamp'] = ts + df['JobName'] = job_name + df['NodeName'] = df.apply(lambda x: x['Hostname'].strip(), axis=1) + df['DebugLog'] = df.apply(lambda x: x['DebugLog'].strip(), axis=1) + df = df[['Timestamp', 'JobName', 'Hostname', 'DebugLog']] + + creds = ManagedIdentityCredential( + client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" + ) + + ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) + print(f"Ingesting health results from {os.path.basename(debug_file)} into {ingest_url} at {database}/{debug_table_name}") + ingest_client.ingest_from_dataframe(df, IngestionProperties(database, debug_table_name)) + +health_files = sys.argv[1:] + +print(f"Attempting to ingest: {','.join(health_files)}") + +for health_file in health_files: + try: + if not os.path.exists(health_file): + raise FileNotFoundError(f"Cannot find file '{health_file}'") + + if health_file.endswith(".health.log"): + ingest_health_log(health_file) + elif health_file.endswith(".debug.log"): + ingest_debug_log(health_file) + else: + raise Exception("Unsuported file, must be .health.log or .debug.log produced by ./distributed_nhc.sb.sh") + + except FileNotFoundError: + if len(health_files) == 1: + print("Cannot find file '{health_file}'") + raise + print("Cannot find file '{health_file}', skipping...") \ No newline at end of file diff --git a/onetouch_nhc.sh b/onetouch_nhc.sh index b215923..490c318 100755 --- a/onetouch_nhc.sh +++ b/onetouch_nhc.sh @@ -205,6 +205,15 @@ echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF echo "=== Finished Running Health Checks ===" +echo +echo "=== Debug Dump ===" +debug=$(grep " DEBUG:" $HEALTH_LOG_FILE_PATH) +echo "$debug" | while read line +do + cleaned_line=$(echo "$line" | sed 's/^\[[0-9]*\] - DEBUG: //') + echo "NHC-DEBUG $(hostname) | $cleaned_line"; +done + echo echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" cat $HEALTH_LOG_FILE_PATH @@ -212,8 +221,13 @@ cat $HEALTH_LOG_FILE_PATH echo echo "=== Detected Errors (if any) ===" errors=$(grep "ERROR" $HEALTH_LOG_FILE_PATH) + if [ -n "$errors" ]; then - echo $errors | while read line; do echo "NHC-RESULT $(hostname) | $line"; done + echo "$errors" | while read line + do + cleaned_line=$(echo "$line" | sed 's/^\[[0-9]*\] - //') + echo "NHC-RESULT $(hostname) | $cleaned_line"; + done else echo "NHC-RESULT $(hostname) | Healthy" fi \ No newline at end of file From 9f39581284a4aebadf49249d357c24cbc1087488 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 28 Jul 2023 19:18:57 -0700 Subject: [PATCH 31/58] fix pip install --- distributed_nhc.sb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 858160e..78c1bc2 100755 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -220,7 +220,7 @@ echo "NHC took $nhc_duration minutes to finish" echo echo "Exporting results to Kusto" requirements_file=$(realpath -e "./requirements.txt") -$(pip install -r $requirements_file) +pip install -r $requirements_file kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") echo "Using export script $kusto_export_script" python3 $kusto_export_script $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH From 761399e0b0d89730d8c391c8b7213f26280eee3c Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 2 Aug 2023 17:45:51 -0700 Subject: [PATCH 32/58] add pinned vbios check --- conf/nd96isr_h100_v5.conf | 1 + customTests/azure_gpu_vbios.nhc | 14 ++++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 customTests/azure_gpu_vbios.nhc diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index 9037ca3..745d16b 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -35,6 +35,7 @@ * || check_hw_eth ib6 * || check_hw_eth ib7 * || check_hw_eth docker0 + * || check_vbios_version 96.00.74.00.01 * || check_hw_topology /opt/microsoft/ndv5-topo.xml ####################################################################### diff --git a/customTests/azure_gpu_vbios.nhc b/customTests/azure_gpu_vbios.nhc new file mode 100644 index 0000000..b0d9e17 --- /dev/null +++ b/customTests/azure_gpu_vbios.nhc @@ -0,0 +1,14 @@ +#!/bin/bash + +function check_vbios_version() { + expected_version="$1" + uniq_vbios_versions=$(nvidia-smi -q | grep "VBIOS Version" | cut -d ':' -f 2 | sed 's/ //g' | uniq) + + if [ ${#uniq_vbios_versions[@]} -ne 1 ]; then + die 1 "$FUNCNAME: More than 1 VBIOS version found on GPUs! Found '${uniq_vbios_versions[@]}' but expected just '$expected_version'" + elif ! echo "${uniq_vbios_versions[@]}" | grep -qw "$expected_version"; then + die 1 "$FUNCNAME: GPU VBIOS version does not match the expected '$expected_version', instead got '${uniq_vbios_versions[@]}'" + fi +} + +check_vbios_version $1 \ No newline at end of file From e0b0db9a6e75a200f7fcf453161a2d0956f66e73 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 2 Aug 2023 17:56:54 -0700 Subject: [PATCH 33/58] fixes --- customTests/azure_gpu_vbios.nhc | 2 +- distributed_nhc.sb.sh | 2 +- export_nhc_result_to_kusto.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/customTests/azure_gpu_vbios.nhc b/customTests/azure_gpu_vbios.nhc index b0d9e17..d20e41e 100644 --- a/customTests/azure_gpu_vbios.nhc +++ b/customTests/azure_gpu_vbios.nhc @@ -11,4 +11,4 @@ function check_vbios_version() { fi } -check_vbios_version $1 \ No newline at end of file +check_vbios_version "$1" \ No newline at end of file diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh index 78c1bc2..ac11df0 100755 --- a/distributed_nhc.sb.sh +++ b/distributed_nhc.sb.sh @@ -220,7 +220,7 @@ echo "NHC took $nhc_duration minutes to finish" echo echo "Exporting results to Kusto" requirements_file=$(realpath -e "./requirements.txt") -pip install -r $requirements_file +pip install -r $requirements_file > /dev/null 2>&1 kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") echo "Using export script $kusto_export_script" python3 $kusto_export_script $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH diff --git a/export_nhc_result_to_kusto.py b/export_nhc_result_to_kusto.py index db94757..f168813 100644 --- a/export_nhc_result_to_kusto.py +++ b/export_nhc_result_to_kusto.py @@ -91,6 +91,6 @@ def ingest_debug_log(debug_file): except FileNotFoundError: if len(health_files) == 1: - print("Cannot find file '{health_file}'") + print(f"Cannot find file '{health_file}'") raise - print("Cannot find file '{health_file}', skipping...") \ No newline at end of file + print(f"Cannot find file '{health_file}', skipping...") \ No newline at end of file From e6638878a50a7586fdaf2629d29a701418762274 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 2 Aug 2023 18:02:35 -0700 Subject: [PATCH 34/58] try fix, don't call on sourcing :) --- customTests/azure_gpu_vbios.nhc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/customTests/azure_gpu_vbios.nhc b/customTests/azure_gpu_vbios.nhc index d20e41e..e871c23 100644 --- a/customTests/azure_gpu_vbios.nhc +++ b/customTests/azure_gpu_vbios.nhc @@ -1,5 +1,4 @@ #!/bin/bash - function check_vbios_version() { expected_version="$1" uniq_vbios_versions=$(nvidia-smi -q | grep "VBIOS Version" | cut -d ':' -f 2 | sed 's/ //g' | uniq) @@ -9,6 +8,4 @@ function check_vbios_version() { elif ! echo "${uniq_vbios_versions[@]}" | grep -qw "$expected_version"; then die 1 "$FUNCNAME: GPU VBIOS version does not match the expected '$expected_version', instead got '${uniq_vbios_versions[@]}'" fi -} - -check_vbios_version "$1" \ No newline at end of file +} \ No newline at end of file From a8e5ca82c212e709a18f23d9135f261bce55b799 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 9 Aug 2023 16:57:25 -0700 Subject: [PATCH 35/58] adjust directory structure --- distributed_nhc/README.md | 115 ++++++++++++++++++ .../distributed_nhc.sb.sh | 0 .../export_nhc_result_to_kusto.py | 0 .../onetouch_nhc.sh | 2 +- .../requirements.txt | 0 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 distributed_nhc/README.md rename distributed_nhc.sb.sh => distributed_nhc/distributed_nhc.sb.sh (100%) mode change 100755 => 100644 rename export_nhc_result_to_kusto.py => distributed_nhc/export_nhc_result_to_kusto.py (100%) rename onetouch_nhc.sh => distributed_nhc/onetouch_nhc.sh (97%) mode change 100755 => 100644 rename requirements.txt => distributed_nhc/requirements.txt (100%) diff --git a/distributed_nhc/README.md b/distributed_nhc/README.md new file mode 100644 index 0000000..6d3a777 --- /dev/null +++ b/distributed_nhc/README.md @@ -0,0 +1,115 @@ +# Running Distributed Node Health Check +We’ll be using a version of Azure NHC called Distributed NHC throughout the thermal test. +The Node Health Check runs a variety of single node validation tests to validate the health of a given node. +These tests validate the presence and status of hardware on the node as well as a variety of GPU and IB performance tests to ensure expected bandwidths and throughputs are met. + +## Acquiring Distributed NHC +Distributed NHC is built on a fork of Azure NHC and can be found at +mpwillia/azurehpc-health-checks: Health checks for Azure N- and H-series VMs. (github.com) +Setup +1. Ensure a shared volume is mounted named /work +2. Clone the repository + git clone https://github.com/mpwillia/azurehpc-health-checks /work/distributed_nhc +3. Create the logs directory + mkdir /work/distributed_nhc/logs + +## Running Distributed NHC +Distributed NHC is ran entirely with the distributed_nhc.sb.sh script. +This script supports two modes of execution, as a slurm sbatch or invoked directly which uses parallel-ssh. +Please note that NHC takes a few minutes (~5-8 minutes) to run and NHC itself produces very limited output until the results of the health check. +## Known Issues +Distributed NHC can fail when running on large sets of nodes (>300 nodes) as a single slurm job. +The failure will manifest as an abnormally quick completion of NHC and every host failing the health check with the error “No results reported” +These failures have only been observed empirically on nodes sets of ~330 nodes while successfully being run on node sets of ~270 nodes. + +A mitigation for this issue is to simply enqueue multiple distributed NHC jobs on smaller subsets of nodes. +For example, to test on a set of 330 nodes, running 3 jobs with 110 each is a suitable work around. +If you enqueue multiple distributed NHC jobs, do note that each job will produce it’s own .health.log report at the end. + +## Slurm +Running with slurm is the typical approach to running distributed NHC. +When doing so, all sbatch arguments to target specific sets of nodes are supported. +Running with slurm is the suggested approach when running on many nodes. + +### Slurm Execution Examples +sbatch -w monument-[001,003-007] ./distributed_nhc.sb.sh +sbatch -F mynodelist.txt ./distributed_nhc.sb.sh +sbatch -N35 --partition=gspe ./distributed_nhc.sb.sh + +### Slurm Logs +All logs will end up in the logs directory at /work/distributed_nhc/logs provided the setup instructions have been followed. +There are three log files produced. +1. distributed_nhc-{jobid}.out is the active standard output. You can tail -f this file to see activity. +2. distributed_nhc-{jobid}.err is the active standard error. +3. distributed_nhc-{jobid}-{timestamp}.health.log is the results of the health check. This file will only appear upon completion of the health check. + See the Interpreting NHC Results section below for details on the health.log files. + +## Parallel-SSH +Running with parallel-ssh by invoking distributed_nhc.sb.sh directly is the preferred approach to test drained nodes or nodes otherwise unreachable by slurm, as well as spot checking smaller sets of nodes. +When doing so, the node lists can be specified like sbatch using either -w or -F. +The script itself also supports the –help argument. + +### Parallel-SSH Execution Examples +./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] +./distributed_nhc.sb.sh -F mynodelist.txt + +### Parallel-SSH Logs +All logs will end up in the logs directory at /work/distributed_nhc/logs provided the setup instructions have been followed. +There are three log files produced. +1. distributed_nhc-pssh-{timestamp}.out is the active standard output. You can tail -f this file to see activity. +2. distributed_nhc-pssh-{timestamp}.err is the active standard error. +3. distributed_nhc-pssh-{timestamp}.health.log is the results of the health check. This file will only appear upon completion of the health check. + See the Interpreting NHC Results section below. + +## Customizing Test Set +You may want to rerun a specific test or subset of tests on problematic nodes rather than rerunning the entire health check. +To do so, make a copy of /work/distributed_nhc/conf/nd96isr_h100_v5.conf +cp /work/distributed_nhc/conf/nd96isr_h100_v5.conf /work/distributed_nhc/mytests.conf +In this file you can comment out any test with # at the beginning of the line. +The execute the customized conf file you must specify it as an argument to distributed_nhc.sb.sh with the -c argument. +Modifying and saving the conf file /work/distributed_nhc/conf/nd96isr_h100_v5.conf will not work unless you explicitly pass it as an argument with -c + +For example: +sbatch -w monument-[001,003-007] ./distributed_nhc.sb.sh -c ./mytests.conf +./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] -c ./mytests.conf +./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] -c /work/distributed_nhc/conf/nd96isr_h100_v5.conf + +Please note, if you modify the .conf file to run a limited set of tests, a node that reports Healthy only means it has passed that specific subset of tests. +The node could still be unhealthy and fail a test you have not run. Always verify a node is healthy by running the entire suite of tests (simply by not specifying -c) + +## Interpreting NHC Results +The resulting health log file shows the health report for every node tested against. +The health logs appear in the /work/distributed_nhc/logs directory with the extension .health.log + +### Health Results +The health results per node found in the .health.log files are always formatted as {hostname} | {health result} + * A healthy node reports + {hostname} | Healthy + + * A node that fails NHC reports + {hostname} | ERROR: nhc: Health check failed: {details of the failure} + + * A node that failed for any reason other than NHC reports + {hostname} | ERROR: No results reported + + The “No results reported” error is the catch-all for non-nhc errors meaning that no NHC results were reported by the node. + This could mean a variety of things, but some common reasons may be: + * The node is unreachable, check to see if you can ssh to the node. + * The script onetouch_nhc.sh is unreachable by the node + * Check to see if the shared volume /work is mounted. + * Every node should have visibility into the /work/distributed_nhc directory. + * Transient file handle errors, retry on that node. + * Az NHC failed to download or install on the node. + * Every node leaves a report of their own execution of NHC in the ~/onetouch_nhc/working directory. + * Check the most recent .out and .err file to debug issues like this. + +### Example Logs from Real Tests +Below is a sample of real outputs from .health.log files showing a variety of results. +monument-042 | Healthy +monument-066 | ERROR: nhc: Health check failed: check_gpu_xid: GPU Xid errors detected: [ 3606.832215] NVRM: Xid (PCI:0002:00:00): 119, pid=67769, name=nvidia-smi, Timeout waiting for RPC from GSP1! Expected function 76 (GSP_RM_CONTROL) (0x2080014b 0x5). +monument-079 | ERROR: nhc: Health check failed: check_nccl_allreduce_ib_loopback: NCCL allreduce, BUS BW (expected >=40.0 GB/s, but measured 20.7377 GB/s) +monument-084 | ERROR: nhc: Health check failed: Bandwidth is low on device 1. Reported bandwidth is 7 GB/s. +monument-108 | ERROR: No results reported +monument-123 | ERROR: nhc: Health check failed: Bandwidth is low on device 3. Reported bandwidth is 27 GB/s. +monument-273 | ERROR: nhc: Health check failed: check_hw_ib: No IB port mlx5_ib7:1 is ACTIVE (LinkUp 400 Gb/sec). +monument-510 | ERROR: nhc: Health check failed: check_gpu_ecc: GPU id 3: SRAM Uncorrectable ECC error count detected, (0,1) diff --git a/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh old mode 100755 new mode 100644 similarity index 100% rename from distributed_nhc.sb.sh rename to distributed_nhc/distributed_nhc.sb.sh diff --git a/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py similarity index 100% rename from export_nhc_result_to_kusto.py rename to distributed_nhc/export_nhc_result_to_kusto.py diff --git a/onetouch_nhc.sh b/distributed_nhc/onetouch_nhc.sh old mode 100755 new mode 100644 similarity index 97% rename from onetouch_nhc.sh rename to distributed_nhc/onetouch_nhc.sh index 490c318..52522b1 --- a/onetouch_nhc.sh +++ b/distributed_nhc/onetouch_nhc.sh @@ -118,7 +118,7 @@ install_nhc() { NHC_INSTALLED=false fi - if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then + if $NHC_INSTALLED && [[ $( diff --brief ../customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then echo "Custom tests differ, reinstalling" NHC_INSTALLED=false fi diff --git a/requirements.txt b/distributed_nhc/requirements.txt similarity index 100% rename from requirements.txt rename to distributed_nhc/requirements.txt From 66e38b844ce0fb189762fa9e7f873b50985d016f Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 13:59:11 -0700 Subject: [PATCH 36/58] make executable again --- distributed_nhc/distributed_nhc.sb.sh | 0 distributed_nhc/onetouch_nhc.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 distributed_nhc/distributed_nhc.sb.sh mode change 100644 => 100755 distributed_nhc/onetouch_nhc.sh diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh old mode 100644 new mode 100755 diff --git a/distributed_nhc/onetouch_nhc.sh b/distributed_nhc/onetouch_nhc.sh old mode 100644 new mode 100755 From 97d24e14cce7f352a0d457141213a12e5e051ea6 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 14:09:39 -0700 Subject: [PATCH 37/58] update confs, verified on nd96amsr_a100_v4 --- conf/nd96amsr_a100_v4.conf | 2 ++ conf/nd96asr_v4.conf | 2 ++ 2 files changed, 4 insertions(+) diff --git a/conf/nd96amsr_a100_v4.conf b/conf/nd96amsr_a100_v4.conf index 679b951..8354847 100644 --- a/conf/nd96amsr_a100_v4.conf +++ b/conf/nd96amsr_a100_v4.conf @@ -43,12 +43,14 @@ * || check_hw_eth ib6 * || check_hw_eth docker0 * || check_hw_eth ib0 + * || check_hw_topology /opt/microsoft/ndv4-topo.xml ####################################################################### ##### ##### GPU checks ##### + * || check_gpu_count 8 * || check_gpu_xid * || check_nvsmi_healthmon * || check_cuda_bw 24 diff --git a/conf/nd96asr_v4.conf b/conf/nd96asr_v4.conf index b786510..1441533 100644 --- a/conf/nd96asr_v4.conf +++ b/conf/nd96asr_v4.conf @@ -43,12 +43,14 @@ * || check_hw_eth ib2 * || check_hw_eth eth0 * || check_hw_eth ib1 + * || check_hw_topology /opt/microsoft/ndv4-topo.xml ######################################################## #### #### GPU checks #### + * || check_gpu_count 8 * || check_gpu_xid * || check_nvsmi_healthmon * || check_cuda_bw 24 From 4016d24b524d0f9e187f5d41c2a7281ee4f52d51 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 14:56:39 -0700 Subject: [PATCH 38/58] add proper options to run-health-checks.sh --- run-health-checks.sh | 153 ++++++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 39 deletions(-) diff --git a/run-health-checks.sh b/run-health-checks.sh index eefa961..522fd0b 100755 --- a/run-health-checks.sh +++ b/run-health-checks.sh @@ -1,45 +1,120 @@ #!/bin/bash -SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") -echo "Running health checks for $SKU SKU..." - -SKU="${SKU,,}" -if echo "$SKU" | grep -q "nd96asr_v4"; then - conf_name="nd96asr_v4" -elif echo "$SKU" | grep -q "nd96amsr_a100_v4"; then - conf_name="nd96amsr_a100_v4" -elif echo "$SKU" | grep -q "nd96isr_h100_v5"; then - conf_name="nd96isr_h100_v5" -elif echo "$SKU" | grep -q "hb120rs_v2"; then - conf_name="hb120rs_v2" -elif echo "$SKU" | grep -q "hb120rs_v3"; then - conf_name="hb120rs_v3" -elif echo "$SKU" | grep -q "hb176rs_v4"; then - conf_name="hb176rs_v4" -elif echo "$SKU" | grep -q "hb176-144rs_v4"; then - conf_name="hb176-144rs_v4" -elif echo "$SKU" | grep -q "hb176-96rs_v4"; then - conf_name="hb176-96rs_v4" -elif echo "$SKU" | grep -q "hb176-48rs_v4"; then - conf_name="hb176-48rs_v4" -elif echo "$SKU" | grep -q "hb176-24rs_v4"; then - conf_name="hb176-24rs_v4" -elif echo "$SKU" | grep -q "hx176rs"; then - conf_name="hx176rs" -elif echo "$SKU" | grep -q "hx176-144rs"; then - conf_name="hx176-144rs" -elif echo "$SKU" | grep -q "hx176-96rs"; then - conf_name="hx176-96rs" -elif echo "$SKU" | grep -q "hx176-48rs"; then - conf_name="hx176-48rs" -elif echo "$SKU" | grep -q "hx176-24rs"; then - conf_name="hx176-24rs" -else - echo "SKU health check currently not implemented" +print_help() { +cat << EOF + +Usage: ./run-health-checks.sh [-h|--help] [-c|--config ] [-o|--output ] [-v|--verbose] +Run health checks on the current VM. + +-h, -help, --help Display this help +-c, -config, --config Optional path to a custom NHC config file. + If not specified the current VM SKU will be detected and the appropriate conf file will be used. + +-o, -output, --output Optional path to output the health check logs to. All directories in the path must exist. + If not specified it will use output to ./health.log + +-t, -timeout, --timeout Optional timeout in seconds for each health check. If not specified it will default to 500 seconds. + +-v, -verbose, --verbose If set, enables verbose and debug outputs. + +EOF +} + +CUSTOM_CONF="" +OUTPUT_PATH="./health.log" +TIMEOUT=500 +VERBOSE=false + +options=$(getopt -l "help,config:,output:,timeout:,verbose" -o "hc:o:t:v" -a -- "$@") + +if [ $? -ne 0 ]; then + print_help exit 1 fi -log_path="${1:-./health.log}" -log_path=$(realpath "$log_path") +eval set -- "$options" +while true +do +case "$1" in +-h|--help) + print_help + exit 0 + ;; +-c|--config) + shift + CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" + ;; +-o|--output) + shift + OUTPUT_PATH="$(realpath -m ${1//\~/$HOME})" + ;; +-t|--timeout) + shift + TIMEOUT="$1" + ;; +-v|--verbose) + VERBOSE=true + ;; +--) + shift + break;; +esac +shift +done + +# If a custom configuration isn't specified, detect the VM SKU and use the appropriate conf file +if [ -z "$CUSTOM_CONF" ]; then + echo "No custom conf file specified, detecting VM SKU..." + + SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") + echo "Running health checks for $SKU SKU..." + + SKU="${SKU,,}" + if echo "$SKU" | grep -q "nd96asr_v4"; then + conf_name="nd96asr_v4" + elif echo "$SKU" | grep -q "nd96amsr_a100_v4"; then + conf_name="nd96amsr_a100_v4" + elif echo "$SKU" | grep -q "nd96isr_h100_v5"; then + conf_name="nd96isr_h100_v5" + elif echo "$SKU" | grep -q "hb120rs_v2"; then + conf_name="hb120rs_v2" + elif echo "$SKU" | grep -q "hb120rs_v3"; then + conf_name="hb120rs_v3" + elif echo "$SKU" | grep -q "hb176rs_v4"; then + conf_name="hb176rs_v4" + elif echo "$SKU" | grep -q "hb176-144rs_v4"; then + conf_name="hb176-144rs_v4" + elif echo "$SKU" | grep -q "hb176-96rs_v4"; then + conf_name="hb176-96rs_v4" + elif echo "$SKU" | grep -q "hb176-48rs_v4"; then + conf_name="hb176-48rs_v4" + elif echo "$SKU" | grep -q "hb176-24rs_v4"; then + conf_name="hb176-24rs_v4" + elif echo "$SKU" | grep -q "hx176rs"; then + conf_name="hx176rs" + elif echo "$SKU" | grep -q "hx176-144rs"; then + conf_name="hx176-144rs" + elif echo "$SKU" | grep -q "hx176-96rs"; then + conf_name="hx176-96rs" + elif echo "$SKU" | grep -q "hx176-48rs"; then + conf_name="hx176-48rs" + elif echo "$SKU" | grep -q "hx176-24rs"; then + conf_name="hx176-24rs" + else + echo "Health checks not supported for SKU $SKU" + exit 1 + fi + + CUSTOM_CONF="$(dirname "${BASH_SOURCE[0]}")/conf/$conf_name.conf" +fi + +OUTPUT_PATH=$(realpath -m "$OUTPUT_PATH") + +nhc_args=() +if [ "$VERBOSE" = true ]; then + nhc_args+=("-v") + nhc_args+=("-d") +fi -nhc -d -v CONFFILE=$(dirname "${BASH_SOURCE[0]}")/conf/$conf_name.conf LOGFILE=$log_path TIMEOUT=500 +echo "Running health checks using $CUSTOM_CONF and outputting to $OUTPUT_PATH" +nhc ${nhc_args[@]} CONFFILE=$CUSTOM_CONF LOGFILE=$OUTPUT_PATH TIMEOUT=$TIMEOUT From 48c9378179ce9aa0150eb708f69c1f0d6e554e97 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 15:09:59 -0700 Subject: [PATCH 39/58] debugging bad custom test copying --- customTests/custom-test-setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/customTests/custom-test-setup.sh b/customTests/custom-test-setup.sh index e1c80a5..5a4ece2 100755 --- a/customTests/custom-test-setup.sh +++ b/customTests/custom-test-setup.sh @@ -124,6 +124,7 @@ else fi # copy all custom test to the nhc scripts dir +echo "Copying *.nhc from $SRC_DIR to /etc/nhc/scripts" cp $SRC_DIR/*.nhc /etc/nhc/scripts exit 0 From 82700d37068b441915957d0e11fd73b92ff526f3 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 15:28:18 -0700 Subject: [PATCH 40/58] pipe wget straight into tar so we don't have the .tar.gz on the filesystem and run into all kinds of idempotency issues --- customTests/custom-test-setup.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/customTests/custom-test-setup.sh b/customTests/custom-test-setup.sh index 5a4ece2..124589f 100755 --- a/customTests/custom-test-setup.sh +++ b/customTests/custom-test-setup.sh @@ -40,9 +40,12 @@ function install_perf_test(){ fi pushd ${EXE_DIR} - wget https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz - tar xvf perftest-${VERSION}.${VERSION_HASH}.tar.gz - pushd perftest-4.5 + perftest_dir="perftest-${VERSION}" + mkdir -p ${EXE_DIR}/${perftest_dir} + archive_url="https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz" + wget -q -O - $archive_url | tar -xz --strip=1 -C ${EXE_DIR}/${perftest_dir} + + pushd ${perftest_dir} if [[ "$type" == "cuda" ]]; then ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h else @@ -51,10 +54,8 @@ function install_perf_test(){ fi make - rm ${EXE_DIR}/perftest-${VERSION}.${VERSION_HASH}.tar.gz popd popd - } From 04bd05b3d623aed4bb23a718c059ceb404f16eeb Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 15:46:21 -0700 Subject: [PATCH 41/58] try fix --- run-health-checks.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/run-health-checks.sh b/run-health-checks.sh index 522fd0b..f4ef1ec 100755 --- a/run-health-checks.sh +++ b/run-health-checks.sh @@ -108,6 +108,7 @@ if [ -z "$CUSTOM_CONF" ]; then CUSTOM_CONF="$(dirname "${BASH_SOURCE[0]}")/conf/$conf_name.conf" fi +CUSTOM_CONF=$(realpath -e "$CUSTOM_CONF") OUTPUT_PATH=$(realpath -m "$OUTPUT_PATH") nhc_args=() From 6521b160edde2c59bbffa4602afa43921b0900c7 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 15:48:32 -0700 Subject: [PATCH 42/58] standardize nhc args --- distributed_nhc/onetouch_nhc.sh | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/distributed_nhc/onetouch_nhc.sh b/distributed_nhc/onetouch_nhc.sh index 52522b1..e739f28 100755 --- a/distributed_nhc/onetouch_nhc.sh +++ b/distributed_nhc/onetouch_nhc.sh @@ -118,7 +118,7 @@ install_nhc() { NHC_INSTALLED=false fi - if $NHC_INSTALLED && [[ $( diff --brief ../customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then + if $NHC_INSTALLED && [[ $( diff --brief ./customTests /etc/nhc/scripts --exclude=lbnl_*.nhc --exclude=common.nhc | grep ".nhc" ) ]]; then echo "Custom tests differ, reinstalling" NHC_INSTALLED=false fi @@ -181,18 +181,22 @@ run_health_checks() { log_file_path=$(realpath -m "$log_file_path") - if [ -z $custom_conf ]; then - # if no custom config is provided, let run-health-checks.sh auto-detect - echo "The health check has been started, it will typically take a few minutes to complete" - sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH $log_file_path - else - # otherwise, run it ourselves - custom_conf=$(realpath "$custom_conf") - echo "Running health checks using $custom_conf" - echo "The health check has been started, it will typically take a few minutes to complete" - sudo nhc -d -v CONFFILE=$custom_conf LOGFILE=$log_file_path TIMEOUT=500 + nhc_args=() + if [ -n "$custom_conf" ]; then + echo "Custom config provided, using $custom_conf" + custom_conf=$(realpath -e "$custom_conf") + nhc_args+=("-c" "$custom_conf") + fi + + if [ "$VERBOSE" = true ]; then + echo "Verbose mode enabled" + nhc_args+=("-v") fi + # if no custom config is provided, let run-health-checks.sh auto-detect + echo sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH ${nhc_args[@]} -o $log_file_path + echo "The health check has been started, it will typically take a few minutes to complete" + sudo $RUN_HEALTH_CHECKS_SCRIPT_PATH ${nhc_args[@]} -o $log_file_path } # Download AZ NHC From 163d9024921457bececbd1a084f74f25d62b334a Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 15:59:19 -0700 Subject: [PATCH 43/58] fix -V verbose flag here --- distributed_nhc/onetouch_nhc.sh | 39 +++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/distributed_nhc/onetouch_nhc.sh b/distributed_nhc/onetouch_nhc.sh index e739f28..4d9b1ee 100755 --- a/distributed_nhc/onetouch_nhc.sh +++ b/distributed_nhc/onetouch_nhc.sh @@ -25,6 +25,8 @@ Runs OneTouch Azure NHC which downloads a specific version of Azure NHC, install If not specified the job name will be generated with "\$(hostname)-\$(date +"%Y-%m-%d_%H-%M-%S")". -f, -force, --force If set, forces the script the redownload and reinstall everything + +-V, -verbose, --verbose If set, enables verbose mode which will output all debug logs to stdout and the health file EOF } @@ -36,9 +38,10 @@ OUTPUT_DIR=$WORKING_DIR JOB_NAME="$(hostname)-$(date --utc +"%Y-%m-%d_%H-%M-%S")" CUSTOM_CONF="" FORCE=false +VERBOSE=false # Parse out arguments -options=$(getopt -l "help,version:,config:,working:,output:,name:,force,git:" -o "hv:c:w:o:n:fg:" -a -- "$@") +options=$(getopt -l "help,version:,config:,working:,output:,name:,force,git:,verbose:" -o "hv:c:w:o:n:fg:V" -a -- "$@") if [ $? -ne 0 ]; then print_help @@ -80,6 +83,9 @@ case "$1" in -f|--force) FORCE=true ;; +-V|--verbose) + VERBOSE=true + ;; --) shift break;; @@ -207,23 +213,28 @@ echo "=== Finished Setting up AZ NHC ===" # Execute Health Checks echo run_health_checks $HEALTH_LOG_FILE_PATH $CUSTOM_CONF -echo "=== Finished Running Health Checks ===" -echo -echo "=== Debug Dump ===" -debug=$(grep " DEBUG:" $HEALTH_LOG_FILE_PATH) -echo "$debug" | while read line -do - cleaned_line=$(echo "$line" | sed 's/^\[[0-9]*\] - DEBUG: //') - echo "NHC-DEBUG $(hostname) | $cleaned_line"; -done +if [ ! -f $HEALTH_LOG_FILE_PATH ]; then + echo "Failed to run health checks, no log file was generated at $HEALTH_LOG_FILE_PATH" + echo "NHC-RESULT $(hostname) | Failed to run health checks, no log file was generated at $HEALTH_LOG_FILE_PATH"; + exit 1 +fi -echo -echo "=== Overall Results ($HEALTH_LOG_FILE_PATH) ===" -cat $HEALTH_LOG_FILE_PATH +echo "=== Finished Running Health Checks ===" + +if [ "$VERBOSE" = true ]; then + echo + echo "=== Debug Dump ===" + debug=$(grep " DEBUG:" $HEALTH_LOG_FILE_PATH) + echo "$debug" | while read line + do + cleaned_line=$(echo "$line" | sed 's/^\[[0-9]*\] - DEBUG: //') + echo "NHC-DEBUG $(hostname) | $cleaned_line"; + done +fi echo -echo "=== Detected Errors (if any) ===" +echo "=== Health Report ===" errors=$(grep "ERROR" $HEALTH_LOG_FILE_PATH) if [ -n "$errors" ]; then From 07a093d60161df467522efd22959fbee371fe21b Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 16:02:41 -0700 Subject: [PATCH 44/58] -V verbose for dnhc --- distributed_nhc/distributed_nhc.sb.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index ac11df0..f3b6fd7 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -34,6 +34,8 @@ Example Usage: If not specified the current VM SKU will be detected and the appropriate conf file will be used. -f, -force, --force If set, forces the NHC script the redownload and reinstall everything + +-V, -verbose, --verbose If set, enables verbose mode which will output all detailed debug file to stdout and a .debug.log file next to the .health.log file EOF } @@ -86,9 +88,10 @@ else GIT_URL="" CUSTOM_CONF="" FORCE=false + VERBOSE=false # Parse out arguments - options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force,git" -o "hF:w:v:c:fg:" -a -- "$@") + options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force,git:,verbose" -o "hF:w:v:c:fg:V" -a -- "$@") if [ $? -ne 0 ]; then print_help exit 1 @@ -125,6 +128,9 @@ else -f|--force) FORCE=true ;; + -V|--verbose) + VERBOSE=true + ;; --) shift break;; @@ -185,6 +191,10 @@ else nhc_args+=("-f") fi + if $VERBOSE ; then + nhc_args+=("-V") + fi + echo "Running Parallel SSH Distributed NHC on:" echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" From deb097a64655248c8df0743f16ef50407c7a3540 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 16:15:12 -0700 Subject: [PATCH 45/58] comment out vbios check --- conf/nd96isr_h100_v5.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index 745d16b..3d34a2a 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -35,7 +35,7 @@ * || check_hw_eth ib6 * || check_hw_eth ib7 * || check_hw_eth docker0 - * || check_vbios_version 96.00.74.00.01 +# * || check_vbios_version 96.00.74.00.01 * || check_hw_topology /opt/microsoft/ndv5-topo.xml ####################################################################### From b75fc75c44d368cd86ea913badf0872d460acd09 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 16:38:54 -0700 Subject: [PATCH 46/58] don't create debug file if we don't have debug output --- distributed_nhc.sb.sh | 240 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 distributed_nhc.sb.sh diff --git a/distributed_nhc.sb.sh b/distributed_nhc.sb.sh new file mode 100644 index 0000000..677fa79 --- /dev/null +++ b/distributed_nhc.sb.sh @@ -0,0 +1,240 @@ +#!/bin/bash +#SBATCH --nodes=1 +#SBATCH --exclusive +#SBATCH --job-name distributed_nhc +#SBATCH --error="logs/%x-%j.err" +#SBATCH --output="logs/%x-%j.out" +#SBATCH --time 00:30:00 + +print_help() { +cat << EOF +Usage: ./distributed_nhc.sb.sh [-h|--help] [-F|--nodefile ] [-F|--nodefile ] +Run Azure NHC distributed onto the specified set of nodes and collects the results. Script can also be ran directly with sbatch. Running it as a shell script will use parallel-ssh + +Example Usage: + sbatch -N4 ./distributed_nhc.sb.sh + sbatch -N4 ./distributed_nhc.sb.sh -v + ./distributed_nhc.sb.sh -F ./my_node_file + ./distributed_nhc.sb.sh -w node1,node2,node3 + ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 + +-h, -help, --help Display this help + +-F --nodefile File contains a list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -F/--nodefile argument + +-w --nodelist Comma seperate list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -w/--nodelist argument but does not support ranges of hosts (eg host[1-5,7,...]). + If -F/--nodefile is provided, any nodes specified with -w/--nodelist will be added to the list of hostnames to run NHC on. This does not modify the provided -F/--nodefile file. + +-v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" + Can be a branch name like "main" for the latest or a full commit hash for a specific version. + +-g, -git, --git Optional git url to download az nhc from. Defaults to "https://github.com/Azure/azurehpc-health-checks" + +-c, -config, --config Optional path to a custom NHC config file. + If not specified the current VM SKU will be detected and the appropriate conf file will be used. + +-f, -force, --force If set, forces the NHC script the redownload and reinstall everything + +-V, -verbose, --verbose If set, enables verbose mode which will output all detailed debug file to stdout and a .debug.log file next to the .health.log file +EOF +} + +expand_nodelist() { + nodelist="$1" + # make nodelist bash "friendly" for expansion + # ie turn "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" + # into "aice-ndv5-iad21-{000170,{000201..000203},{000218..000220}}" + # which bash can easily expand into + # aice-ndv5-iad21-000170 aice-ndv5-iad21-000201 aice-ndv5-iad21-000202 aice-ndv5-iad21-000203 aice-ndv5-iad21-000218 aice-ndv5-iad21-000219 aice-ndv5-iad21-000220 + + # converts "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" + # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" + # which we can then stick into an array. If we have 1 element, there were no ranges + # otherwise, expand the ranges and rebuild the node names + host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) + if [ ${#host_num_split[@]} -eq 1 ]; then + echo ${host_num_split[0]} + return + fi + + nodenumbers=${host_num_split[1]} + bash_friendly_ranges=$(echo $nodenumbers | sed -r -e 's:[[](.*)[]]:{\1}:' -e 's:([0-9]+)[-]([0-9]+):{\1..\2}:g') + bash_friendly_node_range="${host_num_split[0]}$bash_friendly_ranges" + eval echo $bash_friendly_node_range | tr -d '{}' +} + +RAW_OUTPUT="" +HEALTH_LOG_FILE_PATH="" +DEBUG_LOG_FILE_PATH="" +NODELIST_ARR=() +onetouch_nhc_path=$(realpath -e "./onetouch_nhc.sh") +nhc_start_time=$(date +%s.%N) + +# Running with SLURM +if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then + NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID-$(date +'%Y-%m-%d_%H-%M-%S')" + HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.health.log") + DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.debug.log") + NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) + + # verify file presence on all nodes + { RAW_OUTPUT=$(srun --gpus-per-node=8 $onetouch_nhc_path -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 +else + # Running with Parallel SSH + # Arguments + NODEFILE="" + NODELIST="" + GIT_VERSION="" + GIT_URL="" + CUSTOM_CONF="" + FORCE=false + VERBOSE=false + + # Parse out arguments + options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force,git:,verbose" -o "hF:w:v:c:fg:V" -a -- "$@") + if [ $? -ne 0 ]; then + print_help + exit 1 + fi + + eval set -- "$options" + while true + do + case "$1" in + -h|--help) + print_help + exit 0 + ;; + -F|--nodefile) + shift + NODEFILE="$1" + ;; + -w|--nodelist) + shift + NODELIST="$1" + ;; + -v|--version) + shift + GIT_VERSION="$1" + ;; + -g|--git) + shift + GIT_URL="$1" + ;; + -c|--config) + shift + CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" + ;; + -f|--force) + FORCE=true + ;; + -V|--verbose) + VERBOSE=true + ;; + --) + shift + break;; + esac + shift + done + + # Parse out nodes + NODELIST_ARR=() + + if [ -f "$NODEFILE" ]; then + mapfile -t NODELIST_ARR < $NODEFILE + fi + + if [ -n "$NODELIST" ]; then + NODELIST_ARR+=( $(expand_nodelist $NODELIST ) ) + fi + + if [ ${#NODELIST_ARR[@]} -eq 0 ]; then + echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a comma seperate list with -w/--nodelist" + echo + print_help + exit 1 + fi + + # Make unique and sorted + NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) + + # Log file paths + jobname="distributed_nhc-pssh-$(date --utc +'%Y-%m-%d_%H-%M-%S')" + HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.health.log") + DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.debug.log") + output_path="logs/$jobname.out" + error_path="logs/$jobname.err" + + # Pssh args + timeout=900 # 15 minute timeout + + pssh_host_args=() + for node in "${NODELIST_ARR[@]}"; do + pssh_host_args+="-H $node " + done + + nhc_args=() + if [ -n "$GIT_VERSION" ]; then + nhc_args+=("-v" "$GIT_VERSION") + fi + + if [ -n "$GIT_URL" ]; then + nhc_args+=("-g" "$GIT_URL") + fi + + if [ -n "$CUSTOM_CONF" ]; then + nhc_args+=("-c" "$CUSTOM_CONF") + fi + + if $FORCE ; then + nhc_args+=("-f") + fi + + if $VERBOSE ; then + nhc_args+=("-V") + fi + + echo "Running Parallel SSH Distributed NHC on:" + echo "${NODELIST_ARR[@]}" | tr ' ' '\n' + echo "======================" + echo "The health check is running, it will take a few minutes to complete." + RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 3> $error_path | tee $output_path) +fi + +nhc_end_time=$(date +%s.%N) +nhc_duration=$(printf "%.2f" $(echo "($nhc_end_time - $nhc_start_time) / 60" | bc -l)) + +# Filter down to NHC-RESULTS +NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g') +NHC_DEBUG=$(echo "$RAW_OUTPUT" | grep "NHC-DEBUG" | sed 's/.*NHC-DEBUG\s*//g') + +if [ -n "$NHC_DEBUG" ]; then + echo "Dumping NHC Debug into $DEBUG_LOG_FILE_PATH" + echo "$NHC_DEBUG" | sort >> $DEBUG_LOG_FILE_PATH +fi + +# Identify nodes who should have reported results but didn't, these failed for some unknown reason +nodes_with_results_arr=( $( echo "$NHC_RESULTS" | sed 's/\s*|.*//g' | tr '\n' ' ' ) ) +nodes_missing_results=(`echo ${NODELIST_ARR[@]} ${nodes_with_results_arr[@]} | tr ' ' '\n' | sort | uniq -u`) + +newline=$'\n' +for missing_node in "${nodes_missing_results[@]}"; do + NHC_RESULTS+="$newline$missing_node | ERROR: No results reported" +done + +echo "Health report can be found into $HEALTH_LOG_FILE_PATH" +echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH +echo "======================" +cat $HEALTH_LOG_FILE_PATH + +echo "======================" +echo "NHC took $nhc_duration minutes to finish" +echo +echo "Exporting results to Kusto" +requirements_file=$(realpath -e "./requirements.txt") +pip install -r $requirements_file > /dev/null 2>&1 +kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") +echo "Using export script $kusto_export_script" +python3 $kusto_export_script $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH +echo "Ingestion queued, results take ~5 minutes to appear in Kusto" \ No newline at end of file From 43e6d055eea2e9cc8da79890171900452d5f9413 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 11 Aug 2023 17:04:42 -0700 Subject: [PATCH 47/58] export py supports arguments --- distributed_nhc/export_nhc_result_to_kusto.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/distributed_nhc/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py index f168813..c0adeb0 100644 --- a/distributed_nhc/export_nhc_result_to_kusto.py +++ b/distributed_nhc/export_nhc_result_to_kusto.py @@ -4,7 +4,7 @@ from datetime import datetime from csv import DictReader from argparse import ArgumentParser -from azure.identity import ManagedIdentityCredential +from azure.identity import ManagedIdentityCredential, DefaultAzureCredential from azure.kusto.data import KustoConnectionStringBuilder from azure.kusto.ingest import QueuedIngestClient, IngestionProperties import pandas as pd @@ -14,7 +14,7 @@ health_table_name = "NodeHealthCheck" debug_table_name = "NodeHealthCheck_Debug" -def ingest_health_log(health_file): +def ingest_health_log(health_file, creds, ingest_url, database, health_table_name): filename_parts = os.path.basename(health_file).split("-", maxsplit=2) ts_str = filename_parts[2].split(".")[0] ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") @@ -44,7 +44,7 @@ def ingest_health_log(health_file): print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{health_table_name}") ingest_client.ingest_from_dataframe(df, IngestionProperties(database, health_table_name)) -def ingest_debug_log(debug_file): +def ingest_debug_log(debug_file, creds, ingest_url, database, debug_table_name): filename_parts = os.path.basename(debug_file).split("-", maxsplit=2) ts_str = filename_parts[2].split(".")[0] ts = datetime.strptime(ts_str, "%Y-%m-%d_%H-%M-%S") @@ -75,17 +75,37 @@ def ingest_debug_log(debug_file): health_files = sys.argv[1:] -print(f"Attempting to ingest: {','.join(health_files)}") - -for health_file in health_files: +def parse_args(): + parser = ArgumentParser(description="Ingest NHC results into Kusto") + parser.add_argument("health_files", nargs="+", help="List of .health.log or .debug.log files to ingest") + parser.add_argument("--ingest_url", help="Kusto ingest URL", required=True) + parser.add_argument("--database", help="Kusto database", required=True) + parser.add_argument("--health_table_name", default="NodeHealthCheck" help="Kusto table name for health results") + parser.add_argument("--debug_table_name", default="NodeHealthCheck_Debug" help="Kusto table name for debug results") + parser.add_argument("--identity", nargs="?", const=True, default=False, help="Managed Identity to use for authentication, if a client ID is provided it will be used, otherwise the system-assigned identity will be used. If --identity is not provided DefaultAzureCredentials will be used.") + return parser.parse_args() + +def get_creds(identity): + if identity is True: + return ManagedIdentityCredential() + elif identity: + return ManagedIdentityCredential(client_id=identity) + else: + return DefaultAzureCredential() + +args = parse_args() +creds = get_creds(args.identity) + +print(f"Attempting to ingest: {','.join(args.health_files)}") +for health_file in args.health_files: try: if not os.path.exists(health_file): raise FileNotFoundError(f"Cannot find file '{health_file}'") if health_file.endswith(".health.log"): - ingest_health_log(health_file) + ingest_health_log(health_file, creds, args.ingest_url, args.database, args.health_table_name) elif health_file.endswith(".debug.log"): - ingest_debug_log(health_file) + ingest_debug_log(health_file, creds, args.ingest_url, args.database, args.debug_table_name) else: raise Exception("Unsuported file, must be .health.log or .debug.log produced by ./distributed_nhc.sb.sh") From e2337ba16a44ae561e344d4bdd0d47a3c07d8c2b Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 14 Aug 2023 14:08:48 -0700 Subject: [PATCH 48/58] fix table name missing comma --- distributed_nhc/export_nhc_result_to_kusto.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/distributed_nhc/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py index c0adeb0..eab5cdf 100644 --- a/distributed_nhc/export_nhc_result_to_kusto.py +++ b/distributed_nhc/export_nhc_result_to_kusto.py @@ -80,8 +80,8 @@ def parse_args(): parser.add_argument("health_files", nargs="+", help="List of .health.log or .debug.log files to ingest") parser.add_argument("--ingest_url", help="Kusto ingest URL", required=True) parser.add_argument("--database", help="Kusto database", required=True) - parser.add_argument("--health_table_name", default="NodeHealthCheck" help="Kusto table name for health results") - parser.add_argument("--debug_table_name", default="NodeHealthCheck_Debug" help="Kusto table name for debug results") + parser.add_argument("--health_table_name", default="NodeHealthCheck", help="Kusto table name for health results") + parser.add_argument("--debug_table_name", default="NodeHealthCheck_Debug", help="Kusto table name for debug results") parser.add_argument("--identity", nargs="?", const=True, default=False, help="Managed Identity to use for authentication, if a client ID is provided it will be used, otherwise the system-assigned identity will be used. If --identity is not provided DefaultAzureCredentials will be used.") return parser.parse_args() From 6bd8459ffb7d62b8daf640e8f3e9bd7f3f546fce Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 14 Aug 2023 17:14:54 -0700 Subject: [PATCH 49/58] rework argument parsing --- distributed_nhc/distributed_nhc.sb.sh | 338 +++++++++++++++++--------- 1 file changed, 222 insertions(+), 116 deletions(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index f3b6fd7..e333f9a 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -8,7 +8,7 @@ print_help() { cat << EOF -Usage: ./distributed_nhc.sb.sh [-h|--help] [-F|--nodefile ] [-F|--nodefile ] +Usage: ./distributed_nhc.sb.sh [-h|--help] [-F|--nodefile ] [-w|--nodelist ] Run Azure NHC distributed onto the specified set of nodes and collects the results. Script can also be ran directly with sbatch. Running it as a shell script will use parallel-ssh Example Usage: @@ -18,24 +18,41 @@ Example Usage: ./distributed_nhc.sb.sh -w node1,node2,node3 ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 --h, -help, --help Display this help +-h --help Display this help --F --nodefile File contains a list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -F/--nodefile argument +Node Selection - Applies to direct shell script usage only, pass these arguments to sbatch to run with SLURM --w --nodelist Comma seperate list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -w/--nodelist argument but does not support ranges of hosts (eg host[1-5,7,...]). +-F --nodefile File contains a list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -F/--nodefile argument + +-w --nodelist Comma seperate list of hostnames to connect to and run NHC on. Similar to slurm's sbatch -w/--nodelist argument but does not support ranges of hosts (eg host[1-5,7,...]). If -F/--nodefile is provided, any nodes specified with -w/--nodelist will be added to the list of hostnames to run NHC on. This does not modify the provided -F/--nodefile file. --v, -version, --version Optional version of Az NHC to download from git, defaults to latest from "main" +NHC Behavior - Applies to both SLURM and direct shell script usage + +-v --version Optional version of Az NHC to download from git, defaults to latest from "main" Can be a branch name like "main" for the latest or a full commit hash for a specific version. --g, -git, --git Optional git url to download az nhc from. Defaults to "https://github.com/Azure/azurehpc-health-checks" +-g --git Optional git url to download az nhc from. Defaults to "https://github.com/Azure/azurehpc-health-checks" --c, -config, --config Optional path to a custom NHC config file. +-c --config Optional path to a custom NHC config file. If not specified the current VM SKU will be detected and the appropriate conf file will be used. --f, -force, --force If set, forces the NHC script the redownload and reinstall everything +-f --force If set, forces the NHC script the redownload and reinstall everything + +-V --verbose If set, enables verbose mode which will output all detailed debug file to stdout and a .debug.log file next to the .health.log file + +Kusto Exporting - Applies to both SLURM and direct shell script usage + + --kusto-export-url Optional Kusto Ingest URL to export results to. If not specified, results will not be exported to Kusto + --kusto-database If kusto-export-url is specified, this is required and is the database to export results to. --V, -verbose, --verbose If set, enables verbose mode which will output all detailed debug file to stdout and a .debug.log file next to the .health.log file + --kusto-identity If kusto-export-url is specified, this is optional and is the identity to use to authenticate to Kusto. + If not provided, will use DefaultAzureCredential to authenticate. + If provided but with no client ID, will use System Assigned Identity to authenticate. For example by just specifying '--kusto-identity' with no value. + If provided with a client ID, will use User Assigned Identity to authenticate. For example by specifying '--kusto-identity my_client_id'. + + --kusto-health-table If kusto-export-url is specified, this is optional and is the table to export health results to. Defaults to "NodeHealthCheck" + --kusto-debug-table If kusto-export-url is specified, this is optional and is the table to export health results to. Defaults to "NodeHealthCheck_Debug" EOF } @@ -63,108 +80,203 @@ expand_nodelist() { eval echo $bash_friendly_node_range | tr -d '{}' } +# Shared Variables RAW_OUTPUT="" HEALTH_LOG_FILE_PATH="" DEBUG_LOG_FILE_PATH="" NODELIST_ARR=() -onetouch_nhc_path=$(realpath -e "./onetouch_nhc.sh") +ONETOUCH_NHC_PATH=$(realpath -e "./onetouch_nhc.sh") +ONETOUCH_NHC_ARGS=() + +KUSTO_EXPORT_ENABLED="False" +KUSTO_EXPORT_ARGS=() +KUSTO_IDENTITY="False" # hold onto this seperately to help with the passing arguments to the export script + nhc_start_time=$(date +%s.%N) -# Running with SLURM + if [ -n "$SLURM_JOB_NAME" ] && [ "$SLURM_JOB_NAME" != "interactive" ]; then + EXECUTION_MODE="SLURM" + + # Setup variables for SLURM + NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) NHC_JOB_NAME="$SLURM_JOB_NAME-$SLURM_JOB_ID-$(date +'%Y-%m-%d_%H-%M-%S')" HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.health.log") DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.debug.log") - NODELIST_ARR=( $(expand_nodelist $SLURM_JOB_NODELIST) ) - # verify file presence on all nodes - { RAW_OUTPUT=$(srun --gpus-per-node=8 $onetouch_nhc_path -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 else - # Running with Parallel SSH - # Arguments - NODEFILE="" - NODELIST="" - GIT_VERSION="" - GIT_URL="" - CUSTOM_CONF="" - FORCE=false - VERBOSE=false - - # Parse out arguments - options=$(getopt -l "help,nodefile:,nodelist:,version:,config:,force,git:,verbose" -o "hF:w:v:c:fg:V" -a -- "$@") - if [ $? -ne 0 ]; then - print_help - exit 1 - fi + EXECUTION_MODE="PSSH" - eval set -- "$options" - while true - do - case "$1" in - -h|--help) - print_help - exit 0 - ;; - -F|--nodefile) - shift - NODEFILE="$1" - ;; - -w|--nodelist) - shift - NODELIST="$1" - ;; - -v|--version) - shift - GIT_VERSION="$1" - ;; - -g|--git) - shift - GIT_URL="$1" - ;; - -c|--config) - shift - CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" - ;; - -f|--force) - FORCE=true - ;; - -V|--verbose) - VERBOSE=true - ;; - --) - shift - break;; - esac - shift - done + # Setup variables for PSSH, Nodefile and Nodelist are handled in the argument parsing + NHC_JOB_NAME="distributed_nhc-pssh-$(date --utc +'%Y-%m-%d_%H-%M-%S')" + HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.health.log") + DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$NHC_JOB_NAME.debug.log") +fi + +echo "Running in $EXECUTION_MODE mode" + + +# Argument Variables +# Kusto Export Arguments +KUSTO_EXPORT_URL="" +KUSTO_DATABASE="" +KUSTO_IDENTITY="False" +KUSTO_HEALTH_TABLE="NodeHealthCheck" +KUSTO_DEBUG_TABLE="NodeHealthCheck_Debug" + + +# These options are shared by both SLURM and PSSH +SHARED_SHORT_OPTS="hv:c:fg:V" +SHARED_LONG_OPTS="help,version:,git:,config:,force,verbose,kusto-export-url:,kusto-database:,kusto-identity::,kusto-health-table:,kusto-debug-table" + +# These options are only needed by PSSH +PSSH_SHORT_OPTS="F:w:" +PSSH_LONG_OPTS="nodefile:,nodelist:" - # Parse out nodes - NODELIST_ARR=() +# Select options based on execution mode +if [ "$EXECUTION_MODE" == "SLURM" ]; then + # SLURM + options=$(getopt -l "$SHARED_LONG_OPTS" -o "$SHARED_SHORT_OPTS" -- "$@") +else + # PSSH + options=$(getopt -l "$SHARED_LONG_OPTS,$PSSH_LONG_OPTS" -o "$SHARED_SHORT_OPTS,$PSSH_SHORT_OPTS" -- "$@") +fi - if [ -f "$NODEFILE" ]; then - mapfile -t NODELIST_ARR < $NODEFILE +if [ $? -ne 0 ]; then + print_help + exit 1 +fi + +echo "Options: $options" + +eval set -- "$options" +while true; do +case "$1" in +-h|--help) + print_help + exit 0 + ;; +# PSSH Options +-F|--nodefile) + shift + if [ "$EXECUTION_MODE" == "SLURM" ]; then + echo "Cannot specify -F/--nodefile when running with SLURM, please pass node file to sbatch instead" + exit 1 fi - if [ -n "$NODELIST" ]; then - NODELIST_ARR+=( $(expand_nodelist $NODELIST ) ) + if [ ! -f "$1" ]; then + echo "Nodefile $1 does not exist" + exit 1 fi - if [ ${#NODELIST_ARR[@]} -eq 0 ]; then - echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a comma seperate list with -w/--nodelist" - echo - print_help + if [ -f "$1" ]; then + mapfile -t NODELIST_ARR < $1 + fi + ;; +-w|--nodelist) + shift + if [ "$EXECUTION_MODE" == "SLURM" ]; then + echo "Cannot specify -w/--nodelist when running with SLURM, please pass node list to sbatch instead" exit 1 fi - # Make unique and sorted - NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) + echo "Adding nodes from nodelist $1" + NODELIST_ARR+=( $(expand_nodelist $1 ) ) + ;; +# Shared Onetouch NHC Args +-v|--version) + shift + ONETOUCH_NHC_ARGS+=("-v" "$1") + ;; +-g|--git) + shift + ONETOUCH_NHC_ARGS+=("-g" "$1") + ;; +-c|--config) + shift + CUSTOM_CONF="$(realpath -e ${1//\~/$HOME})" + ONETOUCH_NHC_ARGS+=("-c" "$CUSTOM_CONF") + ;; +-f|--force) + ONETOUCH_NHC_ARGS+=("-f") + ;; +-V|--verbose) + ONETOUCH_NHC_ARGS+=("-V") + ;; +# Shared Kusto Export Args +--kusto-export-url) + shift + echo "Setting Kusto export url to $1" + KUSTO_EXPORT_ENABLED="True" + KUSTO_EXPORT_ARGS+=("--ingest_url" "$1") + ;; +--kusto-database) + shift + echo "Setting Kusto database to $1" + KUSTO_EXPORT_ARGS+=("--database" "$1") + ;; +--kusto-identity) + shift + echo "Setting Kusto identity to $1" + KUSTO_IDENTITY="$1" + # Handle case of no client id provided + if [ -z "$KUSTO_IDENTITY" ]; then + echo "No client id provided, using system assigned identity" + KUSTO_IDENTITY="True" + fi + ;; +--kusto-health-table) + shift + echo "Setting Kusto health table to $1" + KUSTO_EXPORT_ARGS+=("--health_table_name" "$1") + ;; +--kusto-debug-table) + shift + echo "Setting Kusto debug table to $1" + KUSTO_EXPORT_ARGS+=("--debug_table_name" "$1") + ;; +--) + shift + break;; +esac +shift +done + +echo "Running with the following arguments:" +echo "OneTouch NHC Args: ${ONETOUCH_NHC_ARGS[@]}" +echo +echo "Node list: ${NODELIST_ARR[@]}" +echo +echo "Kusto export enabled: $KUSTO_EXPORT_ENABLED" +echo "Kusto Args: ${KUSTO_EXPORT_ARGS[@]}" +echo "Kusto identity: $KUSTO_IDENTITY" +echo +echo "The rest of the arguments are: $@" +echo +echo "Early exit for testing" +exit + +if [ ${#NODELIST_ARR[@]} -eq 0 ]; then + echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a slurm node list with -w/--nodelist" + echo + print_help + exit 1 +fi + +NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) + +exit + +# Running with SLURM +if [ $EXECUTION_MODE == "SLURM" ] + # verify file presence on all nodes + { RAW_OUTPUT=$(srun --gpus-per-node=8 $ONETOUCH_NHC_PATH -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 +else + # Running with Parallel SSH # Log file paths - jobname="distributed_nhc-pssh-$(date --utc +'%Y-%m-%d_%H-%M-%S')" - HEALTH_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.health.log") - DEBUG_LOG_FILE_PATH=$(realpath -m "./logs/$jobname.debug.log") - output_path="logs/$jobname.out" - error_path="logs/$jobname.err" + output_path="logs/$NHC_JOB_NAME.out" + error_path="logs/$NHC_JOB_NAME.err" # Pssh args timeout=900 # 15 minute timeout @@ -174,32 +286,11 @@ else pssh_host_args+="-H $node " done - nhc_args=() - if [ -n "$GIT_VERSION" ]; then - nhc_args+=("-v" "$GIT_VERSION") - fi - - if [ -n "$GIT_URL" ]; then - nhc_args+=("-g" "$GIT_URL") - fi - - if [ -n "$CUSTOM_CONF" ]; then - nhc_args+=("-c" "$CUSTOM_CONF") - fi - - if $FORCE ; then - nhc_args+=("-f") - fi - - if $VERBOSE ; then - nhc_args+=("-V") - fi - echo "Running Parallel SSH Distributed NHC on:" echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" echo "The health check is running, it will take a few minutes to complete." - RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${nhc_args[@]} 3> $error_path | tee $output_path) + RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${ONETOUCH_NHC_ARGS[@]} 3> $error_path | tee $output_path) fi nhc_end_time=$(date +%s.%N) @@ -228,10 +319,25 @@ cat $HEALTH_LOG_FILE_PATH echo "======================" echo "NHC took $nhc_duration minutes to finish" echo -echo "Exporting results to Kusto" -requirements_file=$(realpath -e "./requirements.txt") -pip install -r $requirements_file > /dev/null 2>&1 -kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") -echo "Using export script $kusto_export_script" -python3 $kusto_export_script $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH -echo "Ingestion queued, results take ~5 minutes to appear in Kusto" \ No newline at end of file + +# Export to Kusto if enabled +if [ "$KUSTO_ENABLED_ENABLED" == "True" ]; then + # Place identity arg at the end (if specified) + if [ "$KUSTO_IDENTITY" == "True" ]; then + KUSTO_EXPORT_ARGS+=("--identity") + elif [ -n "$KUSTO_IDENTITY" ]; then + KUSTO_EXPORT_ARGS+=("--identity" "$KUSTO_IDENTITY") + fi + + echo "Exporting results to Kusto" + + # Ensure prerequisites are installed + requirements_file=$(realpath -e "./requirements.txt") + pip install -r $requirements_file > /dev/null 2>&1 + + # Run export script + kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") + echo "Using export script $kusto_export_script" + python3 $kusto_export_script ${KUSTO_EXPORT_ARGS[@]} $KUSTO_IDENTITY -- $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH + echo "Ingestion queued, results take ~5 minutes to appear in Kusto" +fi \ No newline at end of file From 64f5c5bca2f24f51dd5bc4b724992fdade2a3599 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Mon, 14 Aug 2023 17:18:22 -0700 Subject: [PATCH 50/58] cleanup --- distributed_nhc/distributed_nhc.sb.sh | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index e333f9a..241883d 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -115,16 +115,6 @@ fi echo "Running in $EXECUTION_MODE mode" - -# Argument Variables -# Kusto Export Arguments -KUSTO_EXPORT_URL="" -KUSTO_DATABASE="" -KUSTO_IDENTITY="False" -KUSTO_HEALTH_TABLE="NodeHealthCheck" -KUSTO_DEBUG_TABLE="NodeHealthCheck_Debug" - - # These options are shared by both SLURM and PSSH SHARED_SHORT_OPTS="hv:c:fg:V" SHARED_LONG_OPTS="help,version:,git:,config:,force,verbose,kusto-export-url:,kusto-database:,kusto-identity::,kusto-health-table:,kusto-debug-table" @@ -147,8 +137,6 @@ if [ $? -ne 0 ]; then exit 1 fi -echo "Options: $options" - eval set -- "$options" while true; do case "$1" in @@ -290,7 +278,7 @@ else echo "${NODELIST_ARR[@]}" | tr ' ' '\n' echo "======================" echo "The health check is running, it will take a few minutes to complete." - RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $onetouch_nhc_path ${ONETOUCH_NHC_ARGS[@]} 3> $error_path | tee $output_path) + RAW_OUTPUT=$(parallel-ssh -P -t $timeout ${pssh_host_args[@]} $ONETOUCH_NHC_PATH ${ONETOUCH_NHC_ARGS[@]} 3> $error_path | tee $output_path) fi nhc_end_time=$(date +%s.%N) From e1c465ad12d6a137927b90b09c97898db9580e34 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 15 Aug 2023 11:34:40 -0700 Subject: [PATCH 51/58] verified that kusto export works and is optional --- distributed_nhc/distributed_nhc.sb.sh | 27 ++++++++++++------- distributed_nhc/export_nhc_result_to_kusto.py | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index 241883d..ecd4cd4 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -87,6 +87,7 @@ DEBUG_LOG_FILE_PATH="" NODELIST_ARR=() ONETOUCH_NHC_PATH=$(realpath -e "./onetouch_nhc.sh") ONETOUCH_NHC_ARGS=() +VERBOSE="False" KUSTO_EXPORT_ENABLED="False" KUSTO_EXPORT_ARGS=() @@ -190,6 +191,7 @@ case "$1" in ;; -V|--verbose) ONETOUCH_NHC_ARGS+=("-V") + VERBOSE="True" ;; # Shared Kusto Export Args --kusto-export-url) @@ -242,7 +244,7 @@ echo echo "The rest of the arguments are: $@" echo echo "Early exit for testing" -exit +echo if [ ${#NODELIST_ARR[@]} -eq 0 ]; then echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a slurm node list with -w/--nodelist" @@ -253,10 +255,8 @@ fi NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) -exit - # Running with SLURM -if [ $EXECUTION_MODE == "SLURM" ] +if [ $EXECUTION_MODE == "SLURM" ]; then # verify file presence on all nodes { RAW_OUTPUT=$(srun --gpus-per-node=8 $ONETOUCH_NHC_PATH -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 else @@ -286,9 +286,13 @@ nhc_duration=$(printf "%.2f" $(echo "($nhc_end_time - $nhc_start_time) / 60" | b # Filter down to NHC-RESULTS NHC_RESULTS=$(echo "$RAW_OUTPUT" | grep "NHC-RESULT" | sed 's/.*NHC-RESULT\s*//g') -NHC_DEBUG=$(echo "$RAW_OUTPUT" | grep "NHC-DEBUG" | sed 's/.*NHC-DEBUG\s*//g') -echo "Dumping NHC Debug into $DEBUG_LOG_FILE_PATH" -echo "$NHC_DEBUG" | sort >> $DEBUG_LOG_FILE_PATH + +if [ "$VERBOSE" == "True" ]; then + # If Verbose was set, we expect NHC-DEBUG to be present + NHC_DEBUG=$(echo "$RAW_OUTPUT" | grep "NHC-DEBUG" | sed 's/.*NHC-DEBUG\s*//g') + echo "Dumping NHC Debug into $DEBUG_LOG_FILE_PATH" + echo "$NHC_DEBUG" | sort >> $DEBUG_LOG_FILE_PATH +fi # Identify nodes who should have reported results but didn't, these failed for some unknown reason nodes_with_results_arr=( $( echo "$NHC_RESULTS" | sed 's/\s*|.*//g' | tr '\n' ' ' ) ) @@ -309,7 +313,7 @@ echo "NHC took $nhc_duration minutes to finish" echo # Export to Kusto if enabled -if [ "$KUSTO_ENABLED_ENABLED" == "True" ]; then +if [ "$KUSTO_EXPORT_ENABLED" == "True" ]; then # Place identity arg at the end (if specified) if [ "$KUSTO_IDENTITY" == "True" ]; then KUSTO_EXPORT_ARGS+=("--identity") @@ -317,6 +321,11 @@ if [ "$KUSTO_ENABLED_ENABLED" == "True" ]; then KUSTO_EXPORT_ARGS+=("--identity" "$KUSTO_IDENTITY") fi + export_files=( "$HEALTH_LOG_FILE_PATH") + if [ "$VERBOSE" == "True" ]; then + export_files+=( "$DEBUG_LOG_FILE_PATH" ) + fi + echo "Exporting results to Kusto" # Ensure prerequisites are installed @@ -326,6 +335,6 @@ if [ "$KUSTO_ENABLED_ENABLED" == "True" ]; then # Run export script kusto_export_script=$(realpath -e "./export_nhc_result_to_kusto.py") echo "Using export script $kusto_export_script" - python3 $kusto_export_script ${KUSTO_EXPORT_ARGS[@]} $KUSTO_IDENTITY -- $HEALTH_LOG_FILE_PATH $DEBUG_LOG_FILE_PATH + python3 $kusto_export_script ${KUSTO_EXPORT_ARGS[@]} $KUSTO_IDENTITY -- ${export_files[@]} echo "Ingestion queued, results take ~5 minutes to appear in Kusto" fi \ No newline at end of file diff --git a/distributed_nhc/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py index eab5cdf..fb04520 100644 --- a/distributed_nhc/export_nhc_result_to_kusto.py +++ b/distributed_nhc/export_nhc_result_to_kusto.py @@ -96,7 +96,7 @@ def get_creds(identity): args = parse_args() creds = get_creds(args.identity) -print(f"Attempting to ingest: {','.join(args.health_files)}") +print(f"Attempting to ingest: {', '.join(args.health_files)}") for health_file in args.health_files: try: if not os.path.exists(health_file): From e74d08461f492b115ff4a9378126a2460a4e10d1 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 15 Aug 2023 11:37:41 -0700 Subject: [PATCH 52/58] cleanup --- distributed_nhc/distributed_nhc.sb.sh | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index ecd4cd4..1271d74 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -232,20 +232,6 @@ esac shift done -echo "Running with the following arguments:" -echo "OneTouch NHC Args: ${ONETOUCH_NHC_ARGS[@]}" -echo -echo "Node list: ${NODELIST_ARR[@]}" -echo -echo "Kusto export enabled: $KUSTO_EXPORT_ENABLED" -echo "Kusto Args: ${KUSTO_EXPORT_ARGS[@]}" -echo "Kusto identity: $KUSTO_IDENTITY" -echo -echo "The rest of the arguments are: $@" -echo -echo "Early exit for testing" -echo - if [ ${#NODELIST_ARR[@]} -eq 0 ]; then echo "No nodes provided, must provide at least one node either from a file with -F/--nodefile or as a slurm node list with -w/--nodelist" echo @@ -255,9 +241,8 @@ fi NODELIST_ARR=( $(echo "${NODELIST_ARR[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') ) -# Running with SLURM if [ $EXECUTION_MODE == "SLURM" ]; then - # verify file presence on all nodes + # Running with SLURM { RAW_OUTPUT=$(srun --gpus-per-node=8 $ONETOUCH_NHC_PATH -n $NHC_JOB_NAME $@ | tee /dev/fd/3 ); } 3>&1 else # Running with Parallel SSH @@ -303,7 +288,7 @@ for missing_node in "${nodes_missing_results[@]}"; do NHC_RESULTS+="$newline$missing_node | ERROR: No results reported" done -echo "Health report can be found into $HEALTH_LOG_FILE_PATH" +echo "Health report can be found in $HEALTH_LOG_FILE_PATH" echo "$NHC_RESULTS" | sort >> $HEALTH_LOG_FILE_PATH echo "======================" cat $HEALTH_LOG_FILE_PATH From 1460e7ae0d3b1f0d1dfd13e25c2dfddd91f95787 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 15 Aug 2023 12:07:39 -0700 Subject: [PATCH 53/58] cleanup comments --- distributed_nhc/distributed_nhc.sb.sh | 13 +++++++------ distributed_nhc/export_nhc_result_to_kusto.py | 15 --------------- distributed_nhc/onetouch_nhc.sh | 5 ++--- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/distributed_nhc/distributed_nhc.sb.sh b/distributed_nhc/distributed_nhc.sb.sh index 1271d74..3ae690e 100755 --- a/distributed_nhc/distributed_nhc.sb.sh +++ b/distributed_nhc/distributed_nhc.sb.sh @@ -17,6 +17,8 @@ Example Usage: ./distributed_nhc.sb.sh -F ./my_node_file ./distributed_nhc.sb.sh -w node1,node2,node3 ./distributed_nhc.sb.sh -F ./my_node_file -w additonal_node1,additional_node2 + ./distributed_nhc.sb.sh -F ./my_node_file --kusto-export-url https://ingest-mycluster.eastus.kusto.windows.net --kusto-database mydatabase --kusto-identity + ./distributed_nhc.sb.sh -F ./my_node_file --kusto-export-url https://ingest-mycluster.eastus.kusto.windows.net --kusto-database mydatabase --kusto-identity my_client_id -h --help Display this help @@ -59,13 +61,13 @@ EOF expand_nodelist() { nodelist="$1" # make nodelist bash "friendly" for expansion - # ie turn "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" - # into "aice-ndv5-iad21-{000170,{000201..000203},{000218..000220}}" + # ie turn "mycluster-[000170,000201-000203,000218-000220]" + # into "mycluster-{000170,{000201..000203},{000218..000220}}" # which bash can easily expand into - # aice-ndv5-iad21-000170 aice-ndv5-iad21-000201 aice-ndv5-iad21-000202 aice-ndv5-iad21-000203 aice-ndv5-iad21-000218 aice-ndv5-iad21-000219 aice-ndv5-iad21-000220 + # mycluster-000170 mycluster-000201 mycluster-000202 mycluster-000203 mycluster-000218 mycluster-000219 mycluster-000220 - # converts "aice-ndv5-iad21-[000170,000201-000203,000218-000220]" - # into "aice-ndv5-iad21- [000170,000201-000203,000218-000220]" + # converts "mycluster-[000170,000201-000203,000218-000220]" + # into "mycluster- [000170,000201-000203,000218-000220]" # which we can then stick into an array. If we have 1 element, there were no ranges # otherwise, expand the ranges and rebuild the node names host_num_split=( $( echo $nodelist | sed -r "s/(.*)(\[.*\]).*/\1 \2/" ) ) @@ -297,7 +299,6 @@ echo "======================" echo "NHC took $nhc_duration minutes to finish" echo -# Export to Kusto if enabled if [ "$KUSTO_EXPORT_ENABLED" == "True" ]; then # Place identity arg at the end (if specified) if [ "$KUSTO_IDENTITY" == "True" ]; then diff --git a/distributed_nhc/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py index fb04520..ce8d75d 100644 --- a/distributed_nhc/export_nhc_result_to_kusto.py +++ b/distributed_nhc/export_nhc_result_to_kusto.py @@ -9,11 +9,6 @@ from azure.kusto.ingest import QueuedIngestClient, IngestionProperties import pandas as pd -ingest_url = "https://ingest-aistresstests.centralus.kusto.windows.net" -database = "sat13c04_stress_testdb" -health_table_name = "NodeHealthCheck" -debug_table_name = "NodeHealthCheck_Debug" - def ingest_health_log(health_file, creds, ingest_url, database, health_table_name): filename_parts = os.path.basename(health_file).split("-", maxsplit=2) ts_str = filename_parts[2].split(".")[0] @@ -36,10 +31,6 @@ def ingest_health_log(health_file, creds, ingest_url, database, health_table_nam df['Healthy'] = df.apply(lambda x: x['RawResult'] == "Healthy", axis=1) df = df[['Timestamp', 'JobName', 'Hostname', 'Healthy', 'RawResult']] - creds = ManagedIdentityCredential( - client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" - ) - ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) print(f"Ingesting health results from {os.path.basename(health_file)} into {ingest_url} at {database}/{health_table_name}") ingest_client.ingest_from_dataframe(df, IngestionProperties(database, health_table_name)) @@ -65,16 +56,10 @@ def ingest_debug_log(debug_file, creds, ingest_url, database, debug_table_name): df['DebugLog'] = df.apply(lambda x: x['DebugLog'].strip(), axis=1) df = df[['Timestamp', 'JobName', 'Hostname', 'DebugLog']] - creds = ManagedIdentityCredential( - client_id = "16b52144-5ca5-4c25-aac5-0d3b7a4cb36d" - ) - ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) print(f"Ingesting health results from {os.path.basename(debug_file)} into {ingest_url} at {database}/{debug_table_name}") ingest_client.ingest_from_dataframe(df, IngestionProperties(database, debug_table_name)) -health_files = sys.argv[1:] - def parse_args(): parser = ArgumentParser(description="Ingest NHC results into Kusto") parser.add_argument("health_files", nargs="+", help="List of .health.log or .debug.log files to ingest") diff --git a/distributed_nhc/onetouch_nhc.sh b/distributed_nhc/onetouch_nhc.sh index 4d9b1ee..293aef8 100755 --- a/distributed_nhc/onetouch_nhc.sh +++ b/distributed_nhc/onetouch_nhc.sh @@ -32,7 +32,7 @@ EOF # Arguments VERSION="main" -GIT_URL="https://github.com/mpwillia/azurehpc-health-checks" +GIT_URL="https://github.com/Azure/azurehpc-health-checks" WORKING_DIR=$(realpath -m "$HOME/onetouch_nhc/working") OUTPUT_DIR=$WORKING_DIR JOB_NAME="$(hostname)-$(date --utc +"%Y-%m-%d_%H-%M-%S")" @@ -175,7 +175,6 @@ setup_nhc() { install_nhc true } - run_health_checks() { log_file_path="$1" custom_conf="$2" @@ -207,7 +206,7 @@ run_health_checks() { # Download AZ NHC echo "Running OneTouch NHC with Job Name $JOB_NAME on host $(hostname)" -setup_nhc $VERSION $AZ_NHC_DIR 1 +setup_nhc $VERSION $AZ_NHC_DIR echo "=== Finished Setting up AZ NHC ===" # Execute Health Checks From afdad45198311e201bb25473a0b1eeffa8a99424 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Tue, 15 Aug 2023 12:52:06 -0700 Subject: [PATCH 54/58] update readmes --- README.md | 6 ++ distributed_nhc/README.md | 156 ++++++++++++++++++++++++++++++-------- 2 files changed, 131 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 3c48779..c68ab82 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,12 @@ Usage ### _References_ ### - [LBNL Node Health Checks](https://github.com/mej/nhc) - [Azure HPC Images](https://github.com/Azure/azhpc-images) + +## Distributed NHC +AzureHPC Node Health Check also comes bundled with a distributed version of NHC. This version of NHC is designed to run on a cluster of machines and report back to a central location. This is useful for running health checks on a large cluster with dozens or hundreds of nodes. + +See [Distributed NHC](./distributed-nhc/README.md) for more information. + ## Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a diff --git a/distributed_nhc/README.md b/distributed_nhc/README.md index 6d3a777..4b84ff8 100644 --- a/distributed_nhc/README.md +++ b/distributed_nhc/README.md @@ -1,17 +1,13 @@ -# Running Distributed Node Health Check -We’ll be using a version of Azure NHC called Distributed NHC throughout the thermal test. +# Distributed Node Health Check +Distributed NHC is an extension of Azure NHC which can easily dispatch, execute, and aggregate Azure NHC on an arbitrary collection of nodes. The Node Health Check runs a variety of single node validation tests to validate the health of a given node. These tests validate the presence and status of hardware on the node as well as a variety of GPU and IB performance tests to ensure expected bandwidths and throughputs are met. -## Acquiring Distributed NHC -Distributed NHC is built on a fork of Azure NHC and can be found at -mpwillia/azurehpc-health-checks: Health checks for Azure N- and H-series VMs. (github.com) -Setup -1. Ensure a shared volume is mounted named /work -2. Clone the repository - git clone https://github.com/mpwillia/azurehpc-health-checks /work/distributed_nhc -3. Create the logs directory - mkdir /work/distributed_nhc/logs +## Setting up Distributed NHC +It is necessary to run Distributed NHC on a cluster in which all nodes have access to the same shared volume. +In all examples following, it is assumed that a shared volume named "/work" is mounted and accessible by all nodes. The name of the shared volume is not required to be /work + +Provided this, you can simply clone the repository onto the /work volume and invoke ./distributed_nhc.sb.sh as described below. ## Running Distributed NHC Distributed NHC is ran entirely with the distributed_nhc.sb.sh script. @@ -32,9 +28,11 @@ When doing so, all sbatch arguments to target specific sets of nodes are support Running with slurm is the suggested approach when running on many nodes. ### Slurm Execution Examples -sbatch -w monument-[001,003-007] ./distributed_nhc.sb.sh +``` +sbatch -w mynode-[001,003-007] ./distributed_nhc.sb.sh sbatch -F mynodelist.txt ./distributed_nhc.sb.sh -sbatch -N35 --partition=gspe ./distributed_nhc.sb.sh +sbatch -N35 --partition=p1 ./distributed_nhc.sb.sh +``` ### Slurm Logs All logs will end up in the logs directory at /work/distributed_nhc/logs provided the setup instructions have been followed. @@ -43,6 +41,8 @@ There are three log files produced. 2. distributed_nhc-{jobid}.err is the active standard error. 3. distributed_nhc-{jobid}-{timestamp}.health.log is the results of the health check. This file will only appear upon completion of the health check. See the Interpreting NHC Results section below for details on the health.log files. +4. distributed_nhc-{jobid}-{timestamp}.debug.log is the very verbose debug logs. This file will only appear upon completion of the health check provided that -V/--verbose was set. + See the Interpreting NHC Results section below. ## Parallel-SSH Running with parallel-ssh by invoking distributed_nhc.sb.sh directly is the preferred approach to test drained nodes or nodes otherwise unreachable by slurm, as well as spot checking smaller sets of nodes. @@ -50,36 +50,43 @@ When doing so, the node lists can be specified like sbatch using either -w or -F The script itself also supports the –help argument. ### Parallel-SSH Execution Examples -./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] +``` +./distributed_nhc.sb.sh -w mynode-[001,003-007,12,018-023] ./distributed_nhc.sb.sh -F mynodelist.txt +``` ### Parallel-SSH Logs -All logs will end up in the logs directory at /work/distributed_nhc/logs provided the setup instructions have been followed. +All logs will end up in the logs directory at ./distributed_nhc/logs provided the setup instructions have been followed. There are three log files produced. 1. distributed_nhc-pssh-{timestamp}.out is the active standard output. You can tail -f this file to see activity. 2. distributed_nhc-pssh-{timestamp}.err is the active standard error. 3. distributed_nhc-pssh-{timestamp}.health.log is the results of the health check. This file will only appear upon completion of the health check. See the Interpreting NHC Results section below. +4. distributed_nhc-pssh-{timestamp}.debug.log is the very verbose debug logs. This file will only appear upon completion of the health check provided that -V/--verbose was set. + See the Interpreting NHC Results section below. ## Customizing Test Set You may want to rerun a specific test or subset of tests on problematic nodes rather than rerunning the entire health check. -To do so, make a copy of /work/distributed_nhc/conf/nd96isr_h100_v5.conf -cp /work/distributed_nhc/conf/nd96isr_h100_v5.conf /work/distributed_nhc/mytests.conf +To do so, make a copy of ../distributed_nhc/conf/nd96isr_h100_v5.conf +cp ./distributed_nhc/conf/nd96isr_h100_v5.conf ./distributed_nhc/mytests.conf In this file you can comment out any test with # at the beginning of the line. The execute the customized conf file you must specify it as an argument to distributed_nhc.sb.sh with the -c argument. -Modifying and saving the conf file /work/distributed_nhc/conf/nd96isr_h100_v5.conf will not work unless you explicitly pass it as an argument with -c +Modifying and saving the conf file ./distributed_nhc/conf/nd96isr_h100_v5.conf will not work unless you explicitly pass it as an argument with -c For example: -sbatch -w monument-[001,003-007] ./distributed_nhc.sb.sh -c ./mytests.conf -./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] -c ./mytests.conf -./distributed_nhc.sb.sh -w monument-[001,003-007,12,018-023] -c /work/distributed_nhc/conf/nd96isr_h100_v5.conf +``` +sbatch -w mynode-[001,003-007] ./distributed_nhc.sb.sh -c ./mytests.conf +./distributed_nhc.sb.sh -w mynode-[001,003-007,12,018-023] -c ./mytests.conf +./distributed_nhc.sb.sh -w mynode-[001,003-007,12,018-023] -c ../conf/nd96isr_h100_v5.conf +``` Please note, if you modify the .conf file to run a limited set of tests, a node that reports Healthy only means it has passed that specific subset of tests. The node could still be unhealthy and fail a test you have not run. Always verify a node is healthy by running the entire suite of tests (simply by not specifying -c) ## Interpreting NHC Results The resulting health log file shows the health report for every node tested against. -The health logs appear in the /work/distributed_nhc/logs directory with the extension .health.log +The health logs appear in the ./distributed_nhc/logs directory with the extension .health.log +The debug logs appear in the ./distributed_nhc/logs directory with the extension .debug.log ### Health Results The health results per node found in the .health.log files are always formatted as {hostname} | {health result} @@ -103,13 +110,100 @@ The health results per node found in the .health.log files are always formatted * Every node leaves a report of their own execution of NHC in the ~/onetouch_nhc/working directory. * Check the most recent .out and .err file to debug issues like this. -### Example Logs from Real Tests +#### Example Logs from Real Tests Below is a sample of real outputs from .health.log files showing a variety of results. -monument-042 | Healthy -monument-066 | ERROR: nhc: Health check failed: check_gpu_xid: GPU Xid errors detected: [ 3606.832215] NVRM: Xid (PCI:0002:00:00): 119, pid=67769, name=nvidia-smi, Timeout waiting for RPC from GSP1! Expected function 76 (GSP_RM_CONTROL) (0x2080014b 0x5). -monument-079 | ERROR: nhc: Health check failed: check_nccl_allreduce_ib_loopback: NCCL allreduce, BUS BW (expected >=40.0 GB/s, but measured 20.7377 GB/s) -monument-084 | ERROR: nhc: Health check failed: Bandwidth is low on device 1. Reported bandwidth is 7 GB/s. -monument-108 | ERROR: No results reported -monument-123 | ERROR: nhc: Health check failed: Bandwidth is low on device 3. Reported bandwidth is 27 GB/s. -monument-273 | ERROR: nhc: Health check failed: check_hw_ib: No IB port mlx5_ib7:1 is ACTIVE (LinkUp 400 Gb/sec). -monument-510 | ERROR: nhc: Health check failed: check_gpu_ecc: GPU id 3: SRAM Uncorrectable ECC error count detected, (0,1) +``` +mynode-042 | Healthy +mynode-066 | ERROR: nhc: Health check failed: check_gpu_xid: GPU Xid errors detected: [ 3606.832215] NVRM: Xid (PCI:0002:00:00): 119, pid=67769, name=nvidia-smi, Timeout waiting for RPC from GSP1! Expected function 76 (GSP_RM_CONTROL) (0x2080014b 0x5). +mynode-079 | ERROR: nhc: Health check failed: check_nccl_allreduce_ib_loopback: NCCL allreduce, BUS BW (expected >=40.0 GB/s, but measured 20.7377 GB/s) +mynode-084 | ERROR: nhc: Health check failed: Bandwidth is low on device 1. Reported bandwidth is 7 GB/s. +mynode-108 | ERROR: No results reported +mynode-123 | ERROR: nhc: Health check failed: Bandwidth is low on device 3. Reported bandwidth is 27 GB/s. +mynode-273 | ERROR: nhc: Health check failed: check_hw_ib: No IB port mlx5_ib7:1 is ACTIVE (LinkUp 400 Gb/sec). +mynode-510 | ERROR: nhc: Health check failed: check_gpu_ecc: GPU id 3: SRAM Uncorrectable ECC error count detected, (0,1) +``` + +### Debug Results +If ./distributed_nhc.sb.sh was ran with -V/--verbose then a .debug.log file will be produced. +This file contains dense, detailed logs of every NHC execution, including for nodes which report Healthy. +This can be useful as it also contains measured bandwidths and extra information about the NHC run. + +#### Example Logs from Real Tests +Below is a sample of potentially useful information from a .debug.log file. +``` +mynode-411 | Device 0 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 0 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 1 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 1 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 2 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 2 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 3 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 3 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 4 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 4 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 5 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 5 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 6 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 6 Host to Device reported bandwidth is 55 GB/s +mynode-411 | Device 7 Device to Host reported bandwidth is 55 GB/s +mynode-411 | Device 7 Host to Device reported bandwidth is 55 GB/s +mynode-411 | IB devices=mlx5_ib0, mlx5_ib4: numa domains=0,1, Measured IB BW 390.98 Gbps +mynode-411 | IB devices=mlx5_ib1, mlx5_ib5: numa domains=0,1, Measured IB BW 390.98 Gbps +mynode-411 | IB devices=mlx5_ib2, mlx5_ib6: numa domains=0,1, Measured IB BW 390.99 Gbps +mynode-411 | IB devices=mlx5_ib3, mlx5_ib7: numa domains=0,1, Measured IB BW 390.98 Gbps +mynode-411 | Measured Avg NCCL allreduce bus BW 477.873 GB/s (expected >=460.0 GB/s) +mynode-411 | NCCL allreduce IB loopback bandwidth 46.3965 GB/s +``` + +## Kusto Export +There are two methods to export health and debug logs to Kusto. + +### Manual Export +The script ./distributed_nhc/export_nhc_result_to_kusto.py can be used to manually export health and debug logs to Kusto. + +It's requirements can be installed with +``` +pip3 install -r ./distributed_nhc/requirements.txt +``` + +#### Example Usage +``` +User Assigned Managed Identity +python3 ./export_nhc_result_to_kusto.py --ingest_url https://ingest-.kusto.windows.net --database mydatabase --identity client_id -- my.health.log my.debug.log + +System Assigned Managed Identity +python3 ./export_nhc_result_to_kusto.py --ingest_url https://ingest-.kusto.windows.net --database mydatabase --identity -- my.health.log my.debug.log + +Default Azure Credentials +python3 ./export_nhc_result_to_kusto.py --ingest_url https://ingest-.kusto.windows.net --database mydatabase -- my.health.log my.debug.log + +Specifying Custom Table Names +python3 ./export_nhc_result_to_kusto.py --ingest_url https://ingest-.kusto.windows.net --database mydatabase --health_table_name MyHealthTable --debug_table_name MyDebugTable -- my.health.log my.debug.log +``` + +### Automated Export +./distributed_nhc.sb.sh supports similar arguments as ./export_nhc_result_to_kusto.py to automatically export health and debug logs to Kusto. + +If an ingest_url is specified, the health and debug logs will be automatically exported to Kusto upon completion of the health check. Additionally the prerequisites for ./export_nhc_result_to_kusto.py will be installed. + +#### Example Usage +``` +User Assigned Managed Identity +./distributed_nhc.sb.sh -w mynode --kusto-export-url https://ingest-.kusto.windows.net --kusto-database mydatabase --kusto-identity client_id + +System Assigned Managed Identity +./distributed_nhc.sb.sh -w mynode --kusto-export-url https://ingest-.kusto.windows.net --kusto-database mydatabase --kusto-identity + +Default Azure Credentials +./distributed_nhc.sb.sh -w mynode --kusto-export-url https://ingest-.kusto.windows.net --kusto-database mydatabase + +Specifying Custom Table Names +./distributed_nhc.sb.sh -w mynode --kusto-export-url https://ingest-.kusto.windows.net --kusto-database mydatabase --kusto-health-table MyHealthTable --kusto-debug-table MyDebugTabl +``` + +### Table Schema +The default table name and it's CSL Schema for the health and debug tables are as follows +``` +NodeHealthCheck: Timestamp:datetime,JobName:string,Hostname:string,Healthy:bool,RawResult:string +NodeHealthCheck_Debug: Timestamp:datetime,JobName:string,Hostname:string,DebugLog:string +``` \ No newline at end of file From 06114325109659ce3c260b56d46aeab436fcfc15 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Wed, 6 Sep 2023 13:54:34 -0700 Subject: [PATCH 55/58] pr feedback --- README.md | 2 +- conf/nd96isr_h100_v5.conf | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index c68ab82..85092a7 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Usage - [Azure HPC Images](https://github.com/Azure/azhpc-images) ## Distributed NHC -AzureHPC Node Health Check also comes bundled with a distributed version of NHC. This version of NHC is designed to run on a cluster of machines and report back to a central location. This is useful for running health checks on a large cluster with dozens or hundreds of nodes. +AzureHPC Node Health Checks also comes bundled with a distributed version of NHC, which is designed to run on a cluster of machines and report back to a central location. This is useful for running health checks on a large cluster with dozens or hundreds of nodes. See [Distributed NHC](./distributed-nhc/README.md) for more information. diff --git a/conf/nd96isr_h100_v5.conf b/conf/nd96isr_h100_v5.conf index 3d34a2a..9037ca3 100644 --- a/conf/nd96isr_h100_v5.conf +++ b/conf/nd96isr_h100_v5.conf @@ -35,7 +35,6 @@ * || check_hw_eth ib6 * || check_hw_eth ib7 * || check_hw_eth docker0 -# * || check_vbios_version 96.00.74.00.01 * || check_hw_topology /opt/microsoft/ndv5-topo.xml ####################################################################### From 208e316fda44fecca16fa0edba95006f639bf4d3 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Thu, 7 Sep 2023 14:27:10 -0700 Subject: [PATCH 56/58] use lstopo-no-graphics, add installation --- customTests/azure_hw_topology_check.nhc | 2 +- customTests/custom-test-setup.sh | 23 ++++++++++++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/customTests/azure_hw_topology_check.nhc b/customTests/azure_hw_topology_check.nhc index 02ea8fe..7158c20 100644 --- a/customTests/azure_hw_topology_check.nhc +++ b/customTests/azure_hw_topology_check.nhc @@ -26,7 +26,7 @@ function load_expected_topology() { } function get_actual_topology() { - raw_topo=$(lstopo) + raw_topo=$(lstopo-no-graphics) numaid="0" echo "$raw_topo" | while read line; do diff --git a/customTests/custom-test-setup.sh b/customTests/custom-test-setup.sh index 124589f..7d8b5a5 100755 --- a/customTests/custom-test-setup.sh +++ b/customTests/custom-test-setup.sh @@ -40,12 +40,9 @@ function install_perf_test(){ fi pushd ${EXE_DIR} - perftest_dir="perftest-${VERSION}" - mkdir -p ${EXE_DIR}/${perftest_dir} - archive_url="https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz" - wget -q -O - $archive_url | tar -xz --strip=1 -C ${EXE_DIR}/${perftest_dir} - - pushd ${perftest_dir} + wget https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz + tar xvf perftest-${VERSION}.${VERSION_HASH}.tar.gz + pushd perftest-4.5 if [[ "$type" == "cuda" ]]; then ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h else @@ -54,8 +51,10 @@ function install_perf_test(){ fi make + rm ${EXE_DIR}/perftest-${VERSION}.${VERSION_HASH}.tar.gz popd popd + } @@ -124,8 +123,18 @@ else fi +# Ensure lstopo-no-graphics is installed for the azure_hw_topology_check.nhc +distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` +if [[ $distro =~ "Ubuntu" ]]; then + apt-get install -y hwloc +elif [[ $distro =~ "AlmaLinux" ]]; then + dnf install -y hwloc +else + echo "OS version is not supported, azure_hw_topology_check will not work." + return 1 +fi + # copy all custom test to the nhc scripts dir -echo "Copying *.nhc from $SRC_DIR to /etc/nhc/scripts" cp $SRC_DIR/*.nhc /etc/nhc/scripts exit 0 From 0cc5b831b011b1d9c40aaa74d8aa4ddc38683c00 Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Fri, 8 Sep 2023 12:59:35 -0700 Subject: [PATCH 57/58] fix bad merge --- customTests/custom-test-setup.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/customTests/custom-test-setup.sh b/customTests/custom-test-setup.sh index 7d8b5a5..1e6088f 100755 --- a/customTests/custom-test-setup.sh +++ b/customTests/custom-test-setup.sh @@ -40,9 +40,12 @@ function install_perf_test(){ fi pushd ${EXE_DIR} - wget https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz - tar xvf perftest-${VERSION}.${VERSION_HASH}.tar.gz - pushd perftest-4.5 + perftest_dir="perftest-${VERSION}" + mkdir -p ${EXE_DIR}/${perftest_dir} + archive_url="https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz" + wget -q -O - $archive_url | tar -xz --strip=1 -C ${EXE_DIR}/${perftest_dir} + + pushd ${perftest_dir} if [[ "$type" == "cuda" ]]; then ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h else @@ -51,10 +54,8 @@ function install_perf_test(){ fi make - rm ${EXE_DIR}/perftest-${VERSION}.${VERSION_HASH}.tar.gz popd popd - } From 836583f92d7c9f463f01a94f3c69f1e0f7a8c08f Mon Sep 17 00:00:00 2001 From: Michael Williams Date: Thu, 14 Sep 2023 14:32:10 -0700 Subject: [PATCH 58/58] adding extra empty lines --- .gitignore | 2 +- customTests/azure_gpu_count.nhc | 2 +- customTests/azure_gpu_vbios.nhc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 331dbed..ec70902 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ lbnl-nhc-1.4.3/ *.deb *stream.c *health.log -.vscode \ No newline at end of file +.vscode diff --git a/customTests/azure_gpu_count.nhc b/customTests/azure_gpu_count.nhc index 26cdf31..f5796e7 100644 --- a/customTests/azure_gpu_count.nhc +++ b/customTests/azure_gpu_count.nhc @@ -5,4 +5,4 @@ function check_gpu_count() { if [ "$gpu_count" -ne "$1" ]; then die 1 "$FUNCNAME: Expected to see $EXPECTED_NUM_GPU but found $gpu_count" fi -} \ No newline at end of file +} diff --git a/customTests/azure_gpu_vbios.nhc b/customTests/azure_gpu_vbios.nhc index e871c23..d77c9a2 100644 --- a/customTests/azure_gpu_vbios.nhc +++ b/customTests/azure_gpu_vbios.nhc @@ -8,4 +8,4 @@ function check_vbios_version() { elif ! echo "${uniq_vbios_versions[@]}" | grep -qw "$expected_version"; then die 1 "$FUNCNAME: GPU VBIOS version does not match the expected '$expected_version', instead got '${uniq_vbios_versions[@]}'" fi -} \ No newline at end of file +}