diff --git a/conf/nc24rs_v3.conf b/conf/nc24rs_v3.conf index 1ac34969..65426f2a 100644 --- a/conf/nc24rs_v3.conf +++ b/conf/nc24rs_v3.conf @@ -24,8 +24,6 @@ * || check_hw_swap 0kB 0kB 3% * || check_hw_eth eth0 * || check_hw_eth lo - * || check_hw_ib 40 mlx4_0:1 - * || check_hw_eth ib0 ######################################################## diff --git a/customTests/azure_gpu_bandwidth.nhc b/customTests/azure_gpu_bandwidth.nhc index 38475d94..d3461405 100644 --- a/customTests/azure_gpu_bandwidth.nhc +++ b/customTests/azure_gpu_bandwidth.nhc @@ -86,11 +86,17 @@ function evaluate_nvBW_result(){ continue fi if (( $(echo "$gpubw >= $EXP_BW" | bc -l) )); then - dbg "${tests_map[$test]} test on GPU $gpu_device passed. Bandwidth $gpubw is greater than $EXP_BW" + if [ "$test" = "$P2P" ]; then + dbg "${tests_map[$test]}_GPU_${gpu_device}_${peer_device}: $gpubw GB/s" + else + dbg "${tests_map[$test]}_GPU_${gpu_device}: $gpubw GB/s" + fi else if [ "$test" = "$P2P" ]; then + dbg "${tests_map[$test]}_GPU_${gpu_device}_${peer_device}: $gpubw GB/s" die 1 "check_gpu_bw: ${tests_map[$test]} test on GPU $gpu_device to GPU $peer_device failed. Bandwidth $gpubw is less than $EXP_BW" else + dbg "${tests_map[$test]}_GPU_${gpu_device}: $gpubw GB/s" die 1 "check_gpu_bw: ${tests_map[$test]} test on GPU $gpu_device failed. Bandwidth $gpubw is less than $EXP_BW" fi fi diff --git a/customTests/azure_ib_write_bw_gdr.nhc b/customTests/azure_ib_write_bw_gdr.nhc index 91123471..50722cdd 100644 --- a/customTests/azure_ib_write_bw_gdr.nhc +++ b/customTests/azure_ib_write_bw_gdr.nhc @@ -57,7 +57,8 @@ function run_ib_bw_gdr(){ break fi done - if [[ $ib_bandwidth < $EXP_IB_BW ]]; then + dbg "ib_write_lb_${device}: $ib_bandwidth Gbps" + if (( $(echo "$ib_bandwidth < $EXP_IB_BW" | bc -l) )); then log "$IB_WRITE_BW_OUT2" die 1 "$FUNCNAME: $IB_WRITE_BW, IB=$device, $device_peer, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps" return 1 diff --git a/customTests/azure_ib_write_bw_non_gdr.nhc b/customTests/azure_ib_write_bw_non_gdr.nhc index f41a612d..85f4a10d 100644 --- a/customTests/azure_ib_write_bw_non_gdr.nhc +++ b/customTests/azure_ib_write_bw_non_gdr.nhc @@ -76,7 +76,7 @@ function ib_write(){ die 1 "check_ib_bw_non_gdr: IB=$device, IB BW (expected > $EXP_IB_BW Gbps, but measured $ib_bandwidth Gbps)" return 1 fi - dbg "$FUNCNAME: IB device=$device: Measured IB BW $ib_bandwidth Gbps" + dbg "ib_write_lb_${device}: $ib_bandwidth Gbps" return 0 } diff --git a/customTests/azure_nccl_allreduce.nhc b/customTests/azure_nccl_allreduce.nhc index 47b13b80..3386c231 100644 --- a/customTests/azure_nccl_allreduce.nhc +++ b/customTests/azure_nccl_allreduce.nhc @@ -56,9 +56,8 @@ function check_nccl_allreduce() { break fi done - - if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_BW ]] - then + dbg "nccl_all_red: $avg_bus_bw GB/s" + if (( $(echo "$avg_bus_bw < $EXP_NCCL_ALLREDUCE_BW" | bc -l) )); then dbg "$nccl_allreduce_out" dbg "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_BW GB/s" else diff --git a/customTests/azure_nccl_allreduce_ib_loopback.nhc b/customTests/azure_nccl_allreduce_ib_loopback.nhc index 9c143e04..b78219c5 100644 --- a/customTests/azure_nccl_allreduce_ib_loopback.nhc +++ b/customTests/azure_nccl_allreduce_ib_loopback.nhc @@ -50,8 +50,8 @@ function check_nccl_allreduce_ib_loopback() { break fi done - - if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]] + dbg "nccl_all_red_lb: $avg_bus_bw GB/s" + if (( $(echo "$avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW" | bc -l) )); then dbg "$nccl_allreduce_ib_loopback_out" dbg "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s" diff --git a/distributed_nhc/export_nhc_result_to_kusto.py b/distributed_nhc/export_nhc_result_to_kusto.py index ce8d75d5..10b4c30a 100644 --- a/distributed_nhc/export_nhc_result_to_kusto.py +++ b/distributed_nhc/export_nhc_result_to_kusto.py @@ -1,6 +1,9 @@ #!/usr/bin/python3 import sys import os +import json +import re +import subprocess from datetime import datetime from csv import DictReader from argparse import ArgumentParser @@ -45,7 +48,7 @@ def ingest_debug_log(debug_file, creds, ingest_url, database, debug_table_name): if job_name == "pssh": job_name = f"{job_name}-{ts_str}" - with open(health_file, 'r') as f: + with open(debug_file, 'r') as f: lines = f.readlines() reader = DictReader(lines, fieldnames = ["Hostname", "DebugLog"], delimiter='|', restkey="extra") @@ -60,6 +63,162 @@ def ingest_debug_log(debug_file, creds, ingest_url, database, debug_table_name): print(f"Ingesting health results from {os.path.basename(debug_file)} into {ingest_url} at {database}/{debug_table_name}") ingest_client.ingest_from_dataframe(df, IngestionProperties(database, debug_table_name)) +def run_command(cmd): + result = subprocess.run(cmd, capture_output=True, shell=True, text=True) + return result.stdout.strip() + +def get_nhc_json_formatted_result(results_file): + def natural_sort_key(s): + return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)] + + # check if GPU or CPU + processor_cmd = f"lspci | grep -iq NVIDIA" # if not empty, then GPU + processor_str = run_command(processor_cmd) + + processor = "GPU" if processor_str else "CPU" + + if processor == "GPU": + ib_write_lb_mlx5_ib_cmd = f"cat {results_file} | grep -o 'ib_write_lb_mlx5_ib[0-7]: .*'" + ib_write_lb_mlx5_ib_str = run_command(ib_write_lb_mlx5_ib_cmd) + ib_write_lb_mlx5_ib_str = sorted(ib_write_lb_mlx5_ib_str.strip().split("\n"), key=natural_sort_key) + ib_write_lb_mlx5_ib_str = '\n'.join(ib_write_lb_mlx5_ib_str) # convert to string + + H2D_GPU_cmd = f"cat {results_file} | grep -o 'H2D_GPU_[0-7]: .*'" + H2D_GPU_str = run_command(H2D_GPU_cmd) + + D2H_GPU_cmd = f"cat {results_file} | grep -o 'D2H_GPU_[0-7]: .*'" + D2H_GPU_str = run_command(D2H_GPU_cmd) + + P2P_GPU_cmd = f"cat {results_file} | grep -o 'P2P_GPU_[0-7]_[0-7]: .*'" + P2P_GPU_str = run_command(P2P_GPU_cmd) + + nccl_all_red_cmd = f"cat {results_file} | grep -o 'nccl_all_red: .*'" + nccl_all_red_str = run_command(nccl_all_red_cmd) + + nccl_all_red_lb_cmd = f"cat {results_file} | grep -o 'nccl_all_red_lb: .*'" + nccl_all_red_lb_str = run_command(nccl_all_red_lb_cmd) + + data_string = "\n".join([ib_write_lb_mlx5_ib_str, H2D_GPU_str, D2H_GPU_str, P2P_GPU_str, nccl_all_red_str, nccl_all_red_lb_str]) + data_string = os.linesep.join([s for s in data_string.splitlines() if s]) # remove empty lines + result = {"IB_WRITE_GDR": {}, "GPU_BW_HTD": {}, "GPU_BW_DTH": {}, "GPU_BW_P2P": {}, "NCCL_ALL_REDUCE": {}, "NCCL_ALL_REDUCE_LOOP_BACK": {}} + + # Split the string by lines and create key-value pairs + for line in data_string.strip().split("\n"): + if not line or line.isspace(): + continue + key, value = line.split(":") + if key.startswith("ib_write_lb_mlx5_ib"): + result["IB_WRITE_GDR"][key] = str(value.strip()) + elif key.startswith("H2D"): + result["GPU_BW_HTD"][key] = str(value.strip()) + elif key.startswith("D2H"): + result["GPU_BW_DTH"][key] = str(value.strip()) + elif key.startswith("P2P"): + result["GPU_BW_P2P"][key] = str(value.strip()) + elif key.startswith("nccl_all_red_lb"): + result["NCCL_ALL_REDUCE_LOOP_BACK"] = str(value.strip()) + elif key.startswith("nccl_all_red"): + result["NCCL_ALL_REDUCE"] = str(value.strip()) + + else: # processor == "CPU" + ib_write_lb_mlx5_ib_cmd = f"cat {results_file} | grep -o 'ib_write_lb_mlx5_ib[0-7]: .*'" + ib_write_lb_mlx5_ib_str = run_command(ib_write_lb_mlx5_ib_cmd) + ib_write_lb_mlx5_ib_str = sorted(ib_write_lb_mlx5_ib_str.strip().split("\n"), key=natural_sort_key) + ib_write_lb_mlx5_ib_str = '\n'.join(ib_write_lb_mlx5_ib_str) # convert to string + + stream_Copy_cmd = f"cat {results_file} | grep -o 'stream_Copy: .*'" + stream_Copy_str = run_command(stream_Copy_cmd) + + stream_Add_cmd = f"cat {results_file} | grep -o 'stream_Add: .*'" + stream_Add_str = run_command(stream_Add_cmd) + + stream_Scale_cmd = f"cat {results_file} | grep -o 'stream_Scale: .*'" + stream_Scale_str = run_command(stream_Scale_cmd) + + stream_Triad_cmd = f"cat {results_file} | grep -o 'stream_Triad: .*'" + stream_Triad_str = run_command(stream_Triad_cmd) + + data_string = "\n".join([ib_write_lb_mlx5_ib_str, stream_Copy_str, stream_Add_str, stream_Scale_str, stream_Triad_str]) + data_string = os.linesep.join([s for s in data_string.splitlines() if s]) # remove empty lines + result = {"IB_WRITE_NON_GDR": {}, "stream_Copy": {}, "stream_Add": {}, "stream_Scale": {}, "stream_Triad": {}} + + # Split the string by lines and create key-value pairs + for line in data_string.strip().split("\n"): + if not line or line.isspace(): + continue + key, value = line.split(":") + if key.startswith("ib_write_lb_mlx5_ib"): + result["IB_WRITE_NON_GDR"][key] = str(value.strip()) + elif key.startswith("stream_Copy"): + result["stream_Copy"]= str(value.strip()) + elif key.startswith("stream_Add"): + result["stream_Add"]= str(value.strip()) + elif key.startswith("stream_Scale"): + result["stream_Scale"]= str(value.strip()) + elif key.startswith("stream_Triad"): + result["stream_Triad"]= str(value.strip()) + + return result + +def ingest_results(results_file, creds, ingest_url, database, results_table_name, nhc_run_uuid="None"): + ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + + job_name = results_file.replace("\\", "/").split(".")[0].split("/")[-1] # account for \ or / in path + uuid = job_name if nhc_run_uuid == "None" else f"{nhc_run_uuid}-{job_name}" + if uuid == "health": + uuid = "" + else : + uuid = "-" + uuid # add the dash here instead of below; this way if 'uuid' is empty, we don't have a trailing dash + full_uuid = f"nhc-{ts}{uuid}" + + vmSize_bash_cmd = "echo $( curl -H Metadata:true --max-time 10 -s \"http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text\") | tr '[:upper:]' '[:lower:]' " + vmSize = run_command(vmSize_bash_cmd) + + vmId_bash_cmd = "curl -H Metadata:true --max-time 10 -s \"http://169.254.169.254/metadata/instance/compute/vmId?api-version=2021-02-01&format=text\"" + vmId = run_command(vmId_bash_cmd) + + vmName_bash_cmd = "hostname" + vmName = run_command(vmName_bash_cmd) + + physhost = run_command("echo $(hostname) \"$(/opt/azurehpc/tools/kvp_client | grep Fully)\" | cut -d ':' -f 3 | cut -d ' ' -f 2 | sed 's/\"//g'") + if not physhost: + physhost = "not Mapped" + + with open(results_file, 'r') as f: + full_results = f.read() + jsonResultDict = get_nhc_json_formatted_result(results_file) + jsonResult = json.dumps(jsonResultDict) + + record = { + 'vmSize': vmSize, + 'vmId': vmId, + 'vmHostname': vmName, + 'physHostname': physhost, + 'workflowType': "main", + 'time': ts, + 'pass': False, # keep as default false + 'errors': '', + 'logOutput': full_results, # the entire file + 'jsonResult': jsonResult, + 'uuid': full_uuid + } + + if "ERROR" in full_results: + record['pass'] = False + record['errors'] = full_results + elif "Node Health Check completed successfully" in full_results: + record['pass'] = True + else: + record['pass'] = False + record['errors'] = "No Node Health Check completed successfully or ERROR" + + df = pd.DataFrame(record, index=[0]) + + ingest_client = QueuedIngestClient(KustoConnectionStringBuilder.with_azure_token_credential(ingest_url, creds)) + print(f"Ingesting results from {os.path.basename(results_file)} into {ingest_url} at {database}/{results_table_name}") + ingest_client.ingest_from_dataframe(df, IngestionProperties(database, results_table_name)) + + def parse_args(): parser = ArgumentParser(description="Ingest NHC results into Kusto") parser.add_argument("health_files", nargs="+", help="List of .health.log or .debug.log files to ingest") @@ -67,6 +226,8 @@ def parse_args(): parser.add_argument("--database", help="Kusto database", required=True) parser.add_argument("--health_table_name", default="NodeHealthCheck", help="Kusto table name for health results") parser.add_argument("--debug_table_name", default="NodeHealthCheck_Debug", help="Kusto table name for debug results") + parser.add_argument("--results_table_name", default="AzNhcRunEvents", help="Kusto table name for results") + parser.add_argument("--uuid", default="None", help="UUID to help identify results in Kusto table") parser.add_argument("--identity", nargs="?", const=True, default=False, help="Managed Identity to use for authentication, if a client ID is provided it will be used, otherwise the system-assigned identity will be used. If --identity is not provided DefaultAzureCredentials will be used.") return parser.parse_args() @@ -91,11 +252,13 @@ def get_creds(identity): ingest_health_log(health_file, creds, args.ingest_url, args.database, args.health_table_name) elif health_file.endswith(".debug.log"): ingest_debug_log(health_file, creds, args.ingest_url, args.database, args.debug_table_name) + elif health_file.endswith(".log"): + ingest_results(health_file, creds, args.ingest_url, args.database, args.results_table_name, args.uuid) else: - raise Exception("Unsuported file, must be .health.log or .debug.log produced by ./distributed_nhc.sb.sh") + raise Exception("Unsupported file, must be .health.log or .debug.log produced by ./distributed_nhc.sb.sh, or .log produced by run-health-checks.sh") except FileNotFoundError: - if len(health_files) == 1: + if len(args.health_files) == 1: print(f"Cannot find file '{health_file}'") raise - print(f"Cannot find file '{health_file}', skipping...") \ No newline at end of file + print(f"Cannot find file '{health_file}', skipping...") diff --git a/dockerfile/aznhc-entrypoint.sh b/dockerfile/aznhc-entrypoint.sh index 0d99ee01..d4431878 100644 --- a/dockerfile/aznhc-entrypoint.sh +++ b/dockerfile/aznhc-entrypoint.sh @@ -1,5 +1,9 @@ #! /bin/bash +if [ -z "$AZ_NHC_ROOT" ]; then + AZ_NHC_ROOT="/azure-nhc" +fi + CONF_FILE=${AZ_NHC_ROOT}/conf/aznhc.conf OUTPUT_PATH=${AZ_NHC_ROOT}/output/aznhc.log DEFAULT_NHC_FILE_PATH=${AZ_NHC_ROOT}/default @@ -49,4 +53,6 @@ if [ "$output_mounted" = false ]; then cat $OUTPUT_PATH fi +echo "Health checks completed." | tee -a $OUTPUT_PATH + exit 0 diff --git a/dockerfile/azure-nvrt-nhc.dockerfile b/dockerfile/azure-nvrt-nhc.dockerfile index 556e0a29..6c43aafa 100644 --- a/dockerfile/azure-nvrt-nhc.dockerfile +++ b/dockerfile/azure-nvrt-nhc.dockerfile @@ -51,12 +51,6 @@ RUN cd /tmp && \ MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX* -# install clang dependency -RUN cd /tmp && \ - wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${AOCC_VERSION}_amd64.deb && \ - apt install -y ./aocc-compiler-${AOCC_VERSION}_amd64.deb && \ - rm aocc-compiler-${AOCC_VERSION}_amd64.deb - # Install HPCx RUN cd /tmp && \ TARBALL="hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64.tbz" && \ @@ -104,6 +98,12 @@ COPY customTests/*.nhc /etc/nhc/scripts/ COPY customTests/topofiles ${AZ_NHC_ROOT}/topofiles COPY conf ${AZ_NHC_ROOT}/default/conf +# install clang dependency needed for stream +RUN cd /tmp && \ + wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-${AOCC_VERSION}_amd64.deb && \ + apt install -y ./aocc-compiler-${AOCC_VERSION}_amd64.deb && \ + rm aocc-compiler-${AOCC_VERSION}_amd64.deb + # Install stream RUN mkdir -p /tmp/stream COPY customTests/stream/Makefile /tmp/stream/ @@ -112,6 +112,10 @@ wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \ make all CC=/opt/AMD/aocc-compiler-4.0.0/bin/clang EXEC_DIR=${AZ_NHC_ROOT}/bin && \ rm -rf /tmp/stream +# Remove AOCC after STREAM build +RUN version=$(echo "$AOCC_VERSION" | sed 's/_1$//') && \ +apt remove aocc-compiler-"${version}" -y + # Copy necessary files COPY customTests/*.nhc /etc/nhc/scripts/ COPY customTests/topofiles ${AZ_NHC_ROOT}/topofiles diff --git a/run-health-checks.sh b/run-health-checks.sh index 862ca625..2ad2a8a9 100755 --- a/run-health-checks.sh +++ b/run-health-checks.sh @@ -169,7 +169,7 @@ DOCKER_RUN_ARGS="--name=$DOCK_CONT_NAME --net=host -e TIMEOUT=$TIMEOUT \ -v /sys:/hostsys/ \ -v $CONF_FILE:"$DOCK_CONF_PATH/aznhc.conf" \ -v $OUTPUT_PATH:$WORKING_DIR/output/aznhc.log \ - -v ${kernel_log}:$WORKING_DIR/syslog + -v ${kernel_log}:$WORKING_DIR/syslog \ -v ${AZ_NHC_ROOT}/customTests:$WORKING_DIR/customTests" sudo docker run ${DOCKER_RUN_ARGS} -e NHC_ARGS="${NHC_ARGS}" "${DOCK_IMG_NAME}" bash -c "$WORKING_DIR/aznhc-entrypoint.sh" diff --git a/test/unit-tests/basic-unit-test.sh b/test/unit-tests/basic-unit-test.sh new file mode 100755 index 00000000..3a04b1d0 --- /dev/null +++ b/test/unit-tests/basic-unit-test.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bats + + +logpath=/tmp/nhclogfile.log +NHC_PATH=$NHC_DIR/run-health-checks.sh + +get_sad_path_conf(){ + SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") + SKU="${SKU,,}" + if echo "$SKU" | grep -q "nd96asr_v4"; then + conf_name="nd96asr_v4" + elif echo "$SKU" | grep -q "nd96amsr_a100_v4"; then + conf_name="nd96amsr_a100_v4" + elif echo "$SKU" | grep -q "nd96isr_h100_v5"; then + conf_name="nd96isr_h100_v5" + elif echo "$SKU" | grep -q "hb176rs_v4"; then + conf_name="hb176rs_v4" + elif echo "$SKU" | grep -q "hx176rs"; then + conf_name="hx176rs" + else + echo "Unit-test for this SKU $SKU is not supported" + return 1 + fi + relative_path="$NHC_DIR/test/bad_test_confs/$conf_name.conf" + echo "$(realpath -m $relative_path)" + return 0 +} + +@test "Docker image pull check" { + set +e + sudo $NHC_DIR/dockerfile/pull-image-acr.sh cuda + result=$? + set -e + [ "$result" -eq 0 ] +} + +@test "Docker image ls check" { + set +e + sudo $NHC_DIR/dockerfile/pull-image-acr.sh cuda + image_name="aznhc.azurecr.io/nvrt" + result=$(sudo docker images | grep $image_name) + set -e + [ -n "$result" ] +} + + +@test "Default checks Pass (Happy Path)" { + sudo $NHC_PATH -o $logpath + result=$? + [ "$result" -eq 0 ] +} + +@test "Checks adjusted to fail (Sad Path)" { + bad_conf_file=$(get_sad_path_conf) + if [[ "$bad_conf_file" == *"not supported"* ]]; then + return false + fi + set +e + out=$(sudo $NHC_PATH -c $bad_conf_file -o $logpath) + set -e + echo "$out" | grep -q "ERROR" + result=$? + [ "$result" -eq 0 ] +} diff --git a/test/unit-tests/nhc-cpu-test.sh b/test/unit-tests/nhc-cpu-test.sh new file mode 100755 index 00000000..269a3ffd --- /dev/null +++ b/test/unit-tests/nhc-cpu-test.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bats + +source $NHC_DIR/customTests/azure_common.nhc +export AZ_NHC_ROOT=$NHC_DIR +source /opt/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/hpcx-init.sh +hpcx_load + +function die() { + log "ERROR: $NAME: Health check failed: $*" +} + +function dbg() { + log "dbg: $*" +} + +function log() { + echo $* +} + +gpu_test=( "azure_gpu_count.nhc" "azure_gpu_bandwidth.nhc" "azure_gpu_ecc.nhc" "azure_nccl_allreduce.nhc" "azure_ib_write_bw_gdr.nhc" ) + +cpu_test=( "azure_cpu_stream.nhc" "azure_ib_write_bw_non_gdr.nhc") + +for check in "${gpu_test[@]}" ; do + source $NHC_DIR/customTests/$check +done + +@test "Pass case: check_cpu_stream" { + set +e + + result=$(check_cpu_stream 1.0) + + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_cpu_stream" { + set +e + result=$(check_cpu_stream 900000.0) + set -e + [[ "$result" == *"ERROR"* ]] +} + +@test "Pass case: check_ib_bw_gdr" { + set +e + result=$(check_ib_bw_non_gdr 1.0) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_ib_bw_gdr" { + set +e + result=$(check_ib_bw_non_gdr 1000.0) + set -e + [[ "$result" == *"ERROR"* ]] +} + diff --git a/test/unit-tests/nhc-gpu-test.sh b/test/unit-tests/nhc-gpu-test.sh new file mode 100755 index 00000000..059d53ee --- /dev/null +++ b/test/unit-tests/nhc-gpu-test.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bats + +source $NHC_DIR/customTests/azure_common.nhc +export AZ_NHC_ROOT=$NHC_DIR +source /opt/hpcx-v2.16-gcc-mlnx_ofed-ubuntu22.04-cuda12-gdrcopy2-nccl2.18-x86_64/hpcx-init.sh +hpcx_load + +function die() { + log "ERROR: $NAME: Health check failed: $*" +} + +function dbg() { + log "dbg: $*" +} + +function log() { + echo $* +} + +gpu_test=( "azure_gpu_count.nhc" "azure_gpu_bandwidth.nhc" "azure_gpu_ecc.nhc" "azure_nccl_allreduce.nhc" "azure_ib_write_bw_gdr.nhc" ) + +for check in "${gpu_test[@]}" ; do + source $NHC_DIR/customTests/$check +done + +@test "Pass case: check_gpu_count" { + set +e + gpu_count=$(nvidia-smi --list-gpus | wc -l) + result=$(check_gpu_count $gpu_count) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_gpu_count" { + set +e + result=$(check_gpu_count 9) + set -e + [[ "$result" == *"ERROR"* ]] +} + +@test "Pass case: check_gpu_bw" { + set +e + result=$(check_gpu_bw 1 1) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_gpu_bw" { + set +e + result=$(check_gpu_bw 500 500) + set -e + [[ "$result" == *"ERROR"* ]] +} + +@test "Pass case: check_gpu_ecc" { + set +e + result=$(check_gpu_ecc 1 1) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_gpu_ecc" { + set +e + result=$(check_gpu_ecc -1 -1) + set -e + [[ "$result" == *"ERROR"* ]] +} + +@test "Pass case: check_nccl_allreduce" { + set +e + + result=$(check_nccl_allreduce 1.0 1 $AZ_NHC_ROOT/topofiles/ndv4-topo.xml 8G) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_nccl_allreduce" { + set +e + result=$(check_nccl_allreduce 600.0 1 $AZ_NHC_ROOT/topofiles/ndv4-topo.xml 8G) + set -e + [[ "$result" == *"ERROR"* ]] +} + +@test "Pass case: check_ib_bw_gdr" { + set +e + + result=$(check_ib_bw_gdr 1.0) + set -e + [[ "$result" != *"ERROR"* ]] +} + +@test "Fail case: check_ib_bw_gdr" { + set +e + result=$(check_ib_bw_gdr 1000.0) + set -e + [[ "$result" == *"ERROR"* ]] +} + diff --git a/test/unit-tests/run_tests.sh b/test/unit-tests/run_tests.sh index 8b8a7ab7..2169ffd7 100755 --- a/test/unit-tests/run_tests.sh +++ b/test/unit-tests/run_tests.sh @@ -23,6 +23,17 @@ fi export NHC_DIR -bats --pretty ${parent_dir}/bats-unit-tests.sh +echo "Running basic tests" +bats --pretty ${parent_dir}/basic-unit-test.sh + +echo "Running nhc custom checks tests" +if lspci | grep -iq NVIDIA ; then + bats --pretty ${parent_dir}/nhc-gpu-test.sh +elif lspci | grep -iq AMD ; then + # AMD installs + echo "No unit tests for AMD GPU SKUs" +else + bats --pretty ${parent_dir}/nhc-cpu-test.sh +fi exit 0