Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend reboot script for rebooting SmartSwitch #3566

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 144 additions & 1 deletion scripts/reboot
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ PLATFORM_UPDATE_REBOOT_CAUSE="platform_update_reboot_cause"
REBOOT_CAUSE_FILE="/host/reboot-cause/reboot-cause.txt"
PLATFORM_REBOOT_PRE_CHECK="platform_reboot_pre_check"
REBOOT_TIME=$(date)
PLATFORM_JSON_FILE="platform.json"
PLATFORM_JSON_PATH="${DEVPATH}/${PLATFORM}/${PLATFORM_JSON_FILE}"

# Reboot immediately if we run the kdump capture kernel
VMCORE_FILE=/proc/vmcore
Expand All @@ -33,6 +35,7 @@ ASIC_TYPE=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asic_type)
SUBTYPE=$(sonic-cfggen -d -v DEVICE_METADATA.localhost.subtype)
ASAN=$(sonic-cfggen -y /etc/sonic/sonic_version.yml -v asan)
VERBOSE=no
EXIT_SUCCESS=0
EXIT_NEXT_IMAGE_NOT_EXISTS=4
EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
EXIT_PLATFORM_FW_AU_FAILURE=22
Expand All @@ -41,6 +44,11 @@ REBOOT_SCRIPT_NAME=$(basename $0)
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
TAG_LATEST=no
REBOOT_FLAGS=""
FORCE_REBOOT="no"
SMART_SWITCH="no"
DPU_MODULE_NAME=""
REBOOT_DPU="no"
PRE_SHUTDOWN="no"

function debug()
{
Expand Down Expand Up @@ -154,7 +162,7 @@ function reboot_pre_check()
${DEVPATH}/${PLATFORM}/${PLATFORM_REBOOT_PRE_CHECK}
[[ $? -ne 0 ]] && exit $?
fi

# Verify the next image by sonic-installer
local message=$(sonic-installer verify-next-image 2>&1)
if [ $? -ne 0 ]; then
Expand All @@ -176,6 +184,108 @@ function check_conflict_boot_in_fw_update()
fi
}

function get_reboot_status()
{
reboot_status=$(gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc RebootStatus)
if [ $? -ne 0 ] || [ -z "$reboot_status" ]; then
echo "Error: Failed to send reboot status command to DPU ${DPU_NAME}"
exit ${EXIT_ERROR}
fi
echo "$reboot_status"
}

function reboot_dpu_module()
{
local DPU_NAME=$1
local DPU_INDEX=${DPU_NAME//[!0-9]/}

debug "User requested rebooting device ${DPU_NAME} ..."

# Retrieve DPU IP from CONFIG_DB
dpu_ip=$(sonic-db-cli CONFIG_DB HGET "DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME}" "ips")
if [ $? -ne 0 ] || [ -z "$dpu_ip" ]; then
echo "Error: Failed to retrieve DPU IP address for ${DPU_NAME}"
exit ${EXIT_ERROR}
fi

# Retrieve GNMI port from CONFIG_DB
port=$(sonic-db-cli CONFIG_DB HGET "GNMI|gnmi" "port")
debug "GNMI port ${port}"
if [ $? -ne 0 ] || [ -z "$port" ]; then
echo "Error: Failed to retrieve GNMI port"
exit ${EXIT_ERROR}
fi

# Issue GNOI client command to reboot the DPU
gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc Reboot -jsonin '{"method":3}'
if [ $? -ne 0 ]; then
echo "Error: Failed to send reboot command to DPU ${DPU_NAME}"
exit ${EXIT_ERROR}
fi

# Retrieve dpu_halt_services_timeout value using jq
dpu_halt_services_timeout=$(jq -r '.dpu_halt_services_timeout' "$PLATFORM_JSON_PATH" 2>/dev/null)

# Poll on reboot status response with a timeout mechanism
poll_interval=5
waited_time=0

while true; do
reboot_status=$(get_reboot_status)
debug "GNOI RebootStatus response ${reboot_status}"
is_reboot_active=$(echo "$reboot_status" | grep "active" | awk '{print $2}')

if [ "$is_reboot_active" == "false" ]; then
break
fi

sleep "$poll_interval"
waited_time=$((waited_time + poll_interval))

if [ $waited_time -ge $dpu_halt_services_timeout ]; then
echo "Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting"
exit ${EXIT_ERROR}
fi
done

# Check if the given DPU_NAME exists in the JSON file
DPU_EXISTS=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME]' "$PLATFORM_JSON_PATH" 2>/dev/null)

if [ -n "$DPU_EXISTS" ]; then
# Retrieve bus_info for the given DPU_NAME
DPU_BUS_INFO=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' "$PLATFORM_JSON_PATH")

if [ -n "$DPU_BUS_INFO" ]; then
debug "DPU: ${DPU_NAME}, Bus Info: ${BUS_INFO}"
else
echo "Error: bus_info not found for DPU ${DPU_NAME}"
exit ${EXIT_ERROR}
fi
else
echo "Error: DPU ${DPU_NAME} not found in platform.json"
exit ${EXIT_ERROR}
fi

# Update STATE_DB with DPU PCIe key
sonic-db-cli state_db set "PCIE_DETACH_INFO|${DPU_NAME}" '{"dpu_id": "${DPU_INDEX}", "dpu_state": "detaching", "bus_info": ${DPU_BUS_INFO}}'

# Detach the DPU module PCIe
echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO}/remove

# Reboot the DPU via platform vendor API
reboot_status=$(python3 -c "import reboot_helper; reboot_helper.reboot_module('${DPU_NAME}')")
if [ -z "$reboot_status" ] || [ "$reboot_status" = "false" ]; then
echo "Error: Failed to reboot the platform"
exit ${EXIT_ERROR}
fi

# Rescan the PCIe
echo 1 > /sys/bus/pci/rescan

# Update STATE_DB to delete DPU PCIe key
sonic-db-cli state_db del "PCIE_DETACH_INFO|${DPU_NAME}"
}

function parse_options()
{
while getopts "h?vf" opt; do
Expand All @@ -192,6 +302,13 @@ function parse_options()
f )
REBOOT_FLAGS+=" -f"
;;
d )
REBOOT_DPU="yes"
DPU_MODULE_NAME="$optarg"
;;
p )
PRE_SHUTDOWN="yes"
;;
esac
done
}
Expand Down Expand Up @@ -225,6 +342,27 @@ fi

debug "User requested rebooting device ..."

# Check for smart switch by parsing platform.json file
if [ -f "$PLATFORM_JSON_PATH" ]; then
NUM_DPU=$(jq -r '.DPUS | length' "$PLATFORM_JSON_PATH" 2>/dev/null)

if [ "$NUM_DPU" -gt 0 ]; then
SMART_SWITCH="yes"
fi
fi

if [[ "$REBOOT_DPU" == "yes" && "$SMART_SWITCH" == "yes" ]]; then
echo "User requested to reboot the device ${DPU_MODULE_NAME}"
reboot_dpu_module "$DPU_MODULE_NAME"
elif [ "$SMART_SWITCH" == "yes" ]; then
# Loop to iterate over DPUs and invoke reboot_dpu_module in parallel
for (( i=0; i<"$NUM_DPU"; i++ )); do
echo "Rebooting DPU module $i"
reboot_dpu_module "dpu$i" &
done
wait
fi

check_conflict_boot_in_fw_update

setup_reboot_variables
Expand Down Expand Up @@ -287,6 +425,11 @@ if [ -x ${WATCHDOG_UTIL} ]; then
${WATCHDOG_UTIL} arm
fi

if [[ "${PRE_SHUTDOWN}" == "yes" ]]; then
echo "${DPU_MODULE_NAME} pre-shutdown steps are completed"
exit ${EXIT_SUCCESS}
fi

if [ -x ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} ]; then
VERBOSE=yes debug "Rebooting with platform ${PLATFORM} specific tool ..."
${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} $@
Expand Down
104 changes: 104 additions & 0 deletions scripts/reboot_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#!/usr/bin/env python3
#
# reboot_helper.py
#
# Utility helper for reboot within SONiC

import sonic_platform
import sys
import syslog

chk_log_level = syslog.LOG_ERR

def _log_msg(lvl, pfx, msg):
if lvl <= chk_log_level:
print("{}: {}".format(pfx, msg))
syslog.syslog(lvl, msg)


def log_err(m):
_log_msg(syslog.LOG_ERR, "Err", m)


def log_info(m):
_log_msg(syslog.LOG_INFO, "Info", m)


def log_debug(m):
_log_msg(syslog.LOG_DEBUG, "Debug", m)


# Global variable for platform chassis
platform_chassis = None

def load_platform_chassis():
global platform_chassis

# Load new platform API class
try:
platform_chassis = sonic_platform.platform.Platform().get_chassis()
except Exception as e:
log_err("Failed to instantiate Chassis due to {}".format(repr(e)))
return False

if not platform_chassis:
log_err("Platform chassis is not loaded")
return False

return True

def reboot_module(module_name):
"""Reboot the specified module by invoking the platform API"""

# Load the platform chassis if not already loaded
if not platform_chassis and not load_platform_chassis():
log_err("Failed to load platform chassis")
return False

# Iterate over the modules to find the one with the specified name
try:
# Use get_all_modules to retrieve all modules on the chassis
modules = platform_chassis.get_all_modules()

# Iterate over the modules to find the one with the specified name
for module in modules:
# Check if the module name matches the provided module_name
if module and module.get_name() == module_name:
# Reboot the module
log_info(f"Rebooting module {module_name}...")
try:
module.reboot()
log_info(f"Reboot command sent for module {module_name}")
return True
except NotImplementedError:
log_error(f"Reboot not implemented for module {module_name}.")
return False
except Exception as e:
log_error(f"An error occurred while rebooting module {module_name}: {e}")
return False

# If the module with the given name is not found
log_err(f"Module {module_name} not found")
return False

except Exception as e:
log_err(f"Error occurred while rebooting module {module_name}: {repr(e)}")
return False

if __name__ == "__main__":
if len(sys.argv) < 3:
print("Usage: reboot_helper.py <command> <module_name>")
sys.exit(1)

command = sys.argv[1]
module_name = sys.argv[2]

if command == "reboot":
success = reboot_module(module_name)
if not success:
sys.exit(1)
else:
print(f"Reboot command sent for module {module_name}")
else:
print(f"Unknown command: {command}")
sys.exit(1)
Loading