From 41f5961263bb2f353a50ed233154af9acc23a465 Mon Sep 17 00:00:00 2001 From: Ying Xie Date: Mon, 9 Dec 2019 20:48:03 -0800 Subject: [PATCH] [fast/warm reboot] ignore errors after shutting down critical service(s) (#761) Once any critical service is shutdown (radv/swss/syncd), we have to commit to the reboot. Failing in the middle will leave the system in bad state. Signed-off-by: Ying Xie --- scripts/fast-reboot | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index f35c3126d3c0..c33140771780 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -144,7 +144,6 @@ function request_pre_shutdown() debug "Requesting pre-shutdown ..." /usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --pre &> /dev/null || { error "Failed to request pre-shutdown" - exit "${EXIT_SYNCD_SHUTDOWN}" } } @@ -180,9 +179,9 @@ function wait_for_pre_shutdown_complete_or_fail() if [[ x"${STATE}" != x"pre-shutdown-succeeded" ]]; then debug "Syncd pre-shutdown failed: ${STATE} ..." - exit "${EXIT_SYNCD_SHUTDOWN}" + else + debug "Pre-shutdown succeeded ..." fi - debug "Pre-shutdown succeeded ..." } function backup_database() @@ -402,6 +401,10 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; t fi fi +# We are fully committed to reboot from this point on becasue critical +# service will go down and we cannot recover from it. +set +e + # Kill radv before stopping BGP service to prevent annoucing our departure. debug "Stopping radv ..." docker kill radv &>/dev/null || [ $? == 1 ] @@ -474,7 +477,7 @@ if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; t fi debug "Stopping syncd ..." -systemctl stop syncd +systemctl stop syncd || debug "Ignore stopping syncd service error $?" debug "Stopped syncd ..." # Kill other containers to make the reboot faster @@ -485,7 +488,7 @@ debug "Stopping all remaining containers ..." for CONTAINER_NAME in $(docker ps --format '{{.Names}}'); do CONTAINER_STOP_RC=0 docker kill $CONTAINER_NAME &> /dev/null || CONTAINER_STOP_RC=$? - systemctl stop $CONTAINER_NAME + systemctl stop $CONTAINER_NAME || debug "Ignore stopping $CONTAINER_NAME error $?" if [[ CONTAINER_STOP_RC -ne 0 ]]; then debug "Failed killing container $CONTAINER_NAME RC $CONTAINER_STOP_RC ." fi @@ -493,12 +496,12 @@ done debug "Stopped all remaining containers ..." # Stop the docker container engine. Otherwise we will have a broken docker storage -systemctl stop docker.service +systemctl stop docker.service || debug "Ignore stopping docker service error $?" # Stop kernel modules for Nephos platform if [[ "$sonic_asic_type" = 'nephos' ]]; then - systemctl stop nps-modules-`uname -r`.service + systemctl stop nps-modules-`uname -r`.service || debug "Ignore stopping nps service error $?" fi # Update the reboot cause file to reflect that user issued this script