From 0e6a55ef5eac306ef61d6f0241625a6baee42ab8 Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Mon, 24 Jun 2024 09:48:14 +0300 Subject: [PATCH] [fast-reboot] Backup database after syncd/swss stopped (#3342) - What I did Backup DB after syncd and swss are stopped. I observed an issue with fast-reboot that in a rare circumstances a queued FDB event might be written to ASIC_DB by a thread inside syncd after a call to FLUSHDB ASIC_DB was made. That left ASIC_DB only with one record about that FDB entry and caused syncd to crash at start: Mar 15 13:28:42.765108 sonic NOTICE syncd#SAI: :- Syncd: syncd started Mar 15 13:28:42.765268 sonic NOTICE syncd#SAI: :- onSyncdStart: performing hard reinit since COLD start was performed Mar 15 13:28:42.765451 sonic NOTICE syncd#SAI: :- readAsicState: loaded 1 switches Mar 15 13:28:42.765465 sonic NOTICE syncd#SAI: :- readAsicState: switch VID: oid:0x21000000000000 Mar 15 13:28:42.765465 sonic NOTICE syncd#SAI: :- readAsicState: read asic state took 0.000205 sec Mar 15 13:28:42.766364 sonic NOTICE syncd#SAI: :- onSyncdStart: on syncd start took 0.001097 sec Mar 15 13:28:42.766376 sonic ERR syncd#SAI: :- run: Runtime error during syncd init: map::at Mar 15 13:28:42.766376 sonic NOTICE syncd#SAI: :- sendShutdownRequest: sending switch_shutdown_request notification to OA for switch: oid:0x0 Mar 15 13:28:42.766518 sonic NOTICE syncd#SAI: :- sendShutdownRequestAfterException: notification send successfully - How I did it Backup DB after syncd/swss have stopped. - How to verify it Run fast-reboot. Signed-off-by: Stepan Blyschak --- scripts/fast-reboot | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/scripts/fast-reboot b/scripts/fast-reboot index 53dcffd7d2..2eeca11112 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -244,6 +244,19 @@ function wait_for_pre_shutdown_complete_or_fail() function backup_database() { debug "Backing up database ..." + + if [[ "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then + # Advanced reboot: dump state to host disk + sonic-db-cli ASIC_DB FLUSHDB > /dev/null + sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null + sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null + fi + + if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then + # Flush RESTAP_DB in fast-reboot to avoid stale status + sonic-db-cli RESTAPI_DB FLUSHDB > /dev/null + fi + # Dump redis content to a file 'dump.rdb' in warmboot directory mkdir -p $WARM_DIR # Delete keys in stateDB except FDB_TABLE|*, MIRROR_SESSION_TABLE|*, WARM_RESTART_ENABLE_TABLE|*, FG_ROUTE_TABLE|* @@ -806,23 +819,11 @@ for service in ${SERVICES_TO_STOP}; do wait_for_pre_shutdown_complete_or_fail fi - if [[ "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then - # Advanced reboot: dump state to host disk - sonic-db-cli ASIC_DB FLUSHDB > /dev/null - sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null - sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null - fi - - if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then - # Flush RESTAP_DB in fast-reboot to avoid stale status - sonic-db-cli RESTAPI_DB FLUSHDB > /dev/null - fi - - backup_database - fi done +backup_database + # Stop the docker container engine. Otherwise we will have a broken docker storage systemctl stop docker.service || debug "Ignore stopping docker service error $?"