Skip to content

Commit 4818360

Browse files
[sonic-package-manager] support warm/fast reboot for extension packages (#1554)
- What I did Implemented functionality for SONiC package manager allowing to support packages wich require special handling for fast and warm reboots. For more details refer to HLD - https://github.com/stepanblyschak/SONiC/blob/sonic-app-ext-3/doc/sonic-application-extention/sonic-application-extention-hld.md#warmboot-and-fastboot-design-impact. - How I did it I extended manifest with warm/fast shutdown fields and added a logic that will account special requirements on fast/warm reboot for a package. Fast/Warm reboot scripts are enhanced to read the ordered list of services from a file on filesystem instead of having the list of services hardcoded in the script. This file is regenerated when package is installed/uninstalled/upgraded and also this file will be generated once during build time. Similary, a warmboot-finalizer service is enhanced by making it read the file on filesystem with processes that perfrom reconciliation. - How to verify it There is an open example extension I pushed to Docker Hub stepanblischak/cpu-report:warm. It can be installed on the switch: admin@sonic:~$ sudo sonic-package-manager show package manifest --from-repository stepanblischak/cpu-report:warm | grep warm -A 6 "warm-shutdown": { "after": [ "swss" ], "before": [ "syncd" ] admin@sonic;~$ sudo sonic-package-manager install --from-repository stepanblischak/cpu-report:warm -y -v DEBUG Then perform warm-reboot and observe that cpu-report is stopped at the right place in shutdown sequence: admin@sonic:~$ sudo warm-reboot -v sudo warm-reboot -v Wed 31 Mar 2021 12:54:10 PM UTC Saving counters folder before warmboot... Wed 31 Mar 2021 12:54:13 PM UTC Prepare MLNX ASIC to fastfast-reboot: install new FW if required Wed 31 Mar 2021 12:54:15 PM UTC Pausing orchagent ... Wed 31 Mar 2021 12:54:15 PM UTC Collecting logs to check ssd health before fastfast-reboot... Wed 31 Mar 2021 12:54:15 PM UTC Stopping lldp ... Wed 31 Mar 2021 12:54:17 PM UTC Stopped lldp Wed 31 Mar 2021 12:54:17 PM UTC Stopping nat ... Dumping conntrack entries failed Wed 31 Mar 2021 12:54:18 PM UTC Stopped nat Wed 31 Mar 2021 12:54:18 PM UTC Stopping radv ... Wed 31 Mar 2021 12:54:18 PM UTC Stopped radv Wed 31 Mar 2021 12:54:18 PM UTC Stopping sflow ... Wed 31 Mar 2021 12:54:18 PM UTC Stopped sflow Wed 31 Mar 2021 12:54:18 PM UTC Stopping bgp ... Wed 31 Mar 2021 12:54:22 PM UTC Stopped bgp Wed 31 Mar 2021 12:54:22 PM UTC Stopping swss ... Wed 31 Mar 2021 12:54:31 PM UTC Stopped swss Wed 31 Mar 2021 12:54:31 PM UTC Initialize pre-shutdown ... Wed 31 Mar 2021 12:54:31 PM UTC Requesting pre-shutdown ... Wed 31 Mar 2021 12:54:32 PM UTC Waiting for pre-shutdown ... Wed 31 Mar 2021 12:54:41 PM UTC Pre-shutdown succeeded, state: pre-shutdown-succeeded ... Wed 31 Mar 2021 12:54:41 PM UTC Backing up database ... Wed 31 Mar 2021 12:54:41 PM UTC Stopping cpu-report... Wed 31 Mar 2021 12:54:41 PM UTC Stopped cpu-report Wed 31 Mar 2021 12:54:41 PM UTC Stopping teamd ... Wed 31 Mar 2021 12:54:48 PM UTC Stopped teamd Wed 31 Mar 2021 12:54:48 PM UTC Stopping syncd ... Wed 31 Mar 2021 12:54:51 PM UTC Stopped syncd Wed 31 Mar 2021 12:54:51 PM UTC Stopping all remaining containers ... Wed 31 Mar 2021 12:54:53 PM UTC Stopped all remaining containers ... Wed 31 Mar 2021 12:54:55 PM UTC Enabling Watchdog before fastfast-reboot Watchdog armed for 180 seconds Wed 31 Mar 2021 12:54:56 PM UTC Rebooting with /sbin/kexec -e to SONiC-OS-master.0-ae9ccf39 ...
1 parent 793b847 commit 4818360

File tree

11 files changed

+465
-116
lines changed

11 files changed

+465
-116
lines changed

config/main.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -2087,20 +2087,28 @@ def warm_restart(ctx, redis_unix_socket_path):
20872087
ctx.obj = {'db': config_db, 'state_db': state_db, 'prefix': prefix}
20882088

20892089
@warm_restart.command('enable')
2090-
@click.argument('module', metavar='<module>', default='system', required=False, type=click.Choice(["system", "swss", "bgp", "teamd"]))
2090+
@click.argument('module', metavar='<module>', default='system', required=False)
20912091
@click.pass_context
20922092
def warm_restart_enable(ctx, module):
20932093
state_db = ctx.obj['state_db']
2094+
config_db = ctx.obj['db']
2095+
feature_table = config_db.get_table('FEATURE')
2096+
if module != 'system' and module not in feature_table:
2097+
exit('Feature {} is unknown'.format(module))
20942098
prefix = ctx.obj['prefix']
20952099
_hash = '{}{}'.format(prefix, module)
20962100
state_db.set(state_db.STATE_DB, _hash, 'enable', 'true')
20972101
state_db.close(state_db.STATE_DB)
20982102

20992103
@warm_restart.command('disable')
2100-
@click.argument('module', metavar='<module>', default='system', required=False, type=click.Choice(["system", "swss", "bgp", "teamd"]))
2104+
@click.argument('module', metavar='<module>', default='system', required=False)
21012105
@click.pass_context
21022106
def warm_restart_enable(ctx, module):
21032107
state_db = ctx.obj['state_db']
2108+
config_db = ctx.obj['db']
2109+
feature_table = config_db.get_table('FEATURE')
2110+
if module != 'system' and module not in feature_table:
2111+
exit('Feature {} is unknown'.format(module))
21042112
prefix = ctx.obj['prefix']
21052113
_hash = '{}{}'.format(prefix, module)
21062114
state_db.set(state_db.STATE_DB, _hash, 'enable', 'false')

scripts/fast-reboot

+55-64
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ WARM_DIR=/host/warmboot
77
REDIS_FILE=dump.rdb
88
REBOOT_SCRIPT_NAME=$(basename $0)
99
REBOOT_TYPE="${REBOOT_SCRIPT_NAME}"
10+
SHUTDOWN_ORDER_FILE="/etc/sonic/${REBOOT_TYPE}_order"
1011
VERBOSE=no
1112
FORCE=no
1213
IGNORE_ASIC=no
@@ -567,82 +568,72 @@ if [ -x ${LOG_SSD_HEALTH} ]; then
567568
fi
568569
569570
570-
# Kill nat docker after saving the conntrack table
571-
debug "Stopping nat ..."
572-
/usr/local/bin/dump_nat_entries.py
573-
docker kill nat > /dev/null || true
574-
systemctl stop nat
575-
debug "Stopped nat ..."
576-
577-
# Kill radv before stopping BGP service to prevent announcing our departure.
578-
debug "Stopping radv service..."
579-
systemctl stop radv
580-
debug "Stopped radv service..."
581-
582-
# Kill bgpd to start the bgp graceful restart procedure
583-
debug "Stopping bgp ..."
584-
systemctl stop bgp
585-
debug "Stopped bgp ..."
586-
587-
# Kill sflow docker
588-
debug "Stopping sflow ..."
589-
container kill sflow &> /dev/null || debug "Docker sflow is not running ($?) ..."
590-
systemctl stop sflow
591-
debug "Stopped sflow ..."
592-
593-
# Kill lldp, otherwise it sends informotion about reboot.
594-
# We call `docker kill lldp` to ensure the container stops as quickly as possible,
595-
# then immediately call `systemctl stop lldp` to prevent the service from
596-
# restarting the container automatically.
597-
container kill lldp &> /dev/null || debug "Docker lldp is not running ($?) ..."
598-
systemctl stop lldp
599-
600-
if [[ "$REBOOT_TYPE" = "fast-reboot" ]]; then
601-
debug "Stopping teamd ..."
602-
systemctl stop teamd
603-
debug "Stopped teamd ..."
571+
if [[ -f ${SHUTDOWN_ORDER_FILE} ]]; then
572+
SERVICES_TO_STOP="$(cat ${SHUTDOWN_ORDER_FILE})"
573+
else
574+
# TODO: to be removed once sonic-buildimage change is in
575+
if [[ "${REBOOT_TYPE}" == "fast-reboot" ]]; then
576+
SERVICES_TO_STOP="nat radv bgp sflow lldp swss teamd syncd"
577+
elif [[ "${REBOOT_TYPE}" == "fastfast-reboot" || "${REBOOT_TYPE}" == "warm-reboot" ]]; then
578+
SERVICES_TO_STOP="nat radv bgp sflow lldp teamd swss syncd"
579+
else
580+
error "Unexpected reboot type ${REBOOT_TYPE}"
581+
exit $EXIT_FAILURE
582+
fi
604583
fi
605584
606-
debug "Stopping swss service ..."
607-
systemctl stop swss
608-
debug "Stopped swss service ..."
585+
for service in ${SERVICES_TO_STOP}; do
586+
debug "Stopping ${service} ..."
609587
610-
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
611-
# Pre-shutdown syncd
612-
initialize_pre_shutdown
588+
# TODO: These exceptions for nat, sflow, lldp
589+
# have to be coded in corresponding service scripts
613590
614-
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
615-
check_issu_bank_file
591+
if [[ "${service}" = "nat" ]]; then
592+
/usr/local/bin/dump_nat_entries.py
616593
fi
617594
618-
request_pre_shutdown
619-
620-
wait_for_pre_shutdown_complete_or_fail
621-
622-
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
623-
check_issu_bank_file
595+
if [[ "${service}" = "nat" || "${service}" = "sflow" || "${service}" = "lldp" ]]; then
596+
container kill "${service}" &> /dev/null || debug "Docker ${service} is not running ($?) ..."
624597
fi
625598
626-
# Warm reboot: dump state to host disk
627-
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
628-
sonic-db-cli ASIC_DB FLUSHDB > /dev/null
629-
sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null
630-
sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null
599+
if [[ "${service}" = "syncd" ]]; then
600+
systemctl stop ${service} || debug "Ignore stopping ${service} service error $?"
601+
else
602+
systemctl stop ${service}
631603
fi
632604
633-
# TODO: backup_database preserves FDB_TABLE
634-
# need to cleanup as well for fastfast boot case
635-
backup_database
605+
debug "Stopped ${service}"
636606
637-
# Stop teamd gracefully
638-
debug "Stopping teamd ..."
639-
systemctl stop teamd
640-
debug "Stopped teamd ..."
641-
fi
607+
if [[ "${service}" = "swss" ]]; then
608+
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
609+
# Pre-shutdown syncd
610+
initialize_pre_shutdown
611+
612+
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
613+
check_issu_bank_file
614+
fi
642615
643-
debug "Stopping syncd ..."
644-
systemctl stop syncd || debug "Ignore stopping syncd service error $?"
645-
debug "Stopped syncd ..."
616+
request_pre_shutdown
617+
618+
wait_for_pre_shutdown_complete_or_fail
619+
620+
if [[ "x$sonic_asic_type" == x"mellanox" ]]; then
621+
check_issu_bank_file
622+
fi
623+
624+
# Warm reboot: dump state to host disk
625+
if [[ "$REBOOT_TYPE" = "fastfast-reboot" ]]; then
626+
sonic-db-cli ASIC_DB FLUSHDB > /dev/null
627+
sonic-db-cli COUNTERS_DB FLUSHDB > /dev/null
628+
sonic-db-cli FLEX_COUNTER_DB FLUSHDB > /dev/null
629+
fi
630+
631+
# TODO: backup_database preserves FDB_TABLE
632+
# need to cleanup as well for fastfast boot case
633+
backup_database
634+
fi
635+
fi
636+
done
646637
647638
# Kill other containers to make the reboot faster
648639
# We call `docker kill ...` to ensure the container stops as quickly as possible,

scripts/generate_shutdown_order.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/python3
2+
3+
''' This script is used to generate initial warm/fast shutdown order file '''
4+
5+
from sonic_package_manager import PackageManager
6+
7+
def main():
8+
manager = PackageManager.get_manager()
9+
installed_packages = manager.get_installed_packages()
10+
print('installed packages {}'.format(installed_packages))
11+
manager.service_creator.generate_shutdown_sequence_files(installed_packages)
12+
print('Done.')
13+
14+
if __name__ == '__main__':
15+
main()

setup.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
'scripts/fdbshow',
9696
'scripts/gearboxutil',
9797
'scripts/generate_dump',
98+
'scripts/generate_shutdown_order.py',
9899
'scripts/intfutil',
99100
'scripts/intfstat',
100101
'scripts/ipintutil',
@@ -187,9 +188,10 @@
187188
'sonic-py-common',
188189
'sonic-yang-mgmt',
189190
'swsssdk>=2.0.1',
190-
'tabulate>=0.8.2',
191-
'www-authenticate>=0.9.2',
192-
'xmltodict>=0.12.0',
191+
'tabulate==0.8.2',
192+
'toposort==1.6',
193+
'www-authenticate==0.9.2',
194+
'xmltodict==0.12.0',
193195
],
194196
setup_requires= [
195197
'pytest-runner',

sonic_package_manager/manager.py

+33-5
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pkgutil
77
import tempfile
88
from inspect import signature
9-
from typing import Any, Iterable, Callable, Dict, Optional
9+
from typing import Any, Iterable, List, Callable, Dict, Optional
1010

1111
import docker
1212
import filelock
@@ -375,6 +375,14 @@ def install_from_source(self,
375375
self.service_creator.create(package, state=feature_state, owner=default_owner)
376376
exits.callback(rollback(self.service_creator.remove, package))
377377

378+
self.service_creator.generate_shutdown_sequence_files(
379+
self._get_installed_packages_and(package)
380+
)
381+
exits.callback(rollback(
382+
self.service_creator.generate_shutdown_sequence_files,
383+
self.get_installed_packages())
384+
)
385+
378386
if not skip_host_plugins:
379387
self._install_cli_plugins(package)
380388
exits.callback(rollback(self._uninstall_cli_plugins, package))
@@ -429,6 +437,9 @@ def uninstall(self, name: str, force=False):
429437
try:
430438
self._uninstall_cli_plugins(package)
431439
self.service_creator.remove(package)
440+
self.service_creator.generate_shutdown_sequence_files(
441+
self._get_installed_packages_except(package)
442+
)
432443

433444
# Clean containers based on this image
434445
containers = self.docker.ps(filters={'ancestor': package.image_id},
@@ -525,8 +536,8 @@ def upgrade_from_source(self,
525536
old_package, 'start'))
526537

527538
self.service_creator.remove(old_package, deregister_feature=False)
528-
exits.callback(rollback(self.service_creator.create,
529-
old_package, register_feature=False))
539+
exits.callback(rollback(self.service_creator.create, old_package,
540+
register_feature=False))
530541

531542
# Clean containers based on the old image
532543
containers = self.docker.ps(filters={'ancestor': old_package.image_id},
@@ -538,6 +549,14 @@ def upgrade_from_source(self,
538549
exits.callback(rollback(self.service_creator.remove, new_package,
539550
register_feature=False))
540551

552+
self.service_creator.generate_shutdown_sequence_files(
553+
self._get_installed_packages_and(new_package)
554+
)
555+
exits.callback(rollback(
556+
self.service_creator.generate_shutdown_sequence_files,
557+
self._get_installed_packages_and(old_package))
558+
)
559+
541560
if self.feature_registry.is_feature_enabled(new_feature):
542561
self._systemctl_action(new_package, 'start')
543562
exits.callback(rollback(self._systemctl_action,
@@ -818,10 +837,19 @@ def get_installed_packages(self) -> Dict[str, Package]:
818837
"""
819838

820839
return {
821-
entry.name: self.get_installed_package(entry.name)
822-
for entry in self.database if entry.installed
840+
entry.name: entry for entry in self.get_installed_packages_list()
823841
}
824842

843+
def get_installed_packages_list(self) -> List[Package]:
844+
""" Returns a list of installed packages.
845+
846+
Returns:
847+
Installed packages dictionary.
848+
"""
849+
850+
return [self.get_installed_package(entry.name)
851+
for entry in self.database if entry.installed]
852+
825853
def _migrate_package_database(self, old_package_database: PackageDatabase):
826854
""" Performs part of package migration process.
827855
For every package in old_package_database that is not listed in current

sonic_package_manager/manifest.py

+19-6
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,10 @@ class ManifestRoot(ManifestNode):
9292

9393
def marshal(self, value: Optional[dict]):
9494
result = {}
95-
if value is None:
96-
value = {}
95+
value = value or {}
96+
97+
if not isinstance(value, dict):
98+
raise ManifestError(f'"{self.key}" field has to be a dictionary')
9799

98100
for item in self.items:
99101
next_value = value.get(item.key)
@@ -115,7 +117,7 @@ def marshal(self, value):
115117
if value is None:
116118
if self.default is not None:
117119
return self.default
118-
raise ManifestError(f'{self.key} is a required field but it is missing')
120+
raise ManifestError(f'"{self.key}" is a required field but it is missing')
119121
try:
120122
return_value = self.type.marshal(value)
121123
except Exception as err:
@@ -130,10 +132,12 @@ class ManifestArray(ManifestNode):
130132
type: Any
131133

132134
def marshal(self, value):
133-
if value is None:
134-
return []
135-
136135
return_value = []
136+
value = value or []
137+
138+
if not isinstance(value, list):
139+
raise ManifestError(f'"{self.key}" has to be of type list')
140+
137141
try:
138142
for item in value:
139143
return_value.append(self.type.marshal(item))
@@ -173,6 +177,14 @@ def unmarshal(self, value):
173177
ManifestField('asic-service', DefaultMarshaller(bool), False),
174178
ManifestField('host-service', DefaultMarshaller(bool), True),
175179
ManifestField('delayed', DefaultMarshaller(bool), False),
180+
ManifestRoot('warm-shutdown', [
181+
ManifestArray('after', DefaultMarshaller(str)),
182+
ManifestArray('before', DefaultMarshaller(str)),
183+
]),
184+
ManifestRoot('fast-shutdown', [
185+
ManifestArray('after', DefaultMarshaller(str)),
186+
ManifestArray('before', DefaultMarshaller(str)),
187+
]),
176188
]),
177189
ManifestRoot('container', [
178190
ManifestField('privileged', DefaultMarshaller(bool), False),
@@ -187,6 +199,7 @@ def unmarshal(self, value):
187199
]),
188200
ManifestArray('processes', ManifestRoot('processes', [
189201
ManifestField('name', DefaultMarshaller(str)),
202+
ManifestField('reconciles', DefaultMarshaller(bool), False),
190203
])),
191204
ManifestRoot('cli', [
192205
ManifestField('mandatory', DefaultMarshaller(bool), False),

0 commit comments

Comments
 (0)