Skip to content

Commit

Permalink
[Fix] Fix multi-asic flaky lldp not ready issue (sonic-net#9453)
Browse files Browse the repository at this point in the history
Description of PR
In the recover of sanity check, for config_reload, we simply sleep 120s.
In some topology, like multi-asic scenario, it's not enough.
lldp services get a big chance that not ready after 120s, then sanity failed.

Use config_reload with safe_reload to better handle this flaky issue.

Approach
What is the motivation for this PR?
Fix multi-asic lldp flaky issue.

How did you do it?
Use config_reload with safe_reload to better handle this flaky issue.

co-authorized by: jianquanye@microsoft.com
  • Loading branch information
yejianquan authored and mssonicbld committed Aug 16, 2023
1 parent 80a5ab1 commit f40f325
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 23 deletions.
14 changes: 7 additions & 7 deletions tests/common/plugins/sanity_check/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,43 +22,43 @@
# Recover related definitions
RECOVER_METHODS = {
"config_reload": {
"cmd": "bash -c 'config reload -y &>/dev/null'",
"reboot": False,
"adaptive": False,
'recover_wait': 120
},
"config_reload_f": {
"cmd": "bash -c 'config reload -f -y &>/dev/null'",
"cmd": "false",
"reload": True,
"reboot": False,
"adaptive": False,
'recover_wait': 120
},
"load_minigraph": {
"cmd": "bash -c 'config load_minigraph -y &>/dev/null'",
"reload": False,
"reboot": False,
"adaptive": False,
'recover_wait': 60
},
"reboot": {
"cmd": "reboot",
"reload": False,
"reboot": True,
"adaptive": False,
'recover_wait': 120
},
"warm_reboot": {
"cmd": "warm-reboot",
"reload": False,
"reboot": True,
"adaptive": False,
'recover_wait': 120
},
"fast_reboot": {
"cmd": "fast_reboot",
"reload": False,
"reboot": True,
"adaptive": False,
'recover_wait': 120
},
"adaptive": {
"cmd": None,
"reload": False,
"reboot": False,
"adaptive": True,
'recover_wait': 30
Expand Down
27 changes: 13 additions & 14 deletions tests/common/plugins/sanity_check/recover.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import json
import logging

from . import constants

from tests.common.utilities import wait
from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.config_reload import config_force_option_supported
from tests.common.reboot import reboot
from tests.common.reboot import REBOOT_TYPE_WARM, REBOOT_TYPE_FAST, REBOOT_TYPE_COLD
from tests.common.helpers.parallel import parallel_run, reset_ansible_local_tmp
from tests.common import config_reload
from tests.common.devices.sonic import SonicHost
from tests.common.helpers.parallel import parallel_run, reset_ansible_local_tmp
from tests.common.platform.device_utils import fanout_switch_port_lookup
from tests.common.reboot import REBOOT_TYPE_WARM, REBOOT_TYPE_FAST, REBOOT_TYPE_COLD
from tests.common.reboot import reboot
from tests.common.utilities import wait
from . import constants

logger = logging.getLogger(__name__)

Expand All @@ -25,7 +23,7 @@ def reboot_dut(dut, localhost, cmd):
else:
reboot_type = REBOOT_TYPE_COLD

reboot(dut, localhost, reboot_type=reboot_type)
reboot(dut, localhost, reboot_type=reboot_type, safe_reboot=True, check_intf_up_ports=True)


def _recover_interfaces(dut, fanouthosts, result, wait_time):
Expand Down Expand Up @@ -160,24 +158,25 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result
.format(result, action, outstanding_action))

if outstanding_action:
if outstanding_action == "config_reload" and config_force_option_supported(dut):
outstanding_action = "config_reload_f"
method = constants.RECOVER_METHODS[outstanding_action]
wait_time = method['recover_wait']
if method["reboot"]:
if method["reload"]:
config_reload(dut, safe_reload=True, check_intf_up_ports=True)
elif method["reboot"]:
reboot_dut(dut, localhost, method["cmd"])
else:
_recover_with_command(dut, method['cmd'], wait_time)


def recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, recover_method):
logger.warning("Try to recover %s using method %s" % (dut.hostname, recover_method))
if recover_method == "config_reload" and config_force_option_supported(dut):
recover_method = "config_reload_f"

method = constants.RECOVER_METHODS[recover_method]
wait_time = method['recover_wait']
if method["adaptive"]:
adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time)
elif method["reload"]:
config_reload(dut, safe_reload=True, check_intf_up_ports=True)
elif method["reboot"]:
reboot_dut(dut, localhost, method["cmd"])
else:
Expand Down
24 changes: 22 additions & 2 deletions tests/common/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import os
from multiprocessing.pool import ThreadPool
from collections import deque

from .helpers.assertions import pytest_assert
from .platform.interface_utils import check_interface_status_of_up_ports
from .platform.processes_utils import wait_critical_processes
from .utilities import wait_until, get_plt_reboot_ctrl
from tests.common.helpers.dut_utils import ignore_t2_syslog_msgs

Expand Down Expand Up @@ -195,7 +199,8 @@ def execute_reboot_helper():

def reboot(duthost, localhost, reboot_type='cold', delay=10,
timeout=0, wait=0, wait_for_ssh=True, wait_warmboot_finalizer=False, warmboot_finalizer_timeout=0,
reboot_helper=None, reboot_kwargs=None, plt_reboot_ctrl_overwrite=True):
reboot_helper=None, reboot_kwargs=None, plt_reboot_ctrl_overwrite=True,
safe_reboot=False, check_intf_up_ports=False):
"""
reboots DUT
:param duthost: DUT host object
Expand All @@ -208,6 +213,8 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,
:param wait_warmboot_finalizer=True: Wait for WARMBOOT_FINALIZER done
:param reboot_helper: helper function to execute the power toggling
:param reboot_kwargs: arguments to pass to the reboot_helper
:param safe_reboot: arguments to wait DUT ready after reboot
:param check_intf_up_ports: arguments to check interface after reboot
:return:
"""
pool = ThreadPool()
Expand Down Expand Up @@ -240,7 +247,20 @@ def reboot(duthost, localhost, reboot_type='cold', delay=10,

logger.info('waiting for switch {} to initialize'.format(hostname))

time.sleep(wait)
if safe_reboot:
# The wait time passed in might not be guaranteed to cover the actual
# time it takes for containers to come back up. Therefore, add 5
# minutes to the maximum wait time. If it's ready sooner, then the
# function will return sooner.
pytest_assert(wait_until(wait + 300, 20, 0, duthost.critical_services_fully_started),
"All critical services should be fully started!")
wait_critical_processes(duthost)

if check_intf_up_ports:
pytest_assert(wait_until(300, 20, 0, check_interface_status_of_up_ports, duthost),
"Not all ports that are admin up on are operationally up")
else:
time.sleep(wait)

# Wait warmboot-finalizer service
if reboot_type == REBOOT_TYPE_WARM and wait_warmboot_finalizer:
Expand Down

0 comments on commit f40f325

Please sign in to comment.