From fc81909dbc5104e327929175b844217c36666ff8 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 15 May 2023 02:05:52 +0000 Subject: [PATCH 1/6] Add UT for orchagent watchdog --- .azure-pipelines/pr_test_scripts.yaml | 1 + tests/system_health/test_watchdog.py | 54 +++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/system_health/test_watchdog.py diff --git a/.azure-pipelines/pr_test_scripts.yaml b/.azure-pipelines/pr_test_scripts.yaml index 49cf62d13f..96b033bcbd 100644 --- a/.azure-pipelines/pr_test_scripts.yaml +++ b/.azure-pipelines/pr_test_scripts.yaml @@ -67,6 +67,7 @@ t0: - test_interfaces.py - test_procdockerstatsd.py - database/test_db_scripts.py + - system_health/test_watchdog.py t0-2vlans: - dhcp_relay/test_dhcp_relay.py diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py new file mode 100644 index 0000000000..8ce2c32180 --- /dev/null +++ b/tests/system_health/test_watchdog.py @@ -0,0 +1,54 @@ +import pytest +import time +from tests.common.helpers.assertions import pytest_assert + +pytestmark = [ + pytest.mark.disable_loganalyzer, + pytest.mark.topology('any') +] + +logger = logging.getLogger(__name__) + +SLEEP_TIME = 10 + +@pytest.fixture +def pause_orchagent(duthost): + # find orchagent pid + pid = duthost.shell(r"ps -ef | grep orchagent | grep -v grep | awk '{print $2}'", module_ignore_errors=True)['stdout'] + logger.info('Get orchagent pid: {}'.format(pid)) + + # pause orchagent and clear syslog + duthost.shell(r"sudo kill -STOP {}".format(pid), module_ignore_errors=True) + duthost.shell(r"sudo truncate -s 0 /var/log/syslog", module_ignore_errors=True) + yield + + # resume orchagent and clear syslog + duthost.shell(r"sudo kill -CONT {}".format(pid), module_ignore_errors=True) + duthost.shell(r"sudo truncate -s 0 /var/log/syslog", module_ignore_errors=True) + + +def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_orchagent): + duthost = duthosts[enum_rand_one_per_hwsku_hostname] + + result = duthost.shell( + r"docker exec -i swss sh -c 'test -f /usr/bin/supervisor-proc-watchdog-listener && echo exist'" + , module_ignore_errors=True)['stdout'] + logger.info('Check watchdog exist: {}'.format(result)) + if result != 'exist': + pytest.skip("Skip orchagent watchdog test.") + + # wait watchdog emit alert + WATCHDOG_TIMEOUT = 120 + current_attempt = 0 + while (True): + time.sleep(SLEEP_TIME) + alert = duthost.shell(r"sudo cat /var/log/syslog | grep 'is stuck in namespace'", module_ignore_errors=True)['stdout'] + logger.info('Get alert from host: {}'.format(alert)) + if "orchagent" in str(alert): + return + else: + # orchagent watchdog timeout is 60 seconds + if current_attempt >= WATCHDOG_TIMEOUT/SLEEP_TIME: + pytest_assert(False, "orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT)) + else: + current_attempt += 1 From 5220eb2964637ece1ae17a86599e73c73a436cb6 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 15 May 2023 02:22:00 +0000 Subject: [PATCH 2/6] Fix code format --- tests/system_health/test_watchdog.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py index 8ce2c32180..f2bbd5f8f8 100644 --- a/tests/system_health/test_watchdog.py +++ b/tests/system_health/test_watchdog.py @@ -1,3 +1,4 @@ +import logging import pytest import time from tests.common.helpers.assertions import pytest_assert @@ -11,10 +12,13 @@ SLEEP_TIME = 10 + @pytest.fixture def pause_orchagent(duthost): # find orchagent pid - pid = duthost.shell(r"ps -ef | grep orchagent | grep -v grep | awk '{print $2}'", module_ignore_errors=True)['stdout'] + pid = duthost.shell( + r"ps -ef | grep orchagent | grep -v grep | awk '{print $2}'", + module_ignore_errors=True)['stdout'] logger.info('Get orchagent pid: {}'.format(pid)) # pause orchagent and clear syslog @@ -31,24 +35,28 @@ def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_or duthost = duthosts[enum_rand_one_per_hwsku_hostname] result = duthost.shell( - r"docker exec -i swss sh -c 'test -f /usr/bin/supervisor-proc-watchdog-listener && echo exist'" - , module_ignore_errors=True)['stdout'] + r"docker exec -i swss sh -c 'test -f /usr/bin/supervisor-proc-watchdog-listener && echo exist'", + module_ignore_errors=True)['stdout'] logger.info('Check watchdog exist: {}'.format(result)) if result != 'exist': pytest.skip("Skip orchagent watchdog test.") - + # wait watchdog emit alert WATCHDOG_TIMEOUT = 120 current_attempt = 0 while (True): time.sleep(SLEEP_TIME) - alert = duthost.shell(r"sudo cat /var/log/syslog | grep 'is stuck in namespace'", module_ignore_errors=True)['stdout'] + alert = duthost.shell( + r"sudo cat /var/log/syslog | grep 'is stuck in namespace'", + module_ignore_errors=True)['stdout'] logger.info('Get alert from host: {}'.format(alert)) if "orchagent" in str(alert): return else: # orchagent watchdog timeout is 60 seconds if current_attempt >= WATCHDOG_TIMEOUT/SLEEP_TIME: - pytest_assert(False, "orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT)) + pytest_assert( + False, + "orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT)) else: current_attempt += 1 From 42d7c83951b1251dcc3bb56873573064721c2722 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 15 May 2023 02:24:40 +0000 Subject: [PATCH 3/6] Fix code format --- tests/system_health/test_watchdog.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py index f2bbd5f8f8..e737a806ba 100644 --- a/tests/system_health/test_watchdog.py +++ b/tests/system_health/test_watchdog.py @@ -20,10 +20,11 @@ def pause_orchagent(duthost): r"ps -ef | grep orchagent | grep -v grep | awk '{print $2}'", module_ignore_errors=True)['stdout'] logger.info('Get orchagent pid: {}'.format(pid)) - + # pause orchagent and clear syslog duthost.shell(r"sudo kill -STOP {}".format(pid), module_ignore_errors=True) duthost.shell(r"sudo truncate -s 0 /var/log/syslog", module_ignore_errors=True) + yield # resume orchagent and clear syslog @@ -59,4 +60,4 @@ def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_or False, "orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT)) else: - current_attempt += 1 + current_attempt += 1 \ No newline at end of file From b537019bd0c1b539255ec1ba983e1ad0f21dd50f Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Mon, 15 May 2023 02:26:56 +0000 Subject: [PATCH 4/6] Fix code format --- tests/system_health/test_watchdog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py index e737a806ba..40d7816de9 100644 --- a/tests/system_health/test_watchdog.py +++ b/tests/system_health/test_watchdog.py @@ -60,4 +60,4 @@ def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_or False, "orchagent watchdog did not been trigger after {} seconds".format(WATCHDOG_TIMEOUT)) else: - current_attempt += 1 \ No newline at end of file + current_attempt += 1 From a7e11a88572f3ecda38e831abf94aaa744185e05 Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Tue, 23 May 2023 05:15:23 +0000 Subject: [PATCH 5/6] Improve UT --- tests/system_health/test_watchdog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py index 40d7816de9..8dab6ce611 100644 --- a/tests/system_health/test_watchdog.py +++ b/tests/system_health/test_watchdog.py @@ -36,7 +36,7 @@ def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_or duthost = duthosts[enum_rand_one_per_hwsku_hostname] result = duthost.shell( - r"docker exec -i swss sh -c 'test -f /usr/bin/supervisor-proc-watchdog-listener && echo exist'", + r"docker exec -i swss sh -c 'test -f /etc/supervisor/watchdog_processes && echo exist'", module_ignore_errors=True)['stdout'] logger.info('Check watchdog exist: {}'.format(result)) if result != 'exist': From 6d214e3d54f0a78abd894e673b93dce893433aea Mon Sep 17 00:00:00 2001 From: liuh-80 Date: Thu, 8 Jun 2023 05:58:59 +0000 Subject: [PATCH 6/6] Improve UT --- tests/system_health/test_watchdog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/system_health/test_watchdog.py b/tests/system_health/test_watchdog.py index 8dab6ce611..2e9361ae72 100644 --- a/tests/system_health/test_watchdog.py +++ b/tests/system_health/test_watchdog.py @@ -17,7 +17,7 @@ def pause_orchagent(duthost): # find orchagent pid pid = duthost.shell( - r"ps -ef | grep orchagent | grep -v grep | awk '{print $2}'", + r"pgrep orchagent", module_ignore_errors=True)['stdout'] logger.info('Get orchagent pid: {}'.format(pid)) @@ -42,7 +42,7 @@ def test_orchagent_watchdog(duthosts, enum_rand_one_per_hwsku_hostname, pause_or if result != 'exist': pytest.skip("Skip orchagent watchdog test.") - # wait watchdog emit alert + # wait watchdog emit alert, orchagent watchdog timeout is 60 seconds WATCHDOG_TIMEOUT = 120 current_attempt = 0 while (True):