From 5825165607b9b74b2d156dd240c4cdf7dd45cded Mon Sep 17 00:00:00 2001 From: Vivek Date: Thu, 27 Apr 2023 11:02:13 -0500 Subject: [PATCH] [Sys Mon] Fix the service entry delete in state_db because of timer job (#14702) Why I did it systemd stop event on service with timers can sometime delete the state_db entry for the corresponding service. Note: This won't be observed on the latest master label since the dependency on timer was removed with the recent config reload enhancement. However, it is better to have the fix since there might be some systemd services added to system health daemon in the future which may contain timers root@qa-eth-vt01-4-3700c:/home/admin# systemctl stop snmp root@qa-eth-vt01-4-3700c:/home/admin# show system-health sysready-status System is not ready - one or more services are not up Service-Name Service-Status App-Ready-Status Down-Reason ---------------------- ---------------- ------------------ ------------- ssh OK OK - swss OK OK - syncd OK OK - sysstat OK OK - teamd OK OK - telemetry OK OK - what-just-happened OK OK - ztp OK OK - Expected Should see a Down entry for SNMP instead of the entry being deleted from the STATE_DB root@qa-eth-vt01-4-3700c:/home/admin# show system-health sysready-status System is not ready - one or more services are not up Service-Name Service-Status App-Ready-Status Down-Reason ---------------------- ---------------- ------------------ ------------- snmp Down Down Inactive ssh OK OK - swss OK OK - syncd OK OK - sysstat OK OK - teamd OK OK - telemetry OK OK - what-just-happened OK OK - ztp OK OK - How I did it Happens because the timer is usually a PartOf service and thus a stop on service is propagated to timer. Fixed the logic to handle this Apr 18 02:06:47.711252 r-lionfish-16 DEBUG healthd: Main process- received event:snmp.service from source:sysbus time:2023-04-17 23:06:47 Apr 18 02:06:47.711347 r-lionfish-16 INFO healthd: check_unit_status for [ snmp.service ] Apr 18 02:06:47.722363 r-lionfish-16 INFO healthd: snmp.service service state changed to [inactive/dead] Apr 18 02:06:47.723230 r-lionfish-16 DEBUG healthd: Main process- received event:snmp.timer from source:sysbus time:2023-04-17 23:06:47 Apr 18 02:06:47.723328 r-lionfish-16 INFO healthd: check_unit_status for [ snmp.timer ] Signed-off-by: Vivek Reddy Karri --- src/system-health/health_checker/sysmonitor.py | 11 +++++++---- src/system-health/tests/test_system_health.py | 9 +++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/system-health/health_checker/sysmonitor.py b/src/system-health/health_checker/sysmonitor.py index b3f43e447f7b..1357693c108a 100755 --- a/src/system-health/health_checker/sysmonitor.py +++ b/src/system-health/health_checker/sysmonitor.py @@ -407,10 +407,13 @@ def check_unit_status(self, event): self.dnsrvs_name.remove(event) srv_name,last = event.split('.') - key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) - key_exists = self.state_db.exists(self.state_db.STATE_DB, key) - if key_exists == 1: - self.state_db.delete(self.state_db.STATE_DB, key) + # stop on service maybe propagated to timers and in that case, + # the state_db entry for the service should not be deleted + if last == "service": + key = 'ALL_SERVICE_STATUS|{}'.format(srv_name) + key_exists = self.state_db.exists(self.state_db.STATE_DB, key) + if key_exists == 1: + self.state_db.delete(self.state_db.STATE_DB, key) return 0 diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py index cea80b48a349..29f2e14704cc 100644 --- a/src/system-health/tests/test_system_health.py +++ b/src/system-health/tests/test_system_health.py @@ -622,6 +622,15 @@ def test_check_unit_status(): assert 'mock_bgp.service' in sysmon.dnsrvs_name +@patch('health_checker.sysmonitor.Sysmonitor.get_all_service_list', MagicMock(return_value=['mock_snmp.service'])) +def test_check_unit_status_timer(): + sysmon = Sysmonitor() + sysmon.state_db = MagicMock() + sysmon.state_db.exists = MagicMock(return_value=1) + sysmon.state_db.delete = MagicMock() + sysmon.check_unit_status('mock_snmp.timer') + assert not sysmon.state_db.delete.called + @patch('health_checker.sysmonitor.Sysmonitor.run_systemctl_show', MagicMock(return_value=mock_srv_props['mock_radv.service'])) @patch('health_checker.sysmonitor.Sysmonitor.get_app_ready_status', MagicMock(return_value=('Up','-','-')))