Skip to content

Commit

Permalink
Add watchdog mechanism to swss service and generate alert when swss h…
Browse files Browse the repository at this point in the history
…ave issue. (#14686)

This PR depends on sonic-net/sonic-swss#2737 merge first.

**What I did**
Add orchagent watchdog to monitor and alert orchagent stuck issue.

**Why I did it**
Currently SONiC monit system only monit orchagent process exist or not. If orchagent process stuck and stop processing, current monit can't find and report it.

**How I verified it**
Pass all UT.
Add new UT sonic-net/sonic-mgmt#8306 to check watchdog works correctly.
Manually test, after pause orchagent with 'kill -STOP <pid>', check there are warning message exist in log:

Apr 28 23:36:41.504923 vlab-01 ERR swss#supervisor-proc-watchdog-listener: Process 'orchagent' is stuck in namespace 'host' (1.0 minutes).

**Details if related**
Heartbeat message PR: sonic-net/sonic-swss#2737
UT PR: sonic-net/sonic-mgmt#8306
  • Loading branch information
liuh-80 authored Jun 6, 2023
1 parent 381cfe4 commit 44427a2
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 16 deletions.
1 change: 1 addition & 0 deletions dockers/docker-orchagent/docker-init.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ CFGGEN_PARAMS=" \
-t /usr/share/sonic/templates/vlan_vars.j2 \
-t /usr/share/sonic/templates/ndppd.conf.j2,/etc/ndppd.conf \
-t /usr/share/sonic/templates/critical_processes.j2,/etc/supervisor/critical_processes \
-t /usr/share/sonic/templates/watchdog_processes.j2,/etc/supervisor/watchdog_processes \
-t /usr/share/sonic/templates/supervisord.conf.j2,/etc/supervisor/conf.d/supervisord.conf
-t /usr/share/sonic/templates/wait_for_link.sh.j2,/usr/bin/wait_for_link.sh \
"
Expand Down
3 changes: 2 additions & 1 deletion dockers/docker-orchagent/supervisord.conf.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ buffer_size=1024

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener --container-name swss
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING
events=PROCESS_STATE_EXITED,PROCESS_STATE_RUNNING,PROCESS_COMMUNICATION_STDOUT
autostart=true
autorestart=unexpected
buffer_size=1024
Expand Down Expand Up @@ -75,6 +75,7 @@ command=/usr/bin/orchagent.sh
priority=4
autostart=false
autorestart=false
stdout_capture_maxbytes=1MB
stdout_logfile=syslog
stderr_logfile=syslog
dependent_startup=true
Expand Down
1 change: 1 addition & 0 deletions dockers/docker-orchagent/watchdog_processes.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
program:orchagent
55 changes: 40 additions & 15 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ from swsscommon import swsscommon

from supervisor import childutils

# Each line of this file should specify one process, (as defined in supervisord.conf file), in the
# following format:
#
# program:<process_name>
WATCH_PROCESSES_FILE = '/etc/supervisor/watchdog_processes'

# Each line of this file should specify either one critical process or one
# critical process group, (as defined in supervisord.conf file), in the
# following format:
Expand All @@ -34,40 +40,40 @@ ALERTING_INTERVAL_SECS = 60
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"

def get_critical_group_and_process_list():
def get_group_and_process_list(process_file):
"""
@summary: Read the critical processes/group names from CRITICAL_PROCESSES_FILE.
@summary: Read the critical processes/group names.
@return: Two lists which contain critical processes and group names respectively.
"""
critical_group_list = []
critical_process_list = []
group_list = []
process_list = []

with open(CRITICAL_PROCESSES_FILE, 'r') as file:
with open(process_file, 'r') as file:
for line in file:
# ignore blank lines
if re.match(r"^\s*$", line):
continue
line_info = line.strip(' \n').split(':')
if len(line_info) != 2:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(5)

identifier_key = line_info[0].strip()
identifier_value = line_info[1].strip()
if identifier_key == "group" and identifier_value:
critical_group_list.append(identifier_value)
group_list.append(identifier_value)
elif identifier_key == "program" and identifier_value:
critical_process_list.append(identifier_value)
process_list.append(identifier_value)
else:
syslog.syslog(syslog.LOG_ERR,
"Syntax of the line {} in critical_processes file is incorrect. Exiting...".format(line))
"Syntax of the line {} in processes file is incorrect. Exiting...".format(line))
sys.exit(6)

return critical_group_list, critical_process_list
return group_list, process_list


def generate_alerting_message(process_name, dead_minutes):
def generate_alerting_message(process_name, status, dead_minutes):
"""
@summary: If a critical process was not running, this function will determine it resides in host
or in a specific namespace. Then an alerting message will be written into syslog.
Expand All @@ -80,8 +86,8 @@ def generate_alerting_message(process_name, dead_minutes):
else:
namespace = namespace_prefix + namespace_id

syslog.syslog(syslog.LOG_ERR, "Process '{}' is not running in namespace '{}' ({} minutes)."
.format(process_name, namespace, dead_minutes))
syslog.syslog(syslog.LOG_ERR, "Process '{}' is {} in namespace '{}' ({} minutes)."
.format(process_name, status, namespace, dead_minutes))


def get_autorestart_state(container_name):
Expand Down Expand Up @@ -125,9 +131,11 @@ def main(argv):
syslog.syslog(syslog.LOG_ERR, "Container name not specified. Exiting...")
sys.exit(1)

critical_group_list, critical_process_list = get_critical_group_and_process_list()
critical_group_list, critical_process_list = get_group_and_process_list(CRITICAL_PROCESSES_FILE)
_, watch_process_list = get_group_and_process_list(WATCH_PROCESSES_FILE)

process_under_alerting = defaultdict(dict)
process_heart_beat_info = defaultdict(dict)
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
Expand Down Expand Up @@ -167,6 +175,15 @@ def main(argv):
if process_name in process_under_alerting:
process_under_alerting.pop(process_name)

# Handle the PROCESS_COMMUNICATION_STDOUT event
elif headers['eventname'] == 'PROCESS_COMMUNICATION_STDOUT':
payload_headers, payload_data = childutils.eventdata(payload + '\n')
process_name = payload_headers['processname']

# update process heart beat time
if (process_name in watch_process_list):
process_heart_beat_info[process_name]["last_heart_beat"] = time.time()

# Transition from BUSY to ACKNOWLEDGED
childutils.listener.ok()

Expand All @@ -181,7 +198,15 @@ def main(argv):
elapsed_mins = elapsed_secs // 60
process_under_alerting[process_name]["last_alerted"] = epoch_time
process_under_alerting[process_name]["dead_minutes"] += elapsed_mins
generate_alerting_message(process_name, process_under_alerting[process_name]["dead_minutes"])
generate_alerting_message(process_name, "not running", process_under_alerting[process_name]["dead_minutes"])

# Check whether we need write alerting messages into syslog
for process in process_heart_beat_info.keys():
epoch_time = time.time()
elapsed_secs = epoch_time - process_heart_beat_info[process]["last_heart_beat"]
if elapsed_secs >= ALERTING_INTERVAL_SECS:
elapsed_mins = elapsed_secs // 60
generate_alerting_message(process, "stuck", elapsed_mins)

if __name__ == "__main__":
main(sys.argv[1:])

0 comments on commit 44427a2

Please sign in to comment.