Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[services] Restart SwSS service upon unexpected critical process exit #2845

Merged
merged 11 commits into from
May 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 18 additions & 20 deletions dockers/docker-dhcp-relay/wait_for_intf.sh.j2
Original file line number Diff line number Diff line change
@@ -1,42 +1,40 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
# Wait for all interfaces to be up and ready
{% for (name, prefix) in INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in VLAN_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }}
{% endfor %}
{% for (name, prefix) in PORTCHANNEL_INTERFACE %}
wait_until_iface_ready {{ name }}
wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }}
{% endfor %}
2 changes: 2 additions & 0 deletions dockers/docker-orchagent/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ COPY ["files/arp_update", "/usr/bin"]
COPY ["enable_counters.py", "/usr/bin"]
COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"]
COPY ["supervisord.conf", "/etc/supervisor/conf.d/"]
COPY ["files/supervisor-proc-exit-listener", "/usr/bin"]
COPY ["critical_processes", "/etc/supervisor/"]

## Copy all Jinja2 template files into the templates folder
COPY ["*.j2", "/usr/share/sonic/templates/"]
Expand Down
11 changes: 11 additions & 0 deletions dockers/docker-orchagent/critical_processes
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
orchagent
portsyncd
neighsyncd
vlanmgrd
intfmgrd
portmgrd
buffermgrd
vrfmgrd
nbrmgrd
vxlanmgrd
intfsyncd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

intfsyncd is no longer present.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I meant to delete that line, but forgot. It won't cause any issues, but I'll open a new PR to remove it soon.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR here: #2850

8 changes: 7 additions & 1 deletion dockers/docker-orchagent/supervisord.conf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ logfile_maxbytes=1MB
logfile_backups=2
nodaemon=true

[eventlistener:supervisor-proc-exit-listener]
command=/usr/bin/supervisor-proc-exit-listener
events=PROCESS_STATE_EXITED
autostart=true
autorestart=unexpected

[program:start.sh]
command=/usr/bin/start.sh
priority=1
Expand All @@ -15,7 +21,7 @@ stderr_logfile=syslog
command=/usr/sbin/rsyslogd -n
priority=2
autostart=false
autorestart=false
autorestart=unexpected
stdout_logfile=syslog
stderr_logfile=syslog

Expand Down
2 changes: 1 addition & 1 deletion files/build_templates/dhcp_relay.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target teamd.service
WantedBy=multi-user.target swss.service teamd.service
2 changes: 1 addition & 1 deletion files/build_templates/radv.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait
ExecStop=/usr/bin/{{ docker_container_name }}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
3 changes: 3 additions & 0 deletions files/build_templates/snmp.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ Before=ntp-config.service
ExecStartPre=/usr/bin/{{docker_container_name}}.sh start
ExecStart=/usr/bin/{{docker_container_name}}.sh wait
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target swss.service
4 changes: 4 additions & 0 deletions files/build_templates/swss.service.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@ Requires=nps-modules-4.9.0-8-2-amd64.service
After=database.service updategraph.service
After=interfaces-config.service
Before=ntp-config.service
StartLimitIntervalSec=1200
StartLimitBurst=3

[Service]
User=root
Environment=sonic_asic_platform={{ sonic_asic_platform }}
ExecStartPre=/usr/local/bin/swss.sh start
ExecStart=/usr/local/bin/swss.sh wait
ExecStop=/usr/local/bin/swss.sh stop
Restart=always
RestartSec=30

[Install]
WantedBy=multi-user.target
4 changes: 2 additions & 2 deletions files/build_templates/teamd.service.j2
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[Unit]
Description=TEAMD container
Requires=updategraph.service
Requires=updategraph.service swss.service
After=updategraph.service swss.service
Before=ntp-config.service

Expand All @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh wait
ExecStop=/usr/bin/{{docker_container_name}}.sh stop

[Install]
WantedBy=multi-user.target
WantedBy=multi-user.target swss.service
2 changes: 1 addition & 1 deletion files/image_config/systemd/journald.conf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#Seal=yes
#SplitMode=uid
#SyncIntervalSec=5m
#RateLimitInterval=30s
#RateLimitIntervalSec=30s
#RateLimitBurst=1000
SystemMaxUse=50M
#SystemKeepFree=
Expand Down
45 changes: 45 additions & 0 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

import os
import signal
import sys
import syslog

from supervisor import childutils

# Contents of file should be the names of critical processes (as defined in
# supervisor.conf file), one per line
CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes'

def main():
# Read the list of critical processes from a file
with open(CRITICAL_PROCESSES_FILE, 'r') as f:
critical_processes = [line.rstrip('\n') for line in f]

while True:
# Transition from ACKNOWLEDGED to READY
childutils.listener.ready()

line = sys.stdin.readline()
headers = childutils.get_headers(line)
payload = sys.stdin.read(int(headers['len']))

# Transition from READY to ACKNOWLEDGED
childutils.listener.ok()

# We only care about PROCESS_STATE_EXITED events
if headers['eventname'] == 'PROCESS_STATE_EXITED':
payload_headers, payload_data = childutils.eventdata(payload + '\n')

expected = int(payload_headers['expected'])
processname = payload_headers['processname']

# If a critical process exited unexpectedly, terminate supervisor
if expected == 0 and processname in critical_processes:
MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..."
msg = MSG_FORMAT_STR.format(payload_headers['processname'])
syslog.syslog(syslog.LOG_INFO, msg)
os.kill(os.getppid(), signal.SIGTERM)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion rules/docker-dhcp-relay.mk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ DOCKER_DHCP_RELAY_DBG = $(DOCKER_DHCP_RELAY_STEM)-$(DBG_IMAGE_MARK).gz

$(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/$(DOCKER_DHCP_RELAY_STEM)

$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY)
$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(REDIS_TOOLS)
$(DOCKER_DHCP_RELAY)_DBG_DEPENDS = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_DEPENDS)
$(DOCKER_DHCP_RELAY)_DBG_IMAGE_PACKAGES = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_IMAGE_PACKAGES)

Expand Down
2 changes: 1 addition & 1 deletion rules/docker-orchagent.mk
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ $(DOCKER_ORCHAGENT)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro
$(DOCKER_ORCHAGENT)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw

$(DOCKER_ORCHAGENT)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel
$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT)
$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)
6 changes: 5 additions & 1 deletion rules/scripts.mk
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@ $(BUFFERS_CONFIG_TEMPLATE)_PATH = files/build_templates
QOS_CONFIG_TEMPLATE = qos_config.j2
$(QOS_CONFIG_TEMPLATE)_PATH = files/build_templates

SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts

SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \
$(ARP_UPDATE_SCRIPT) \
$(BUFFERS_CONFIG_TEMPLATE) \
$(QOS_CONFIG_TEMPLATE)
$(QOS_CONFIG_TEMPLATE) \
$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)


50 changes: 24 additions & 26 deletions src/sonic-config-engine/tests/sample_output/wait_for_intf.sh
Original file line number Diff line number Diff line change
@@ -1,43 +1,41 @@
#!/usr/bin/env bash

function wait_until_iface_ready
{
IFACE=$1
STATE_DB_IDX="6"

echo "Waiting until interface $IFACE is up..."

# Wait for the interface to come up (i.e., 'ip link show' returns 0)
until ip link show dev $IFACE up > /dev/null 2>&1; do
sleep 1
done
PORT_TABLE_PREFIX="PORT_TABLE"
VLAN_TABLE_PREFIX="VLAN_TABLE"
LAG_TABLE_PREFIX="LAG_TABLE"

echo "Interface $IFACE is up"
function wait_until_iface_ready
{
TABLE_PREFIX=$1
IFACE=$2

echo "Waiting until interface $IFACE has an IPv4 address..."
echo "Waiting until interface $IFACE is ready..."

# Wait until the interface gets assigned an IPv4 address
# Wait for the interface to come up
# (i.e., interface is present in STATE_DB and state is "ok")
while true; do
IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1)

if [ -n "$IP" ]; then
RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null)
if [ x"$RESULT" == x"ok" ]; then
break
fi

sleep 1
done

echo "Interface $IFACE is configured with IP $IP"
echo "Interface ${IFACE} is ready!"
}


# Wait for all interfaces to come up and have IPv4 addresses assigned
wait_until_iface_ready Vlan1000
wait_until_iface_ready PortChannel01
wait_until_iface_ready PortChannel01
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel02
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel03
wait_until_iface_ready PortChannel04
wait_until_iface_ready PortChannel04
# Wait for all interfaces to be up and ready
wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04
wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04