From cfcbfd21b47aeeb686fae8b0fadfa83613d152ab Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Tue, 11 Dec 2018 22:49:47 +0000 Subject: [PATCH 01/11] [service] Restart SwSS Docker container if orchagent exits unexpectedly --- dockers/docker-orchagent/Dockerfile.j2 | 1 + .../supervisor-proc-exit-listener | 40 +++++++++++++++++++ dockers/docker-orchagent/supervisord.conf | 6 +++ files/build_templates/dhcp_relay.service.j2 | 1 + files/build_templates/radv.service.j2 | 1 + files/build_templates/swss.service.j2 | 2 + files/build_templates/teamd.service.j2 | 2 +- 7 files changed, 52 insertions(+), 1 deletion(-) create mode 100755 dockers/docker-orchagent/supervisor-proc-exit-listener diff --git a/dockers/docker-orchagent/Dockerfile.j2 b/dockers/docker-orchagent/Dockerfile.j2 index d29ea6a8905a..fab61d160f14 100755 --- a/dockers/docker-orchagent/Dockerfile.j2 +++ b/dockers/docker-orchagent/Dockerfile.j2 @@ -39,6 +39,7 @@ RUN rm -rf /debs COPY ["files/arp_update", "/usr/bin"] COPY ["enable_counters.py", "/usr/bin"] +COPY ["supervisor-proc-exit-listener", "/usr/bin"] COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] diff --git a/dockers/docker-orchagent/supervisor-proc-exit-listener b/dockers/docker-orchagent/supervisor-proc-exit-listener new file mode 100755 index 000000000000..8e55942321d9 --- /dev/null +++ b/dockers/docker-orchagent/supervisor-proc-exit-listener @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import os +import signal +import sys +import syslog + +from supervisor import childutils + +# Process names as defined in supervisor.conf file +critical_processes = ['orchagent'] + +def main(): + while True: + # Transition from ACKNOWLEDGED to READY + childutils.listener.ready() + + line = sys.stdin.readline() + headers = childutils.get_headers(line) + payload = sys.stdin.read(int(headers['len'])) + + # Transition from READY to ACKNOWLEDGED + childutils.listener.ok() + + # We only care about PROCESS_STATE_EXITED events + if headers['eventname'] == 'PROCESS_STATE_EXITED': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + + expected = int(payload_headers['expected']) + processname = payload_headers['processname'] + + # If a critical process exited unexpectedly, terminate supervisor + if expected == 0 and processname in critical_processes: + MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." + msg = MSG_FORMAT_STR.format(payload_headers['processname']) + syslog.syslog(syslog.LOG_INFO, msg) + os.kill(os.getppid(), signal.SIGTERM) + +if __name__ == "__main__": + main() diff --git a/dockers/docker-orchagent/supervisord.conf b/dockers/docker-orchagent/supervisord.conf index 4479a27f2a7b..10c7af1a79a4 100644 --- a/dockers/docker-orchagent/supervisord.conf +++ b/dockers/docker-orchagent/supervisord.conf @@ -3,6 +3,12 @@ logfile_maxbytes=1MB logfile_backups=2 nodaemon=true +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:start.sh] command=/usr/bin/start.sh priority=1 diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index 5a462e53973d..e1d2779c2c71 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -1,6 +1,7 @@ [Unit] Description=DHCP relay container Requires=updategraph.service swss.service teamd.service +PartOf=swss.service After=updategraph.service swss.service teamd.service Before=ntp-config.service diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index 8cda2fdd0afb..989b009e0321 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -1,6 +1,7 @@ [Unit] Description=Router advertiser container Requires=updategraph.service swss.service +PartOf=swss.service After=updategraph.service swss.service Before=ntp-config.service diff --git a/files/build_templates/swss.service.j2 b/files/build_templates/swss.service.j2 index b391f95de52d..b9de98322c26 100644 --- a/files/build_templates/swss.service.j2 +++ b/files/build_templates/swss.service.j2 @@ -16,6 +16,8 @@ Environment=sonic_asic_platform={{ sonic_asic_platform }} ExecStartPre=/usr/local/bin/swss.sh start ExecStart=/usr/local/bin/swss.sh wait ExecStop=/usr/local/bin/swss.sh stop +Restart=always +RestartSec=30 [Install] WantedBy=multi-user.target diff --git a/files/build_templates/teamd.service.j2 b/files/build_templates/teamd.service.j2 index 8034698ecc07..1bea7b7d48be 100644 --- a/files/build_templates/teamd.service.j2 +++ b/files/build_templates/teamd.service.j2 @@ -1,6 +1,6 @@ [Unit] Description=TEAMD container -Requires=updategraph.service +Requires=updategraph.service swss.service After=updategraph.service swss.service Before=ntp-config.service From 3877850b67015a980509e90eeb6677bf01d16473 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Fri, 5 Apr 2019 20:58:29 +0000 Subject: [PATCH 02/11] Move supervisor-proc-exit-listener script --- dockers/docker-orchagent/Dockerfile.j2 | 3 ++- dockers/docker-orchagent/critical_processes | 7 +++++++ dockers/docker-orchagent/supervisord.conf | 2 +- .../scripts}/supervisor-proc-exit-listener | 0 rules/scripts.mk | 6 +++++- 5 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 dockers/docker-orchagent/critical_processes rename {dockers/docker-orchagent => files/scripts}/supervisor-proc-exit-listener (100%) diff --git a/dockers/docker-orchagent/Dockerfile.j2 b/dockers/docker-orchagent/Dockerfile.j2 index fab61d160f14..8f480a53287e 100755 --- a/dockers/docker-orchagent/Dockerfile.j2 +++ b/dockers/docker-orchagent/Dockerfile.j2 @@ -39,9 +39,10 @@ RUN rm -rf /debs COPY ["files/arp_update", "/usr/bin"] COPY ["enable_counters.py", "/usr/bin"] -COPY ["supervisor-proc-exit-listener", "/usr/bin"] COPY ["start.sh", "orchagent.sh", "swssconfig.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["critical_processes", "/etc/supervisor/"] ## Copy all Jinja2 template files into the templates folder COPY ["*.j2", "/usr/share/sonic/templates/"] diff --git a/dockers/docker-orchagent/critical_processes b/dockers/docker-orchagent/critical_processes new file mode 100644 index 000000000000..d48eb66cda1b --- /dev/null +++ b/dockers/docker-orchagent/critical_processes @@ -0,0 +1,7 @@ +orchagent +portsyncd +intfsyncd +neighsyncd +vlanmgrd +intfmgrd +buffermgrd diff --git a/dockers/docker-orchagent/supervisord.conf b/dockers/docker-orchagent/supervisord.conf index 10c7af1a79a4..46b37e77c0e4 100644 --- a/dockers/docker-orchagent/supervisord.conf +++ b/dockers/docker-orchagent/supervisord.conf @@ -21,7 +21,7 @@ stderr_logfile=syslog command=/usr/sbin/rsyslogd -n priority=2 autostart=false -autorestart=false +autorestart=unexpected stdout_logfile=syslog stderr_logfile=syslog diff --git a/dockers/docker-orchagent/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener similarity index 100% rename from dockers/docker-orchagent/supervisor-proc-exit-listener rename to files/scripts/supervisor-proc-exit-listener diff --git a/rules/scripts.mk b/rules/scripts.mk index 6010b10b832b..8c6d0324fc0e 100644 --- a/rules/scripts.mk +++ b/rules/scripts.mk @@ -11,9 +11,13 @@ $(BUFFERS_CONFIG_TEMPLATE)_PATH = files/build_templates QOS_CONFIG_TEMPLATE = qos_config.j2 $(QOS_CONFIG_TEMPLATE)_PATH = files/build_templates +SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT = supervisor-proc-exit-listener +$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)_PATH = files/scripts + SONIC_COPY_FILES += $(CONFIGDB_LOAD_SCRIPT) \ $(ARP_UPDATE_SCRIPT) \ $(BUFFERS_CONFIG_TEMPLATE) \ - $(QOS_CONFIG_TEMPLATE) + $(QOS_CONFIG_TEMPLATE) \ + $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) From c847c985fed2274f4bb97f3677a90fb0f3ecc5bc Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Thu, 7 Feb 2019 22:38:49 +0000 Subject: [PATCH 03/11] Configure systemd to stop restarting swss if it attempts to restart more than 3 times in 20 minutes --- files/build_templates/swss.service.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/files/build_templates/swss.service.j2 b/files/build_templates/swss.service.j2 index b9de98322c26..45adbe698a13 100644 --- a/files/build_templates/swss.service.j2 +++ b/files/build_templates/swss.service.j2 @@ -9,6 +9,8 @@ Requires=nps-modules-4.9.0-8-2-amd64.service After=database.service updategraph.service After=interfaces-config.service Before=ntp-config.service +StartLimitInterval=1200 +StartLimitBurst=3 [Service] User=root From 0db6c24b0f221ba0709cadc78fb2c82bf143531d Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Fri, 5 Apr 2019 20:41:56 +0000 Subject: [PATCH 04/11] [docker-dhcp-relay] Enhance wait_for_intf.sh.j2 to utilize STATEDB --- dockers/docker-dhcp-relay/wait_for_intf.sh.j2 | 38 +++++++------- rules/docker-dhcp-relay.mk | 2 +- .../tests/sample_output/wait_for_intf.sh | 50 +++++++++---------- 3 files changed, 43 insertions(+), 47 deletions(-) diff --git a/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 b/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 index 037dc66ead63..23133706cb6c 100644 --- a/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 +++ b/dockers/docker-dhcp-relay/wait_for_intf.sh.j2 @@ -1,42 +1,40 @@ #!/usr/bin/env bash -function wait_until_iface_ready -{ - IFACE=$1 +STATE_DB_IDX="6" - echo "Waiting until interface $IFACE is up..." - - # Wait for the interface to come up (i.e., 'ip link show' returns 0) - until ip link show dev $IFACE up > /dev/null 2>&1; do - sleep 1 - done +PORT_TABLE_PREFIX="PORT_TABLE" +VLAN_TABLE_PREFIX="VLAN_TABLE" +LAG_TABLE_PREFIX="LAG_TABLE" - echo "Interface $IFACE is up" +function wait_until_iface_ready +{ + TABLE_PREFIX=$1 + IFACE=$2 - echo "Waiting until interface $IFACE has an IPv4 address..." + echo "Waiting until interface $IFACE is ready..." - # Wait until the interface gets assigned an IPv4 address + # Wait for the interface to come up + # (i.e., interface is present in STATE_DB and state is "ok") while true; do - IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1) - - if [ -n "$IP" ]; then + RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) + if [ x"$RESULT" == x"ok" ]; then break fi sleep 1 done - echo "Interface $IFACE is configured with IP $IP" + echo "Interface ${IFACE} is ready!" } -# Wait for all interfaces to come up and have IPv4 addresses assigned +# Wait for all interfaces to be up and ready {% for (name, prefix) in INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${PORT_TABLE_PREFIX} {{ name }} {% endfor %} {% for (name, prefix) in VLAN_INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${VLAN_TABLE_PREFIX} {{ name }} {% endfor %} {% for (name, prefix) in PORTCHANNEL_INTERFACE %} -wait_until_iface_ready {{ name }} +wait_until_iface_ready ${LAG_TABLE_PREFIX} {{ name }} {% endfor %} diff --git a/rules/docker-dhcp-relay.mk b/rules/docker-dhcp-relay.mk index f1f19d974a8e..d15bdbf84dc4 100644 --- a/rules/docker-dhcp-relay.mk +++ b/rules/docker-dhcp-relay.mk @@ -6,7 +6,7 @@ DOCKER_DHCP_RELAY_DBG = $(DOCKER_DHCP_RELAY_STEM)-$(DBG_IMAGE_MARK).gz $(DOCKER_DHCP_RELAY)_PATH = $(DOCKERS_PATH)/$(DOCKER_DHCP_RELAY_STEM) -$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) +$(DOCKER_DHCP_RELAY)_DEPENDS += $(ISC_DHCP_COMMON) $(ISC_DHCP_RELAY) $(REDIS_TOOLS) $(DOCKER_DHCP_RELAY)_DBG_DEPENDS = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_DEPENDS) $(DOCKER_DHCP_RELAY)_DBG_IMAGE_PACKAGES = $($(DOCKER_CONFIG_ENGINE_STRETCH)_DBG_IMAGE_PACKAGES) diff --git a/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh b/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh index 0f06235eb0dc..383f7cb389e9 100644 --- a/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh +++ b/src/sonic-config-engine/tests/sample_output/wait_for_intf.sh @@ -1,43 +1,41 @@ #!/usr/bin/env bash -function wait_until_iface_ready -{ - IFACE=$1 +STATE_DB_IDX="6" - echo "Waiting until interface $IFACE is up..." - - # Wait for the interface to come up (i.e., 'ip link show' returns 0) - until ip link show dev $IFACE up > /dev/null 2>&1; do - sleep 1 - done +PORT_TABLE_PREFIX="PORT_TABLE" +VLAN_TABLE_PREFIX="VLAN_TABLE" +LAG_TABLE_PREFIX="LAG_TABLE" - echo "Interface $IFACE is up" +function wait_until_iface_ready +{ + TABLE_PREFIX=$1 + IFACE=$2 - echo "Waiting until interface $IFACE has an IPv4 address..." + echo "Waiting until interface $IFACE is ready..." - # Wait until the interface gets assigned an IPv4 address + # Wait for the interface to come up + # (i.e., interface is present in STATE_DB and state is "ok") while true; do - IP=$(ip -4 addr show dev $IFACE | grep "inet " | awk '{ print $2 }' | cut -d '/' -f1) - - if [ -n "$IP" ]; then + RESULT=$(redis-cli -n ${STATE_DB_IDX} HGET "${TABLE_PREFIX}|${IFACE}" "state" 2> /dev/null) + if [ x"$RESULT" == x"ok" ]; then break fi sleep 1 done - echo "Interface $IFACE is configured with IP $IP" + echo "Interface ${IFACE} is ready!" } -# Wait for all interfaces to come up and have IPv4 addresses assigned -wait_until_iface_ready Vlan1000 -wait_until_iface_ready PortChannel01 -wait_until_iface_ready PortChannel01 -wait_until_iface_ready PortChannel02 -wait_until_iface_ready PortChannel02 -wait_until_iface_ready PortChannel03 -wait_until_iface_ready PortChannel03 -wait_until_iface_ready PortChannel04 -wait_until_iface_ready PortChannel04 +# Wait for all interfaces to be up and ready +wait_until_iface_ready ${VLAN_TABLE_PREFIX} Vlan1000 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel01 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel02 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel03 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 +wait_until_iface_ready ${LAG_TABLE_PREFIX} PortChannel04 From c61151be51fe58b3219066b1f77487b4a3b72217 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Fri, 5 Apr 2019 21:42:12 +0000 Subject: [PATCH 05/11] Ensure dependent services stop/start/restart with SwSS --- files/build_templates/dhcp_relay.service.j2 | 2 +- files/build_templates/radv.service.j2 | 2 +- files/build_templates/snmp.service.j2 | 3 +++ files/build_templates/teamd.service.j2 | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index e1d2779c2c71..2b4270ac5206 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -12,4 +12,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait ExecStop=/usr/bin/{{ docker_container_name }}.sh stop [Install] -WantedBy=multi-user.target teamd.service +WantedBy=multi-user.target swss.service teamd.service diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index 989b009e0321..4cf1b4e2b665 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -12,4 +12,4 @@ ExecStart=/usr/bin/{{ docker_container_name }}.sh wait ExecStop=/usr/bin/{{ docker_container_name }}.sh stop [Install] -WantedBy=multi-user.target +WantedBy=multi-user.target swss.service diff --git a/files/build_templates/snmp.service.j2 b/files/build_templates/snmp.service.j2 index 416156d5a891..43f46bd2b9c0 100644 --- a/files/build_templates/snmp.service.j2 +++ b/files/build_templates/snmp.service.j2 @@ -9,3 +9,6 @@ Before=ntp-config.service ExecStartPre=/usr/bin/{{docker_container_name}}.sh start ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop + +[Install] +WantedBy=multi-user.target swss.service diff --git a/files/build_templates/teamd.service.j2 b/files/build_templates/teamd.service.j2 index 1bea7b7d48be..58c858effb36 100644 --- a/files/build_templates/teamd.service.j2 +++ b/files/build_templates/teamd.service.j2 @@ -11,4 +11,4 @@ ExecStart=/usr/bin/{{docker_container_name}}.sh wait ExecStop=/usr/bin/{{docker_container_name}}.sh stop [Install] -WantedBy=multi-user.target +WantedBy=multi-user.target swss.service From afb1fc6ab6a27f2f0d2dd1e8d6bad8fae0122401 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Mon, 8 Apr 2019 17:43:00 +0000 Subject: [PATCH 06/11] Change 'StartLimitInterval' to 'StartLimitIntervalSec', as Stretch installs systemd 232 (>= v230) --- files/build_templates/swss.service.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/build_templates/swss.service.j2 b/files/build_templates/swss.service.j2 index 45adbe698a13..048c9c34704b 100644 --- a/files/build_templates/swss.service.j2 +++ b/files/build_templates/swss.service.j2 @@ -9,7 +9,7 @@ Requires=nps-modules-4.9.0-8-2-amd64.service After=database.service updategraph.service After=interfaces-config.service Before=ntp-config.service -StartLimitInterval=1200 +StartLimitIntervalSec=1200 StartLimitBurst=3 [Service] From 0e312e2c1c4a35c01cf06b208309bc5c8f3e9e91 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Mon, 8 Apr 2019 18:38:32 +0000 Subject: [PATCH 07/11] Also update journald.conf options --- files/image_config/systemd/journald.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/image_config/systemd/journald.conf b/files/image_config/systemd/journald.conf index 0f3d1b5ec25b..214bd099a285 100644 --- a/files/image_config/systemd/journald.conf +++ b/files/image_config/systemd/journald.conf @@ -13,7 +13,7 @@ #Seal=yes #SplitMode=uid #SyncIntervalSec=5m -#RateLimitInterval=30s +#RateLimitIntervalSec=30s #RateLimitBurst=1000 SystemMaxUse=50M #SystemKeepFree= From 7c361a634e07d85666207a1ea97c330cae5bbff2 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Mon, 29 Apr 2019 18:28:09 +0000 Subject: [PATCH 08/11] Remove 'PartOf' option from unit files --- files/build_templates/dhcp_relay.service.j2 | 1 - files/build_templates/radv.service.j2 | 1 - 2 files changed, 2 deletions(-) diff --git a/files/build_templates/dhcp_relay.service.j2 b/files/build_templates/dhcp_relay.service.j2 index 2b4270ac5206..7ec133c87af7 100644 --- a/files/build_templates/dhcp_relay.service.j2 +++ b/files/build_templates/dhcp_relay.service.j2 @@ -1,7 +1,6 @@ [Unit] Description=DHCP relay container Requires=updategraph.service swss.service teamd.service -PartOf=swss.service After=updategraph.service swss.service teamd.service Before=ntp-config.service diff --git a/files/build_templates/radv.service.j2 b/files/build_templates/radv.service.j2 index 4cf1b4e2b665..4f1c67e661b0 100644 --- a/files/build_templates/radv.service.j2 +++ b/files/build_templates/radv.service.j2 @@ -1,7 +1,6 @@ [Unit] Description=Router advertiser container Requires=updategraph.service swss.service -PartOf=swss.service After=updategraph.service swss.service Before=ntp-config.service From b4cf4b42878c5ae37eec808026e5c149511877d6 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Mon, 29 Apr 2019 18:34:03 +0000 Subject: [PATCH 09/11] Add '$(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT)' to new shared docker-orchagent makefile --- rules/docker-orchagent.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/docker-orchagent.mk b/rules/docker-orchagent.mk index ea637308179a..20536adc2520 100644 --- a/rules/docker-orchagent.mk +++ b/rules/docker-orchagent.mk @@ -34,4 +34,4 @@ $(DOCKER_ORCHAGENT)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_ORCHAGENT)_RUN_OPT += -v /var/log/swss:/var/log/swss:rw $(DOCKER_ORCHAGENT)_BASE_IMAGE_FILES += swssloglevel:/usr/bin/swssloglevel -$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT) +$(DOCKER_ORCHAGENT)_FILES += $(ARP_UPDATE_SCRIPT) $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) From cd690d84fa6e8090feb426dc9824b5563c42d9e1 Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Tue, 30 Apr 2019 00:45:08 +0000 Subject: [PATCH 10/11] Make supervisor-proc-exit-listener script read from 'critical_processes' file inside container --- files/scripts/supervisor-proc-exit-listener | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 8e55942321d9..6bc62fc400c8 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -7,10 +7,15 @@ import syslog from supervisor import childutils -# Process names as defined in supervisor.conf file -critical_processes = ['orchagent'] +# Contents of file should be the names of critical processes (as defined in +# supervisor.conf file), one per line +CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' def main(): + # Read the list of critical processes from a file + with open(CRITICAL_PROCESSES_FILE, 'r') as f: + critical_processes = [line.rstrip('\n') for line in f] + while True: # Transition from ACKNOWLEDGED to READY childutils.listener.ready() From f3dcce77ffff74a95f16b35cf85948af90e4603b Mon Sep 17 00:00:00 2001 From: Joe LeVeque Date: Tue, 30 Apr 2019 19:51:32 +0000 Subject: [PATCH 11/11] Update critical_processes file for swss container --- dockers/docker-orchagent/critical_processes | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dockers/docker-orchagent/critical_processes b/dockers/docker-orchagent/critical_processes index d48eb66cda1b..c0f441ecc0ea 100644 --- a/dockers/docker-orchagent/critical_processes +++ b/dockers/docker-orchagent/critical_processes @@ -1,7 +1,11 @@ orchagent portsyncd -intfsyncd neighsyncd vlanmgrd intfmgrd +portmgrd buffermgrd +vrfmgrd +nbrmgrd +vxlanmgrd +intfsyncd