From 172f0c055bd9b24ff9b46bf3aa793313aecdb5c7 Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Thu, 20 Apr 2023 22:33:06 +0000 Subject: [PATCH 1/3] LAG keepalive script to reduce lacp session wait during warm upgrade --- scripts/fast-reboot | 8 +++ scripts/lag_keepalive.py | 102 +++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 3 files changed, 111 insertions(+) create mode 100644 scripts/lag_keepalive.py diff --git a/scripts/fast-reboot b/scripts/fast-reboot index eea97e792b..99a631046f 100755 --- a/scripts/fast-reboot +++ b/scripts/fast-reboot @@ -17,6 +17,7 @@ STRICT=no REBOOT_METHOD="/sbin/kexec -e" ASSISTANT_IP_LIST="" ASSISTANT_SCRIPT="/usr/local/bin/neighbor_advertiser" +LAG_KEEPALIVE_SCRIPT="/usr/local/bin/lag_keepalive.py" WATCHDOG_UTIL="/usr/local/bin/watchdogutil" DEVPATH="/usr/share/sonic/device" PLATFORM=$(sonic-cfggen -H -v DEVICE_METADATA.localhost.platform) @@ -682,6 +683,13 @@ fi # disable trap-handlers which were set before trap '' EXIT HUP INT QUIT TERM KILL ABRT ALRM +# start sending LACPDUs to keep the LAGs refreshed +# this is a non-blocking call, and the process will die in 300s +debug "Starting lag_keepalive to send LACPDUs ..." +timeout 300 python ${LAG_KEEPALIVE_SCRIPT} & +# give the lag_keepalive script a chance to get ready (30s) and collect one lacpdu before going down (30s) +sleep 60 + if [ -x ${LOG_SSD_HEALTH} ]; then debug "Collecting logs to check ssd health before ${REBOOT_TYPE}..." ${LOG_SSD_HEALTH} diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py new file mode 100644 index 0000000000..d33cb3580f --- /dev/null +++ b/scripts/lag_keepalive.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 + +from scapy.config import conf +conf.ipv6_enabled = False +from scapy.all import sendp, sniff +from swsssdk import ConfigDBConnector +import time, threading, traceback +import syslog + +SYSLOG_ID = 'lag_keepalive' + + +def log_info(msg): + syslog.openlog(SYSLOG_ID) + syslog.syslog(syslog.LOG_INFO, msg) + syslog.closelog() + + +def log_error(msg): + syslog.openlog(SYSLOG_ID) + syslog.syslog(syslog.LOG_ERR, msg) + syslog.closelog() + + +def sniff_lacpdu(device_mac, lag_member, lag_member_to_packet): + sniffed_packet = sniff(iface=lag_member, + filter="ether proto 0x8809 and ether src {}".format(device_mac), + count=1, timeout=30) + lag_member_to_packet[lag_member] = sniffed_packet + + +def get_lacpdu_per_lag_member(): + appDB = ConfigDBConnector() + appDB.db_connect('APPL_DB') + appDB_lag_info = appDB.get_keys('LAG_MEMBER_TABLE') + configDB = ConfigDBConnector() + configDB.db_connect('CONFIG_DB') + device_mac = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "mac") + hwsku = configDB.get(configDB.CONFIG_DB, "DEVICE_METADATA|localhost", "hwsku") + active_lag_members = list() + lag_member_to_packet = dict() + sniffer_threads = list() + for lag_entry in appDB_lag_info: + lag_name = str(lag_entry[0]) + oper_status = appDB.get(appDB.APPL_DB,"LAG_TABLE:{}".format(lag_name), "oper_status") + if oper_status == "up": + # only apply the workaround for active lags + lag_member = str(lag_entry[1]) + active_lag_members.append(lag_member) + # use threading to capture lacpdus from several lag members simultaneously + sniffer_thread = threading.Thread(target=sniff_lacpdu, + args=(device_mac, lag_member, lag_member_to_packet)) + sniffer_thread.start() + sniffer_threads.append(sniffer_thread) + + # sniff for lacpdu should finish in <= 30s. sniff timeout is also set to 30s + for sniffer in sniffer_threads: + sniffer.join(timeout=30) + + return active_lag_members, lag_member_to_packet + + +def lag_keepalive(lag_member_to_packet): + while True: + for lag_member, packet in lag_member_to_packet.items(): + try: + sendp(packet, iface=lag_member, verbose=False) + except Exception: + # log failure and continue to send lacpdu + traceback_msg = traceback.format_exc() + log_error("Failed to send LACPDU packet from interface {} with error: {}".format( + lag_member, traceback_msg)) + continue + log_info("sent LACPDU packets via {}".format(lag_member_to_packet.keys())) + time.sleep(1) + + +def main(): + while True: + try: + active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() + if len(active_lag_members) != len(active_lag_members): + log_error("Failed to capture LACPDU packets for some lag members. " +\ + "Active lag members: {}. LACPDUs captured for: {}".format( + active_lag_members, active_lag_members.keys())) + + log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) + except Exception: + traceback_msg = traceback.format_exc() + log_error("Failed to get LAG members and LACPDUs with error: {}".format( + traceback_msg)) + # keep attempting until sniffed packets are ready + continue + # if no exceptions are thrown, break from loop as LACPDUs are ready to be sent + break + + if lag_member_to_packet: + # start an infinite loop to keep sending lacpdus from lag member ports + lag_keepalive(lag_member_to_packet) + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index a2c851998f..d369e4947c 100644 --- a/setup.py +++ b/setup.py @@ -139,6 +139,7 @@ 'scripts/intfutil', 'scripts/intfstat', 'scripts/ipintutil', + 'scripts/lag_keepalive.py', 'scripts/lldpshow', 'scripts/log_ssd_health', 'scripts/mellanox_buffer_migrator.py', From 882cb1225731c99bf66cf85b4e04445787214339 Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Wed, 26 Apr 2023 23:09:42 +0000 Subject: [PATCH 2/3] Fix the check for missed lags and corresponding error --- scripts/lag_keepalive.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py index d33cb3580f..80575d6e6a 100644 --- a/scripts/lag_keepalive.py +++ b/scripts/lag_keepalive.py @@ -79,10 +79,10 @@ def main(): while True: try: active_lag_members, lag_member_to_packet = get_lacpdu_per_lag_member() - if len(active_lag_members) != len(active_lag_members): + if len(active_lag_members) != len(lag_member_to_packet.keys()): log_error("Failed to capture LACPDU packets for some lag members. " +\ "Active lag members: {}. LACPDUs captured for: {}".format( - active_lag_members, active_lag_members.keys())) + active_lag_members, lag_member_to_packet.keys())) log_info("ready to send LACPDU packets via {}".format(lag_member_to_packet.keys())) except Exception: From eee54155779765b9ed7a778a0aa5058cc30d2d88 Mon Sep 17 00:00:00 2001 From: Vaibhav Hemant Dixit Date: Thu, 4 May 2023 05:55:56 +0000 Subject: [PATCH 3/3] Fix incorrect import --- scripts/lag_keepalive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lag_keepalive.py b/scripts/lag_keepalive.py index 80575d6e6a..04ac01a074 100644 --- a/scripts/lag_keepalive.py +++ b/scripts/lag_keepalive.py @@ -3,7 +3,7 @@ from scapy.config import conf conf.ipv6_enabled = False from scapy.all import sendp, sniff -from swsssdk import ConfigDBConnector +from swsscommon.swsscommon import ConfigDBConnector import time, threading, traceback import syslog