Skip to content

Commit

Permalink
[chassis][midplane] Modify the chassisd to log expected/unexpected mi…
Browse files Browse the repository at this point in the history
…dplane connectivity messages (sonic-net#480)

* [chassis][midplane] Modify the chassisd to log expected/unexpected midplane connectivity messages

Add mechanism to get the linecard_reboot_timeout value from platform_env.conf file.
This provides capabilitiy to different platform can have a different timeout value

* Add UT test linecard reboot

---------

Signed-off-by: mlok <marty.lok@nokia.com>
  • Loading branch information
mlok-nokia authored May 15, 2024
1 parent 9d0c550 commit 88bf8ec
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 2 deletions.
54 changes: 52 additions & 2 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ CHASSIS_MIDPLANE_INFO_ACCESS_FIELD = 'access'
CHASSIS_MODULE_HOSTNAME_TABLE = 'CHASSIS_MODULE_TABLE'
CHASSIS_MODULE_INFO_HOSTNAME_FIELD = 'hostname'

CHASSIS_MODULE_REBOOT_INFO_TABLE = 'CHASSIS_MODULE_REBOOT_INFO_TABLE'
CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
DEFAULT_LINECARD_REBOOT_TIMEOUT = 180
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"

CHASSIS_INFO_UPDATE_PERIOD_SECS = 10
CHASSIS_DB_CLEANUP_MODULE_DOWN_PERIOD = 30 # Minutes

Expand Down Expand Up @@ -198,9 +204,18 @@ class ModuleUpdater(logger.Logger):
CHASSIS_ASIC_INFO_TABLE)

self.hostname_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_HOSTNAME_TABLE)
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None

self.linecard_reboot_timeout = DEFAULT_LINECARD_REBOOT_TIMEOUT
if os.path.isfile(PLATFORM_ENV_CONF_FILE):
with open(PLATFORM_ENV_CONF_FILE, 'r') as file:
for line in file:
field = line.split('=')[0].strip()
if field == "linecard_reboot_timeout":
self.linecard_reboot_timeout = int(line.split('=')[1].strip())

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
self.log_error("Chassisd midplane intialization failed")
Expand Down Expand Up @@ -362,6 +377,31 @@ class ModuleUpdater(logger.Logger):
else:
return False

def is_module_reboot_expected(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
fvs = dict(fvs[-1])
if fvs[CHASSIS_MODULE_REBOOT_REBOOT_FIELD] == "expected":
return True
return False

def module_reboot_set_time(self, key):
time_now = time.time()
fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
self.module_reboot_table.set(key,fvs)

def is_module_reboot_system_up_expired(self, key):
fvs = self.module_reboot_table.get(key)
if isinstance(fvs, list) and fvs[0] is True:
fvs = dict(fvs[-1])
if CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD in fvs.keys():
timestamp= float(fvs[CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD])
time_now = time.time()
if time_now - timestamp >= self.linecard_reboot_timeout:
self.module_reboot_table._del(key)
return True
return False

def check_midplane_reachability(self):
if not self.midplane_initialized:
return
Expand Down Expand Up @@ -395,10 +435,20 @@ class ModuleUpdater(logger.Logger):
current_midplane_state = fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

if midplane_access is False and current_midplane_state == 'True':
self.log_warning("Module {} lost midplane connectivity".format(module_key))
if self.is_module_reboot_expected(module_key):
self.module_reboot_set_time(module_key)
self.log_warning("Expected: Module {} lost midplane connectivity".format(module_key))
else:
self.log_warning("Unexpected: Module {} lost midplane connectivity".format(module_key))
elif midplane_access is True and current_midplane_state == 'False':
self.log_notice("Module {} midplane connectivity is up".format(module_key))

# clean up the reboot_info_table
if self.module_reboot_table.get(module_key) is not None:
self.module_reboot_table._del(module_key)
elif midplane_access is False and current_midplane_state == 'False':
if self.is_module_reboot_system_up_expired(module_key):
self.log_warning("Unexpected: Module {} midplane connectivity is not restored in {} seconds".format(module_key, self.linecard_reboot_timeout))

# Update db with midplane information
fvs = swsscommon.FieldValuePairs([(CHASSIS_MIDPLANE_INFO_IP_FIELD, midplane_ip),
(CHASSIS_MIDPLANE_INFO_ACCESS_FIELD, str(midplane_access))])
Expand Down
124 changes: 124 additions & 0 deletions sonic-chassisd/tests/test_chassisd.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
import mock
from imp import load_source

from mock import Mock, MagicMock, patch
Expand Down Expand Up @@ -40,6 +41,10 @@
CHASSIS_ASIC_PCI_ADDRESS_FIELD = 'asic_pci_address'
CHASSIS_ASIC_ID_IN_MODULE_FIELD = 'asic_id_in_module'

CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"

def setup_function():
ModuleUpdater.log_notice = MagicMock()
ModuleUpdater.log_warning = MagicMock()
Expand Down Expand Up @@ -357,6 +362,125 @@ def test_midplane_presence_modules():
fvs = midplane_table.get(name)
assert fvs == None

builtin_open = open # save the unpatched version
def mock_open(*args, **kwargs):
if args[0] == PLATFORM_ENV_CONF_FILE:
return mock.mock_open(read_data="dummy=1\nlinecard_reboot_timeout=240\n")(*args, **kwargs)
# unpatched version for every other path
return builtin_open(*args, **kwargs)

@patch("builtins.open", mock_open)
@patch('os.path.isfile', MagicMock(return_value=True))
def test_midplane_presence_modules_linecard_reboot():
chassis = MockChassis()

#Supervisor
index = 0
name = "SUPERVISOR0"
desc = "Supervisor card"
slot = 16
serial = "RP1000101"
module_type = ModuleBase.MODULE_TYPE_SUPERVISOR
supervisor = MockModule(index, name, desc, module_type, slot, serial)
supervisor.set_midplane_ip()
chassis.module_list.append(supervisor)

#Linecard
index = 1
name = "LINE-CARD0"
desc = "36 port 400G card"
slot = 1
serial = "LC1000101"
module_type = ModuleBase.MODULE_TYPE_LINE
module = MockModule(index, name, desc, module_type, slot, serial)
module.set_midplane_ip()
chassis.module_list.append(module)

#Fabric-card
index = 1
name = "FABRIC-CARD0"
desc = "Switch fabric card"
slot = 17
serial = "FC1000101"
module_type = ModuleBase.MODULE_TYPE_FABRIC
fabric = MockModule(index, name, desc, module_type, slot, serial)
chassis.module_list.append(fabric)

#Run on supervisor
module_updater = ModuleUpdater(SYSLOG_IDENTIFIER, chassis, slot,
module.supervisor_slot)
module_updater.supervisor_slot = supervisor.get_slot()
module_updater.my_slot = supervisor.get_slot()
module_updater.modules_num_update()
module_updater.module_db_update()
module_updater.check_midplane_reachability()

midplane_table = module_updater.midplane_table
#Check only one entry in database
assert 1 == midplane_table.size()

#Check fields in database
name = "LINE-CARD0"
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

#Set access of line-card to Up (midplane connectivity is down initially)
module.set_midplane_reachable(True)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]


#Set access of line-card to Down (to mock midplane connectivity state change)
module.set_midplane_reachable(False)
# set expected reboot of linecard
module_reboot_table = module_updater.module_reboot_table
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

#Set access of line-card to up on time (to mock midplane connectivity state change)
module.set_midplane_reachable(True)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]

# test linecard reboot midplane connectivity restored timeout
# Set access of line-card to Down (to mock midplane connectivity state change)
module.set_midplane_reachable(False)
linecard_fvs = swsscommon.FieldValuePairs([("reboot", "expected")])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
time_now= time.time() - module_updater.linecard_reboot_timeout
linecard_fvs = swsscommon.FieldValuePairs([(CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD, str(time_now))])
module_reboot_table.set(name,linecard_fvs)
module_updater.check_midplane_reachability()
fvs = midplane_table.get(name)
assert fvs != None
if isinstance(fvs, list):
fvs = dict(fvs[-1])
assert module.get_midplane_ip() == fvs[CHASSIS_MIDPLANE_INFO_IP_FIELD]
assert str(module.is_midplane_reachable()) == fvs[CHASSIS_MIDPLANE_INFO_ACCESS_FIELD]
assert module_updater.linecard_reboot_timeout == 240

def test_midplane_presence_supervisor():
chassis = MockChassis()

Expand Down

0 comments on commit 88bf8ec

Please sign in to comment.