diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index fa5da2654880..69d188abd28a 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -64,27 +64,61 @@ def log_on_status_changed(normal_status, normal_log, abnormal_log): class FanStatus(object): - def __init__(self): + absence_fan_count = 0 + fault_fan_count = 0 + update_led_color = True + + def __init__(self, fan=None, is_psu_fan=False): """ Constructor of FanStatus """ + self.fan = fan + self.is_psu_fan = is_psu_fan self.presence = True + self.status = True self.under_speed = False self.over_speed = False self.invalid_direction = False + @classmethod + def get_bad_fan_count(cls): + return cls.absence_fan_count + cls.fault_fan_count + + @classmethod + def reset_fan_counter(cls): + cls.absence_fan_count = 0 + cls.fault_fan_count = 0 + def set_presence(self, presence): """ Set and cache Fan presence status :param presence: Fan presence status :return: True if status changed else False """ + if not presence and not self.is_psu_fan: + FanStatus.absence_fan_count += 1 + if presence == self.presence: return False self.presence = presence return True + def set_fault_status(self, status): + """ + Set and cache Fan fault status + :param status: Fan fault status, False indicate Fault + :return: True if status changed else False + """ + if not status: + FanStatus.fault_fan_count += 1 + + if status == self.status: + return False + + self.status = status + return True + def _check_speed_value_available(self, speed, target_speed, tolerance, current_status): if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE: if tolerance > 100 or tolerance < 0: @@ -142,7 +176,11 @@ class FanStatus(object): Indicate the Fan works as expect :return: True if Fan works normal else False """ - return self.presence and not self.under_speed and not self.over_speed and not self.invalid_direction + return self.presence and \ + self.status and \ + not self.under_speed and \ + not self.over_speed and \ + not self.invalid_direction # @@ -176,33 +214,51 @@ class FanUpdater(object): :return: """ logger.log_debug("Start fan updating") - for index, fan in enumerate(self.chassis.get_all_fans()): - try: - self._refresh_fan_status(fan, index) - except Exception as e: - logger.log_warning('Failed to update FAN status - {}'.format(e)) + old_bad_fan_count = FanStatus.get_bad_fan_count() + FanStatus.reset_fan_counter() + + fan_index = 0 + for drawer in self.chassis.get_all_fan_drawers(): + for fan in drawer.get_all_fans(): + try: + self._refresh_fan_status(drawer, fan, fan_index) + except Exception as e: + logger.log_warning('Failed to update FAN status - {}'.format(e)) + fan_index += 1 for psu_index, psu in enumerate(self.chassis.get_all_psus()): psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index)) for fan_index, fan in enumerate(psu.get_all_fans()): try: - self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name)) + self._refresh_fan_status(None, fan, fan_index, '{} FAN'.format(psu_name), True) except Exception as e: logger.log_warning('Failed to update PSU FAN status - {}'.format(e)) + self._update_led_color() + + bad_fan_count = FanStatus.get_bad_fan_count() + if bad_fan_count > 0 and old_bad_fan_count != bad_fan_count: + logger.log_warning("Insufficient number of working fans warning: {} fans are not working.".format( + bad_fan_count + )) + elif old_bad_fan_count > 0 and bad_fan_count == 0: + logger.log_notice("Insufficient number of working fans warning cleared: all fans are back to normal.") + logger.log_debug("End fan updating") - def _refresh_fan_status(self, fan, index, name_prefix='FAN'): + def _refresh_fan_status(self, fan_drawer, fan, index, name_prefix='FAN', is_psu_fan=False): """ Get Fan status by platform API and write to database for a given Fan + :param fan_drawer: Object representing a platform Fan drawer :param fan: Object representing a platform Fan :param index: Index of the Fan object in the platform :param name_prefix: name prefix of Fan object if Fan.get_name not presented :return: """ + drawer_name = NOT_AVAILABLE if is_psu_fan else str(try_get(fan_drawer.get_name)) fan_name = try_get(fan.get_name, '{} {}'.format(name_prefix, index + 1)) if fan_name not in self.fan_status_dict: - self.fan_status_dict[fan_name] = FanStatus() + self.fan_status_dict[fan_name] = FanStatus(fan, is_psu_fan) fan_status = self.fan_status_dict[fan_name] @@ -228,29 +284,39 @@ class FanUpdater(object): 'the system, potential overheat hazard'.format(fan_name) ) + if presence and fan_status.set_fault_status(fan_fault_status): + set_led = True + log_on_status_changed(fan_status.status, + 'Fan fault warning cleared: {} is back to normal.'.format(fan_name), + 'Fan fault warning: {} is broken.'.format(fan_name) + ) + if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance): set_led = True log_on_status_changed(not fan_status.under_speed, - 'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name), - 'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'. + 'Fan low speed warning cleared: {} speed is back to normal.'.format(fan_name), + 'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}.'. format(fan_name, speed, speed_target, speed_tolerance) ) if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance): set_led = True log_on_status_changed(not fan_status.over_speed, - 'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name), - 'Fan over speed warning: {} target speed={}, current speed={}, tolerance={}.'. + 'Fan high speed warning cleared: {} speed is back to normal.'.format(fan_name), + 'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}.'. format(fan_name, speed_target, speed, speed_tolerance) ) # TODO: handle invalid fan direction - if set_led: - self._set_fan_led(fan, fan_name, fan_status) + # We don't set PSU led here, PSU led will be handled in psud + if set_led and not is_psu_fan: + self._set_fan_led(fan_drawer, fan, fan_name, fan_status) + FanStatus.update_led_color = True fvs = swsscommon.FieldValuePairs( [('presence', str(presence)), + ('drawer_name', drawer_name), ('model', str(try_get(fan.get_model))), ('serial', str(try_get(fan.get_serial))), ('status', str(fan_fault_status)), @@ -258,15 +324,15 @@ class FanUpdater(object): ('speed', str(speed)), ('speed_tolerance', str(speed_tolerance)), ('speed_target', str(speed_target)), - ('led_status', str(try_get(fan.get_status_led))), ('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S')) ]) self.table.set(fan_name, fvs) - def _set_fan_led(self, fan, fan_name, fan_status): + def _set_fan_led(self, fan_drawer, fan, fan_name, fan_status): """ Set fan led according to current status + :param fan_drawer: Object representing a platform Fan drawer or PSU :param fan: Object representing a platform Fan :param fan_name: Name of the Fan object in case any vendor not implement Fan.get_name :param fan_status: Object representing the FanStatus @@ -275,13 +341,30 @@ class FanUpdater(object): try: if fan_status.is_ok(): fan.set_status_led(fan.STATUS_LED_COLOR_GREEN) + fan_drawer.set_status_led(fan.STATUS_LED_COLOR_GREEN) else: # TODO: wait for Kebo to define the mapping of fan status to led color, # just set it to red so far fan.set_status_led(fan.STATUS_LED_COLOR_RED) + fan_drawer.set_status_led(fan.STATUS_LED_COLOR_RED) except NotImplementedError as e: logger.log_warning('Failed to set led to fan, set_status_led not implemented') + def _update_led_color(self): + if FanStatus.update_led_color: + for fan_name, fan_status in self.fan_status_dict.items(): + try: + fvs = swsscommon.FieldValuePairs([ + ('led_status', str(try_get(fan_status.fan.get_status_led))) + ]) + except Exception as e: + logger.log_warning('Failed to get led status for fan') + fvs = swsscommon.FieldValuePairs([ + ('led_status', NOT_AVAILABLE) + ]) + self.table.set(fan_name, fvs) + FanStatus.update_led_color = False + class TemperatureStatus(object): TEMPERATURE_DIFF_THRESHOLD = 10 diff --git a/sonic-thermalctld/tests/mock_platform.py b/sonic-thermalctld/tests/mock_platform.py index df2c6518105c..b6fae1eaf0d4 100644 --- a/sonic-thermalctld/tests/mock_platform.py +++ b/sonic-thermalctld/tests/mock_platform.py @@ -81,6 +81,25 @@ def get_all_fans(self): return self.fan_list +class MockFanDrawer(MockDevice): + def __init__(self): + self.name = 'FanDrawer' + self.fan_list = [] + self.led_status = 'red' + + def get_name(self): + return self.name + + def get_all_fans(self): + return self.fan_list + + def get_status_led(self): + return self.led_status + + def set_status_led(self, value): + self.led_status = value + + class MockThermal: def __init__(self): self.name = None @@ -134,6 +153,7 @@ def __init__(self): self.fan_list = [] self.psu_list = [] self.thermal_list = [] + self.fan_drawer_list = [] def get_all_fans(self): return self.fan_list @@ -144,24 +164,47 @@ def get_all_psus(self): def get_all_thermals(self): return self.thermal_list + def get_all_fan_drawers(self): + return self.fan_drawer_list + def make_absence_fan(self): fan = MockFan() fan.presence = False + fan_drawer = MockFanDrawer() + fan_drawer.fan_list.append(fan) + self.fan_list.append(fan) + self.fan_drawer_list.append(fan_drawer) + + def make_fault_fan(self): + fan = MockFan() + fan.status = False + fan_drawer = MockFanDrawer() + fan_drawer.fan_list.append(fan) self.fan_list.append(fan) + self.fan_drawer_list.append(fan_drawer) def make_under_speed_fan(self): fan = MockFan() fan.make_under_speed() + fan_drawer = MockFanDrawer() + fan_drawer.fan_list.append(fan) self.fan_list.append(fan) + self.fan_drawer_list.append(fan_drawer) def make_over_speed_fan(self): fan = MockFan() fan.make_over_speed() + fan_drawer = MockFanDrawer() + fan_drawer.fan_list.append(fan) self.fan_list.append(fan) + self.fan_drawer_list.append(fan_drawer) def make_error_fan(self): fan = MockErrorFan() + fan_drawer = MockFanDrawer() + fan_drawer.fan_list.append(fan) self.fan_list.append(fan) + self.fan_drawer_list.append(fan_drawer) def make_over_temper_thermal(self): thermal = MockThermal() diff --git a/sonic-thermalctld/tests/test_thermalctld.py b/sonic-thermalctld/tests/test_thermalctld.py index f2f0b05bd24f..9a3db6ba39eb 100644 --- a/sonic-thermalctld/tests/test_thermalctld.py +++ b/sonic-thermalctld/tests/test_thermalctld.py @@ -95,12 +95,27 @@ def test_fanupdater_fan_absence(): fan_updater.update() fan_list = chassis.get_all_fans() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED - logger.log_warning.assert_called_once() + logger.log_warning.assert_called() fan_list[0].presence = True fan_updater.update() assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN - logger.log_notice.assert_called_once() + logger.log_notice.assert_called() + + +def test_fanupdater_fan_fault(): + chassis = MockChassis() + chassis.make_fault_fan() + fan_updater = FanUpdater(chassis) + fan_updater.update() + fan_list = chassis.get_all_fans() + assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED + logger.log_warning.assert_called() + + fan_list[0].status = True + fan_updater.update() + assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN + logger.log_notice.assert_called() def test_fanupdater_fan_under_speed(): @@ -133,6 +148,35 @@ def test_fanupdater_fan_over_speed(): logger.log_notice.assert_called_once() +def test_insufficient_fan_number(): + fan_status1 = FanStatus() + fan_status2 = FanStatus() + fan_status1.set_presence(False) + fan_status2.set_fault_status(False) + assert FanStatus.get_bad_fan_count() == 2 + FanStatus.reset_fan_counter() + assert FanStatus.get_bad_fan_count() == 0 + + chassis = MockChassis() + chassis.make_absence_fan() + chassis.make_fault_fan() + fan_updater = FanUpdater(chassis) + fan_updater.update() + assert logger.log_warning.call_count == 3 + logger.log_warning.assert_called_with('Insufficient number of working fans warning: 2 fans are not working.') + + fan_list = chassis.get_all_fans() + fan_list[0].presence = True + fan_updater.update() + assert logger.log_notice.call_count == 1 + logger.log_warning.assert_called_with('Insufficient number of working fans warning: 1 fans are not working.') + + fan_list[1].status = True + fan_updater.update() + assert logger.log_notice.call_count == 3 + logger.log_notice.assert_called_with('Insufficient number of working fans warning cleared: all fans are back to normal.') + + def test_temperature_status_set_over_temper(): temperatue_status = TemperatureStatus() ret = temperatue_status.set_over_temperature(NOT_AVAILABLE, NOT_AVAILABLE)