From 9933dcd72252f44b3f8422a71350469720a27e57 Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Thu, 8 Sep 2022 13:47:27 +0800 Subject: [PATCH] [Mellanox] Support PSU power threshold checking (#6288) What is the motivation for this PR? Implement the regression test case of PSU power threshold exceeding check. 1. It can be done on a per vendor/platform basis only because the way to trigger PSU power threshold exceeding varies among vendors and platforms. 2. Support parsing WARNING state in the output of show platform psustatus How did you do it? We implement the test as a mock test because * It's not able to meet the conditions to trigger the PSU power to exceed its threshold in any regression test bed, (which requires 100% utilization of throughput and high power-consumption xSFP modules * PSU power threshold exceeding checking is not supported on all testbeds. If we would like to run the test on a test bed which does not physically support it, we also need to mock it. How did you verify/test it? Manually test and run regression test. Any platform specific information? Mellanox platforms only. Signed-off-by: Stephen Sun --- tests/common/platform/device_utils.py | 4 +- tests/platform_tests/mellanox/conftest.py | 17 + .../mellanox_thermal_control_test_helper.py | 95 +++++- .../mellanox/test_psu_power_threshold.py | 302 ++++++++++++++++++ 4 files changed, 409 insertions(+), 9 deletions(-) create mode 100644 tests/platform_tests/mellanox/test_psu_power_threshold.py diff --git a/tests/common/platform/device_utils.py b/tests/common/platform/device_utils.py index f9c01a3f1d..85ab5c4967 100644 --- a/tests/common/platform/device_utils.py +++ b/tests/common/platform/device_utils.py @@ -31,7 +31,7 @@ def get_dut_psu_line_pattern(dut): if "201811" in dut.os_version or "201911" in dut.os_version: psu_line_pattern = re.compile(r"PSU\s+(\d)+\s+(OK|NOT OK|NOT PRESENT)") elif dut.facts['platform'] == "x86_64-dellemc_z9332f_d1508-r0" or dut.facts['asic_type'] == "cisco-8000": - psu_line_pattern = re.compile(r"PSU\s+(\d+).*?(OK|NOT OK|NOT PRESENT)\s+(N/A)") + psu_line_pattern = re.compile(r"PSU\s+(\d+).*?(OK|NOT OK|NOT PRESENT|WARNING)\s+(N/A)") else: """ Changed the pattern to match space (s+) and non-space (S+) only. @@ -45,7 +45,7 @@ def get_dut_psu_line_pattern(dut): PSU 2 N/A N/A 12.01 4.12 49.50 OK green """ - psu_line_pattern = re.compile(r"PSU\s+(\d+).*?(OK|NOT OK|NOT PRESENT)\s+(green|amber|red|off)") + psu_line_pattern = re.compile(r"PSU\s+(\d+).*?(OK|NOT OK|NOT PRESENT|WARNING)\s+(green|amber|red|off)") return psu_line_pattern diff --git a/tests/platform_tests/mellanox/conftest.py b/tests/platform_tests/mellanox/conftest.py index e69de29bb2..1b7d46186c 100644 --- a/tests/platform_tests/mellanox/conftest.py +++ b/tests/platform_tests/mellanox/conftest.py @@ -0,0 +1,17 @@ +def pytest_addoption(parser): + ''' + Adds option to Mellanox specific pytest + + Args: + parser: pytest parser object + + Returns: + None + ''' + mellanox_group = parser.getgroup("Mellanox test suite options") + + mellanox_group.addoption( + "--mock_any_testbed", + action="store_true", + help="Mock on testbeds which do not support PSU power thresholds", + ) diff --git a/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py b/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py index 993f49318b..ed2d79abc3 100644 --- a/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py +++ b/tests/platform_tests/mellanox/mellanox_thermal_control_test_helper.py @@ -114,6 +114,12 @@ class MockerHelper: # LED related sys fs folder path. LED_PATH = '/var/run/hw-management/led/' + # Config path + CONFIG_PATH = '/var/run/hw-management/config/' + + # Power path + POWER_PATH = '/var/run/hw-management/power/' + # FAN number of DUT. FAN_NUM = 0 @@ -190,21 +196,26 @@ def mock_led_value(self, file_path, value): file_path = os.path.join(MockerHelper.LED_PATH, file_path) self.mock_value(file_path, value) - def mock_value(self, file_path, value): + def mock_value(self, file_path, value, force=False): """ Unlink existing sys fs file and replace it with a new one. Write given value to the new file. :param file_path: Sys fs file path. :param value: Value to write to sys fs file. + :param force: Force mock even if the file does not exist. :return: """ if file_path not in self.regular_file_list and file_path not in self.unlink_file_list: out = self.dut.stat(path=file_path) + exist = True if not out['stat']['exists']: - raise SysfsNotExistError('{} not exist'.format(file_path)) - if out['stat']['islnk']: + if force: + exist = False + else: + raise SysfsNotExistError('{} not exist'.format(file_path)) + if exist and out['stat']['islnk']: self._unlink(file_path) else: - self._cache_file_value(file_path) + self._cache_file_value(file_path, force) self.dut.shell('echo \'{}\' > {}'.format(value, file_path)) def read_thermal_value(self, file_path): @@ -241,7 +252,7 @@ def read_value(self, file_path): except Exception as e: assert 0, "Get content from %s failed, exception: %s" % (file_path, repr(e)) - def _cache_file_value(self, file_path): + def _cache_file_value(self, file_path, may_nexist=False): """ Cache file value for regular file. :param file_path: Regular file path. @@ -252,7 +263,10 @@ def _cache_file_value(self, file_path): value = output["stdout"] self.regular_file_list[file_path] = value.strip() except Exception as e: - assert 0, "Get content from %s failed, exception: %s" % (file_path, repr(e)) + if may_nexist: + self.regular_file_list[file_path] = None + else: + assert 0, "Get content from %s failed, exception: %s" % (file_path, repr(e)) def _unlink(self, file_path): """ @@ -282,7 +296,10 @@ def deinit(self): failed_recover_files = {} for file_path, value in self.regular_file_list.items(): try: - self.dut.shell('echo \'{}\' > {}'.format(value, file_path)) + if value is None: + self.dut.shell('rm -f {}'.format(file_path)) + else: + self.dut.shell('echo \'{}\' > {}'.format(value, file_path)) except Exception as e: # Catch any exception for later retry failed_recover_files[file_path] = value @@ -1192,3 +1209,67 @@ def mock_cpu_pack_temperature(self, temperature): def get_cpu_cooling_state(self): return int(self.mock_helper.read_value(self.CPU_COOLING_STATE_FILE)) + + +@mocker('PsuPowerThresholdMocker') +class PsuPowerThresholdMocker(object): + PORT_AMBIENT_TEMP = '/var/run/hw-management/thermal/port_amb' + FAN_AMBIENT_TEMP = '/var/run/hw-management/thermal/fan_amb' + AMBIENT_TEMP_CRITICAL_THRESHOLD = '/var/run/hw-management/config/amb_tmp_crit_limit' + AMBIENT_TEMP_WARNING_THRESHOLD = '/var/run/hw-management/config/amb_tmp_warn_limit' + PSU_POWER_SLOPE = '/var/run/hw-management/config/psu_power_slope' + PSU_POWER_CAPACITY = '/var/run/hw-management/config/psu{}_power_capacity' + PSU_POWER = '/var/run/hw-management/power/psu{}_power' + + def __init__(self, dut): + self.mock_helper = MockerHelper(dut) + + def deinit(self): + self.mock_helper.deinit() + + def mock_power_threshold(self, number_psus): + self.mock_helper.mock_value(self.AMBIENT_TEMP_WARNING_THRESHOLD, 65000, True) + self.mock_helper.mock_value(self.AMBIENT_TEMP_CRITICAL_THRESHOLD, 75000, True) + self.mock_helper.mock_value(self.PSU_POWER_SLOPE, 2000, True) + + max_power = None + for i in range(number_psus): + if not max_power: + power = int(self.mock_helper.read_value(self.PSU_POWER.format(i + 1))) + # Round up to 100 watt and then double it to avoid noise when power fluctuate + max_power = int(round(power/100000000.0)) * 100000000 * 2 + self.mock_helper.mock_value(self.PSU_POWER_CAPACITY.format(i + 1), max_power, True) + + # Also mock ambient temperatures + self.mock_helper.mock_value(self.PORT_AMBIENT_TEMP, self.read_port_ambient_thermal()) + self.mock_helper.mock_value(self.FAN_AMBIENT_TEMP, self.read_fan_ambient_thermal()) + + def mock_psu_power(self, psu, power): + self.mock_helper.mock_value(self.PSU_POWER.format(psu), int(power)) + + def mock_fan_ambient_thermal(self, temperature): + self.mock_helper.mock_value(self.FAN_AMBIENT_TEMP, int(temperature)) + + def mock_port_ambient_thermal(self, temperature): + self.mock_helper.mock_value(self.PORT_AMBIENT_TEMP, int(temperature)) + + def read_psu_power_threshold(self, psu): + return int(self.mock_helper.read_value(self.PSU_POWER_CAPACITY.format(psu))) + + def read_psu_power_slope(self): + return int(self.mock_helper.read_value(self.PSU_POWER_SLOPE)) + + def read_psu_power(self, psu): + return int(self.mock_helper.read_value(self.PSU_POWER.format(psu))) + + def read_ambient_temp_critical_threshold(self): + return int(self.mock_helper.read_value(self.AMBIENT_TEMP_CRITICAL_THRESHOLD)) + + def read_ambient_temp_warning_threshold(self): + return int(self.mock_helper.read_value(self.AMBIENT_TEMP_WARNING_THRESHOLD)) + + def read_port_ambient_thermal(self): + return int(self.mock_helper.read_value(self.PORT_AMBIENT_TEMP)) + + def read_fan_ambient_thermal(self): + return int(self.mock_helper.read_value(self.FAN_AMBIENT_TEMP)) diff --git a/tests/platform_tests/mellanox/test_psu_power_threshold.py b/tests/platform_tests/mellanox/test_psu_power_threshold.py new file mode 100644 index 0000000000..4d17dd468b --- /dev/null +++ b/tests/platform_tests/mellanox/test_psu_power_threshold.py @@ -0,0 +1,302 @@ +import allure +import logging +import pytest +from tests.common.plugins.loganalyzer.loganalyzer import LogAnalyzer +from tests.common.helpers.assertions import pytest_assert +from tests.common.mellanox_data import get_platform_data +from tests.common.utilities import wait_until +from tests.platform_tests.thermal_control_test_helper import mocker_factory +from mellanox_thermal_control_test_helper import MockerHelper, PsuPowerThresholdMocker + +pytestmark = [ + pytest.mark.asic('mellanox'), + pytest.mark.topology('any') +] + +logger = logging.getLogger(__name__) + +mocker = None + +MAX_PSUS = None + +@pytest.fixture +# We can not set it as module because mocker_factory is function scope +def mock_power_threshold(request, duthosts, rand_one_dut_hostname, mocker_factory): + global mocker + global MAX_PSUS + + psudaemon_restarted = False + + duthost = duthosts[rand_one_dut_hostname] + platform_data = get_platform_data(duthost) + MAX_PSUS = platform_data['psus']['number'] + + mocker = mocker_factory(duthost, 'PsuPowerThresholdMocker') + + all_psus_supporting_thresholds = True + + try: + for psu_index in range(MAX_PSUS): + mocker.read_psu_power_threshold(psu_index + 1) + except Exception as e: + all_psus_supporting_thresholds = False + + if all_psus_supporting_thresholds: + try: + slope = None + ambient_critical_threshold = None + ambient_warning_threshold = None + slope = mocker.read_psu_power_slope() + ambient_critical_threshold = mocker.read_ambient_temp_critical_threshold() + ambient_warning_threshold = mocker.read_ambient_temp_warning_threshold() + except Exception as e: + pytest.fail('Some required information does not exist (slope {}, ambient thresholds critical {} warning {})'.format( + slope, + ambient_critical_threshold, + ambient_warning_threshold)) + + MockPlatform = request.config.getoption("--mock_any_testbed") + if MockPlatform: + if all_psus_supporting_thresholds: + logger.info('CLI option "--mock_any_testbed" is provided while power thresholds are supported on both PSUs') + + logger.info('Mocking the system to support PSU power threshold') + mocker.mock_power_threshold(MAX_PSUS) + + # Restart PSU daemon to take the mock stuff + logger.info('Restart PSU daemon to take mock PSU power threshold') + duthost.shell('docker exec -ti pmon supervisorctl restart psud') + psudaemon_restarted = True + time.sleep(2) + elif not all_psus_supporting_thresholds: + pytest.skip('PSU power threshold is not supported') + + yield + + logging.info('Clean all mock files') + mocker.deinit() + + if psudaemon_restarted: + logger.info('Restore PSU daemon') + duthost.shell('docker exec -ti pmon supervisorctl restart psud') + time.sleep(2) + + +def init_log_analyzer(duthost, marker, expected, ignored=None): + loganalyzer = LogAnalyzer(ansible_host=duthost, marker_prefix=marker) + marker = loganalyzer.init() + + loganalyzer.load_common_config() + loganalyzer.expect_regex = expected + if ignored: + loganalyzer.ignore_regex.extend(ignored) + + return loganalyzer, marker + + +def check_log_analyzer(loganalyzer, marker): + loganalyzer.analyze(marker) + return loganalyzer + + +@pytest.mark.disable_loganalyzer +def test_psu_power_threshold(request, duthosts, rand_one_dut_hostname, mock_power_threshold): + def _check_psu_info_in_db(psu_index, power, power_warning_threshold, power_critical_threshold, power_overload): + psuname = 'PSU {}'.format(psu_index) + command_check_psu_db = 'sonic-db-cli STATE_DB hmget "PSU_INFO|{}" power power_warning_threshold power_critical_threshold power_overload'.format(psuname) + output = duthost.shell(command_check_psu_db)['stdout'].split() + if len(output) != 4: + pytest.fail('Got wrong information ({}) from STATE_DB PSU_INFO|{}'.format(output, psuname)) + + if int(float(output[0])) != power/1000000 \ + or int(float(output[1])) != power_warning_threshold/1000000 \ + or int(float(output[2])) != power_critical_threshold/1000000 \ + or output[3] != str(power_overload): + return False + + command_check_system_health_db = 'sonic-db-cli STATE_DB hget SYSTEM_HEALTH_INFO "{}"' + summary = duthost.shell(command_check_system_health_db.format('summary'))['stdout'].strip() + if power_overload: + if 'Not OK' in summary: + detail = duthost.shell(command_check_system_health_db.format(psuname))['stdout'].strip() + if 'exceeds threshold' in detail: + return True + elif summary == 'OK': + return True + else: + detail = duthost.shell(command_check_system_health_db.format(psuname))['stdout'].strip() + if not detail: + return True + else: + logger.info('SYSTEM_HEALTH_INFO: {} is not OK due to {}'.format(psuname, detail)) + + return False + + def _calculate_psu_power_threshold(ambient_threshold, port_ambient, fan_ambient): + ambient_temperature = min(port_ambient, fan_ambient) + if ambient_temperature <= ambient_threshold: + return power_capacity + + return power_capacity - slope * (ambient_temperature - ambient_threshold) + + def _update_ambient_sensors_and_check_db(psu_index, port_ambient_mock, fan_ambient_mock, power, was_power_exceeded): + power_critical_threshold = _calculate_psu_power_threshold(ambient_critical_threshold, port_ambient_mock, fan_ambient_mock) + power_warning_threshold = _calculate_psu_power_threshold(ambient_warning_threshold, port_ambient_mock, fan_ambient_mock) + + logger.info('Mock ambient temperature sensors (fan {} port {}) and check the thresholds)'.format( + port_ambient_mock/1000, + fan_ambient_mock/1000)) + mocker.mock_port_ambient_thermal(port_ambient_mock) + mocker.mock_fan_ambient_thermal(fan_ambient_mock) + # Check whether thresholds are updated + pytest_assert(wait_until(10, + 2, + 0, + _check_psu_info_in_db, + psu_index, + power, + power_warning_threshold, + power_critical_threshold, + was_power_exceeded)) + + return power_warning_threshold, power_critical_threshold + + def _update_power_and_check_db(psu_index, power_warning_threshold, power_critical_threshold, power, was_power_exceeded): + logger.info('Mock PSU {} power to {} (the warning threshold {}, the critical threshold {})'.format( + psu_index, + power/1000000, + power_warning_threshold/1000000, + power_critical_threshold/1000000)) + + mocker.mock_psu_power(psu_index, power) + if was_power_exceeded and power < power_warning_threshold or not was_power_exceeded and power >= power_critical_threshold: + timeout = 80 + interval = 10 + is_power_exceeded = not was_power_exceeded + else: + timeout = 10 + interval = 2 + is_power_exceeded = was_power_exceeded + + pytest_assert(wait_until(timeout, + interval, + 0, + _check_psu_info_in_db, + psu_index, + power, + power_warning_threshold, + power_critical_threshold, + is_power_exceeded)) + + global mocker + + duthost = duthosts[rand_one_dut_hostname] + + slope = mocker.read_psu_power_slope() + ambient_critical_threshold = mocker.read_ambient_temp_critical_threshold() + ambient_warning_threshold = mocker.read_ambient_temp_warning_threshold() + fan_ambient = mocker.read_fan_ambient_thermal() + port_ambient = mocker.read_port_ambient_thermal() + + if fan_ambient > ambient_warning_threshold or port_ambient > ambient_warning_threshold: + pytest.fail('Fan ambient temperature {} or port ambient temperature exceeds the ambient warning threshold'.format(fan_ambient, port_ambient)) + + for i in range(MAX_PSUS): + psu_index = i + 1 + + logging.info('Starting mock test on PSU {}'.format(psu_index)) + + power_capacity = mocker.read_psu_power_threshold(psu_index) + power = mocker.read_psu_power(psu_index) + + if power > power_capacity: + pytest.fail('Current power {} exceeds maximum power capacity {}'.format(power, power_capacity)) + + # Ignore some possible errors + loganalyzer, marker = init_log_analyzer(duthost, + 'PSU power exceeding test', + [], + []) + + # Mock the power as well. + # This is to make sure the power will be a fixed value because it can flucuate if it was read from a sensor. + logger.info('Mock PSU power to {} which is in normal range'.format(power/1000000)) + mocker.mock_psu_power(psu_index, power) + + power_warning_threshold = None + power_critical_threshold = None + + with allure.step('Mock power to range (warning, critical)'): + with allure.step('Mock ambient temperature sensors'): + power_warning_threshold, power_critical_threshold = \ + _update_ambient_sensors_and_check_db(psu_index, + ambient_warning_threshold + (ambient_critical_threshold - ambient_warning_threshold)/2, + ambient_critical_threshold, + power, + False) + + with allure.step('Mock the power'): + power = power_warning_threshold + 1000000 + _update_power_and_check_db(psu_index, + power_warning_threshold, + power_critical_threshold, + power, + False) + + with allure.step('Mock power to range (critical, infinity)'): + with allure.step('Mock ambient temperature sensors'): + power_warning_threshold, power_critical_threshold = \ + _update_ambient_sensors_and_check_db(psu_index, + ambient_critical_threshold + 5000, + ambient_critical_threshold + 1000, + power, + False) + # Prepare for log analyzer + check_log_analyzer(loganalyzer, marker) + loganalyzer, marker = init_log_analyzer(duthost, + 'PSU power exceeds threshold', + ['PSU power warning: PSU {} power .* exceeds critical threshold'.format(psu_index)]) + + with allure.step('Mock the power'): + power = power_critical_threshold + 1000000 + _update_power_and_check_db(psu_index, + power_warning_threshold, + power_critical_threshold, + power, + False) + + # Check whether the expected message is found + check_log_analyzer(loganalyzer, marker) + loganalyzer, marker = init_log_analyzer(duthost, 'PSU power exceeding threshold', []) + + with allure.step('Mock power to range (warning, critical)'): + power = power_critical_threshold - 1000000 + _update_power_and_check_db(psu_index, + power_warning_threshold, + power_critical_threshold, + power, + True) + + with allure.step('Mock power to range (a low value, warning)'): + with allure.step('Mock ambient temperature sensors'): + power_warning_threshold, power_critical_threshold = \ + _update_ambient_sensors_and_check_db(psu_index, + ambient_critical_threshold + 1000, + ambient_warning_threshold + (ambient_critical_threshold - ambient_warning_threshold)/2, + power, + True) + + # Prepare log analyzer + check_log_analyzer(loganalyzer, marker) + loganalyzer, marker = init_log_analyzer(duthost, + 'PSU power become back to normal', + ['PSU power warning cleared: PSU {} power .* is back to normal'.format(psu_index)]) + + with allure.step('Mock power'): + _update_power_and_check_db(psu_index, + power_warning_threshold, + power_critical_threshold, + power_warning_threshold - 1000000, + True) + + check_log_analyzer(loganalyzer, marker)