Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Fix thermal control bugs #7

Closed
wants to merge 10 commits into from
10 changes: 6 additions & 4 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
Expand Down Expand Up @@ -107,15 +107,15 @@ def get_status(self):
"""
status = 0
if self.is_psu_fan:
status = 1
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
except (ValueError, IOError):
status = 0
status = 1

return status == 1
return status == 0


def get_presence(self):
Expand Down Expand Up @@ -183,6 +183,8 @@ def get_speed(self):

max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm
if speed > 100:
speed = 100

return speed

Expand Down
28 changes: 23 additions & 5 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,18 @@ def __init__(self, psu_index, sku):
self.psu_presence = psu_presence

fan = Fan(sku, psu_index, psu_index, True)
if fan.get_presence():
self._fan_list.append(fan)

def get_name(self):
return self._name
self._fan_list.append(fan)

self.psu_green_led_path = "led_psu_green"
self.psu_red_led_path = "led_psu_red"
self.psu_orange_led_path = "led_psu_orange"
self.psu_led_cap_path = "led_psu_capability"


def get_name(self):
return self._name


def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
Expand Down Expand Up @@ -287,3 +287,21 @@ def get_status_led(self):
raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e)))

return self.STATUS_LED_COLOR_OFF


def get_power_available_status(self):
"""
Gets the power available status

Returns:
True if power is present and power on.
False and "power not present" if power is not present.
False and "power off" if power is present but not power on.
"""
if not self.get_presence():
return False, "power not present"
elif not self.get_powergood_status():
return False, "power off"
else:
return True, ""

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest using "get_power_available_status"

30 changes: 19 additions & 11 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:None
THERMAL_API_GET_HIGH_THRESHOLD:None,
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic",
Expand Down Expand Up @@ -263,7 +264,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
else:
if category == THERMAL_DEV_CATEGORY_PSU:
for index in range(count):
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off")
thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
thermal_list.append(thermal)
else:
for index in range(count):
Expand All @@ -273,7 +274,7 @@ def initialize_thermals(sku, thermal_list, psu_list):


class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None):
def __init__(self, category, index, has_index, dependency = None):
"""
index should be a string for category ambient and int for other categories
"""
Expand All @@ -292,7 +293,6 @@ def __init__(self, category, index, has_index, dependency = None, hint = None):
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency
self.dependent_hint = hint


def get_name(self):
Expand Down Expand Up @@ -344,13 +344,11 @@ def get_temperature(self):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.dependency and not self.dependency():
if self.dependent_hint:
hint = self.dependent_hint
else:
hint = "unknown reason"
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.temperature, 0)
if value_str is None:
return None
Expand All @@ -370,6 +368,11 @@ def get_high_threshold(self):
"""
if self.high_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None:
return None
Expand All @@ -389,6 +392,11 @@ def get_high_critical_threshold(self):
"""
if self.high_critical_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None:
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ def collect(self, chassis):
"""
self._status_changed = False
for psu in chassis.get_all_psus():
if psu.get_presence() and psu not in self._presence_psus:
if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus:
self._presence_psus.add(psu)
self._status_changed = True
if psu in self._absence_psus:
self._absence_psus.remove(psu)
elif not psu.get_presence() and psu not in self._absence_psus:
elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus:
self._absence_psus.add(psu)
self._status_changed = True
if psu in self._presence_psus:
Expand Down
18 changes: 18 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
17 changes: 17 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
},
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
10 changes: 10 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
]
}
11 changes: 11 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "any fan absence",
"conditions": [
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
4 changes: 4 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ def set_speed(self, speed):
class MockPsu:
def __init__(self):
self.presence = True
self.powergood = True

def get_presence(self):
return self.presence

def get_powergood_status(self):
return self.powergood


class MockChassis:
def __init__(self):
Expand Down
38 changes: 38 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ def test_psu_info():
assert len(psu_info.get_presence_psus()) == 1
assert psu_info.is_status_changed()

psu_list[0].powergood = False
psu_info.collect(chassis)
assert len(psu_info.get_absence_psus()) == 1
assert len(psu_info.get_presence_psus()) == 0
assert psu_info.is_status_changed()


def test_fan_policy(thermal_manager):
chassis = MockChassis()
Expand Down Expand Up @@ -269,4 +275,36 @@ def test_load_control_thermal_algo_action():
with pytest.raises(ValueError):
action.load_from_json(json_obj)

def test_load_duplicate_condition():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'duplicate_condition.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)

def test_load_duplicate_action():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'duplicate_action.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)

def test_load_empty_condition():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'empty_condition.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)

def test_load_empty_action():
from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy
with open(os.path.join(test_path, 'empty_action.json')) as f:
json_obj = json.load(f)
policy = ThermalPolicy()
with pytest.raises(Exception):
policy.load_from_json(json_obj)