Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Fix thermal control bugs #7

Closed
wants to merge 10 commits into from
4 changes: 2 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ def initialize_fan(self):

for index in range(num_of_fan):
if multi_rotor_in_drawer:
fan = Fan(has_fan_dir, index, index/2)
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
else:
fan = Fan(has_fan_dir, index, index)
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
self._fan_list.append(fan)


Expand Down
34 changes: 22 additions & 12 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,22 @@
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"

# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']

class Fan(FanBase):
"""Platform-specific Fan class"""

STATUS_LED_COLOR_ORANGE = "orange"

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
# API index is starting from 0, Mellanox platform index is starting from 1
self.index = fan_index + 1
self.drawer_index = drawer_index + 1

self.is_psu_fan = psu_fan
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True

self.fan_min_speed_path = "fan{}_min".format(self.index)
if not self.is_psu_fan:
Expand All @@ -47,7 +52,7 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False):
else:
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
Expand Down Expand Up @@ -80,13 +85,13 @@ def get_direction(self):
1 stands for forward, in other words intake
0 stands for reverse, in other words exhaust
"""
if not self.fan_dir or self.is_psu_fan:
if not self.fan_dir or self.is_psu_fan or not self.get_presence():
return self.FAN_DIRECTION_NOT_APPLICABLE

try:
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
fan_dir_bits = int(fan_dir.read())
fan_mask = 1 << self.index - 1
fan_mask = 1 << self.drawer_index - 1
if fan_dir_bits & fan_mask:
return self.FAN_DIRECTION_INTAKE
else:
Expand All @@ -107,15 +112,15 @@ def get_status(self):
"""
status = 0
if self.is_psu_fan:
status = 1
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
status = int(fault_status.read())
except (ValueError, IOError):
status = 0
status = 1

return status == 1
return status == 0


def get_presence(self):
Expand All @@ -132,11 +137,14 @@ def get_presence(self):
else:
status = 0
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0
if self.always_presence:
status = 1
else:
try:
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
status = int(presence_status.read())
except (ValueError, IOError):
status = 0

return status == 1

Expand Down Expand Up @@ -183,6 +191,8 @@ def get_speed(self):

max_speed_in_rpm = self._get_max_speed_in_rpm()
speed = 100*speed_in_rpm/max_speed_in_rpm
if speed > 100:
speed = 100

return speed

Expand Down
28 changes: 23 additions & 5 deletions platform/mellanox/mlnx-platform-api/sonic_platform/psu.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,18 +100,18 @@ def __init__(self, psu_index, sku):
self.psu_presence = psu_presence

fan = Fan(sku, psu_index, psu_index, True)
if fan.get_presence():
self._fan_list.append(fan)

def get_name(self):
return self._name
self._fan_list.append(fan)

self.psu_green_led_path = "led_psu_green"
self.psu_red_led_path = "led_psu_red"
self.psu_orange_led_path = "led_psu_orange"
self.psu_led_cap_path = "led_psu_capability"


def get_name(self):
return self._name


def _read_generic_file(self, filename, len):
"""
Read a generic file, returns the contents of the file
Expand Down Expand Up @@ -287,3 +287,21 @@ def get_status_led(self):
raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e)))

return self.STATUS_LED_COLOR_OFF


def get_power_available_status(self):
"""
Gets the power available status

Returns:
True if power is present and power on.
False and "absence of PSU" if power is not present.
False and "absence of power" if power is present but not power on.
"""
if not self.get_presence():
return False, "absence of PSU"
elif not self.get_powergood_status():
return False, "absence of power"
else:
return True, ""

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest using "get_power_available_status"

30 changes: 19 additions & 11 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
}
thermal_api_handler_gearbox = {
THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input",
THERMAL_API_GET_HIGH_THRESHOLD:None
THERMAL_API_GET_HIGH_THRESHOLD:None,
THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None
}
thermal_ambient_apis = {
THERMAL_DEV_ASIC_AMBIENT : "asic",
Expand Down Expand Up @@ -263,7 +264,7 @@ def initialize_thermals(sku, thermal_list, psu_list):
else:
if category == THERMAL_DEV_CATEGORY_PSU:
for index in range(count):
thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off")
thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status)
thermal_list.append(thermal)
else:
for index in range(count):
Expand All @@ -273,7 +274,7 @@ def initialize_thermals(sku, thermal_list, psu_list):


class Thermal(ThermalBase):
def __init__(self, category, index, has_index, dependency = None, hint = None):
def __init__(self, category, index, has_index, dependency = None):
"""
index should be a string for category ambient and int for other categories
"""
Expand All @@ -292,7 +293,6 @@ def __init__(self, category, index, has_index, dependency = None, hint = None):
self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD)
self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD)
self.dependency = dependency
self.dependent_hint = hint


def get_name(self):
Expand Down Expand Up @@ -344,13 +344,11 @@ def get_temperature(self):
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
if self.dependency and not self.dependency():
if self.dependent_hint:
hint = self.dependent_hint
else:
hint = "unknown reason"
logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint))
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.temperature, 0)
if value_str is None:
return None
Expand All @@ -370,6 +368,11 @@ def get_high_threshold(self):
"""
if self.high_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_threshold, 0)
if value_str is None:
return None
Expand All @@ -389,6 +392,11 @@ def get_high_critical_threshold(self):
"""
if self.high_critical_threshold is None:
return None
if self.dependency:
status, hint = self.dependency()
if not status:
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
value_str = self._read_generic_file(self.high_critical_threshold, 0)
if value_str is None:
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,12 @@ def collect(self, chassis):
"""
self._status_changed = False
for psu in chassis.get_all_psus():
if psu.get_presence() and psu not in self._presence_psus:
if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus:
self._presence_psus.add(psu)
self._status_changed = True
if psu in self._absence_psus:
self._absence_psus.remove(psu)
elif not psu.get_presence() and psu not in self._absence_psus:
elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus:
self._absence_psus.add(psu)
self._status_changed = True
if psu in self._presence_psus:
Expand Down
18 changes: 18 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
17 changes: 17 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
},
{
"type": "fan.any.absence"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
10 changes: 10 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_action.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"name": "any fan absence",
"conditions": [
{
"type": "fan.any.absence"
}
],
"actions": [
]
}
11 changes: 11 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/empty_condition.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"name": "any fan absence",
"conditions": [
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
}
4 changes: 4 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@ def set_speed(self, speed):
class MockPsu:
def __init__(self):
self.presence = True
self.powergood = True

def get_presence(self):
return self.presence

def get_powergood_status(self):
return self.powergood


class MockChassis:
def __init__(self):
Expand Down
Loading