Skip to content

Commit

Permalink
[Mellanox] Add CPU thermal control for SN4800
Browse files Browse the repository at this point in the history
  • Loading branch information
Junchao-Mellanox committed Mar 10, 2022
1 parent cf1bc8d commit f65991b
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from sonic_py_common.task_base import ThreadTaskBase

from . import utils
from .device_data import DeviceDataManager


class CPUThermalControl(ThreadTaskBase):
CPU_COOLING_STATE = '/var/run/hw-management/thermal/cooling2_cur_state'
CPU_TEMP_FILE = '/var/run/hw-management/thermal/cpu_pack'
MAX_COOLING_STATE = 10
MIN_COOLING_STATE = 2
INTERVAL = 3

def __init__(self):
super(CPUThermalControl, self).__init__()
self.temp_low, self.temp_high = DeviceDataManager.get_cpu_thermal_threshold()

def task_worker(self):
last_temp = 0
while not self.task_stopping_event.wait(self.INTERVAL):
last_temp = self.run(last_temp)

def run(self, last_temp):
current_temp = self.read_cpu_temp()
if current_temp < self.temp_low:
self.set_cooling_state(self.MIN_COOLING_STATE)
elif current_temp > self.temp_high:
self.set_cooling_state(self.MAX_COOLING_STATE)
else:
cooling_state = self.get_cooling_state()
if current_temp > last_temp:
self.set_cooling_state(min(cooling_state + 1, self.MAX_COOLING_STATE))
elif current_temp < last_temp:
self.set_cooling_state(max(cooling_state - 1, self.MIN_COOLING_STATE))
return current_temp

def set_cooling_state(self, state):
utils.write_file(self.CPU_COOLING_STATE, state, log_func=None)

def get_cooling_state(self):
return utils.read_int_from_file(self.CPU_COOLING_STATE, default=self.MAX_COOLING_STATE, log_func=None)

def read_cpu_temp(self):
cpu_temp = utils.read_int_from_file(self.CPU_TEMP_FILE, default=self.temp_high, log_func=None)
return cpu_temp if cpu_temp <= 1000 else int(cpu_temp / 1000)

Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@
'thermal': {
"capability": {
"comex_amb": False
}
},
'cpu_threshold': (80, 95) # min=80, max=95
},
'sfp': {
'max_port_per_line_card': 16
Expand Down Expand Up @@ -263,3 +264,20 @@ def get_linecard_max_port_count(cls):
if not sfp_data:
return 0
return sfp_data.get('max_port_per_line_card', 0)

@classmethod
def is_cpu_thermal_control_supported(cls):
return cls.get_cpu_thermal_threshold() != (None, None)

@classmethod
@utils.read_only_cache()
def get_cpu_thermal_threshold(cls):
platform_data = DEVICE_DATA.get(cls.get_platform_name(), None)
if not platform_data:
return None, None

thermal_data = platform_data.get('thermal', None)
if not thermal_data:
return None, None

return thermal_data.get('cpu_threshold', (None, None))
9 changes: 9 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,15 @@ def monitor_asic_themal_zone(cls):
else:
cls.expect_cooling_state = None

@classmethod
def start_cpu_thermal_control(cls, chassis):
platform_name = DeviceDataManager.get_platform_name()
if platform_name != 'x86_64-nvidia_sn4800-r0':
return





class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,17 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from .cpu_thermal_control import CPUThermalControl
from .device_data import DeviceDataManager
from .thermal_actions import *
from .thermal_conditions import *
from .thermal_infos import *
from .thermal import logger, MAX_COOLING_LEVEL, Thermal


class ThermalManager(ThermalManagerBase):
cpu_thermal_control = None

@classmethod
def start_thermal_control_algorithm(cls):
"""
Expand All @@ -42,8 +46,30 @@ def stop_thermal_control_algorithm(cls):
"""
Thermal.set_thermal_algorithm_status(False)

@classmethod
def start_cpu_thermal_control_algoritm(cls):
if cls.cpu_thermal_control:
return

if not DeviceDataManager.is_cpu_thermal_control_supported():
return

cls.cpu_thermal_control = CPUThermalControl()
cls.cpu_thermal_control.task_run()

@classmethod
def stop_cpu_thermal_control_algoritm(cls):
if cls.cpu_thermal_control:
cls.cpu_thermal_control.task_stop()
cls.cpu_thermal_control = None

@classmethod
def run_policy(cls, chassis):
if cls._running:
cls.start_cpu_thermal_control_algoritm()
else:
cls.stop_cpu_thermal_control_algoritm()

if not cls._policy_dict:
return

Expand All @@ -59,7 +85,6 @@ def run_policy(cls, chassis):
if not cls._running:
return
try:
print(policy.name)
if policy.is_match(cls._thermal_info_dict):
policy.do_action(cls._thermal_info_dict)
except Exception as e:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import glob
import os
import pytest
import sys
if sys.version_info.major == 3:
from unittest import mock
else:
import mock

test_path = os.path.dirname(os.path.abspath(__file__))
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

from sonic_platform.cpu_thermal_control import CPUThermalControl


class TestCPUThermalControl:
@mock.patch('sonic_platform.device_data.DeviceDataManager.get_cpu_thermal_threshold', mock.MagicMock(return_value=(85, 95)))
@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.utils.write_file')
def test_run(self, mock_write_file, mock_read_file):
instance = CPUThermalControl()
file_content = {
CPUThermalControl.CPU_COOLING_STATE: 5,
CPUThermalControl.CPU_TEMP_FILE: instance.temp_high + 1
}

def read_file(file_path, **kwargs):
return file_content[file_path]

mock_read_file.side_effect = read_file
# Test current temp is higher than high threshold
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None)

# Test current temp is lower than low threshold
file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low - 1
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None)

# Test current temp increasing
file_content[CPUThermalControl.CPU_TEMP_FILE] = instance.temp_low
instance.run(0)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 6, log_func=None)

# Test current temp decreasing
instance.run(instance.temp_low + 1)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, 4, log_func=None)

# Test current temp increasing and current cooling state is already the max
file_content[CPUThermalControl.CPU_TEMP_FILE] = 85
file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MAX_COOLING_STATE
instance.run(84)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MAX_COOLING_STATE, log_func=None)

# Test current temp decreasing and current cooling state is already the max
file_content[CPUThermalControl.CPU_COOLING_STATE] = CPUThermalControl.MIN_COOLING_STATE
instance.run(86)
mock_write_file.assert_called_with(CPUThermalControl.CPU_COOLING_STATE, CPUThermalControl.MIN_COOLING_STATE, log_func=None)

0 comments on commit f65991b

Please sign in to comment.