From 415b8c457625c514aff0f8ecbdbbb655414d8067 Mon Sep 17 00:00:00 2001 From: Junchao-Mellanox <57339448+Junchao-Mellanox@users.noreply.github.com> Date: Thu, 20 Aug 2020 02:42:57 +0800 Subject: [PATCH] [thermalctld] Optimize the thermal policy loop to make it execute every 60 seconds (#77) In regression, we found that thermal policy loop takes 8 to 15 seconds sometimes which would enhance the loop interval to 68 to 75 seconds. This change is to make the loop interval more accurate. Record the elapse time for thermal policy, and start next iteration in (60 - elapsed) seconds. --- sonic-thermalctld/scripts/thermalctld | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/sonic-thermalctld/scripts/thermalctld b/sonic-thermalctld/scripts/thermalctld index 8ad0dd5c91ae..d70c9c69d80f 100644 --- a/sonic-thermalctld/scripts/thermalctld +++ b/sonic-thermalctld/scripts/thermalctld @@ -607,6 +607,9 @@ class ThermalMonitor(ProcessTaskBase): class ThermalControlDaemon(daemon_base.DaemonBase): # Interval to run thermal control logic INTERVAL = 60 + RUN_POLICY_WARN_THRESHOLD_SECS = 30 + FAST_START_INTERVAL = 15 + POLICY_FILE = '/usr/share/sonic/platform/thermal_policy.json' def __init__(self, log_identifier): @@ -658,12 +661,23 @@ class ThermalControlDaemon(daemon_base.DaemonBase): except Exception as e: self.log_error('Caught exception while initializing thermal manager - {}'.format(e)) - while not self.stop_event.wait(ThermalControlDaemon.INTERVAL): + wait_time = ThermalControlDaemon.INTERVAL + while not self.stop_event.wait(wait_time): + begin = time.time() try: if thermal_manager: thermal_manager.run_policy(chassis) except Exception as e: self.log_error('Caught exception while running thermal policy - {}'.format(e)) + elapsed = time.time() - begin + if elapsed < ThermalControlDaemon.INTERVAL: + wait_time = ThermalControlDaemon.INTERVAL - elapsed + else: + wait_time = ThermalControlDaemon.FAST_START_INTERVAL + + if elapsed > ThermalControlDaemon.RUN_POLICY_WARN_THRESHOLD_SECS: + self.log_warning('Thermal policy execution takes {} seconds, ' + 'there might be performance risk'.format(elapsed)) try: if thermal_manager: