From 604c3080434014aef2e10046ee9b35be98c3d169 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam Date: Mon, 11 Apr 2022 12:34:29 -0700 Subject: [PATCH 1/5] check agent cg after goal state processed --- azurelinuxagent/common/cgroupconfigurator.py | 5 +++-- azurelinuxagent/ga/exthandlers.py | 5 ++--- azurelinuxagent/ga/update.py | 4 ++++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index f37f08d938..e2767b6707 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -540,7 +540,7 @@ def __try_set_cpu_quota(quota): return False return True - def check_cgroups(self, cgroup_metrics): + def check_cgroups(self, cgroup_metrics=None): if not self.enabled(): return @@ -555,7 +555,8 @@ def check_cgroups(self, cgroup_metrics): quota_check_success = False try: - self._check_agent_throttled_time(cgroup_metrics) + if cgroup_metrics is not None: + self._check_agent_throttled_time(cgroup_metrics) quota_check_success = True except CGroupsException as exception: errors.append(exception) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index cc9d0afc32..13d0ca3e09 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -1411,9 +1411,8 @@ def _enable_extension(self, extension, uninstall_exit_code): env = { ExtCommandEnvVariable.UninstallReturnCode: uninstall_exit_code } - # This check to call the setup if AzureMonitorLinuxAgent extension already installed and not called setup before - if self.is_azuremonitorlinuxagent(self.get_full_name()) and \ - not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(self.get_full_name()): + # This check to call the setup if extension already installed and not called setup before + if not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(self.get_full_name()): self.set_extension_resource_limits() self.set_operation(WALAEventOperation.Enable) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 2118ca6831..b02a4d09ac 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -614,6 +614,10 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): self._extensions_summary = ExtensionsSummary() exthandlers_handler.run() + # check cgroup and disable if any extension started in agent cgroup after goal state processed. + # Note: Monitor thread periodically checks this in addition to here. + CGroupConfigurator.get_instance().check_cgroups() + # always report status, even if the goal state did not change # do it before processing the remote access, since that operation can take a long time self._report_status(exthandlers_handler) From 716ddbbefc46efd4bc7175ed7c7e0c0273d1a390 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam Date: Wed, 13 Apr 2022 17:45:13 -0700 Subject: [PATCH 2/5] fix PR comments --- azurelinuxagent/common/cgroupconfigurator.py | 50 +++++++++++--------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index e2767b6707..eb32d4b6b3 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -17,6 +17,7 @@ import os import re import subprocess +import threading from azurelinuxagent.common import conf from azurelinuxagent.common import logger @@ -131,6 +132,7 @@ def __init__(self): self._cgroups_api = None self._agent_cpu_cgroup_path = None self._agent_memory_cgroup_path = None + self._lock = threading.RLock() def initialize(self): try: @@ -540,34 +542,38 @@ def __try_set_cpu_quota(quota): return False return True - def check_cgroups(self, cgroup_metrics=None): - if not self.enabled(): - return + def check_cgroups(self, cgroup_metrics=[]): + self._lock.acquire() + try: + if not self.enabled(): + return - errors = [] + errors = [] - process_check_success = False - try: - self._check_processes_in_agent_cgroup() - process_check_success = True - except CGroupsException as exception: - errors.append(exception) + process_check_success = False + try: + self._check_processes_in_agent_cgroup() + process_check_success = True + except CGroupsException as exception: + errors.append(exception) - quota_check_success = False - try: - if cgroup_metrics is not None: - self._check_agent_throttled_time(cgroup_metrics) - quota_check_success = True - except CGroupsException as exception: - errors.append(exception) + quota_check_success = False + try: + if cgroup_metrics: + self._check_agent_throttled_time(cgroup_metrics) + quota_check_success = True + except CGroupsException as exception: + errors.append(exception) - reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors])) + reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors])) - if not process_check_success and conf.get_cgroup_disable_on_process_check_failure(): - self.disable(reason, DisableCgroups.ALL) + if not process_check_success and conf.get_cgroup_disable_on_process_check_failure(): + self.disable(reason, DisableCgroups.ALL) - if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure(): - self.disable(reason, DisableCgroups.AGENT) + if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure(): + self.disable(reason, DisableCgroups.AGENT) + finally: + self._lock.release() def _check_processes_in_agent_cgroup(self): """ From 321a322b8387a534f715392706cf00888291bbbb Mon Sep 17 00:00:00 2001 From: nnandigam Date: Wed, 13 Apr 2022 18:03:51 -0700 Subject: [PATCH 3/5] fix UT --- tests/common/test_cgroupconfigurator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index fcae952bd2..ae856f25c1 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -859,7 +859,10 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self): with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as add_event: configurator.enable() - configurator.check_cgroups([]) + tracked_metrics = [ + MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, "test", + 10)] + configurator.check_cgroups(tracked_metrics) if method_to_fail == "_check_processes_in_agent_cgroup": self.assertFalse(configurator.enabled(), "An error in {0} should have disabled cgroups".format(method_to_fail)) else: From c50fd0a20b33aa7aaa366831dcafe724e2b0cbbf Mon Sep 17 00:00:00 2001 From: nnandigam Date: Wed, 13 Apr 2022 18:25:58 -0700 Subject: [PATCH 4/5] fix default value --- azurelinuxagent/common/cgroupconfigurator.py | 2 +- azurelinuxagent/ga/update.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index eb32d4b6b3..7c4e5bc636 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -542,7 +542,7 @@ def __try_set_cpu_quota(quota): return False return True - def check_cgroups(self, cgroup_metrics=[]): + def check_cgroups(self, cgroup_metrics): self._lock.acquire() try: if not self.enabled(): diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index b02a4d09ac..1bdda2fc44 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -616,7 +616,7 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): # check cgroup and disable if any extension started in agent cgroup after goal state processed. # Note: Monitor thread periodically checks this in addition to here. - CGroupConfigurator.get_instance().check_cgroups() + CGroupConfigurator.get_instance().check_cgroups(cgroup_metrics=[]) # always report status, even if the goal state did not change # do it before processing the remote access, since that operation can take a long time From 9c490c445783de8619f0bc66120bfd3677d34186 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam Date: Fri, 15 Apr 2022 13:35:48 -0700 Subject: [PATCH 5/5] address comment --- azurelinuxagent/common/cgroupconfigurator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index 7c4e5bc636..866e4f866f 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -132,7 +132,7 @@ def __init__(self): self._cgroups_api = None self._agent_cpu_cgroup_path = None self._agent_memory_cgroup_path = None - self._lock = threading.RLock() + self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop. def initialize(self): try: @@ -543,7 +543,7 @@ def __try_set_cpu_quota(quota): return True def check_cgroups(self, cgroup_metrics): - self._lock.acquire() + self._check_cgroups_lock.acquire() try: if not self.enabled(): return @@ -573,7 +573,7 @@ def check_cgroups(self, cgroup_metrics): if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure(): self.disable(reason, DisableCgroups.AGENT) finally: - self._lock.release() + self._check_cgroups_lock.release() def _check_processes_in_agent_cgroup(self): """