Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check agent cg after goal state processed and handle extensions to starts in extension slice #2546

Merged
merged 8 commits into from
Apr 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 27 additions & 20 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import os
import re
import subprocess
import threading

from azurelinuxagent.common import conf
from azurelinuxagent.common import logger
Expand Down Expand Up @@ -131,6 +132,7 @@ def __init__(self):
self._cgroups_api = None
self._agent_cpu_cgroup_path = None
self._agent_memory_cgroup_path = None
self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.

def initialize(self):
try:
Expand Down Expand Up @@ -541,32 +543,37 @@ def __try_set_cpu_quota(quota):
return True

def check_cgroups(self, cgroup_metrics):
if not self.enabled():
return
self._check_cgroups_lock.acquire()
try:
if not self.enabled():
return

errors = []
errors = []

process_check_success = False
try:
self._check_processes_in_agent_cgroup()
process_check_success = True
except CGroupsException as exception:
errors.append(exception)
process_check_success = False
try:
self._check_processes_in_agent_cgroup()
process_check_success = True
except CGroupsException as exception:
errors.append(exception)

quota_check_success = False
try:
self._check_agent_throttled_time(cgroup_metrics)
quota_check_success = True
except CGroupsException as exception:
errors.append(exception)
quota_check_success = False
try:
if cgroup_metrics:
self._check_agent_throttled_time(cgroup_metrics)
quota_check_success = True
except CGroupsException as exception:
errors.append(exception)

reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors]))
reason = "Check on cgroups failed:\n{0}".format("\n".join([ustr(e) for e in errors]))

if not process_check_success and conf.get_cgroup_disable_on_process_check_failure():
self.disable(reason, DisableCgroups.ALL)
if not process_check_success and conf.get_cgroup_disable_on_process_check_failure():
self.disable(reason, DisableCgroups.ALL)

if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure():
self.disable(reason, DisableCgroups.AGENT)
if not quota_check_success and conf.get_cgroup_disable_on_quota_check_failure():
self.disable(reason, DisableCgroups.AGENT)
finally:
self._check_cgroups_lock.release()

def _check_processes_in_agent_cgroup(self):
"""
Expand Down
5 changes: 2 additions & 3 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1411,9 +1411,8 @@ def _enable_extension(self, extension, uninstall_exit_code):
env = {
ExtCommandEnvVariable.UninstallReturnCode: uninstall_exit_code
}
# This check to call the setup if AzureMonitorLinuxAgent extension already installed and not called setup before
if self.is_azuremonitorlinuxagent(self.get_full_name()) and \
not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(self.get_full_name()):
# This check to call the setup if extension already installed and not called setup before
if not CGroupConfigurator.get_instance().is_extension_resource_limits_setup_completed(self.get_full_name()):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we used to do it for only AMA but this setup needed for every extension to starts in VMExtension slice.

self.set_extension_resource_limits()

self.set_operation(WALAEventOperation.Enable)
Expand Down
4 changes: 4 additions & 0 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,10 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler):
self._extensions_summary = ExtensionsSummary()
exthandlers_handler.run()

# check cgroup and disable if any extension started in agent cgroup after goal state processed.
# Note: Monitor thread periodically checks this in addition to here.
CGroupConfigurator.get_instance().check_cgroups(cgroup_metrics=[])

# always report status, even if the goal state did not change
# do it before processing the remote access, since that operation can take a long time
self._report_status(exthandlers_handler)
Expand Down
5 changes: 4 additions & 1 deletion tests/common/test_cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,10 @@ def test_check_cgroups_should_disable_cgroups_when_a_check_fails(self):
with patch("azurelinuxagent.common.cgroupconfigurator.add_event") as add_event:
configurator.enable()

configurator.check_cgroups([])
tracked_metrics = [
MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, "test",
10)]
configurator.check_cgroups(tracked_metrics)
if method_to_fail == "_check_processes_in_agent_cgroup":
self.assertFalse(configurator.enabled(), "An error in {0} should have disabled cgroups".format(method_to_fail))
else:
Expand Down