Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Verify that the CPU and Memory cgroups for the agent are properly initialized; disabled cgroups if they are not active. #2015

Merged
merged 5 commits into from
Sep 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 45 additions & 36 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ class CGroupConfigurator(object):

NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions)
"""
class __impl(object): # pylint: disable=R0902,C0103
# too-many-instance-attributes<R0902> Disabled: class complexity is OK
# invalid-name<C0103> Disabled: class is private, so name starts with __
class __Impl(object): # pylint: disable=R0902,C0103
def __init__(self):
self._initialized = False
self._cgroups_supported = False
Expand All @@ -46,7 +48,8 @@ def __init__(self):
self._get_processes_in_agent_cgroup_last_error = None
self._get_processes_in_agent_cgroup_error_count = 0

def initialize(self): # pylint: disable=R0912
# too-many-branches<R0912> Disabled: branches are sequential, not nested
def initialize(self): # pylint: disable=R0912
# pylint: disable=too-many-locals
try:
if self._initialized:
Expand Down Expand Up @@ -76,12 +79,7 @@ def log_cgroup_info(format_string, *args):
logger.info(message)
add_event(op=WALAEventOperation.CGroupsInfo, message=message)

def log_cgroup_warn(format_string, *args):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

converted warnings into infos

message = format_string.format(*args)
logger.warn(message)
add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False)

log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # pylint: disable=E1101
log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # pylint: disable=E1101

#
# Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running
Expand All @@ -90,69 +88,79 @@ def log_cgroup_warn(format_string, *args):
legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups()

if legacy_cgroups > 0:
log_cgroup_warn("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.")
log_cgroup_info("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.")
return

#
# check v1 controllers
#
cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() # pylint: disable=E1101
cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() # pylint: disable=E1101

if cpu_controller_root is not None:
logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root)
else:
log_cgroup_warn("The CPU cgroup controller is not mounted")
log_cgroup_info("The CPU cgroup controller is not mounted")

if memory_controller_root is not None:
logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root)
else:
log_cgroup_warn("The memory cgroup controller is not mounted")
log_cgroup_info("The memory cgroup controller is not mounted")

#
# check v2 controllers
#
cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() # pylint: disable=E1101
if cgroup2_mountpoint is not None:
log_cgroup_warn("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mountpoint, cgroup2_controllers)
cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() # pylint: disable=E1101
if cgroup2_mount_point is not None:
log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point, cgroup2_controllers)

#
# check the cgroups for the agent
#
agent_unit_name = self._cgroups_api.get_agent_unit_name() # pylint: disable=E1101
cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self") # pylint: disable=E1101
agent_unit_name = self._cgroups_api.get_agent_unit_name() # pylint: disable=E1101
cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self") # pylint: disable=E1101
expected_relative_path = os.path.join('system.slice', agent_unit_name)
if cpu_cgroup_relative_path is None:
log_cgroup_warn("The agent's process is not within a CPU cgroup")
log_cgroup_info("The agent's process is not within a CPU cgroup")
else:
cpu_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "CPUAccounting") # pylint: disable=E1101
if cpu_cgroup_relative_path != expected_relative_path:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not enable cgroups if agent is not in system.slice/walinuxagent.service

log_cgroup_info("The Agent is not in the expected cgroup; will not enable cgroup monitoring. CPU relative path:[{0}] Expected:[{1}]", cpu_cgroup_relative_path, expected_relative_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it be a warning here?

return
cpu_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "CPUAccounting") # pylint: disable=E1101
log_cgroup_info('CPUAccounting: {0}', cpu_accounting)

if memory_cgroup_relative_path is None:
log_cgroup_warn("The agent's process is not within a memory cgroup")
log_cgroup_info("The agent's process is not within a memory cgroup")
else:
memory_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "MemoryAccounting") # pylint: disable=E1101
if memory_cgroup_relative_path != expected_relative_path:
log_cgroup_info("The Agent is not in the expected cgroup; will not enable cgroup monitoring. Memory relative path:[{0}] Expected:[{1}]", memory_cgroup_relative_path, expected_relative_path)
return
memory_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "MemoryAccounting") # pylint: disable=E1101
log_cgroup_info('MemoryAccounting: {0}', memory_accounting)

#
# All good, enable cgroups and start monitoring the agent
#
self._cgroups_enabled = True

if cpu_controller_root is None or cpu_cgroup_relative_path is None:
logger.info("Will not track CPU for the agent's cgroup")
else:
self._agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path)
log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path)
CGroupsTelemetry.track_cgroup(CpuCgroup(agent_unit_name, self._agent_cpu_cgroup_path))

if memory_controller_root is None or memory_cgroup_relative_path is None:
logger.info("Will not track memory for the agent's cgroup")
else:
self._agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path)
log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(MemoryCgroup(agent_unit_name, self._agent_memory_cgroup_path))

log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}", self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path)
if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do not enable cgroups if both cpu and memory are None

self._cgroups_enabled = True

log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled)

except Exception as e: # pylint: disable=C0103
message = "Error initializing cgroups: {0}".format(ustr(e))
except Exception as exception:
message = "Error initializing cgroups: {0}".format(ustr(exception))
logger.warn(message)
add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False)
finally:
Expand Down Expand Up @@ -183,13 +191,13 @@ def _invoke_cgroup_operation(self, operation, error_message, on_error=None):

try:
return operation()
except Exception as e: # pylint: disable=C0103
logger.warn("{0} Error: {1}".format(error_message, ustr(e)))
except Exception as exception:
logger.warn("{0} Error: {1}".format(error_message, ustr(exception)))
if on_error is not None:
try:
on_error(e)
except Exception as ex: # pylint: disable=W0612
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e)))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW on this one the linter caught me logging the wrong exception

on_error(exception)
except Exception as exception:
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(exception)))

def create_extension_cgroups_root(self):
"""
Expand Down Expand Up @@ -227,7 +235,7 @@ def get_processes_in_agent_cgroup(self):
def __impl():
if self._agent_cpu_cgroup_path is None:
return []
return self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # pylint: disable=E1101
return self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # pylint: disable=E1101

def __on_error(exception):
#
Expand All @@ -242,8 +250,8 @@ def __on_error(exception):

return self._invoke_cgroup_operation(__impl, "Failed to list the processes in the agent's cgroup.", on_error=__on_error)

def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, # pylint: disable=R0913
error_code=ExtensionErrorCodes.PluginUnknownFailure):
# too-many-arguments<R0913> Disabled: argument list mimics Popen's
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=R0913
"""
Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup
:param extension_name: The extension executing the command
Expand All @@ -257,7 +265,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd,
:param error_code: Extension error code to raise in case of error
"""
if not self.enabled():
process = subprocess.Popen(command, # pylint: disable=W1509
# subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded
process = subprocess.Popen(command, # pylint: disable=W1509
shell=shell,
cwd=cwd,
env=env,
Expand Down Expand Up @@ -290,7 +299,7 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd,
@staticmethod
def get_instance():
if CGroupConfigurator._instance is None:
CGroupConfigurator._instance = CGroupConfigurator.__impl()
CGroupConfigurator._instance = CGroupConfigurator.__Impl()
return CGroupConfigurator._instance

@staticmethod
Expand Down
10 changes: 6 additions & 4 deletions azurelinuxagent/ga/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ def _operation_impl(self):
if not CGroupConfigurator.is_agent_process(command_line):
unexpected_processes.append(command_line)

if len(unexpected_processes) > 0: # pylint: disable=len-as-condition
if unexpected_processes:
unexpected_processes.sort()
processes_check_error = "The agent's cgroup includes unexpected processes: {0}".format(ustr(unexpected_processes))
except Exception as e: # pylint: disable=C0103
processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format(ustr(e))
except Exception as exception:
processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format(ustr(exception))

# Report a small sample of errors
if processes_check_error != self._last_error and self._error_count < 5:
Expand Down Expand Up @@ -181,11 +181,13 @@ def __init__(self):
ResetPeriodicLogMessagesOperation(),
PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD),
ReportNetworkErrorsOperation(),
PollResourceUsageOperation(),
PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD),
PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD),
ReportNetworkConfigurationChangesOperation(),
]
if CGroupConfigurator.get_instance().enabled():
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't check for cgroups stuff if they are disabled

self._periodic_operations.append(PollResourceUsageOperation())

self.protocol = None
self.protocol_util = None
self.health_service = None
Expand Down
36 changes: 20 additions & 16 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def run_latest(self, child_args=None): # pylint: disable=R0912,R1711
self.child_process = None
return

def run(self, debug=False): # pylint: disable=R0912,R0914
def run(self, debug=False): # pylint: disable=R0912,R0914
"""
This is the main loop which watches for agent and extension updates.
"""
Expand All @@ -258,6 +258,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914
protocol = self.protocol_util.get_protocol()
protocol.update_goal_state()

# Initialize the common parameters for telemetry events
initialize_event_logger_vminfo_common_parameters(protocol)

# Log OS-specific info.
Expand All @@ -267,6 +268,24 @@ def run(self, debug=False): # pylint: disable=R0912,R0914
logger.info(os_info_msg)
add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg)

#
Copy link
Member Author

@narrieta narrieta Sep 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

initialization of these threads was racing against initialization of the main thread

# Perform initialization tasks
#
from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this import be up top of the file?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ya; i tried to fix that and it ends up breaking up the tests... we have several of those problematic imports across the code base... yet another cleanup task to do

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm. What were the errors the tests were having? Should we document this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can try moving it in the current code and check the errors... i don't think we need to document it, all of us are aware of this (now you are :))

exthandlers_handler = get_exthandlers_handler(protocol)
migrate_handler_state()

from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
remote_access_handler = get_remote_access_handler(protocol)

self._ensure_no_orphans()
self._emit_restart_event()
self._emit_changes_in_default_configuration()
self._ensure_partition_assigned()
self._ensure_readonly_files()
self._ensure_cgroups_initialized()
self._ensure_extension_telemetry_state_configured_properly(protocol)

# Get all thread handlers
all_thread_handlers = [
get_monitor_handler(),
Expand All @@ -284,21 +303,6 @@ def run(self, debug=False): # pylint: disable=R0912,R0914
for thread_handler in all_thread_handlers:
thread_handler.run()

from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state
exthandlers_handler = get_exthandlers_handler(protocol)
migrate_handler_state()

from azurelinuxagent.ga.remoteaccess import get_remote_access_handler
remote_access_handler = get_remote_access_handler(protocol)

self._ensure_no_orphans()
self._emit_restart_event()
self._emit_changes_in_default_configuration()
self._ensure_partition_assigned()
self._ensure_readonly_files()
self._ensure_cgroups_initialized()
self._ensure_extension_telemetry_state_configured_properly(protocol)

goal_state_interval = conf.get_goal_state_period() if conf.get_extensions_enabled() else GOAL_STATE_INTERVAL_DISABLED

while self.running:
Expand Down
22 changes: 13 additions & 9 deletions tests/common/mock_cgroup_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#
# The output comes from an Ubuntu 18 system
#
_default_commands = [ # pylint: disable=invalid-name
__DEFAULT_COMMANDS = [
(r"^systemctl --version$",
'''systemd 237
+PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD -IDN2 +IDN -PCRE2 default-hierarchy=hybrid
Expand Down Expand Up @@ -74,7 +74,7 @@
'''),
]

_default_files = [ # pylint: disable=invalid-name
__DEFAULT_FILES = [
(r"^/proc/self/cgroup$", os.path.join(data_dir, 'cgroups', 'proc_self_cgroup')),
(r"^/proc/[0-9]+/cgroup$", os.path.join(data_dir, 'cgroups', 'proc_pid_cgroup')),
(r"^/sys/fs/cgroup/unified/cgroup.controllers$", os.path.join(data_dir, 'cgroups', 'sys_fs_cgroup_unified_cgroup.controllers')),
Expand All @@ -86,6 +86,9 @@ def mock_cgroup_commands():
original_read_file = fileutil.read_file
original_path_exists = os.path.exists

def add_file(pattern, file_path):
patcher.files.insert(0, (pattern, file_path))

def add_command(pattern, output):
patcher.commands.insert(0, (pattern, output))

Expand All @@ -104,15 +107,15 @@ def mock_popen(command, *args, **kwargs):
return original_popen(command, *args, **kwargs)

def mock_read_file(filepath, **kwargs):
for file in patcher.files: # pylint: disable=redefined-builtin
match = re.match(file[0], filepath)
for item in patcher.files:
match = re.match(item[0], filepath)
if match is not None:
filepath = file[1]
filepath = item[1]
return original_read_file(filepath, **kwargs)

def mock_path_exists(path):
for file in patcher.files: # pylint: disable=redefined-builtin
match = re.match(file[0], path)
for item in patcher.files:
match = re.match(item[0], path)
if match is not None:
return True
return original_path_exists(path)
Expand All @@ -122,8 +125,9 @@ def mock_path_exists(path):
with patch("azurelinuxagent.common.cgroupapi.fileutil.read_file", side_effect=mock_read_file):
with patch('azurelinuxagent.common.cgroupapi.CGroupsApi.cgroups_supported', return_value=True):
with patch('azurelinuxagent.common.cgroupapi.CGroupsApi.is_systemd', return_value=True):
patcher.commands = _default_commands[:]
patcher.files = _default_files[:]
patcher.commands = __DEFAULT_COMMANDS[:]
patcher.files = __DEFAULT_FILES[:]
patcher.add_file = add_file
patcher.add_command = add_command
yield patcher

Loading