-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Verify that the CPU and Memory cgroups for the agent are properly initialized; disabled cgroups if they are not active. #2015
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,9 @@ class CGroupConfigurator(object): | |
|
||
NOTE: with the exception of start_extension_command, none of the methods in this class raise exceptions (cgroup operations should not block extensions) | ||
""" | ||
class __impl(object): # pylint: disable=R0902,C0103 | ||
# too-many-instance-attributes<R0902> Disabled: class complexity is OK | ||
# invalid-name<C0103> Disabled: class is private, so name starts with __ | ||
class __Impl(object): # pylint: disable=R0902,C0103 | ||
def __init__(self): | ||
self._initialized = False | ||
self._cgroups_supported = False | ||
|
@@ -46,7 +48,8 @@ def __init__(self): | |
self._get_processes_in_agent_cgroup_last_error = None | ||
self._get_processes_in_agent_cgroup_error_count = 0 | ||
|
||
def initialize(self): # pylint: disable=R0912 | ||
# too-many-branches<R0912> Disabled: branches are sequential, not nested | ||
def initialize(self): # pylint: disable=R0912 | ||
# pylint: disable=too-many-locals | ||
try: | ||
if self._initialized: | ||
|
@@ -76,12 +79,7 @@ def log_cgroup_info(format_string, *args): | |
logger.info(message) | ||
add_event(op=WALAEventOperation.CGroupsInfo, message=message) | ||
|
||
def log_cgroup_warn(format_string, *args): | ||
message = format_string.format(*args) | ||
logger.warn(message) | ||
add_event(op=WALAEventOperation.CGroupsInfo, message=message, is_success=False, log_event=False) | ||
|
||
log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # pylint: disable=E1101 | ||
log_cgroup_info("systemd version: {0}", self._cgroups_api.get_systemd_version()) # pylint: disable=E1101 | ||
|
||
# | ||
# Older versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent. When running | ||
|
@@ -90,69 +88,79 @@ def log_cgroup_warn(format_string, *args): | |
legacy_cgroups = self._cgroups_api.cleanup_legacy_cgroups() | ||
|
||
if legacy_cgroups > 0: | ||
log_cgroup_warn("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") | ||
log_cgroup_info("The daemon's PID was added to a legacy cgroup; will not monitor resource usage.") | ||
return | ||
|
||
# | ||
# check v1 controllers | ||
# | ||
cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() # pylint: disable=E1101 | ||
cpu_controller_root, memory_controller_root = self._cgroups_api.get_cgroup_mount_points() # pylint: disable=E1101 | ||
|
||
if cpu_controller_root is not None: | ||
logger.info("The CPU cgroup controller is mounted at {0}", cpu_controller_root) | ||
else: | ||
log_cgroup_warn("The CPU cgroup controller is not mounted") | ||
log_cgroup_info("The CPU cgroup controller is not mounted") | ||
|
||
if memory_controller_root is not None: | ||
logger.info("The memory cgroup controller is mounted at {0}", memory_controller_root) | ||
else: | ||
log_cgroup_warn("The memory cgroup controller is not mounted") | ||
log_cgroup_info("The memory cgroup controller is not mounted") | ||
|
||
# | ||
# check v2 controllers | ||
# | ||
cgroup2_mountpoint, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() # pylint: disable=E1101 | ||
if cgroup2_mountpoint is not None: | ||
log_cgroup_warn("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mountpoint, cgroup2_controllers) | ||
cgroup2_mount_point, cgroup2_controllers = self._cgroups_api.get_cgroup2_controllers() # pylint: disable=E1101 | ||
if cgroup2_mount_point is not None: | ||
log_cgroup_info("cgroups v2 mounted at {0}. Controllers: [{1}]", cgroup2_mount_point, cgroup2_controllers) | ||
|
||
# | ||
# check the cgroups for the agent | ||
# | ||
agent_unit_name = self._cgroups_api.get_agent_unit_name() # pylint: disable=E1101 | ||
cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self") # pylint: disable=E1101 | ||
agent_unit_name = self._cgroups_api.get_agent_unit_name() # pylint: disable=E1101 | ||
cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self") # pylint: disable=E1101 | ||
expected_relative_path = os.path.join('system.slice', agent_unit_name) | ||
if cpu_cgroup_relative_path is None: | ||
log_cgroup_warn("The agent's process is not within a CPU cgroup") | ||
log_cgroup_info("The agent's process is not within a CPU cgroup") | ||
else: | ||
cpu_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "CPUAccounting") # pylint: disable=E1101 | ||
if cpu_cgroup_relative_path != expected_relative_path: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not enable cgroups if agent is not in system.slice/walinuxagent.service |
||
log_cgroup_info("The Agent is not in the expected cgroup; will not enable cgroup monitoring. CPU relative path:[{0}] Expected:[{1}]", cpu_cgroup_relative_path, expected_relative_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't it be a warning here? |
||
return | ||
cpu_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "CPUAccounting") # pylint: disable=E1101 | ||
log_cgroup_info('CPUAccounting: {0}', cpu_accounting) | ||
|
||
if memory_cgroup_relative_path is None: | ||
log_cgroup_warn("The agent's process is not within a memory cgroup") | ||
log_cgroup_info("The agent's process is not within a memory cgroup") | ||
else: | ||
memory_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "MemoryAccounting") # pylint: disable=E1101 | ||
if memory_cgroup_relative_path != expected_relative_path: | ||
log_cgroup_info("The Agent is not in the expected cgroup; will not enable cgroup monitoring. Memory relative path:[{0}] Expected:[{1}]", memory_cgroup_relative_path, expected_relative_path) | ||
return | ||
memory_accounting = self._cgroups_api.get_unit_property(agent_unit_name, "MemoryAccounting") # pylint: disable=E1101 | ||
log_cgroup_info('MemoryAccounting: {0}', memory_accounting) | ||
|
||
# | ||
# All good, enable cgroups and start monitoring the agent | ||
# | ||
self._cgroups_enabled = True | ||
|
||
if cpu_controller_root is None or cpu_cgroup_relative_path is None: | ||
logger.info("Will not track CPU for the agent's cgroup") | ||
else: | ||
self._agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path) | ||
log_cgroup_info("Agent CPU cgroup: {0}", self._agent_cpu_cgroup_path) | ||
CGroupsTelemetry.track_cgroup(CpuCgroup(agent_unit_name, self._agent_cpu_cgroup_path)) | ||
|
||
if memory_controller_root is None or memory_cgroup_relative_path is None: | ||
logger.info("Will not track memory for the agent's cgroup") | ||
else: | ||
self._agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path) | ||
log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path) | ||
CGroupsTelemetry.track_cgroup(MemoryCgroup(agent_unit_name, self._agent_memory_cgroup_path)) | ||
|
||
log_cgroup_info("Agent cgroups: CPU: {0} -- MEMORY: {1}", self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path) | ||
if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do not enable cgroups if both cpu and memory are None |
||
self._cgroups_enabled = True | ||
|
||
log_cgroup_info('Cgroups enabled: {0}', self._cgroups_enabled) | ||
|
||
except Exception as e: # pylint: disable=C0103 | ||
message = "Error initializing cgroups: {0}".format(ustr(e)) | ||
except Exception as exception: | ||
message = "Error initializing cgroups: {0}".format(ustr(exception)) | ||
logger.warn(message) | ||
add_event(op=WALAEventOperation.CGroupsInitialize, is_success=False, message=message, log_event=False) | ||
finally: | ||
|
@@ -183,13 +191,13 @@ def _invoke_cgroup_operation(self, operation, error_message, on_error=None): | |
|
||
try: | ||
return operation() | ||
except Exception as e: # pylint: disable=C0103 | ||
logger.warn("{0} Error: {1}".format(error_message, ustr(e))) | ||
except Exception as exception: | ||
logger.warn("{0} Error: {1}".format(error_message, ustr(exception))) | ||
if on_error is not None: | ||
try: | ||
on_error(e) | ||
except Exception as ex: # pylint: disable=W0612 | ||
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW on this one the linter caught me logging the wrong exception |
||
on_error(exception) | ||
except Exception as exception: | ||
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(exception))) | ||
|
||
def create_extension_cgroups_root(self): | ||
""" | ||
|
@@ -227,7 +235,7 @@ def get_processes_in_agent_cgroup(self): | |
def __impl(): | ||
if self._agent_cpu_cgroup_path is None: | ||
return [] | ||
return self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # pylint: disable=E1101 | ||
return self._cgroups_api.get_processes_in_cgroup(self._agent_cpu_cgroup_path) # pylint: disable=E1101 | ||
|
||
def __on_error(exception): | ||
# | ||
|
@@ -242,8 +250,8 @@ def __on_error(exception): | |
|
||
return self._invoke_cgroup_operation(__impl, "Failed to list the processes in the agent's cgroup.", on_error=__on_error) | ||
|
||
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, # pylint: disable=R0913 | ||
error_code=ExtensionErrorCodes.PluginUnknownFailure): | ||
# too-many-arguments<R0913> Disabled: argument list mimics Popen's | ||
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): # pylint: disable=R0913 | ||
""" | ||
Starts a command (install/enable/etc) for an extension and adds the command's PID to the extension's cgroup | ||
:param extension_name: The extension executing the command | ||
|
@@ -257,7 +265,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd, | |
:param error_code: Extension error code to raise in case of error | ||
""" | ||
if not self.enabled(): | ||
process = subprocess.Popen(command, # pylint: disable=W1509 | ||
# subprocess-popen-preexec-fn<W1509> Disabled: code is not multi-threaded | ||
process = subprocess.Popen(command, # pylint: disable=W1509 | ||
shell=shell, | ||
cwd=cwd, | ||
env=env, | ||
|
@@ -290,7 +299,7 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd, | |
@staticmethod | ||
def get_instance(): | ||
if CGroupConfigurator._instance is None: | ||
CGroupConfigurator._instance = CGroupConfigurator.__impl() | ||
CGroupConfigurator._instance = CGroupConfigurator.__Impl() | ||
return CGroupConfigurator._instance | ||
|
||
@staticmethod | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,11 +70,11 @@ def _operation_impl(self): | |
if not CGroupConfigurator.is_agent_process(command_line): | ||
unexpected_processes.append(command_line) | ||
|
||
if len(unexpected_processes) > 0: # pylint: disable=len-as-condition | ||
if unexpected_processes: | ||
unexpected_processes.sort() | ||
processes_check_error = "The agent's cgroup includes unexpected processes: {0}".format(ustr(unexpected_processes)) | ||
except Exception as e: # pylint: disable=C0103 | ||
processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format(ustr(e)) | ||
except Exception as exception: | ||
processes_check_error = "Failed to check the processes in the agent's cgroup: {0}".format(ustr(exception)) | ||
|
||
# Report a small sample of errors | ||
if processes_check_error != self._last_error and self._error_count < 5: | ||
|
@@ -181,11 +181,13 @@ def __init__(self): | |
ResetPeriodicLogMessagesOperation(), | ||
PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD), | ||
ReportNetworkErrorsOperation(), | ||
PollResourceUsageOperation(), | ||
PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), | ||
PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), | ||
ReportNetworkConfigurationChangesOperation(), | ||
] | ||
if CGroupConfigurator.get_instance().enabled(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't check for cgroups stuff if they are disabled |
||
self._periodic_operations.append(PollResourceUsageOperation()) | ||
|
||
self.protocol = None | ||
self.protocol_util = None | ||
self.health_service = None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -243,7 +243,7 @@ def run_latest(self, child_args=None): # pylint: disable=R0912,R1711 | |
self.child_process = None | ||
return | ||
|
||
def run(self, debug=False): # pylint: disable=R0912,R0914 | ||
def run(self, debug=False): # pylint: disable=R0912,R0914 | ||
""" | ||
This is the main loop which watches for agent and extension updates. | ||
""" | ||
|
@@ -258,6 +258,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 | |
protocol = self.protocol_util.get_protocol() | ||
protocol.update_goal_state() | ||
|
||
# Initialize the common parameters for telemetry events | ||
initialize_event_logger_vminfo_common_parameters(protocol) | ||
|
||
# Log OS-specific info. | ||
|
@@ -267,6 +268,24 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 | |
logger.info(os_info_msg) | ||
add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) | ||
|
||
# | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. initialization of these threads was racing against initialization of the main thread |
||
# Perform initialization tasks | ||
# | ||
from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this import be up top of the file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ya; i tried to fix that and it ends up breaking up the tests... we have several of those problematic imports across the code base... yet another cleanup task to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm. What were the errors the tests were having? Should we document this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can try moving it in the current code and check the errors... i don't think we need to document it, all of us are aware of this (now you are :)) |
||
exthandlers_handler = get_exthandlers_handler(protocol) | ||
migrate_handler_state() | ||
|
||
from azurelinuxagent.ga.remoteaccess import get_remote_access_handler | ||
remote_access_handler = get_remote_access_handler(protocol) | ||
|
||
self._ensure_no_orphans() | ||
self._emit_restart_event() | ||
self._emit_changes_in_default_configuration() | ||
self._ensure_partition_assigned() | ||
self._ensure_readonly_files() | ||
self._ensure_cgroups_initialized() | ||
self._ensure_extension_telemetry_state_configured_properly(protocol) | ||
|
||
# Get all thread handlers | ||
all_thread_handlers = [ | ||
get_monitor_handler(), | ||
|
@@ -284,21 +303,6 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 | |
for thread_handler in all_thread_handlers: | ||
thread_handler.run() | ||
|
||
from azurelinuxagent.ga.exthandlers import get_exthandlers_handler, migrate_handler_state | ||
exthandlers_handler = get_exthandlers_handler(protocol) | ||
migrate_handler_state() | ||
|
||
from azurelinuxagent.ga.remoteaccess import get_remote_access_handler | ||
remote_access_handler = get_remote_access_handler(protocol) | ||
|
||
self._ensure_no_orphans() | ||
self._emit_restart_event() | ||
self._emit_changes_in_default_configuration() | ||
self._ensure_partition_assigned() | ||
self._ensure_readonly_files() | ||
self._ensure_cgroups_initialized() | ||
self._ensure_extension_telemetry_state_configured_properly(protocol) | ||
|
||
goal_state_interval = conf.get_goal_state_period() if conf.get_extensions_enabled() else GOAL_STATE_INTERVAL_DISABLED | ||
|
||
while self.running: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
converted warnings into infos