Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disable cgroups when daemon is setup incorrectly #1688

Merged
merged 3 commits into from
Nov 4, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 70 additions & 46 deletions azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def get_extension_cgroups(self, extension_name):
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code):
raise NotImplementedError()

def cleanup_old_cgroups(self):
def cleanup_legacy_cgroups(self):
raise NotImplementedError()

@staticmethod
Expand Down Expand Up @@ -100,20 +100,52 @@ def _foreach_controller(operation, message):
is not mounted or if an error occurs in the operation
:return: Returns a list of error messages or an empty list if no errors occurred
"""
errors = []
mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT)

for controller in CGROUP_CONTROLLERS:
try:
if controller not in mounted_controllers:
logger.warn('Cgroup controller "{0}" is not mounted. {1}'.format(controller, message))
logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message)
else:
operation(controller)
except Exception as e:
msg = 'Error in cgroup controller "{0}": {1}. {2}'.format(controller, ustr(e), message)
logger.warn(msg)
errors.append(msg)
return errors
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now there is no need for this function to report errors to the caller

logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message)

@staticmethod
def _foreach_legacy_cgroup(operation):
"""
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. Also,
when running under systemd, the PIDs should not be explicitly moved to the cgroup filesystem. The older daemons would
incorrectly do that under certain conditions.

This method checks for the existence of the legacy cgroups and, if the daemon's PID has been added to them, executes the
given operation on the cgroups. After this check, the method attempts to remove the legacy cgroups.

:param operation:
The function to execute on each legacy cgroup. It must take 2 arguments: the controller and the daemon's PID
"""
legacy_cgroups = []
for controller in ['cpu', 'memory']:
cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent")
if os.path.exists(cgroup):
logger.info('Found legacy cgroup {0}', cgroup)
legacy_cgroups.append((controller, cgroup))

try:
for controller, cgroup in legacy_cgroups:
procs_file = os.path.join(cgroup, "cgroup.procs")

if os.path.exists(procs_file):
procs_file_contents = fileutil.read_file(procs_file).strip()
daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip()

if daemon_pid in procs_file_contents:
operation(controller, daemon_pid)
finally:
for _, cgroup in legacy_cgroups:
logger.info('Removing {0}', cgroup)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Suggestion to make the message Removing legacy cgroup {0} instead to make it even clearer what's getting removed when just looking at the logs.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the whole path is in the message. a few lines up in the log there would be a message identifying the path as a legacy group (logged at line 132)

shutil.rmtree(cgroup, ignore_errors=True)


class FileSystemCgroupsApi(CGroupsApi):
Expand Down Expand Up @@ -143,6 +175,10 @@ def _try_mkdir(path):
else:
raise

@staticmethod
def _get_agent_cgroup_path(controller):
return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)

@staticmethod
def _get_extension_cgroups_root_path(controller):
return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME)
Expand All @@ -158,49 +194,28 @@ def _get_extension_cgroup_path(self, controller, extension_name):
return os.path.join(extensions_root, cgroup_name)

def _create_extension_cgroup(self, controller, extension_name):
return CGroup.create(self._get_extension_cgroup_path(controller, extension_name),
controller, extension_name)
return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name)

@staticmethod
def _add_process_to_cgroup(pid, cgroup_path):
tasks_file = os.path.join(cgroup_path, 'cgroup.procs')
fileutil.append_file(tasks_file, "{0}\n".format(pid))
logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path))

def cleanup_old_cgroups(self):
# Old daemon versions (2.2.31-2.2.40) wrote their PID to the WALinuxAgent/WALinuxAgent cgroup.
# Starting from version 2.2.41, we track the agent service in walinuxagent.service. This method
# cleans up the old behavior by moving the daemon's PID to the new cgroup and deleting the old cgroup.
daemon_pid_file = get_agent_pid_file_path()
daemon_pid = fileutil.read_file(daemon_pid_file)

def cleanup_old_controller(controller):
old_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent")
new_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)

if not os.path.isdir(old_path):
return

contents = fileutil.read_file(os.path.join(old_path, "cgroup.procs"))

if daemon_pid in contents:
fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid)
shutil.rmtree(old_path, ignore_errors=True)

errors = self._foreach_controller(cleanup_old_controller,
"Failed to update the tracking of the daemon; resource usage of the agent "
"will not include the daemon process.")

if len(errors) == 0:
msg = 'Successfully cleaned up old cgroups in WALinuxAgent/WALinuxAgent.'
else:
msg = 'Failed to clean up old cgroups in WALinuxAgent/WALinuxAgent. Errors: {0}'.format(errors)
def cleanup_legacy_cgroups(self):
"""
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This
method moves the daemon's PID from the legacy cgroups to the newer cgroups.
"""
def move_daemon_pid(controller, daemon_pid):
new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)
logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path)
fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid)
msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path)
add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we explicitly say that the collection of resource usage data is still enabled? I'm looking at it from the perspective of querying for this event and parsing what the different event messages mean.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can talk offline... not sure I understand how that would improve parsing

vrdmr marked this conversation as resolved.
Show resolved Hide resolved

add_event(AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.CGroupsCleanUp,
is_success=len(errors) == 0,
message=msg)
CGroupsApi._foreach_legacy_cgroup(move_daemon_pid)

def create_agent_cgroups(self):
"""
Expand All @@ -211,7 +226,7 @@ def create_agent_cgroups(self):
pid = int(os.getpid())

def create_cgroup(controller):
path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME)
path = FileSystemCgroupsApi._get_agent_cgroup_path(controller)

if not os.path.isdir(path):
FileSystemCgroupsApi._try_mkdir(path)
Expand Down Expand Up @@ -532,6 +547,15 @@ def create_cgroup(controller):
# The process terminated in time and successfully
return extension_cgroups, process_output

def cleanup_old_cgroups(self):
# No cleanup needed from the old daemon in the systemd case.
pass
def cleanup_legacy_cgroups(self):
"""
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent;
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If
we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance
(under systemd, moving PIDs across the cgroup file system can produce unpredictable results)
"""
def report_error(_, daemon_pid):
raise CGroupsException(
"The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid))

CGroupsApi._foreach_legacy_cgroup(report_error)
35 changes: 26 additions & 9 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def enable(self):
def disable(self):
self._enabled = False

def _invoke_cgroup_operation(self, operation, error_message):
def _invoke_cgroup_operation(self, operation, error_message, on_error=None):
"""
Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation.
"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you update the comment explaining what on_error does and why it's there?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesn't the name imply a callback to execute when an error occurs? :)

Expand All @@ -89,7 +89,12 @@ def _invoke_cgroup_operation(self, operation, error_message):
try:
return operation()
except Exception as e:
logger.warn("{0}. Error: {1}".format(error_message, ustr(e)))
logger.warn("{0} Error: {1}".format(error_message, ustr(e)))
if on_error is not None:
try:
on_error(e)
except Exception as ex:
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e)))

def create_agent_cgroups(self, track_cgroups):
"""
Expand All @@ -104,14 +109,26 @@ def __impl():

return cgroups

self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked")
self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.")

def cleanup_old_cgroups(self):
def cleanup_legacy_cgroups(self):
def __impl():
self._cgroups_api.cleanup_old_cgroups()
self._cgroups_api.cleanup_legacy_cgroups()

self._invoke_cgroup_operation(__impl, "Failed to update the tracking of the daemon; resource usage "
"of the agent will not include the daemon process.")
message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.'

def disable_cgroups(exception):
self.disable()

add_event(
AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.CGroupsCleanUp,
is_success=False,
log_event=False,
message='{0} {1}'.format(message, ustr(exception)))

self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups)

def create_extension_cgroups_root(self):
"""
Expand All @@ -120,7 +137,7 @@ def create_extension_cgroups_root(self):
def __impl():
self._cgroups_api.create_extension_cgroups_root()

self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked")
self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.")

def create_extension_cgroups(self, name):
"""
Expand All @@ -129,7 +146,7 @@ def create_extension_cgroups(self, name):
def __impl():
return self._cgroups_api.create_extension_cgroups(name)

return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked".format(name))
return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name))

def remove_extension_cgroups(self, name):
"""
Expand Down
4 changes: 4 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,11 @@ class CGroupsException(AgentError):

def __init__(self, msg, inner=None):
super(AgentError, self).__init__(msg, inner)
# TODO: AgentError should set the message - investigate whether doing it there would break anything
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So... would it? :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dunno; no time to check :)

Doing it in AgentError would change the formatting of all exceptions. Need to spend some time checking what would happen then.

self.message = msg

def __str__(self):
return self.message

class ExtensionError(AgentError):
"""
Expand Down
1 change: 0 additions & 1 deletion azurelinuxagent/daemon/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def run(self, child_args=None):
self.initialize_environment()

CGroupConfigurator.get_instance().create_agent_cgroups(track_cgroups=False)
CGroupConfigurator.get_instance().cleanup_old_cgroups()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not have the daemon clean up?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not needed. Or did you have an scenario in mind when it was added?


# If FIPS is enabled, set the OpenSSL environment variable
# Note:
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def _ensure_readonly_files(self):
def _ensure_cgroups_initialized(self):
configurator = CGroupConfigurator.get_instance()
configurator.create_agent_cgroups(track_cgroups=True)
configurator.cleanup_old_cgroups()
configurator.cleanup_legacy_cgroups()
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
configurator.create_extension_cgroups_root()

def _evaluate_agent_health(self, latest_agent):
Expand Down
Loading