-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Disable cgroups when daemon is setup incorrectly #1688
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,7 +61,7 @@ def get_extension_cgroups(self, extension_name): | |
def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code): | ||
raise NotImplementedError() | ||
|
||
def cleanup_old_cgroups(self): | ||
def cleanup_legacy_cgroups(self): | ||
raise NotImplementedError() | ||
|
||
@staticmethod | ||
|
@@ -100,20 +100,52 @@ def _foreach_controller(operation, message): | |
is not mounted or if an error occurs in the operation | ||
:return: Returns a list of error messages or an empty list if no errors occurred | ||
""" | ||
errors = [] | ||
mounted_controllers = os.listdir(CGROUPS_FILE_SYSTEM_ROOT) | ||
|
||
for controller in CGROUP_CONTROLLERS: | ||
try: | ||
if controller not in mounted_controllers: | ||
logger.warn('Cgroup controller "{0}" is not mounted. {1}'.format(controller, message)) | ||
logger.warn('Cgroup controller "{0}" is not mounted. {1}', controller, message) | ||
else: | ||
operation(controller) | ||
except Exception as e: | ||
msg = 'Error in cgroup controller "{0}": {1}. {2}'.format(controller, ustr(e), message) | ||
logger.warn(msg) | ||
errors.append(msg) | ||
return errors | ||
logger.warn('Error in cgroup controller "{0}": {1}. {2}', controller, ustr(e), message) | ||
|
||
@staticmethod | ||
def _foreach_legacy_cgroup(operation): | ||
""" | ||
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; | ||
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. Also, | ||
when running under systemd, the PIDs should not be explicitly moved to the cgroup filesystem. The older daemons would | ||
incorrectly do that under certain conditions. | ||
|
||
This method checks for the existence of the legacy cgroups and, if the daemon's PID has been added to them, executes the | ||
given operation on the cgroups. After this check, the method attempts to remove the legacy cgroups. | ||
|
||
:param operation: | ||
The function to execute on each legacy cgroup. It must take 2 arguments: the controller and the daemon's PID | ||
""" | ||
legacy_cgroups = [] | ||
for controller in ['cpu', 'memory']: | ||
cgroup = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") | ||
if os.path.exists(cgroup): | ||
logger.info('Found legacy cgroup {0}', cgroup) | ||
legacy_cgroups.append((controller, cgroup)) | ||
|
||
try: | ||
for controller, cgroup in legacy_cgroups: | ||
procs_file = os.path.join(cgroup, "cgroup.procs") | ||
|
||
if os.path.exists(procs_file): | ||
procs_file_contents = fileutil.read_file(procs_file).strip() | ||
daemon_pid = fileutil.read_file(get_agent_pid_file_path()).strip() | ||
|
||
if daemon_pid in procs_file_contents: | ||
operation(controller, daemon_pid) | ||
finally: | ||
for _, cgroup in legacy_cgroups: | ||
logger.info('Removing {0}', cgroup) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Suggestion to make the message There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the whole path is in the message. a few lines up in the log there would be a message identifying the path as a legacy group (logged at line 132) |
||
shutil.rmtree(cgroup, ignore_errors=True) | ||
|
||
|
||
class FileSystemCgroupsApi(CGroupsApi): | ||
|
@@ -143,6 +175,10 @@ def _try_mkdir(path): | |
else: | ||
raise | ||
|
||
@staticmethod | ||
def _get_agent_cgroup_path(controller): | ||
return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) | ||
|
||
@staticmethod | ||
def _get_extension_cgroups_root_path(controller): | ||
return os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, EXTENSIONS_ROOT_CGROUP_NAME) | ||
|
@@ -158,49 +194,28 @@ def _get_extension_cgroup_path(self, controller, extension_name): | |
return os.path.join(extensions_root, cgroup_name) | ||
|
||
def _create_extension_cgroup(self, controller, extension_name): | ||
return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), | ||
controller, extension_name) | ||
return CGroup.create(self._get_extension_cgroup_path(controller, extension_name), controller, extension_name) | ||
|
||
@staticmethod | ||
def _add_process_to_cgroup(pid, cgroup_path): | ||
tasks_file = os.path.join(cgroup_path, 'cgroup.procs') | ||
fileutil.append_file(tasks_file, "{0}\n".format(pid)) | ||
logger.info("Added PID {0} to cgroup {1}".format(pid, cgroup_path)) | ||
|
||
def cleanup_old_cgroups(self): | ||
# Old daemon versions (2.2.31-2.2.40) wrote their PID to the WALinuxAgent/WALinuxAgent cgroup. | ||
# Starting from version 2.2.41, we track the agent service in walinuxagent.service. This method | ||
# cleans up the old behavior by moving the daemon's PID to the new cgroup and deleting the old cgroup. | ||
daemon_pid_file = get_agent_pid_file_path() | ||
daemon_pid = fileutil.read_file(daemon_pid_file) | ||
|
||
def cleanup_old_controller(controller): | ||
old_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, "WALinuxAgent", "WALinuxAgent") | ||
new_path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) | ||
|
||
if not os.path.isdir(old_path): | ||
return | ||
|
||
contents = fileutil.read_file(os.path.join(old_path, "cgroup.procs")) | ||
|
||
if daemon_pid in contents: | ||
fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid) | ||
shutil.rmtree(old_path, ignore_errors=True) | ||
|
||
errors = self._foreach_controller(cleanup_old_controller, | ||
"Failed to update the tracking of the daemon; resource usage of the agent " | ||
"will not include the daemon process.") | ||
|
||
if len(errors) == 0: | ||
msg = 'Successfully cleaned up old cgroups in WALinuxAgent/WALinuxAgent.' | ||
else: | ||
msg = 'Failed to clean up old cgroups in WALinuxAgent/WALinuxAgent. Errors: {0}'.format(errors) | ||
def cleanup_legacy_cgroups(self): | ||
""" | ||
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; | ||
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. This | ||
method moves the daemon's PID from the legacy cgroups to the newer cgroups. | ||
""" | ||
def move_daemon_pid(controller, daemon_pid): | ||
new_path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) | ||
logger.info("Writing daemon's PID ({0}) to {1}", daemon_pid, new_path) | ||
fileutil.append_file(os.path.join(new_path, "cgroup.procs"), daemon_pid) | ||
msg = "Moved daemon's PID from legacy cgroup to {0}".format(new_path) | ||
add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.CGroupsCleanUp, is_success=True, message=msg) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we explicitly say that the collection of resource usage data is still enabled? I'm looking at it from the perspective of querying for this event and parsing what the different event messages mean. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can talk offline... not sure I understand how that would improve parsing
vrdmr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
add_event(AGENT_NAME, | ||
version=CURRENT_VERSION, | ||
op=WALAEventOperation.CGroupsCleanUp, | ||
is_success=len(errors) == 0, | ||
message=msg) | ||
CGroupsApi._foreach_legacy_cgroup(move_daemon_pid) | ||
|
||
def create_agent_cgroups(self): | ||
""" | ||
|
@@ -211,7 +226,7 @@ def create_agent_cgroups(self): | |
pid = int(os.getpid()) | ||
|
||
def create_cgroup(controller): | ||
path = os.path.join(CGROUPS_FILE_SYSTEM_ROOT, controller, VM_AGENT_CGROUP_NAME) | ||
path = FileSystemCgroupsApi._get_agent_cgroup_path(controller) | ||
|
||
if not os.path.isdir(path): | ||
FileSystemCgroupsApi._try_mkdir(path) | ||
|
@@ -532,6 +547,15 @@ def create_cgroup(controller): | |
# The process terminated in time and successfully | ||
return extension_cgroups, process_output | ||
|
||
def cleanup_old_cgroups(self): | ||
# No cleanup needed from the old daemon in the systemd case. | ||
pass | ||
def cleanup_legacy_cgroups(self): | ||
""" | ||
Previous versions of the daemon (2.2.31-2.2.40) wrote their PID to /sys/fs/cgroup/{cpu,memory}/WALinuxAgent/WALinuxAgent; | ||
starting from version 2.2.41 we track the agent service in walinuxagent.service instead of WALinuxAgent/WALinuxAgent. If | ||
we find that any of the legacy groups include the PID of the daemon then we disable data collection for this instance | ||
(under systemd, moving PIDs across the cgroup file system can produce unpredictable results) | ||
""" | ||
def report_error(_, daemon_pid): | ||
raise CGroupsException( | ||
"The daemon's PID ({0}) was already added to the legacy cgroup; this invalidates resource usage data.".format(daemon_pid)) | ||
|
||
CGroupsApi._foreach_legacy_cgroup(report_error) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,7 @@ def enable(self): | |
def disable(self): | ||
self._enabled = False | ||
|
||
def _invoke_cgroup_operation(self, operation, error_message): | ||
def _invoke_cgroup_operation(self, operation, error_message, on_error=None): | ||
""" | ||
Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation. | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you update the comment explaining what There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't the name imply a callback to execute when an error occurs? :) |
||
|
@@ -89,7 +89,12 @@ def _invoke_cgroup_operation(self, operation, error_message): | |
try: | ||
return operation() | ||
except Exception as e: | ||
logger.warn("{0}. Error: {1}".format(error_message, ustr(e))) | ||
logger.warn("{0} Error: {1}".format(error_message, ustr(e))) | ||
if on_error is not None: | ||
try: | ||
on_error(e) | ||
except Exception as ex: | ||
logger.warn("CGroupConfigurator._invoke_cgroup_operation: {0}".format(ustr(e))) | ||
|
||
def create_agent_cgroups(self, track_cgroups): | ||
""" | ||
|
@@ -104,14 +109,26 @@ def __impl(): | |
|
||
return cgroups | ||
|
||
self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked") | ||
self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for the VM Agent; resource usage for the Agent will not be tracked.") | ||
|
||
def cleanup_old_cgroups(self): | ||
def cleanup_legacy_cgroups(self): | ||
def __impl(): | ||
self._cgroups_api.cleanup_old_cgroups() | ||
self._cgroups_api.cleanup_legacy_cgroups() | ||
|
||
self._invoke_cgroup_operation(__impl, "Failed to update the tracking of the daemon; resource usage " | ||
"of the agent will not include the daemon process.") | ||
message = 'Failed to process legacy cgroups. Collection of resource usage data will be disabled.' | ||
|
||
def disable_cgroups(exception): | ||
self.disable() | ||
|
||
add_event( | ||
AGENT_NAME, | ||
version=CURRENT_VERSION, | ||
op=WALAEventOperation.CGroupsCleanUp, | ||
is_success=False, | ||
log_event=False, | ||
message='{0} {1}'.format(message, ustr(exception))) | ||
|
||
self._invoke_cgroup_operation(__impl, message, on_error=disable_cgroups) | ||
|
||
def create_extension_cgroups_root(self): | ||
""" | ||
|
@@ -120,7 +137,7 @@ def create_extension_cgroups_root(self): | |
def __impl(): | ||
self._cgroups_api.create_extension_cgroups_root() | ||
|
||
self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked") | ||
self._invoke_cgroup_operation(__impl, "Failed to create a root cgroup for extensions; resource usage for extensions will not be tracked.") | ||
|
||
def create_extension_cgroups(self, name): | ||
""" | ||
|
@@ -129,7 +146,7 @@ def create_extension_cgroups(self, name): | |
def __impl(): | ||
return self._cgroups_api.create_extension_cgroups(name) | ||
|
||
return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked".format(name)) | ||
return self._invoke_cgroup_operation(__impl, "Failed to create a cgroup for extension '{0}'; resource usage will not be tracked.".format(name)) | ||
|
||
def remove_extension_cgroups(self, name): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,7 +55,11 @@ class CGroupsException(AgentError): | |
|
||
def __init__(self, msg, inner=None): | ||
super(AgentError, self).__init__(msg, inner) | ||
# TODO: AgentError should set the message - investigate whether doing it there would break anything | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So... would it? :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dunno; no time to check :) Doing it in AgentError would change the formatting of all exceptions. Need to spend some time checking what would happen then. |
||
self.message = msg | ||
|
||
def __str__(self): | ||
return self.message | ||
|
||
class ExtensionError(AgentError): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,7 +69,6 @@ def run(self, child_args=None): | |
self.initialize_environment() | ||
|
||
CGroupConfigurator.get_instance().create_agent_cgroups(track_cgroups=False) | ||
CGroupConfigurator.get_instance().cleanup_old_cgroups() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not have the daemon clean up? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is not needed. Or did you have an scenario in mind when it was added? |
||
|
||
# If FIPS is enabled, set the OpenSSL environment variable | ||
# Note: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
now there is no need for this function to report errors to the caller