diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index c0ebdbb42e..babd67e1ef 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -31,7 +31,7 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator -from azurelinuxagent.ga.cgroup import AGENT_LOG_COLLECTOR, CpuCgroup, MemoryCgroup +from azurelinuxagent.ga.controllermetrics import AGENT_LOG_COLLECTOR, CpuMetrics from azurelinuxagent.ga.cgroupapi import get_cgroup_api, log_cgroup_warning, InvalidCgroupMountpointException import azurelinuxagent.common.conf as conf @@ -208,8 +208,7 @@ def collect_logs(self, is_full_mode): # Check the cgroups unit log_collector_monitor = None - cpu_cgroup_path = None - memory_cgroup_path = None + tracked_metrics = [] if CollectLogsHandler.is_enabled_monitor_cgroups_check(): try: cgroup_api = get_cgroup_api() @@ -220,40 +219,27 @@ def collect_logs(self, is_full_mode): log_cgroup_warning("Unable to determine which cgroup version to use: {0}".format(ustr(e)), send_event=True) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) - cpu_cgroup_path, memory_cgroup_path = cgroup_api.get_process_cgroup_paths("self") - cpu_slice_matches = False - memory_slice_matches = False - if cpu_cgroup_path is not None: - cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path) - if memory_cgroup_path is not None: - memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path) - - if not cpu_slice_matches or not memory_slice_matches: - log_cgroup_warning("The Log Collector process is not in the proper cgroups:", send_event=False) - if not cpu_slice_matches: - log_cgroup_warning("\tunexpected cpu slice: {0}".format(cpu_cgroup_path), send_event=False) - if not memory_slice_matches: - log_cgroup_warning("\tunexpected memory slice: {0}".format(memory_cgroup_path), send_event=False) + log_collector_cgroup = cgroup_api.get_process_cgroup(process_id="self", cgroup_name=AGENT_LOG_COLLECTOR) + tracked_metrics = log_collector_cgroup.get_controller_metrics() + if len(tracked_metrics) != len(log_collector_cgroup.get_supported_controllers()): + log_cgroup_warning("At least one required controller is missing. The following controllers are required for the log collector to run: {0}".format(log_collector_cgroup.get_supported_controllers())) sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) - def initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path): - cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path) - msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup) - logger.info(msg) - cpu_cgroup.initialize_cpu_usage() - memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path) - msg = "Started tracking memory cgroup {0}".format(memory_cgroup) - logger.info(msg) - return [cpu_cgroup, memory_cgroup] + if not log_collector_cgroup.check_in_expected_slice(cgroupconfigurator.LOGCOLLECTOR_SLICE): + log_cgroup_warning("The Log Collector process is not in the proper cgroups", send_event=False) + sys.exit(logcollector.INVALID_CGROUPS_ERRCODE) try: log_collector = LogCollector(is_full_mode) - # Running log collector resource(CPU, Memory) monitoring only if agent starts the log collector. + # Running log collector resource monitoring only if agent starts the log collector. # If Log collector start by any other means, then it will not be monitored. if CollectLogsHandler.is_enabled_monitor_cgroups_check(): - tracked_cgroups = initialize_cgroups_tracking(cpu_cgroup_path, memory_cgroup_path) - log_collector_monitor = get_log_collector_monitor_handler(tracked_cgroups) + for metric in tracked_metrics: + if isinstance(metric, CpuMetrics): + metric.initialize_cpu_usage() + break + log_collector_monitor = get_log_collector_monitor_handler(tracked_metrics) log_collector_monitor.run() archive = log_collector.collect_logs_and_get_archive() logger.info("Log collection successfully completed. Archive can be found at {0} " diff --git a/azurelinuxagent/ga/cgroupapi.py b/azurelinuxagent/ga/cgroupapi.py index 3bce053502..3483527620 100644 --- a/azurelinuxagent/ga/cgroupapi.py +++ b/azurelinuxagent/ga/cgroupapi.py @@ -24,7 +24,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import WALAEventOperation, add_event -from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.conf import get_agent_pid_file_path from azurelinuxagent.common.exception import CGroupsException, ExtensionErrorCodes, ExtensionError, \ @@ -185,14 +185,14 @@ def get_cgroup_api(): if available_unified_controllers != "": raise CGroupsException("Detected hybrid cgroup mode, but there are controllers available to be enabled in unified hierarchy: {0}".format(available_unified_controllers)) - cgroup_api = SystemdCgroupApiv1() + cgroup_api_v1 = SystemdCgroupApiv1() # Previously the agent supported users mounting cgroup v1 controllers in locations other than the systemd - # default ('/sys/fs/cgroup'). The agent no longer supports this scenario. If either the cpu or memory - # controller is mounted in a location other than the systemd default, raise Exception. - if not cgroup_api.are_mountpoints_systemd_created(): - raise InvalidCgroupMountpointException("Expected cgroup controllers to be mounted at '{0}', but at least one is not. v1 mount points: \n{1}".format(CGROUP_FILE_SYSTEM_ROOT, json.dumps(cgroup_api.get_controller_root_paths()))) + # default ('/sys/fs/cgroup'). The agent no longer supports this scenario. If any agent supported controller is + # mounted in a location other than the systemd default, raise Exception. + if not cgroup_api_v1.are_mountpoints_systemd_created(): + raise InvalidCgroupMountpointException("Expected cgroup controllers to be mounted at '{0}', but at least one is not. v1 mount points: \n{1}".format(CGROUP_FILE_SYSTEM_ROOT, json.dumps(cgroup_api_v1.get_controller_mountpoints()))) log_cgroup_info("Using cgroup v1 for resource enforcement and monitoring") - return cgroup_api + return cgroup_api_v1 raise CGroupsException("{0} has an unexpected file type: {1}".format(CGROUP_FILE_SYSTEM_ROOT, root_hierarchy_mode)) @@ -202,7 +202,6 @@ class _SystemdCgroupApi(object): Cgroup interface via systemd. Contains common api implementations between cgroup v1 and v2. """ def __init__(self): - self._agent_unit_name = None self._systemd_run_commands = [] self._systemd_run_commands_lock = threading.RLock() @@ -213,55 +212,36 @@ def get_systemd_run_commands(self): with self._systemd_run_commands_lock: return self._systemd_run_commands[:] - def get_controller_root_paths(self): + def get_unit_cgroup(self, unit_name, cgroup_name): """ - Cgroup version specific. Returns a tuple with the root paths for the cpu and memory controllers; the values can - be None if the corresponding controller is not mounted or enabled at the root cgroup. + Cgroup version specific. Returns a representation of the unit cgroup. + + :param unit_name: The unit to return the cgroup of. + :param cgroup_name: A name to represent the cgroup. Used for logging/tracking purposes. """ raise NotImplementedError() - def get_unit_cgroup_paths(self, unit_name): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given unit. - The values returned can be None if the controller is not mounted or enabled. + def get_cgroup_from_relative_path(self, relative_path, cgroup_name): """ - # Ex: ControlGroup=/azure.slice/walinuxagent.service - # controlgroup_path[1:] = azure.slice/walinuxagent.service - controlgroup_path = systemd.get_unit_property(unit_name, "ControlGroup") - cpu_root_path, memory_root_path = self.get_controller_root_paths() - - cpu_cgroup_path = os.path.join(cpu_root_path, controlgroup_path[1:]) \ - if cpu_root_path is not None else None - - memory_cgroup_path = os.path.join(memory_root_path, controlgroup_path[1:]) \ - if memory_root_path is not None else None + Cgroup version specific. Returns a representation of the cgroup at the provided relative path. - return cpu_cgroup_path, memory_cgroup_path - - def get_process_cgroup_paths(self, process_id): - """ - Returns a tuple with the path of the cpu and memory cgroups for the given process. - The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the controller is not mounted or enabled. + :param relative_path: The relative path to return the cgroup of. + :param cgroup_name: A name to represent the cgroup. Used for logging/tracking purposes. """ - cpu_cgroup_relative_path, memory_cgroup_relative_path = self.get_process_cgroup_relative_paths(process_id) - - cpu_root_path, memory_root_path = self.get_controller_root_paths() - - cpu_cgroup_path = os.path.join(cpu_root_path, cpu_cgroup_relative_path) \ - if cpu_root_path is not None and cpu_cgroup_relative_path is not None else None + raise NotImplementedError() - memory_cgroup_path = os.path.join(memory_root_path, memory_cgroup_relative_path) \ - if memory_root_path is not None and memory_cgroup_relative_path is not None else None + def get_process_cgroup(self, process_id, cgroup_name): + """ + Cgroup version specific. Returns a representation of the process' cgroup. - return cpu_cgroup_path, memory_cgroup_path + :param process_id: A numeric PID to return the cgroup of, or the string "self" to return the cgroup of the current process. + :param cgroup_name: A name to represent the cgroup. Used for logging/tracking purposes. + """ + raise NotImplementedError() - def get_process_cgroup_relative_paths(self, process_id): + def log_root_paths(self): """ - Cgroup version specific. Returns a tuple with the path of the cpu and memory cgroups for the given process - (relative to the root path of the corresponding controller). - The 'process_id' can be a numeric PID or the string "self" for the current process. - The values returned can be None if the controller is not mounted or enabled. + Cgroup version specific. Logs the root paths of the cgroup filesystem/controllers. """ raise NotImplementedError() @@ -279,11 +259,6 @@ def _is_systemd_failure(scope_name, stderr): unit_not_found = "Unit {0} not found.".format(scope_name) return unit_not_found in stderr or scope_name not in stderr - @staticmethod - def get_processes_in_cgroup(cgroup_path): - with open(os.path.join(cgroup_path, "cgroup.procs"), "r") as cgroup_procs: - return [int(pid) for pid in cgroup_procs.read().split()] - class SystemdCgroupApiv1(_SystemdCgroupApi): """ @@ -293,7 +268,8 @@ def __init__(self): super(SystemdCgroupApiv1, self).__init__() self._cgroup_mountpoints = self._get_controller_mountpoints() - def _get_controller_mountpoints(self): + @staticmethod + def _get_controller_mountpoints(): """ In v1, each controller is mounted at a different path. Use findmnt to get each path. @@ -304,7 +280,8 @@ def _get_controller_mountpoints(self): /sys/fs/cgroup/cpu,cpuacct cgroup cgroup rw,nosuid,nodev,noexec,relatime,cpu,cpuacct etc - Returns a dictionary of the controller-path mappings. + Returns a dictionary of the controller-path mappings. The dictionary only includes the controllers which are + supported by the agent. """ mount_points = {} for line in shellutil.run_command(['findmnt', '-t', 'cgroup', '--noheadings']).splitlines(): @@ -315,51 +292,91 @@ def _get_controller_mountpoints(self): if match is not None: path = match.group('path') controller = match.group('controller') - if controller is not None and path is not None: + if controller is not None and path is not None and controller in CgroupV1.get_supported_controllers(): mount_points[controller] = path return mount_points + def get_controller_mountpoints(self): + """ + Returns a dictionary of controller-mountpoint mappings. + """ + return self._cgroup_mountpoints + def are_mountpoints_systemd_created(self): """ - Systemd mounts each controller at '/sys/fs/cgroup/'. Returns True if both cpu and memory - mountpoints match this pattern, False otherwise. + Systemd mounts each controller at '/sys/fs/cgroup/'. Returns True if all mounted controllers which + are supported by the agent have mountpoints which match this pattern, False otherwise. The agent does not support cgroup usage if the default root systemd mountpoint (/sys/fs/cgroup) is not used. This method is used to check if any users are using non-systemd mountpoints. If they are, the agent drop-in files will be cleaned up in cgroupconfigurator. """ - cpu_mountpoint = self._cgroup_mountpoints.get('cpu,cpuacct') - memory_mountpoint = self._cgroup_mountpoints.get('memory') - if cpu_mountpoint is not None and cpu_mountpoint != os.path.join(CGROUP_FILE_SYSTEM_ROOT, 'cpu,cpuacct'): - return False - if memory_mountpoint is not None and memory_mountpoint != os.path.join(CGROUP_FILE_SYSTEM_ROOT, 'memory'): - return False + for controller, mount_point in self._cgroup_mountpoints.items(): + if mount_point != os.path.join(CGROUP_FILE_SYSTEM_ROOT, controller): + return False return True - def get_controller_root_paths(self): - # Return a tuple representing the mountpoints for cpu and memory. Either should be None if the corresponding - # controller is not mounted. - return self._cgroup_mountpoints.get('cpu,cpuacct'), self._cgroup_mountpoints.get('memory') - - def get_process_cgroup_relative_paths(self, process_id): - # The contents of the file are similar to - # # cat /proc/1218/cgroup - # 10:memory:/system.slice/walinuxagent.service - # 3:cpu,cpuacct:/system.slice/walinuxagent.service - # etc - cpu_path = None - memory_path = None + @staticmethod + def _get_process_relative_controller_paths(process_id): + """ + Returns the relative paths of the cgroup for the given process as a dict of controller-path mappings. The result + only includes controllers which are supported. + The contents of the /proc/{process_id}/cgroup file are similar to + # cat /proc/1218/cgroup + 10:memory:/system.slice/walinuxagent.service + 3:cpu,cpuacct:/system.slice/walinuxagent.service + etc + + :param process_id: A numeric PID to return the relative paths of, or the string "self" to return the relative paths of the current process. + """ + conroller_relative_paths = {} for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): - match = re.match(r'\d+:(?P(memory|.*cpuacct.*)):(?P.+)', line) + match = re.match(r'\d+:(?P.+):(?P.+)', line) if match is not None: controller = match.group('controller') path = match.group('path').lstrip('/') if match.group('path') != '/' else None - if controller == 'memory': - memory_path = path - else: - cpu_path = path + if path is not None and controller in CgroupV1.get_supported_controllers(): + conroller_relative_paths[controller] = path + + return conroller_relative_paths + + def get_unit_cgroup(self, unit_name, cgroup_name): + unit_cgroup_relative_path = systemd.get_unit_property(unit_name, "ControlGroup") + unit_controller_paths = {} - return cpu_path, memory_path + for controller, mountpoint in self._cgroup_mountpoints.items(): + unit_controller_paths[controller] = os.path.join(mountpoint, unit_cgroup_relative_path[1:]) + + return CgroupV1(cgroup_name=cgroup_name, controller_mountpoints=self._cgroup_mountpoints, + controller_paths=unit_controller_paths) + + def get_cgroup_from_relative_path(self, relative_path, cgroup_name): + controller_paths = {} + for controller, mountpoint in self._cgroup_mountpoints.items(): + controller_paths[controller] = os.path.join(mountpoint, relative_path) + + return CgroupV1(cgroup_name=cgroup_name, controller_mountpoints=self._cgroup_mountpoints, + controller_paths=controller_paths) + + def get_process_cgroup(self, process_id, cgroup_name): + relative_controller_paths = self._get_process_relative_controller_paths(process_id) + process_controller_paths = {} + + for controller, mountpoint in self._cgroup_mountpoints.items(): + relative_controller_path = relative_controller_paths.get(controller) + if relative_controller_path is not None: + process_controller_paths[controller] = os.path.join(mountpoint, relative_controller_path) + + return CgroupV1(cgroup_name=cgroup_name, controller_mountpoints=self._cgroup_mountpoints, + controller_paths=process_controller_paths) + + def log_root_paths(self): + for controller in CgroupV1.get_supported_controllers(): + mount_point = self._cgroup_mountpoints.get(controller) + if mount_point is None: + log_cgroup_info("The {0} controller is not mounted".format(controller), send_event=False) + else: + log_cgroup_info("The {0} controller is mounted at {1}".format(controller, mount_point), send_event=False) def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): @@ -385,25 +402,14 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh log_cgroup_info("Started extension in unit '{0}'".format(scope_name), send_event=False) - cpu_cgroup = None + cpu_metrics = None try: cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) - - cpu_cgroup_mountpoint, memory_cgroup_mountpoint = self.get_controller_root_paths() - - if cpu_cgroup_mountpoint is None: - log_cgroup_info("The CPU controller is not mounted; will not track resource usage", send_event=False) - else: - cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) - cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path) - CGroupsTelemetry.track_cgroup(cpu_cgroup) - - if memory_cgroup_mountpoint is None: - log_cgroup_info("The Memory controller is not mounted; will not track resource usage", send_event=False) - else: - memory_cgroup_path = os.path.join(memory_cgroup_mountpoint, cgroup_relative_path) - memory_cgroup = MemoryCgroup(extension_name, memory_cgroup_path) - CGroupsTelemetry.track_cgroup(memory_cgroup) + cgroup = self.get_cgroup_from_relative_path(cgroup_relative_path, extension_name) + for metrics in cgroup.get_controller_metrics(): + if isinstance(metrics, CpuMetrics): + cpu_metrics = metrics + CGroupsTelemetry.track_cgroup(metrics) except IOError as e: if e.errno == 2: # 'No such file or directory' @@ -415,7 +421,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Wait for process completion or timeout try: return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, - stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup) + stderr=stderr, error_code=error_code, cpu_metrics=cpu_metrics) except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. @@ -448,7 +454,7 @@ class SystemdCgroupApiv2(_SystemdCgroupApi): def __init__(self): super(SystemdCgroupApiv2, self).__init__() self._root_cgroup_path = self._get_root_cgroup_path() - self._controllers_enabled_at_root = self._get_controllers_enabled_at_root(self._root_cgroup_path) if self._root_cgroup_path is not None else [] + self._controllers_enabled_at_root = self._get_controllers_enabled_at_root(self._root_cgroup_path) if self._root_cgroup_path != "" else [] @staticmethod def _get_root_cgroup_path(): @@ -459,7 +465,7 @@ def _get_root_cgroup_path(): $ findmnt -t cgroup2 --noheadings /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot - Returns None if the root cgroup cannot be determined from the output above. + Returns empty string if the root cgroup cannot be determined from the output above. """ # for line in shellutil.run_command(['findmnt', '-t', 'cgroup2', '--noheadings']).splitlines(): @@ -470,7 +476,13 @@ def _get_root_cgroup_path(): root_cgroup_path = match.group('path') if root_cgroup_path is not None: return root_cgroup_path - return None + return "" + + def get_root_cgroup_path(self): + """ + Returns the unified cgroup mountpoint. + """ + return self._root_cgroup_path @staticmethod def _get_controllers_enabled_at_root(root_cgroup_path): @@ -478,47 +490,229 @@ def _get_controllers_enabled_at_root(root_cgroup_path): Returns a list of the controllers enabled at the root cgroup. The cgroup.subtree_control file at the root shows a space separated list of the controllers which are enabled to control resource distribution from the root cgroup to its children. If a controller is listed here, then that controller is available to enable in children - cgroups. + cgroups. Returns only the enabled controllers which are supported by the agent. $ cat /sys/fs/cgroup/cgroup.subtree_control cpuset cpu io memory hugetlb pids rdma misc """ - controllers_enabled_at_root = [] enabled_controllers_file = os.path.join(root_cgroup_path, 'cgroup.subtree_control') if os.path.exists(enabled_controllers_file): controllers_enabled_at_root = fileutil.read_file(enabled_controllers_file).rstrip().split() - return controllers_enabled_at_root - - def get_controller_root_paths(self): - # Return a tuple representing the root cgroups for cpu and memory. Either should be None if the corresponding - # controller is not enabled at the root. This check is necessary because all non-root "cgroup.subtree_control" - # files can only contain controllers which are enabled in the parent's "cgroup.subtree_control" file. - - root_cpu_path = None - root_memory_path = None - if self._root_cgroup_path is not None: - if 'cpu' in self._controllers_enabled_at_root: - root_cpu_path = self._root_cgroup_path - if 'memory' in self._controllers_enabled_at_root: - root_memory_path = self._root_cgroup_path - - return root_cpu_path, root_memory_path - - def get_process_cgroup_relative_paths(self, process_id): - # The contents of the file are similar to - # # cat /proc/1218/cgroup - # 0::/azure.slice/walinuxagent.service - cpu_path = None - memory_path = None + return list(set(controllers_enabled_at_root) & set(CgroupV2.get_supported_controllers())) + return [] + + @staticmethod + def _get_process_relative_cgroup_path(process_id): + """ + Returns the relative path of the cgroup for the given process. + The contents of the /proc/{process_id}/cgroup file are similar to + # cat /proc/1218/cgroup + 0::/azure.slice/walinuxagent.service + + :param process_id: A numeric PID to return the relative path of, or the string "self" to return the relative path of the current process. + """ + relative_path = "" for line in fileutil.read_file("/proc/{0}/cgroup".format(process_id)).splitlines(): match = re.match(r'0::(?P\S+)', line) if match is not None: - path = match.group('path').lstrip('/') if match.group('path') != '/' else None - memory_path = path - cpu_path = path + relative_path = match.group('path').lstrip('/') if match.group('path') != '/' else "" + + return relative_path + + def get_unit_cgroup(self, unit_name, cgroup_name): + unit_cgroup_relative_path = systemd.get_unit_property(unit_name, "ControlGroup") + unit_cgroup_path = "" + + if self._root_cgroup_path != "": + unit_cgroup_path = os.path.join(self._root_cgroup_path, unit_cgroup_relative_path[1:]) - return cpu_path, memory_path + return CgroupV2(cgroup_name=cgroup_name, root_cgroup_path=self._root_cgroup_path, cgroup_path=unit_cgroup_path, enabled_controllers=self._controllers_enabled_at_root) + + def get_cgroup_from_relative_path(self, relative_path, cgroup_name): + cgroup_path = "" + if self._root_cgroup_path != "": + cgroup_path = os.path.join(self._root_cgroup_path, relative_path) + + return CgroupV2(cgroup_name=cgroup_name, root_cgroup_path=self._root_cgroup_path, cgroup_path=cgroup_path, enabled_controllers=self._controllers_enabled_at_root) + + def get_process_cgroup(self, process_id, cgroup_name): + relative_path = self._get_process_relative_cgroup_path(process_id) + cgroup_path = "" + + if self._root_cgroup_path != "": + cgroup_path = os.path.join(self._root_cgroup_path, relative_path) + + return CgroupV2(cgroup_name=cgroup_name, root_cgroup_path=self._root_cgroup_path, cgroup_path=cgroup_path, enabled_controllers=self._controllers_enabled_at_root) + + def log_root_paths(self): + log_cgroup_info("The root cgroup path is {0}".format(self._root_cgroup_path), send_event=False) + for controller in CgroupV2.get_supported_controllers(): + if controller in self._controllers_enabled_at_root: + log_cgroup_info("The {0} controller is enabled at the root cgroup".format(controller), send_event=False) + else: + log_cgroup_info("The {0} controller is not enabled at the root cgroup".format(controller), send_event=False) def start_extension_command(self, extension_name, command, cmd_name, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): raise NotImplementedError() + + +class Cgroup(object): + MEMORY_CONTROLLER = "memory" + + def __init__(self, cgroup_name): + self._cgroup_name = cgroup_name + + @staticmethod + def get_supported_controllers(): + """ + Cgroup version specific. Returns a list of the controllers which the agent supports. + """ + raise NotImplementedError() + + def check_in_expected_slice(self, expected_slice): + """ + Cgroup version specific. Returns True if the cgroup is in the expected slice, False otherwise. + + :param expected_slice: The slice the cgroup is expected to be in. + """ + raise NotImplementedError() + + def get_controller_metrics(self, expected_relative_path=None): + """ + Cgroup version specific. Returns a list of the metrics for the agent supported controllers which are + mounted/enabled for the cgroup. + + :param expected_relative_path: The expected relative path of the cgroup. If provided, only metrics for controllers at this expected path will be returned. + """ + raise NotImplementedError() + + def get_processes(self): + """ + Cgroup version specific. Returns a list of all the process ids in the cgroup. + """ + raise NotImplementedError() + + +class CgroupV1(Cgroup): + CPU_CONTROLLER = "cpu,cpuacct" + + def __init__(self, cgroup_name, controller_mountpoints, controller_paths): + """ + :param cgroup_name: The name of the cgroup. Used for logging/tracking purposes. + :param controller_mountpoints: A dictionary of controller-mountpoint mappings for each agent supported controller which is mounted. + :param controller_paths: A dictionary of controller-path mappings for each agent supported controller which is mounted. The path represents the absolute path of the controller. + """ + super(CgroupV1, self).__init__(cgroup_name=cgroup_name) + self._controller_mountpoints = controller_mountpoints + self._controller_paths = controller_paths + + @staticmethod + def get_supported_controllers(): + return [CgroupV1.CPU_CONTROLLER, CgroupV1.MEMORY_CONTROLLER] + + def check_in_expected_slice(self, expected_slice): + in_expected_slice = True + for controller, path in self._controller_paths.items(): + if expected_slice not in path: + log_cgroup_warning("The {0} controller for the {1} cgroup is not mounted in the expected slice. Expected slice: {2}. Actual controller path: {3}".format(controller, self._cgroup_name, expected_slice, path), send_event=False) + in_expected_slice = False + + return in_expected_slice + + def get_controller_metrics(self, expected_relative_path=None): + metrics = [] + + for controller in self.get_supported_controllers(): + controller_metrics = None + controller_path = self._controller_paths.get(controller) + controller_mountpoint = self._controller_mountpoints.get(controller) + + if controller_mountpoint is None: + log_cgroup_warning("{0} controller is not mounted; will not track metrics".format(controller), send_event=False) + continue + + if controller_path is None: + log_cgroup_warning("{0} is not mounted for the {1} cgroup; will not track metrics".format(controller, self._cgroup_name), send_event=False) + continue + + if expected_relative_path is not None: + expected_path = os.path.join(controller_mountpoint, expected_relative_path) + if controller_path != expected_path: + log_cgroup_warning("The {0} controller is not mounted at the expected path for the {1} cgroup; will not track metrics. Actual cgroup path:[{2}] Expected:[{3}]".format(controller, self._cgroup_name, controller_path, expected_path), send_event=False) + continue + + if controller == self.CPU_CONTROLLER: + controller_metrics = CpuMetrics(self._cgroup_name, controller_path) + elif controller == self.MEMORY_CONTROLLER: + controller_metrics = MemoryMetrics(self._cgroup_name, controller_path) + + if controller_metrics is not None: + msg = "{0} metrics for cgroup: {1}".format(controller, controller_metrics) + log_cgroup_info(msg, send_event=False) + metrics.append(controller_metrics) + + return metrics + + def get_controller_procs_path(self, controller): + controller_path = self._controller_paths.get(controller) + if controller_path is not None and controller_path != "": + return os.path.join(controller_path, "cgroup.procs") + return "" + + def get_processes(self): + pids = set() + for controller in self._controller_paths.keys(): + procs_path = self.get_controller_procs_path(controller) + if os.path.exists(procs_path): + with open(procs_path, "r") as cgroup_procs: + for pid in cgroup_procs.read().split(): + pids.add(int(pid)) + return list(pids) + + +class CgroupV2(Cgroup): + CPU_CONTROLLER = "cpu" + + def __init__(self, cgroup_name, root_cgroup_path, cgroup_path, enabled_controllers): + """ + :param cgroup_name: The name of the cgroup. Used for logging/tracking purposes. + :param root_cgroup_path: A string representing the root cgroup path. String can be empty. + :param cgroup_path: A string representing the absolute cgroup path. String can be empty. + :param enabled_controllers: A list of strings representing the agent supported controllers enabled at the root cgroup. + """ + super(CgroupV2, self).__init__(cgroup_name) + self._root_cgroup_path = root_cgroup_path + self._cgroup_path = cgroup_path + self._enabled_controllers = enabled_controllers + + @staticmethod + def get_supported_controllers(): + return [CgroupV2.CPU_CONTROLLER, CgroupV2.MEMORY_CONTROLLER] + + def check_in_expected_slice(self, expected_slice): + if expected_slice not in self._cgroup_path: + log_cgroup_warning("The {0} cgroup is not in the expected slice. Expected slice: {1}. Actual cgroup path: {2}".format(self._cgroup_name, expected_slice, self._cgroup_path), send_event=False) + return False + + return True + + def get_controller_metrics(self, expected_relative_path=None): + # TODO - Implement controller metrics for cgroup v2 + raise NotImplementedError() + + def get_procs_path(self): + if self._cgroup_path != "": + return os.path.join(self._cgroup_path, "cgroup.procs") + return "" + + def get_processes(self): + pids = set() + procs_path = self.get_procs_path() + if os.path.exists(procs_path): + with open(procs_path, "r") as cgroup_procs: + for pid in cgroup_procs.read().split(): + pids.add(int(pid)) + return list(pids) + + diff --git a/azurelinuxagent/ga/cgroupconfigurator.py b/azurelinuxagent/ga/cgroupconfigurator.py index 72d5329f92..a36b9dae10 100644 --- a/azurelinuxagent/ga/cgroupconfigurator.py +++ b/azurelinuxagent/ga/cgroupconfigurator.py @@ -23,7 +23,7 @@ from azurelinuxagent.common import conf from azurelinuxagent.common import logger -from azurelinuxagent.ga.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup +from azurelinuxagent.ga.controllermetrics import CpuMetrics, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryMetrics from azurelinuxagent.ga.cgroupapi import SystemdRunError, EXTENSION_SLICE_PREFIX, CGroupUtil, SystemdCgroupApiv2, \ log_cgroup_info, log_cgroup_warning, get_cgroup_api, InvalidCgroupMountpointException from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry @@ -130,9 +130,8 @@ def __init__(self): self._agent_cgroups_enabled = False self._extensions_cgroups_enabled = False self._cgroups_api = None - self._agent_cpu_cgroup_path = None - self._agent_memory_cgroup_path = None - self._agent_memory_cgroup = None + self._agent_cgroup = None + self._agent_memory_metrics = None self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop. def initialize(self): @@ -189,28 +188,30 @@ def initialize(self): self.__setup_azure_slice() - cpu_controller_root, memory_controller_root = self.__get_cgroup_controller_roots() - self._agent_cpu_cgroup_path, self._agent_memory_cgroup_path = self.__get_agent_cgroup_paths(agent_slice, - cpu_controller_root, - memory_controller_root) + # Log mount points/root paths for cgroup controllers + self._cgroups_api.log_root_paths() + + # Get agent cgroup + self._agent_cgroup = self._cgroups_api.get_process_cgroup(process_id="self", cgroup_name=AGENT_NAME_TELEMETRY) if conf.get_cgroup_disable_on_process_check_failure() and self._check_fails_if_processes_found_in_agent_cgroup_before_enable(agent_slice): reason = "Found unexpected processes in the agent cgroup before agent enable cgroups." self.disable(reason, DisableCgroups.ALL) return - if self._agent_cpu_cgroup_path is not None or self._agent_memory_cgroup_path is not None: + # Get metrics to track + metrics = self._agent_cgroup.get_controller_metrics(expected_relative_path=os.path.join(agent_slice, systemd.get_agent_unit_name())) + if len(metrics) > 0: self.enable() - if self._agent_cpu_cgroup_path is not None: - log_cgroup_info("Agent CPU cgroup: {0}".format(self._agent_cpu_cgroup_path)) - self.__set_cpu_quota(conf.get_agent_cpu_quota()) - CGroupsTelemetry.track_cgroup(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) - - if self._agent_memory_cgroup_path is not None: - log_cgroup_info("Agent Memory cgroup: {0}".format(self._agent_memory_cgroup_path)) - self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path) - CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup) + for metric in metrics: + for prop in metric.get_unit_properties(): + log_cgroup_info('{0}: {1}'.format(prop, systemd.get_unit_property(systemd.get_agent_unit_name(), prop))) + if isinstance(metric, CpuMetrics): + self.__set_cpu_quota(conf.get_agent_cpu_quota()) + elif isinstance(metric, MemoryMetrics): + self._agent_memory_metrics = metric + CGroupsTelemetry.track_cgroup(metric) except Exception as exception: log_cgroup_warning("Error initializing cgroups: {0}".format(ustr(exception))) @@ -229,21 +230,6 @@ def __check_no_legacy_cgroups(self): return False return True - def __get_cgroup_controller_roots(self): - cpu_controller_root, memory_controller_root = self._cgroups_api.get_controller_root_paths() - - if cpu_controller_root is not None: - log_cgroup_info("The CPU cgroup controller root path is {0}".format(cpu_controller_root), send_event=False) - else: - log_cgroup_warning("The CPU cgroup controller is not mounted or enabled") - - if memory_controller_root is not None: - log_cgroup_info("The memory cgroup controller root path is {0}".format(memory_controller_root), send_event=False) - else: - log_cgroup_warning("The memory cgroup controller is not mounted or enabled") - - return cpu_controller_root, memory_controller_root - @staticmethod def __setup_azure_slice(): """ @@ -416,47 +402,6 @@ def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota return True return False - def __get_agent_cgroup_paths(self, agent_slice, cpu_controller_root, memory_controller_root): - agent_unit_name = systemd.get_agent_unit_name() - - expected_relative_path = os.path.join(agent_slice, agent_unit_name) - cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths( - "self") - - if cpu_cgroup_relative_path is None: - log_cgroup_warning("The agent's process is not within a CPU cgroup") - else: - if cpu_cgroup_relative_path == expected_relative_path: - log_cgroup_info('CPUAccounting: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUAccounting"))) - log_cgroup_info('CPUQuota: {0}'.format(systemd.get_unit_property(agent_unit_name, "CPUQuotaPerSecUSec"))) - else: - log_cgroup_warning( - "The Agent is not in the expected CPU cgroup; will not enable monitoring. Cgroup:[{0}] Expected:[{1}]".format(cpu_cgroup_relative_path, expected_relative_path)) - cpu_cgroup_relative_path = None # Set the path to None to prevent monitoring - - if memory_cgroup_relative_path is None: - log_cgroup_warning("The agent's process is not within a memory cgroup") - else: - if memory_cgroup_relative_path == expected_relative_path: - memory_accounting = systemd.get_unit_property(agent_unit_name, "MemoryAccounting") - log_cgroup_info('MemoryAccounting: {0}'.format(memory_accounting)) - else: - log_cgroup_warning( - "The Agent is not in the expected memory cgroup; will not enable monitoring. CGroup:[{0}] Expected:[{1}]".format(memory_cgroup_relative_path, expected_relative_path)) - memory_cgroup_relative_path = None # Set the path to None to prevent monitoring - - if cpu_controller_root is not None and cpu_cgroup_relative_path is not None: - agent_cpu_cgroup_path = os.path.join(cpu_controller_root, cpu_cgroup_relative_path) - else: - agent_cpu_cgroup_path = None - - if memory_controller_root is not None and memory_cgroup_relative_path is not None: - agent_memory_cgroup_path = os.path.join(memory_controller_root, memory_cgroup_relative_path) - else: - agent_memory_cgroup_path = None - - return agent_cpu_cgroup_path, agent_memory_cgroup_path - def supported(self): return self._cgroups_supported @@ -496,7 +441,11 @@ def disable(self, reason, disable_cgroups): elif disable_cgroups == DisableCgroups.AGENT: # disable agent self._agent_cgroups_enabled = False self.__reset_agent_cpu_quota() - CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) + agent_metrics = self._agent_cgroup.get_controller_metrics() + for metric in agent_metrics: + if isinstance(metric, CpuMetrics): + CGroupsTelemetry.stop_tracking(metric) + break log_cgroup_warning("Disabling resource usage monitoring. Reason: {0}".format(reason), op=WALAEventOperation.CGroupsDisabled) @@ -612,11 +561,7 @@ def _check_processes_in_agent_cgroup(self): """ unexpected = [] agent_cgroup_proc_names = [] - # Now we call _check_processes_in_agent_cgroup before we enable the cgroups or any one of the controller is not mounted, agent cgroup paths can be None. - # so we need to check both. - cgroup_path = self._agent_cpu_cgroup_path if self._agent_cpu_cgroup_path is not None else self._agent_memory_cgroup_path - if cgroup_path is None: - return + try: daemon = os.getppid() extension_handler = os.getpid() @@ -624,12 +569,12 @@ def _check_processes_in_agent_cgroup(self): agent_commands.update(shellutil.get_running_commands()) systemd_run_commands = set() systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) - agent_cgroup = self._cgroups_api.get_processes_in_cgroup(cgroup_path) + agent_cgroup_proccesses = self._agent_cgroup.get_processes() # get the running commands again in case new commands started or completed while we were fetching the processes in the cgroup; agent_commands.update(shellutil.get_running_commands()) systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands()) - for process in agent_cgroup: + for process in agent_cgroup_proccesses: agent_cgroup_proc_names.append(self.__format_process(process)) # Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't. if process in (daemon, extension_handler) or process in systemd_run_commands: @@ -753,8 +698,8 @@ def _check_agent_throttled_time(cgroup_metrics): raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value)) def check_agent_memory_usage(self): - if self.enabled() and self._agent_memory_cgroup: - metrics = self._agent_memory_cgroup.get_tracked_metrics() + if self.enabled() and self._agent_memory_metrics is not None: + metrics = self._agent_memory_metrics.get_tracked_metrics() current_usage = 0 for metric in metrics: if metric.counter == MetricsCounter.TOTAL_MEM_USAGE: @@ -780,59 +725,37 @@ def _get_parent(pid): return 0 def start_tracking_unit_cgroups(self, unit_name): - """ - TODO: Start tracking Memory Cgroups - """ try: - cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) - - if cpu_cgroup_path is None: - log_cgroup_info("The CPU controller is not mounted or enabled; will not track resource usage", send_event=False) - else: - CGroupsTelemetry.track_cgroup(CpuCgroup(unit_name, cpu_cgroup_path)) + cgroup = self._cgroups_api.get_unit_cgroup(unit_name, unit_name) + metrics = cgroup.get_controller_metrics() - if memory_cgroup_path is None: - log_cgroup_info("The Memory controller is not mounted or enabled; will not track resource usage", send_event=False) - else: - CGroupsTelemetry.track_cgroup(MemoryCgroup(unit_name, memory_cgroup_path)) + for metric in metrics: + CGroupsTelemetry.track_cgroup(metric) except Exception as exception: log_cgroup_info("Failed to start tracking resource usage for the extension: {0}".format(ustr(exception)), send_event=False) def stop_tracking_unit_cgroups(self, unit_name): - """ - TODO: remove Memory cgroups from tracked list. - """ try: - cpu_cgroup_path, memory_cgroup_path = self._cgroups_api.get_unit_cgroup_paths(unit_name) - - if cpu_cgroup_path is not None: - CGroupsTelemetry.stop_tracking(CpuCgroup(unit_name, cpu_cgroup_path)) + cgroup = self._cgroups_api.get_unit_cgroup(unit_name, unit_name) + metrics = cgroup.get_controller_metrics() - if memory_cgroup_path is not None: - CGroupsTelemetry.stop_tracking(MemoryCgroup(unit_name, memory_cgroup_path)) + for metric in metrics: + CGroupsTelemetry.stop_tracking(metric) except Exception as exception: log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) def stop_tracking_extension_cgroups(self, extension_name): - """ - TODO: remove extension Memory cgroups from tracked list - """ try: extension_slice_name = CGroupUtil.get_extension_slice_name(extension_name) - cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE, - extension_slice_name) - - cpu_root_path, memory_root_path = self._cgroups_api.get_controller_root_paths() - cpu_cgroup_path = os.path.join(cpu_root_path, cgroup_relative_path) - memory_cgroup_path = os.path.join(memory_root_path, cgroup_relative_path) + cgroup_relative_path = os.path.join(_AZURE_VMEXTENSIONS_SLICE, extension_slice_name) - if cpu_cgroup_path is not None: - CGroupsTelemetry.stop_tracking(CpuCgroup(extension_name, cpu_cgroup_path)) - - if memory_cgroup_path is not None: - CGroupsTelemetry.stop_tracking(MemoryCgroup(extension_name, memory_cgroup_path)) + cgroup = self._cgroups_api.get_cgroup_from_relative_path(relative_path=cgroup_relative_path, + cgroup_name=extension_name) + metrics = cgroup.get_controller_metrics() + for metric in metrics: + CGroupsTelemetry.stop_tracking(metric) except Exception as exception: log_cgroup_info("Failed to stop tracking resource usage for the extension service: {0}".format(ustr(exception)), send_event=False) diff --git a/azurelinuxagent/ga/cgroupstelemetry.py b/azurelinuxagent/ga/cgroupstelemetry.py index 5943b45ade..e8efad0382 100644 --- a/azurelinuxagent/ga/cgroupstelemetry.py +++ b/azurelinuxagent/ga/cgroupstelemetry.py @@ -17,7 +17,7 @@ import threading from azurelinuxagent.common import logger -from azurelinuxagent.ga.cgroup import CpuCgroup +from azurelinuxagent.ga.controllermetrics import CpuMetrics from azurelinuxagent.common.future import ustr @@ -41,7 +41,7 @@ def track_cgroup(cgroup): """ Adds the given item to the dictionary of tracked cgroups """ - if isinstance(cgroup, CpuCgroup): + if isinstance(cgroup, CpuMetrics): # set the current cpu usage cgroup.initialize_cpu_usage() diff --git a/azurelinuxagent/ga/collect_logs.py b/azurelinuxagent/ga/collect_logs.py index d82933e963..d8ea3dba3d 100644 --- a/azurelinuxagent/ga/collect_logs.py +++ b/azurelinuxagent/ga/collect_logs.py @@ -25,7 +25,7 @@ import azurelinuxagent.common.conf as conf from azurelinuxagent.common import logger -from azurelinuxagent.ga.cgroup import MetricsCounter +from azurelinuxagent.ga.controllermetrics import MetricsCounter from azurelinuxagent.common.event import elapsed_milliseconds, add_event, WALAEventOperation, report_metric from azurelinuxagent.common.future import ustr from azurelinuxagent.ga.interfaces import ThreadHandlerInterface diff --git a/azurelinuxagent/ga/cgroup.py b/azurelinuxagent/ga/controllermetrics.py similarity index 96% rename from azurelinuxagent/ga/cgroup.py rename to azurelinuxagent/ga/controllermetrics.py index b2bf32fbc1..3aaeab3193 100644 --- a/azurelinuxagent/ga/cgroup.py +++ b/azurelinuxagent/ga/controllermetrics.py @@ -88,7 +88,7 @@ class MetricsCounter(object): re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') -class CGroup(object): +class ControllerMetrics(object): def __init__(self, name, cgroup_path): """ Initialize _data collection for the Memory controller @@ -169,10 +169,16 @@ def get_tracked_metrics(self, **_): """ raise NotImplementedError() + def get_unit_properties(self): + """ + Returns a list of the unit properties to collect for the controller. + """ + raise NotImplementedError() -class CpuCgroup(CGroup): + +class CpuMetrics(ControllerMetrics): def __init__(self, name, cgroup_path): - super(CpuCgroup, self).__init__(name, cgroup_path) + super(CpuMetrics, self).__init__(name, cgroup_path) self._osutil = get_osutil() self._previous_cgroup_cpu = None @@ -306,10 +312,13 @@ def get_tracked_metrics(self, **kwargs): return tracked + def get_unit_properties(self): + return ["CPUAccounting", "CPUQuotaPerSecUSec"] + -class MemoryCgroup(CGroup): +class MemoryMetrics(ControllerMetrics): def __init__(self, name, cgroup_path): - super(MemoryCgroup, self).__init__(name, cgroup_path) + super(MemoryMetrics, self).__init__(name, cgroup_path) self._counter_not_found_error_count = 0 @@ -390,3 +399,6 @@ def get_tracked_metrics(self, **_): MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.SWAP_MEM_USAGE, self.name, self.try_swap_memory_usage(), _REPORT_EVERY_HOUR) ] + + def get_unit_properties(self): + return["MemoryAccounting"] diff --git a/azurelinuxagent/ga/extensionprocessutil.py b/azurelinuxagent/ga/extensionprocessutil.py index d2b37551ba..9061fd3fff 100644 --- a/azurelinuxagent/ga/extensionprocessutil.py +++ b/azurelinuxagent/ga/extensionprocessutil.py @@ -31,7 +31,7 @@ TELEMETRY_MESSAGE_MAX_LEN = 3200 -def wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup): +def wait_for_process_completion_or_timeout(process, timeout, cpu_metrics): """ Utility function that waits for the process to complete within the given time frame. This function will terminate the process if when the given time frame elapses. @@ -47,7 +47,7 @@ def wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup): throttled_time = 0 if timeout == 0: - throttled_time = get_cpu_throttled_time(cpu_cgroup) + throttled_time = get_cpu_throttled_time(cpu_metrics) os.killpg(os.getpgid(process.pid), signal.SIGKILL) else: # process completed or forked; sleep 1 sec to give the child process (if any) a chance to start @@ -57,7 +57,7 @@ def wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup): return timeout == 0, return_code, throttled_time -def handle_process_completion(process, command, timeout, stdout, stderr, error_code, cpu_cgroup=None): +def handle_process_completion(process, command, timeout, stdout, stderr, error_code, cpu_metrics=None): """ Utility function that waits for process completion and retrieves its output (stdout and stderr) if it completed before the timeout period. Otherwise, the process will get killed and an ExtensionError will be raised. @@ -68,15 +68,15 @@ def handle_process_completion(process, command, timeout, stdout, stderr, error_c :param stdout: Must be a file since we seek on it when parsing the subprocess output :param stderr: Must be a file since we seek on it when parsing the subprocess outputs :param error_code: The error code to set if we raise an ExtensionError - :param cpu_cgroup: Reference the cpu cgroup name and path + :param cpu_metrics: References the cpu metrics for the cgroup :return: """ # Wait for process completion or timeout - timed_out, return_code, throttled_time = wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup) + timed_out, return_code, throttled_time = wait_for_process_completion_or_timeout(process, timeout, cpu_metrics) process_output = read_output(stdout, stderr) if timed_out: - if cpu_cgroup is not None: # Report CPUThrottledTime when timeout happens + if cpu_metrics is not None: # Report CPUThrottledTime when timeout happens raise ExtensionError("Timeout({0});CPUThrottledTime({1}secs): {2}\n{3}".format(timeout, throttled_time, command, process_output), code=ExtensionErrorCodes.PluginHandlerScriptTimedout) @@ -211,14 +211,14 @@ def to_s(captured_stdout, stdout_offset, captured_stderr, stderr_offset): return to_s(stdout, -1*max_len_each, stderr, -1*max_len_each) -def get_cpu_throttled_time(cpu_cgroup): +def get_cpu_throttled_time(cpu_metrics): """ return the throttled time for the given cgroup. """ throttled_time = 0 - if cpu_cgroup is not None: + if cpu_metrics is not None: try: - throttled_time = cpu_cgroup.get_cpu_throttled_time(read_previous_throttled_time=False) + throttled_time = cpu_metrics.get_cpu_throttled_time(read_previous_throttled_time=False) except Exception as e: logger.warn("Failed to get cpu throttled time for the extension: {0}", ustr(e)) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index a5ff29aa01..f34192be72 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -22,7 +22,7 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.networkutil as networkutil -from azurelinuxagent.ga.cgroup import MetricValue, MetricsCategory, MetricsCounter +from azurelinuxagent.ga.controllermetrics import MetricValue, MetricsCategory, MetricsCounter from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.errorstate import ErrorState diff --git a/tests/common/utils/test_extension_process_util.py b/tests/common/utils/test_extension_process_util.py index 316bad6a37..7771de4fd2 100644 --- a/tests/common/utils/test_extension_process_util.py +++ b/tests/common/utils/test_extension_process_util.py @@ -19,7 +19,7 @@ import subprocess import tempfile -from azurelinuxagent.ga.cgroup import CpuCgroup +from azurelinuxagent.ga.controllermetrics import CpuMetrics from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr from azurelinuxagent.ga.extensionprocessutil import format_stdout_stderr, read_output, \ @@ -52,7 +52,7 @@ def test_wait_for_process_completion_or_timeout_should_terminate_cleanly(self): stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_cgroup=None) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_metrics=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 0) @@ -70,7 +70,8 @@ def test_wait_for_process_completion_or_timeout_should_kill_process_on_timeout(s # We don't actually mock the kill, just wrap it so we can assert its call count with patch('azurelinuxagent.ga.extensionprocessutil.os.killpg', wraps=os.killpg) as patch_kill: with patch('time.sleep') as mock_sleep: - timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=timeout, cpu_cgroup=None) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=timeout, + cpu_metrics=None) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process @@ -89,7 +90,7 @@ def test_handle_process_completion_should_return_nonzero_when_process_fails(self stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_cgroup=None) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_metrics=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 2) @@ -105,12 +106,8 @@ def test_handle_process_completion_should_return_process_output(self): stderr=stderr, preexec_fn=os.setsid) - process_output = handle_process_completion(process=process, - command=command, - timeout=5, - stdout=stdout, - stderr=stderr, - error_code=42) + process_output = handle_process_completion(process=process, command=command, timeout=5, stdout=stdout, + stderr=stderr, error_code=42) expected_output = "[stdout]\ndummy stdout\n\n\n[stderr]\ndummy stderr\n" self.assertEqual(process_output, expected_output) @@ -130,12 +127,8 @@ def test_handle_process_completion_should_raise_on_timeout(self): stderr=stderr, preexec_fn=os.setsid) - handle_process_completion(process=process, - command=command, - timeout=timeout, - stdout=stdout, - stderr=stderr, - error_code=42) + handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, + stderr=stderr, error_code=42) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process and raising an exception @@ -158,7 +151,7 @@ def test_handle_process_completion_should_log_throttled_time_on_timeout(self): test_file = os.path.join(self.tmp_dir, "cpu.stat") shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), test_file) # throttled_time = 50 - cgroup = CpuCgroup("test", self.tmp_dir) + cgroup = CpuMetrics("test", self.tmp_dir) process = subprocess.Popen(command, # pylint: disable=subprocess-popen-preexec-fn shell=True, cwd=self.tmp_dir, @@ -167,13 +160,8 @@ def test_handle_process_completion_should_log_throttled_time_on_timeout(self): stderr=stderr, preexec_fn=os.setsid) - handle_process_completion(process=process, - command=command, - timeout=timeout, - stdout=stdout, - stderr=stderr, - error_code=42, - cpu_cgroup=cgroup) + handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, + stderr=stderr, error_code=42, cpu_metrics=cgroup) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process and raising an exception @@ -200,11 +188,7 @@ def test_handle_process_completion_should_raise_on_nonzero_exit_code(self): stderr=stderr, preexec_fn=os.setsid) - handle_process_completion(process=process, - command=command, - timeout=4, - stdout=stdout, - stderr=stderr, + handle_process_completion(process=process, command=command, timeout=4, stdout=stdout, stderr=stderr, error_code=error_code) self.assertEqual(context_manager.exception.code, error_code) diff --git a/tests/data/cgroups/cgroup.procs b/tests/data/cgroups/cgroup.procs new file mode 100644 index 0000000000..93c25c16df --- /dev/null +++ b/tests/data/cgroups/cgroup.procs @@ -0,0 +1,3 @@ +123 +234 +345 \ No newline at end of file diff --git a/tests/ga/test_cgroupapi.py b/tests/ga/test_cgroupapi.py index ec077c90a0..717adbb6f0 100644 --- a/tests/ga/test_cgroupapi.py +++ b/tests/ga/test_cgroupapi.py @@ -24,10 +24,11 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga.cgroupapi import SystemdCgroupApiv1, SystemdCgroupApiv2, CGroupUtil, get_cgroup_api, \ - InvalidCgroupMountpointException + InvalidCgroupMountpointException, CgroupV1, CgroupV2 from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment, mock_cgroup_v2_environment, \ mock_cgroup_hybrid_environment from tests.lib.mock_environment import MockCommand @@ -85,7 +86,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu16_centos7dot4_redhat7dot4_an class SystemdCgroupsApiTestCase(AgentTestCase): - def test_get_cgroup_api_raises_exception_when_systemd_mount_point_does_not_exist(self): + def test_get_cgroup_api_raises_exception_when_systemd_mountpoint_does_not_exist(self): with mock_cgroup_v1_environment(self.tmp_dir): # Mock os.path.exists to return False for the os.path.exists(CGROUP_FILE_SYSTEM_ROOT) check with patch("os.path.exists", return_value=False): @@ -151,106 +152,16 @@ def test_get_unit_property_should_return_the_value_of_the_given_property(self): class SystemdCgroupsApiv1TestCase(AgentTestCase): - def test_get_unit_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): - with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', - "The mount point for the memory controller is incorrect") - - def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): - with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The mount point for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup/memory')): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIsNone(cpu, "The mount point for the cpu controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/extension.service', - "The mount point for the memory controller is incorrect") - - def test_get_process_cgroup_paths_should_return_the_cgroup_v1_mount_points(self): - with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") - - def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v1_controller_not_mounted(self): - with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=('/sys/fs/cgroup/cpu,cpuacct', None)): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The mount point for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup/memory')): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The mount point for the CPU controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") - - def test_get_process_cgroup_v1_path_should_return_None_if_either_relative_path_is_None(self): - with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_relative_paths', return_value=('system.slice/walinuxagent.service', None)): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', - "The mount point for the CPU controller is incorrect") - self.assertIsNone(memory, - "The relative cgroup path for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_relative_paths', return_value=(None, 'system.slice/walinuxagent.service')): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/memory/system.slice/walinuxagent.service', - "The mount point for the memory controller is incorrect") - - def test_get_controller_root_paths_should_return_the_cgroup_v1_controller_mount_points(self): - with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The root cgroup for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/memory', "The root cgroup for the memory controller is incorrect") - - def test_get_controller_root_paths_should_return_None_if_either_controller_not_mounted(self): - with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup/memory', 'io': '/sys/fs/cgroup/io'}): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertIsNone(cpu, "The CPU controller is mot mounted, so the cpu controller path should be None") - self.assertEqual(memory, '/sys/fs/cgroup/memory', "The root cgroup for the memory controller is incorrect") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'io': '/sys/fs/cgroup/io'}): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertIsNone(memory, "The memory controller is mot mounted, so the memory controller path should be None") - self.assertEqual(cpu, '/sys/fs/cgroup/cpu,cpuacct', "The root cgroup for the cpu controller is incorrect") - - def test_get_controller_mountpoints_should_return_all_controller_mount_points(self): + def test_get_controller_mountpoints_should_return_only_supported_controllers(self): with mock_cgroup_v1_environment(self.tmp_dir): cgroup_api = get_cgroup_api() # Expected value comes from findmnt output in the mocked environment self.assertEqual(cgroup_api._get_controller_mountpoints(), { - 'systemd': '/sys/fs/cgroup/systemd', - 'devices': '/sys/fs/cgroup/devices', - 'rdma': '/sys/fs/cgroup/rdma', - 'perf_event': '/sys/fs/cgroup/perf_event', - 'net_cls,net_prio': '/sys/fs/cgroup/net_cls,net_prio', - 'blkio': '/sys/fs/cgroup/blkio', - 'cpuset': '/sys/fs/cgroup/cpuset', - 'misc': '/sys/fs/cgroup/misc', 'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', - 'memory': '/sys/fs/cgroup/memory', - 'freezer': '/sys/fs/cgroup/freezer', - 'hugetlb': '/sys/fs/cgroup/hugetlb', - 'pids': '/sys/fs/cgroup/pids', + 'memory': '/sys/fs/cgroup/memory' }, "The controller mountpoints are not correct") - def test_are_mountpoints_systemd_created_should_return_False_if_cpu_or_memory_are_not_systemd_mountpoints(self): + def test_are_mountpoints_systemd_created_should_return_False_if_mountpoints_are_not_systemd(self): with mock_cgroup_v1_environment(self.tmp_dir): with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/custom/mountpoint/path', 'memory': '/custom/mountpoint/path'}): self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) @@ -261,23 +172,123 @@ def test_are_mountpoints_systemd_created_should_return_False_if_cpu_or_memory_ar with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/custom/mountpoint/path'}): self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) - def test_are_mountpoints_systemd_created_should_return_True_if_cpu_and_memory_are_systemd_mountpoints(self): + def test_are_mountpoints_systemd_created_should_return_True_if_mountpoints_are_systemd(self): with mock_cgroup_v1_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup', 'memory': '/sys/fs/cgroup'}): - self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'memory': '/sys/fs/cgroup/memory'}): + self.assertTrue(SystemdCgroupApiv1().are_mountpoints_systemd_created()) # are_mountpoints_systemd_created should only check controllers which are mounted - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup'}): - self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): + self.assertTrue(SystemdCgroupApiv1().are_mountpoints_systemd_created()) - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup'}): - self.assertFalse(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup/memory'}): + self.assertTrue(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + self.assertTrue(SystemdCgroupApiv1().are_mountpoints_systemd_created()) + + def test_get_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): + with mock_cgroup_v1_environment(self.tmp_dir): + relative_paths = get_cgroup_api()._get_process_relative_controller_paths('self') + self.assertEqual(len(relative_paths), 2) + self.assertEqual(relative_paths.get('cpu,cpuacct'), "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") + self.assertEqual(relative_paths.get('memory'), "system.slice/walinuxagent.service", "The relative memory for the memory cgroup is incorrect") + + def test_get_unit_cgroup_should_return_correct_paths_for_cgroup_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._controller_mountpoints, {'cpu,cpuacct':'/sys/fs/cgroup/cpu,cpuacct', 'memory':'/sys/fs/cgroup/memory'}) + self.assertEqual(cgroup._controller_paths, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service', 'memory': '/sys/fs/cgroup/memory/system.slice/extension.service'}) - def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v1_relative_paths(self): + def test_get_unit_cgroup_should_return_only_mounted_controllers_v1(self): with mock_cgroup_v1_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') - self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") - self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._controller_mountpoints, {'cpu,cpuacct':'/sys/fs/cgroup/cpu,cpuacct'}) + self.assertEqual(cgroup._controller_paths, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'}) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._controller_mountpoints, {}) + self.assertEqual(cgroup._controller_paths, {}) + + def test_get_cgroup_from_relative_path_should_return_the_correct_paths_for_cgroup_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_cgroup_from_relative_path(relative_path="some/relative/path", cgroup_name="test_cgroup") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "test_cgroup") + self.assertEqual(cgroup._controller_mountpoints, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'memory': '/sys/fs/cgroup/memory'}) + self.assertEqual(cgroup._controller_paths, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/some/relative/path', + 'memory': '/sys/fs/cgroup/memory/some/relative/path'}) + + def test_get_cgroup_from_relative_path_should_return_only_mounted_controllers_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): + cgroup = get_cgroup_api().get_cgroup_from_relative_path(relative_path="some/relative/path", cgroup_name="test_cgroup") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "test_cgroup") + self.assertEqual(cgroup._controller_mountpoints, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}) + self.assertEqual(cgroup._controller_paths, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/some/relative/path'}) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + cgroup = get_cgroup_api().get_cgroup_from_relative_path(relative_path="some/relative/path", cgroup_name="test_cgroup") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "test_cgroup") + self.assertEqual(cgroup._controller_mountpoints, {}) + self.assertEqual(cgroup._controller_paths, {}) + + def test_get_process_cgroup_should_return_the_correct_paths_for_cgroup_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._controller_mountpoints, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'memory': '/sys/fs/cgroup/memory'}) + self.assertEqual(cgroup._controller_paths, + {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service', + 'memory': '/sys/fs/cgroup/memory/system.slice/walinuxagent.service'}) + + def test_get_process_cgroup_should_return_only_mounted_controllers_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._controller_mountpoints, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}) + self.assertEqual(cgroup._controller_paths, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service'}) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._controller_mountpoints, {}) + self.assertEqual(cgroup._controller_paths, {}) + + def test_get_process_cgroup_should_return_only_mounted_process_controllers_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'relative/path'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._controller_mountpoints, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'memory':'/sys/fs/cgroup/memory'}) + self.assertEqual(cgroup._controller_paths, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct/relative/path'}) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV1) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._controller_mountpoints, {'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct', 'memory':'/sys/fs/cgroup/memory'}) + self.assertEqual(cgroup._controller_paths, {}) @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_cgroups_v1_command_should_return_the_command_output(self, _): @@ -354,17 +365,6 @@ def test_start_extension_cgroups_v1_command_should_use_systemd_to_execute_the_co class SystemdCgroupsApiv2TestCase(AgentTestCase): - def test_get_controllers_enabled_at_root_should_return_list_of_enabled_controllers(self): - with mock_cgroup_v2_environment(self.tmp_dir): - cgroup_api = get_cgroup_api() - self.assertEqual(cgroup_api._get_controllers_enabled_at_root('/sys/fs/cgroup'), ['cpuset', 'cpu', 'io', 'memory', 'pids']) - - def test_get_controllers_enabled_at_root_should_return_empty_list_if_root_cgroup_path_is_None(self): - with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=None): - cgroup_api = get_cgroup_api() - self.assertEqual(cgroup_api._controllers_enabled_at_root, []) - def test_get_root_cgroup_path_should_return_v2_cgroup_root(self): with mock_cgroup_v2_environment(self.tmp_dir): cgroup_api = get_cgroup_api() @@ -374,97 +374,113 @@ def test_get_root_cgroup_path_should_only_match_systemd_mountpoint(self): with mock_cgroup_v2_environment(self.tmp_dir) as env: # Mock an environment which has multiple v2 mountpoints env.add_command(MockCommand(r"^findmnt -t cgroup2 --noheadings$", -'''/custom/mountpoint/path1 cgroup2 cgroup2 rw,relatime -/sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime -/custom/mountpoint/path2 none cgroup2 rw,relatime -''')) + '''/custom/mountpoint/path1 cgroup2 cgroup2 rw,relatime + /sys/fs/cgroup cgroup2 cgroup2 rw,nosuid,nodev,noexec,relatime + /custom/mountpoint/path2 none cgroup2 rw,relatime + ''')) cgroup_api = get_cgroup_api() self.assertEqual(cgroup_api._get_root_cgroup_path(), '/sys/fs/cgroup') - def test_get_unit_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): + def test_get_controllers_enabled_at_root_should_return_list_of_agent_supported_and_enabled_controllers(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertEqual(cpu, '/sys/fs/cgroup/system.slice/extension.service', - "The cgroup path for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup/system.slice/extension.service', - "The cgroup path for the memory controller is incorrect") + cgroup_api = get_cgroup_api() + enabled_controllers = cgroup_api._get_controllers_enabled_at_root('/sys/fs/cgroup') + self.assertEqual(len(enabled_controllers), 2) + self.assertIn('cpu', enabled_controllers) + self.assertIn('memory', enabled_controllers) - def test_get_unit_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + def test_get_controllers_enabled_at_root_should_return_empty_list_if_root_cgroup_path_is_empty(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=('/sys/fs/cgroup', None)): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/extension.service', - "The cgroup path for the CPU controller is incorrect") - self.assertIsNone(memory, - "The cgroup path for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup')): - cpu, memory = get_cgroup_api().get_unit_cgroup_paths("extension.service") - self.assertIsNone(cpu, "The cgroup path for the cpu controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/extension.service', - "The cgroup path for the memory controller is incorrect") - - def test_get_process_cgroup_paths_should_return_the_cgroup_v2_cgroup_paths(self): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup_api = get_cgroup_api() + self.assertEqual(cgroup_api._controllers_enabled_at_root, []) + + def test_get_process_relative_cgroup_path_should_return_relative_path(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The cgroup path for the CPU controller is incorrect") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The cgroup path for the memory controller is incorrect") + cgroup_api = get_cgroup_api() + self.assertEqual(cgroup_api._get_process_relative_cgroup_path(process_id="self"), "system.slice/walinuxagent.service") - def test_get_process_cgroup_path_should_return_None_if_either_cgroup_v2_controller_not_enabled(self): + def test_get_unit_cgroup_should_return_correct_paths_for_cgroup_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=('/sys/fs/cgroup', None)): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIn(cpu, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The cgroup path for the CPU controller is incorrect") - self.assertIsNone(memory, - "The cgroup path for the memory controller is None so unit cgroup should be None") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_controller_root_paths', return_value=(None, '/sys/fs/cgroup')): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The cgroup path for the CPU controller is None so unit cgroup should be None") - self.assertIn(memory, '/sys/fs/cgroup/system.slice/walinuxagent.service', - "The cgroup path for the memory controller is incorrect") - - def test_get_process_cgroup_v2_path_should_return_None_if_relative_path_is_None(self): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._root_cgroup_path, "/sys/fs/cgroup") + self.assertEqual(cgroup._cgroup_path, "/sys/fs/cgroup/system.slice/extension.service") + self.assertEqual(len(cgroup._enabled_controllers), 2) + self.assertIn('cpu', cgroup._enabled_controllers) + self.assertIn('memory', cgroup._enabled_controllers) + + def test_get_unit_cgroup_should_return_empty_paths_if_root_path_empty_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2.get_process_cgroup_relative_paths', return_value=(None, None)): - cpu, memory = get_cgroup_api().get_process_cgroup_paths("self") - self.assertIsNone(cpu, "The relative cgroup path for the cpu controller is None so unit cgroup should be None") - self.assertIsNone(memory, - "The relative cgroup path for the memory controller is None so unit cgroup should be None") - - def test_get_controller_root_paths_should_return_the_cgroup_v2_root_cgroup_path(self): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._root_cgroup_path, "") + self.assertEqual(cgroup._cgroup_path, "") + self.assertEqual(len(cgroup._enabled_controllers), 0) + + def test_get_unit_cgroup_should_return_only_enabled_controllers_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertEqual(cpu, '/sys/fs/cgroup', "The root cgroup for the CPU controller is incorrect") - self.assertEqual(memory, '/sys/fs/cgroup', "The root cgroup for the memory controller is incorrect") - - def test_get_controller_root_paths_should_return_None_if_root_cgroup_path_is_None(self): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=['cpu']): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._root_cgroup_path, "/sys/fs/cgroup") + self.assertEqual(cgroup._cgroup_path, "/sys/fs/cgroup/system.slice/extension.service") + self.assertEqual(len(cgroup._enabled_controllers), 1) + self.assertIn('cpu', cgroup._enabled_controllers) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=[]): + cgroup = get_cgroup_api().get_unit_cgroup(unit_name="extension.service", cgroup_name="extension") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "extension") + self.assertEqual(cgroup._root_cgroup_path, "/sys/fs/cgroup") + self.assertEqual(cgroup._cgroup_path, "/sys/fs/cgroup/system.slice/extension.service") + self.assertEqual(len(cgroup._enabled_controllers), 0) + + def test_get_cgroup_from_relative_path_should_return_the_correct_paths_for_cgroup_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=None): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertIsNone(cpu, "The root cgroup path is None, so the CPU controller path should be None") - self.assertIsNone(memory, "The root cgroup path is None, so the memory controller path should be None") - - def test_get_controller_root_paths_should_return_None_if_either_controller_not_enabled(self): + cgroup = get_cgroup_api().get_cgroup_from_relative_path(relative_path="some/relative/path", cgroup_name="test_cgroup") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "test_cgroup") + self.assertEqual(cgroup._root_cgroup_path, "/sys/fs/cgroup") + self.assertEqual(cgroup._cgroup_path, "/sys/fs/cgroup/some/relative/path") + self.assertEqual(len(cgroup._enabled_controllers), 2) + self.assertIn('cpu', cgroup._enabled_controllers) + self.assertIn('memory', cgroup._enabled_controllers) + + def test_get_cgroup_from_relative_path_should_return_empty_paths_if_root_path_empty_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=['io', 'memory']): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertIsNone(cpu, "The CPU controller is not enabled, so the CPU controller path should be None") - self.assertEqual(memory, '/sys/fs/cgroup', "The root cgroup for the memory controller is incorrect") - - with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_controllers_enabled_at_root', return_value=['cpu', 'io']): - cpu, memory = get_cgroup_api().get_controller_root_paths() - self.assertEqual(cpu, '/sys/fs/cgroup', "The root cgroup for the CPU controller is incorrect") - self.assertIsNone(memory, "The memory controller is not enabled, so the memory controller path should be None") - - def test_get_cpu_and_memory_cgroup_relative_paths_for_process_should_return_the_cgroup_v2_relative_paths(self): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_cgroup_from_relative_path(relative_path="some/relative/path", cgroup_name="test_cgroup") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "test_cgroup") + self.assertEqual(cgroup._root_cgroup_path, "") + self.assertEqual(cgroup._cgroup_path, "") + self.assertEqual(len(cgroup._enabled_controllers), 0) + + def test_get_process_cgroup_should_return_the_correct_paths_for_cgroup_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._root_cgroup_path, "/sys/fs/cgroup") + self.assertEqual(cgroup._cgroup_path, "/sys/fs/cgroup/system.slice/walinuxagent.service") + self.assertEqual(len(cgroup._enabled_controllers), 2) + self.assertIn('cpu', cgroup._enabled_controllers) + self.assertIn('memory', cgroup._enabled_controllers) + + def test_get_process_cgroup_should_return_empty_paths_if_root_path_empty_v2(self): with mock_cgroup_v2_environment(self.tmp_dir): - cpu, memory = get_cgroup_api().get_process_cgroup_relative_paths('self') - self.assertEqual(cpu, "system.slice/walinuxagent.service", "The relative path for the CPU cgroup is incorrect") - self.assertEqual(memory, "system.slice/walinuxagent.service", "The relative memory for the CPU cgroup is incorrect") + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertIsInstance(cgroup, CgroupV2) + self.assertEqual(cgroup._cgroup_name, "walinuxagent") + self.assertEqual(cgroup._root_cgroup_path, "") + self.assertEqual(cgroup._cgroup_path, "") + self.assertEqual(len(cgroup._enabled_controllers), 0) class SystemdCgroupsApiMockedFileSystemTestCase(_MockedFileSystemTestCase): @@ -483,3 +499,176 @@ def test_cleanup_legacy_cgroups_should_remove_legacy_cgroups(self): self.assertEqual(legacy_cgroups, 2, "cleanup_legacy_cgroups() did not find all the expected cgroups") self.assertFalse(os.path.exists(legacy_cpu_cgroup), "cleanup_legacy_cgroups() did not remove the CPU legacy cgroup") self.assertFalse(os.path.exists(legacy_memory_cgroup), "cleanup_legacy_cgroups() did not remove the memory legacy cgroup") + + +class CgroupsApiv1TestCase(AgentTestCase): + def test_get_supported_controllers_returns_v1_controllers(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_supported_controllers() + self.assertEqual(len(controllers), 2) + self.assertIn('cpu,cpuacct', controllers) + self.assertIn('memory', controllers) + + def test_check_in_expected_slice_returns_True_if_all_paths_in_expected_slice(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertTrue(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'system.slice/walinuxagent.service'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertTrue(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertTrue(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + def test_check_in_expected_slice_returns_False_if_any_paths_not_in_expected_slice(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertFalse(cgroup.check_in_expected_slice(expected_slice='user.slice')) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'system.slice/walinuxagent.service', 'memory': 'user.slice/walinuxagent.service'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertFalse(cgroup.check_in_expected_slice(expected_slice='user.slice')) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': '', 'memory': ''}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertFalse(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + def test_get_controller_metrics_returns_all_supported_controllers_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics() + self.assertEqual(len(metrics), 2) + self.assertIsInstance(metrics[0], CpuMetrics) + self.assertEqual(metrics[0].name, "walinuxagent") + self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + self.assertIsInstance(metrics[1], MemoryMetrics) + self.assertEqual(metrics[1].name, "walinuxagent") + self.assertEqual(metrics[1].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") + + def test_get_controller_metrics_returns_only_mounted_controllers_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'cpu,cpuacct': '/sys/fs/cgroup/cpu,cpuacct'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics() + self.assertEqual(len(metrics), 1) + self.assertIsInstance(metrics[0], CpuMetrics) + self.assertEqual(metrics[0].name, "walinuxagent") + self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={'memory': '/sys/fs/cgroup/memory'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics() + self.assertEqual(len(metrics), 1) + self.assertIsInstance(metrics[0], MemoryMetrics) + self.assertEqual(metrics[0].name, "walinuxagent") + self.assertEqual(metrics[0].path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics() + self.assertEqual(len(metrics), 0) + + def test_get_controller_metrics_returns_only_controllers_at_expected_path_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'system.slice/walinuxagent.service', 'memory': 'unexpected/path'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics(expected_relative_path="system.slice/walinuxagent.service") + self.assertEqual(len(metrics), 1) + self.assertIsInstance(metrics[0], CpuMetrics) + self.assertEqual(metrics[0].name, "walinuxagent") + self.assertEqual(metrics[0].path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service") + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_process_relative_controller_paths', return_value={'cpu,cpuacct': 'unexpected/path', 'memory': 'unexpected/path'}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + metrics = cgroup.get_controller_metrics(expected_relative_path="system.slice/walinuxagent.service") + self.assertEqual(len(metrics), 0) + + def test_get_procs_path_returns_correct_path_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs_path = cgroup.get_controller_procs_path(controller='cpu,cpuacct') + self.assertEqual(procs_path, "/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service/cgroup.procs") + + procs_path = cgroup.get_controller_procs_path(controller='memory') + self.assertEqual(procs_path, "/sys/fs/cgroup/memory/system.slice/walinuxagent.service/cgroup.procs") + + def test_get_processes_returns_processes_at_all_controller_paths_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs = cgroup.get_processes() + self.assertEqual(len(procs), 3) + self.assertIn(int(123), procs) + self.assertIn(int(234), procs) + self.assertIn(int(345), procs) + + def test_get_processes_returns_empty_list_if_no_controllers_mounted_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1._get_controller_mountpoints', return_value={}): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs = cgroup.get_processes() + self.assertIsInstance(procs, list) + self.assertEqual(len(procs), 0) + + def test_get_processes_returns_empty_list_if_procs_path_empty_v1(self): + with mock_cgroup_v1_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.CgroupV1.get_controller_procs_path', return_value=""): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs = cgroup.get_processes() + self.assertIsInstance(procs, list) + self.assertEqual(len(procs), 0) + + +class CgroupsApiv2TestCase(AgentTestCase): + def test_get_supported_controllers_returns_v2_controllers(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + controllers = cgroup.get_supported_controllers() + self.assertEqual(len(controllers), 2) + self.assertIn('cpu', controllers) + self.assertIn('memory', controllers) + + def test_check_in_expected_slice_returns_True_if_cgroup_path_in_expected_slice(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertTrue(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + def test_check_in_expected_slice_returns_False_if_cgroup_path_not_in_expected_slice(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertFalse(cgroup.check_in_expected_slice(expected_slice='user.slice')) + + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_process_relative_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + self.assertFalse(cgroup.check_in_expected_slice(expected_slice='system.slice')) + + def test_get_procs_path_returns_empty_if_root_cgroup_empty_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs_path = cgroup.get_procs_path() + self.assertEqual(procs_path, "") + + def test_get_procs_path_returns_correct_path_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs_path = cgroup.get_procs_path() + self.assertEqual(procs_path, "/sys/fs/cgroup/system.slice/walinuxagent.service/cgroup.procs") + + def test_get_processes_returns_processes_at_all_controller_paths_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs = cgroup.get_processes() + self.assertEqual(len(procs), 3) + self.assertIn(int(123), procs) + self.assertIn(int(234), procs) + self.assertIn(int(345), procs) + + def test_get_processes_returns_empty_list_if_root_cgroup_empty_v2(self): + with mock_cgroup_v2_environment(self.tmp_dir): + with patch('azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv2._get_root_cgroup_path', return_value=""): + cgroup = get_cgroup_api().get_process_cgroup(process_id="self", cgroup_name="walinuxagent") + procs = cgroup.get_processes() + self.assertEqual(len(procs), 0) diff --git a/tests/ga/test_cgroupconfigurator.py b/tests/ga/test_cgroupconfigurator.py index 5b4b0976e9..1d1465a47d 100644 --- a/tests/ga/test_cgroupconfigurator.py +++ b/tests/ga/test_cgroupconfigurator.py @@ -27,7 +27,7 @@ import threading from azurelinuxagent.common import conf -from azurelinuxagent.ga.cgroup import AGENT_NAME_TELEMETRY, MetricsCounter, MetricValue, MetricsCategory, CpuCgroup +from azurelinuxagent.ga.controllermetrics import AGENT_NAME_TELEMETRY, MetricsCounter, MetricValue, MetricsCategory, CpuMetrics from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator, DisableCgroups from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.event import WALAEventOperation @@ -272,7 +272,7 @@ def test_remove_extension_slice_should_remove_unit_files(self): CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \ 'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \ - CpuCgroup('Microsoft.CPlat.Extension', + CpuMetrics('Microsoft.CPlat.Extension', '/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice') configurator.remove_extension_slice(extension_name="Microsoft.CPlat.Extension") @@ -369,10 +369,10 @@ def test_disable_should_reset_cpu_quota_for_all_cgroups(self): configurator.setup_extension_slice(extension_name=extension_name, cpu_quota=5) configurator.set_extension_services_cpu_memory_quota(service_list) CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'] = \ - CpuCgroup('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') + CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \ 'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \ - CpuCgroup('Microsoft.CPlat.Extension', + CpuMetrics('Microsoft.CPlat.Extension', '/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice') configurator.disable("UNIT TEST", DisableCgroups.ALL) @@ -717,7 +717,7 @@ def test_it_should_stop_tracking_extension_services_cgroups(self): with self._get_cgroup_configurator() as configurator: with patch("os.path.exists") as mock_path: mock_path.return_value = True - CGroupsTelemetry.track_cgroup(CpuCgroup('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service')) + CGroupsTelemetry.track_cgroup(CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service')) configurator.stop_tracking_extension_services_cgroups(service_list) tracked = CGroupsTelemetry._tracked @@ -776,7 +776,7 @@ def side_effect(path): with patch("os.path.exists") as mock_path: mock_path.side_effect = side_effect CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'] = \ - CpuCgroup('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') + CpuMetrics('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') configurator.stop_tracking_unit_cgroups("extension.service") tracked = CGroupsTelemetry._tracked @@ -911,7 +911,7 @@ def get_completed_process(): agent_processes = [os.getppid(), os.getpid()] + agent_command_processes + [start_extension.systemd_run_pid] other_processes = [1, get_completed_process()] + extension_processes - with patch("azurelinuxagent.ga.cgroupapi._SystemdCgroupApi.get_processes_in_cgroup", return_value=agent_processes + other_processes): + with patch("azurelinuxagent.ga.cgroupapi.CgroupV1.get_processes", return_value=agent_processes + other_processes): with self.assertRaises(CGroupsException) as context_manager: configurator._check_processes_in_agent_cgroup() @@ -1012,7 +1012,7 @@ def test_check_agent_memory_usage_should_raise_a_cgroups_exception_when_the_limi with self.assertRaises(AgentMemoryExceededException) as context_manager: with self._get_cgroup_configurator() as configurator: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_tracked_metrics") as tracked_metrics: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_tracked_metrics") as tracked_metrics: tracked_metrics.return_value = metrics configurator.check_agent_memory_usage() diff --git a/tests/ga/test_cgroupstelemetry.py b/tests/ga/test_cgroupstelemetry.py index 26fcecbf65..457b20e473 100644 --- a/tests/ga/test_cgroupstelemetry.py +++ b/tests/ga/test_cgroupstelemetry.py @@ -19,7 +19,7 @@ import random import time -from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.utils import fileutil from tests.lib.tools import AgentTestCase, data_dir, patch @@ -105,10 +105,10 @@ def tearDown(self): @staticmethod def _track_new_extension_cgroups(num_extensions): for i in range(num_extensions): - dummy_cpu_cgroup = CpuCgroup("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) + dummy_cpu_cgroup = CpuMetrics("dummy_extension_{0}".format(i), "dummy_cpu_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - dummy_memory_cgroup = MemoryCgroup("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) + dummy_memory_cgroup = MemoryMetrics("dummy_extension_{0}".format(i), "dummy_memory_path_{0}".format(i)) CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) def _assert_cgroups_are_tracked(self, num_extensions): @@ -136,12 +136,12 @@ def test_telemetry_polling_with_active_cgroups(self, *args): # pylint: disable= self._track_new_extension_cgroups(num_extensions) - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.try_swap_memory_usage") as patch_try_swap_memory_usage: - with patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.ga.cgroup.CGroup.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") as patch_get_memory_max_usage: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") as patch_try_swap_memory_usage: + with patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") as patch_get_cpu_usage: + with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: patch_is_active.return_value = True current_cpu = 30 @@ -163,10 +163,10 @@ def test_telemetry_polling_with_active_cgroups(self, *args): # pylint: disable= self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected) self._assert_polled_metrics_equal(metrics, current_cpu, current_memory, current_max_memory, current_swap_memory) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.cgroup.CGroup.is_active", return_value=False) + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active", return_value=False) def test_telemetry_polling_with_inactive_cgroups(self, *_): num_extensions = 5 no_extensions_expected = 0 # pylint: disable=unused-variable @@ -182,10 +182,10 @@ def test_telemetry_polling_with_inactive_cgroups(self, *_): self.assertEqual(len(metrics), 0) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage") - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") - @patch("azurelinuxagent.ga.cgroup.CGroup.is_active") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") + @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") def test_telemetry_polling_with_changing_cgroups_state(self, patch_is_active, patch_get_cpu_usage, # pylint: disable=unused-argument patch_get_mem, patch_get_max_mem, *args): num_extensions = 5 @@ -274,11 +274,11 @@ def test_telemetry_polling_to_generate_transient_logs_index_error(self): CGroupsTelemetry.poll_all_tracked() self.assertEqual(expected_call_count, patch_periodic_warn.call_count) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.try_swap_memory_usage") - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage") - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") - @patch("azurelinuxagent.ga.cgroup.CGroup.is_active") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") + @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") def test_telemetry_calculations(self, patch_is_active, patch_get_cpu_usage, patch_get_memory_usage, patch_get_memory_max_usage, patch_try_memory_swap_usage, *args): # pylint: disable=unused-argument num_polls = 10 @@ -321,13 +321,13 @@ def test_cgroup_is_tracked(self, *args): # pylint: disable=unused-argument self.assertFalse(CGroupsTelemetry.is_tracked("not_present_cpu_dummy_path")) self.assertFalse(CGroupsTelemetry.is_tracked("not_present_memory_dummy_path")) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): # pylint: disable=unused-argument num_extensions = 5 self._track_new_extension_cgroups(num_extensions) - with patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.ga.cgroup.CGroup.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") as patch_get_cpu_usage: + with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: patch_is_active.return_value = True current_cpu = 30 @@ -341,16 +341,16 @@ def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): # py self.assertEqual(len(metrics), num_extensions * 1) # Only CPU populated self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0, 0) - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): # pylint: disable=unused-argument num_extensions = 5 self._track_new_extension_cgroups(num_extensions) - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.ga.cgroup.MemoryCgroup.try_swap_memory_usage") as patch_try_swap_memory_usage: - with patch("azurelinuxagent.ga.cgroup.CGroup.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage") as patch_get_memory_max_usage: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") as patch_get_memory_usage: + with patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.try_swap_memory_usage") as patch_try_swap_memory_usage: + with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: patch_is_active.return_value = True current_memory = 209715200 @@ -367,14 +367,14 @@ def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): # pylin self.assertEqual(len(metrics), num_extensions * 3) self._assert_polled_metrics_equal(metrics, 0, current_memory, current_max_memory, current_swap_memory) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror) - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_max_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage", side_effect=raise_ioerror) def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # pylint: disable=unused-argument num_extensions = 5 self._track_new_extension_cgroups(num_extensions) - with patch("azurelinuxagent.ga.cgroup.CGroup.is_active") as patch_is_active: + with patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") as patch_is_active: patch_is_active.return_value = False poll_count = 1 @@ -383,9 +383,9 @@ def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # py metrics = CGroupsTelemetry.poll_all_tracked() self.assertEqual(0, len(metrics)) - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_throttled_time") - @patch("azurelinuxagent.ga.cgroup.CGroup.is_active") + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_throttled_time") + @patch("azurelinuxagent.ga.controllermetrics.ControllerMetrics.is_active") def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_active, path_get_throttled_time, patch_get_cpu_usage): num_polls = 5 @@ -396,7 +396,7 @@ def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_ac cpu_percent_values.append(-1) cpu_throttled_values = [random.randint(0, 60 * 60) for _ in range(num_polls)] - dummy_cpu_cgroup = CpuCgroup("dummy_extension_name", "dummy_cpu_path") + dummy_cpu_cgroup = CpuMetrics("dummy_extension_name", "dummy_cpu_path") CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) self.assertEqual(1, len(CGroupsTelemetry._tracked)) diff --git a/tests/ga/test_collect_logs.py b/tests/ga/test_collect_logs.py index 4ac3f03fb4..2b8c4f412e 100644 --- a/tests/ga/test_collect_logs.py +++ b/tests/ga/test_collect_logs.py @@ -18,7 +18,7 @@ import os from azurelinuxagent.common import logger, conf -from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup, MetricValue +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricValue from azurelinuxagent.ga.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.logger import Logger from azurelinuxagent.common.protocol.util import ProtocolUtil @@ -197,8 +197,8 @@ def run_and_wait(): monitor_log_collector.join() cgroups = [ - CpuCgroup("test", "dummy_cpu_path"), - MemoryCgroup("test", "dummy_memory_path") + CpuMetrics("test", "dummy_cpu_path"), + MemoryMetrics("test", "dummy_memory_path") ] monitor_log_collector = get_log_collector_monitor_handler(cgroups) monitor_log_collector.run_and_wait = run_and_wait diff --git a/tests/ga/test_cgroups.py b/tests/ga/test_controllermetrics.py similarity index 59% rename from tests/ga/test_cgroups.py rename to tests/ga/test_controllermetrics.py index 0ffcfed1bd..cdd31395f1 100644 --- a/tests/ga/test_cgroups.py +++ b/tests/ga/test_controllermetrics.py @@ -22,7 +22,7 @@ import random import shutil -from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup, MetricsCounter, CounterNotFound +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricsCounter, CounterNotFound from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.utils import fileutil @@ -36,35 +36,35 @@ def consume_cpu_time(): return waste -class TestCGroup(AgentTestCase): +class TestControllerMetrics(AgentTestCase): def test_is_active(self): - test_cgroup = CpuCgroup("test_extension", self.tmp_dir) - self.assertEqual(False, test_cgroup.is_active()) + test_metrics = CpuMetrics("test_extension", self.tmp_dir) + self.assertEqual(False, test_metrics.is_active()) with open(os.path.join(self.tmp_dir, "tasks"), mode="wb") as tasks: tasks.write(str(1000).encode()) - self.assertEqual(True, test_cgroup.is_active()) + self.assertEqual(True, test_metrics.is_active()) @patch("azurelinuxagent.common.logger.periodic_warn") def test_is_active_file_not_present(self, patch_periodic_warn): - test_cgroup = CpuCgroup("test_extension", self.tmp_dir) - self.assertEqual(False, test_cgroup.is_active()) + test_metrics = CpuMetrics("test_extension", self.tmp_dir) + self.assertEqual(False, test_metrics.is_active()) - test_cgroup = MemoryCgroup("test_extension", os.path.join(self.tmp_dir, "this_cgroup_does_not_exist")) - self.assertEqual(False, test_cgroup.is_active()) + test_metrics = MemoryMetrics("test_extension", os.path.join(self.tmp_dir, "this_cgroup_does_not_exist")) + self.assertEqual(False, test_metrics.is_active()) self.assertEqual(0, patch_periodic_warn.call_count) @patch("azurelinuxagent.common.logger.periodic_warn") def test_is_active_incorrect_file(self, patch_periodic_warn): open(os.path.join(self.tmp_dir, "tasks"), mode="wb").close() - test_cgroup = CpuCgroup("test_extension", os.path.join(self.tmp_dir, "tasks")) - self.assertEqual(False, test_cgroup.is_active()) + test_metrics = CpuMetrics("test_extension", os.path.join(self.tmp_dir, "tasks")) + self.assertEqual(False, test_metrics.is_active()) self.assertEqual(1, patch_periodic_warn.call_count) -class TestCpuCgroup(AgentTestCase): +class TestCpuMetrics(AgentTestCase): @classmethod def setUpClass(cls): AgentTestCase.setUpClass() @@ -96,147 +96,147 @@ def tearDownClass(cls): def setUp(self): AgentTestCase.setUp(self) - TestCpuCgroup.mock_read_file_map.clear() + TestCpuMetrics.mock_read_file_map.clear() def test_initialize_cpu_usage_should_set_current_cpu_usage(self): - cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") + metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } - cgroup.initialize_cpu_usage() + metrics.initialize_cpu_usage() - self.assertEqual(cgroup._current_cgroup_cpu, 63763) - self.assertEqual(cgroup._current_system_cpu, 5496872) + self.assertEqual(metrics._current_cgroup_cpu, 63763) + self.assertEqual(metrics._current_system_cpu, 5496872) def test_get_cpu_usage_should_return_the_cpu_usage_since_its_last_invocation(self): osutil = get_osutil() - cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") + metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } - cgroup.initialize_cpu_usage() + metrics.initialize_cpu_usage() - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t1"), - os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t1") + os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t1") } - cpu_usage = cgroup.get_cpu_usage() + cpu_usage = metrics.get_cpu_usage() self.assertEqual(cpu_usage, round(100.0 * 0.000307697876885 * osutil.get_processor_cores(), 3)) - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t2"), - os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t2") + os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t2") } - cpu_usage = cgroup.get_cpu_usage() + cpu_usage = metrics.get_cpu_usage() self.assertEqual(cpu_usage, round(100.0 * 0.000445181085968 * osutil.get_processor_cores(), 3)) def test_initialize_cpu_usage_should_set_the_cgroup_usage_to_0_when_the_cgroup_does_not_exist(self): - cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") + metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") io_error_2 = IOError() io_error_2.errno = errno.ENOENT # "No such directory" - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(cgroup.path, "cpuacct.stat"): io_error_2 + os.path.join(metrics.path, "cpuacct.stat"): io_error_2 } - cgroup.initialize_cpu_usage() + metrics.initialize_cpu_usage() - self.assertEqual(cgroup._current_cgroup_cpu, 0) - self.assertEqual(cgroup._current_system_cpu, 5496872) # check the system usage just for test sanity + self.assertEqual(metrics._current_cgroup_cpu, 0) + self.assertEqual(metrics._current_system_cpu, 5496872) # check the system usage just for test sanity def test_initialize_cpu_usage_should_raise_an_exception_when_called_more_than_once(self): - cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") + metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") - TestCpuCgroup.mock_read_file_map = { + TestCpuMetrics.mock_read_file_map = { "/proc/stat": os.path.join(data_dir, "cgroups", "proc_stat_t0"), - os.path.join(cgroup.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") + os.path.join(metrics.path, "cpuacct.stat"): os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") } - cgroup.initialize_cpu_usage() + metrics.initialize_cpu_usage() with self.assertRaises(CGroupsException): - cgroup.initialize_cpu_usage() + metrics.initialize_cpu_usage() def test_get_cpu_usage_should_raise_an_exception_when_initialize_cpu_usage_has_not_been_invoked(self): - cgroup = CpuCgroup("test", "/sys/fs/cgroup/cpu/system.slice/test") + metrics = CpuMetrics("test", "/sys/fs/cgroup/cpu/system.slice/test") with self.assertRaises(CGroupsException): - cpu_usage = cgroup.get_cpu_usage() # pylint: disable=unused-variable + cpu_usage = metrics.get_cpu_usage() # pylint: disable=unused-variable def test_get_throttled_time_should_return_the_value_since_its_last_invocation(self): test_file = os.path.join(self.tmp_dir, "cpu.stat") shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), test_file) # throttled_time = 50 - cgroup = CpuCgroup("test", self.tmp_dir) - cgroup.initialize_cpu_usage() + metrics = CpuMetrics("test", self.tmp_dir) + metrics.initialize_cpu_usage() shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t1"), test_file) # throttled_time = 2075541442327 - throttled_time = cgroup.get_cpu_throttled_time() + throttled_time = metrics.get_cpu_throttled_time() self.assertEqual(throttled_time, float(2075541442327 - 50) / 1E9, "The value of throttled_time is incorrect") def test_get_tracked_metrics_should_return_the_throttled_time(self): - cgroup = CpuCgroup("test", os.path.join(data_dir, "cgroups")) - cgroup.initialize_cpu_usage() + metrics = CpuMetrics("test", os.path.join(data_dir, "cgroups")) + metrics.initialize_cpu_usage() def find_throttled_time(metrics): return [m for m in metrics if m.counter == MetricsCounter.THROTTLED_TIME] - found = find_throttled_time(cgroup.get_tracked_metrics()) + found = find_throttled_time(metrics.get_tracked_metrics()) self.assertTrue(len(found) == 0, "get_tracked_metrics should not fetch the throttled time by default. Found: {0}".format(found)) - found = find_throttled_time(cgroup.get_tracked_metrics(track_throttled_time=True)) + found = find_throttled_time(metrics.get_tracked_metrics(track_throttled_time=True)) self.assertTrue(len(found) == 1, "get_tracked_metrics should have fetched the throttled time by default. Found: {0}".format(found)) -class TestMemoryCgroup(AgentTestCase): +class TestMemoryMetrics(AgentTestCase): def test_get_metrics(self): - test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "memory_mount")) + test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups", "memory_mount")) - memory_usage = test_mem_cg.get_memory_usage() + memory_usage = test_mem_metrics.get_memory_usage() self.assertEqual(150000, memory_usage) - max_memory_usage = test_mem_cg.get_max_memory_usage() + max_memory_usage = test_mem_metrics.get_max_memory_usage() self.assertEqual(1000000, max_memory_usage) - swap_memory_usage = test_mem_cg.try_swap_memory_usage() + swap_memory_usage = test_mem_metrics.try_swap_memory_usage() self.assertEqual(20000, swap_memory_usage) def test_get_metrics_when_files_not_present(self): - test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups")) + test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups")) with self.assertRaises(IOError) as e: - test_mem_cg.get_memory_usage() + test_mem_metrics.get_memory_usage() self.assertEqual(e.exception.errno, errno.ENOENT) with self.assertRaises(IOError) as e: - test_mem_cg.get_max_memory_usage() + test_mem_metrics.get_max_memory_usage() self.assertEqual(e.exception.errno, errno.ENOENT) with self.assertRaises(IOError) as e: - test_mem_cg.try_swap_memory_usage() + test_mem_metrics.try_swap_memory_usage() self.assertEqual(e.exception.errno, errno.ENOENT) def test_get_memory_usage_counters_not_found(self): - test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "missing_memory_counters")) + test_mem_metrics = MemoryMetrics("test_extension", os.path.join(data_dir, "cgroups", "missing_memory_counters")) with self.assertRaises(CounterNotFound): - test_mem_cg.get_memory_usage() + test_mem_metrics.get_memory_usage() - swap_memory_usage = test_mem_cg.try_swap_memory_usage() + swap_memory_usage = test_mem_metrics.try_swap_memory_usage() self.assertEqual(0, swap_memory_usage) diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 1dbec27c39..420645fe0e 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -21,7 +21,7 @@ import string from azurelinuxagent.common import event, logger -from azurelinuxagent.ga.cgroup import CpuCgroup, MemoryCgroup, MetricValue, _REPORT_EVERY_HOUR +from azurelinuxagent.ga.controllermetrics import CpuMetrics, MemoryMetrics, MetricValue, _REPORT_EVERY_HOUR from azurelinuxagent.ga.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.event import EVENTS_DIRECTORY from azurelinuxagent.common.protocol.healthservice import HealthService @@ -222,7 +222,7 @@ def test_send_extension_metrics_telemetry_for_empty_cgroup(self, patch_poll_all_ self.assertEqual(0, patch_add_metric.call_count) @patch('azurelinuxagent.common.event.EventLogger.add_metric') - @patch("azurelinuxagent.ga.cgroup.MemoryCgroup.get_memory_usage") + @patch("azurelinuxagent.ga.controllermetrics.MemoryMetrics.get_memory_usage") @patch('azurelinuxagent.common.logger.Logger.periodic_warn') def test_send_extension_metrics_telemetry_handling_memory_cgroup_exceptions_errno2(self, patch_periodic_warn, # pylint: disable=unused-argument patch_get_memory_usage, @@ -231,14 +231,14 @@ def test_send_extension_metrics_telemetry_handling_memory_cgroup_exceptions_errn ioerror.errno = 2 patch_get_memory_usage.side_effect = ioerror - CGroupsTelemetry._tracked["/test/path"] = MemoryCgroup("cgroup_name", "/test/path") + CGroupsTelemetry._tracked["/test/path"] = MemoryMetrics("_cgroup_name", "/test/path") PollResourceUsage().run() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent. @patch('azurelinuxagent.common.event.EventLogger.add_metric') - @patch("azurelinuxagent.ga.cgroup.CpuCgroup.get_cpu_usage") + @patch("azurelinuxagent.ga.controllermetrics.CpuMetrics.get_cpu_usage") @patch('azurelinuxagent.common.logger.Logger.periodic_warn') def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2(self, patch_periodic_warn, # pylint: disable=unused-argument patch_cpu_usage, patch_add_metric, @@ -247,7 +247,7 @@ def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2( ioerror.errno = 2 patch_cpu_usage.side_effect = ioerror - CGroupsTelemetry._tracked["/test/path"] = CpuCgroup("cgroup_name", "/test/path") + CGroupsTelemetry._tracked["/test/path"] = CpuMetrics("_cgroup_name", "/test/path") PollResourceUsage().run() self.assertEqual(0, patch_periodic_warn.call_count) diff --git a/tests/lib/mock_cgroup_environment.py b/tests/lib/mock_cgroup_environment.py index d9f79cb6a1..a8f5fa9a3a 100644 --- a/tests/lib/mock_cgroup_environment.py +++ b/tests/lib/mock_cgroup_environment.py @@ -122,7 +122,9 @@ _MOCKED_FILES_V1 = [ ("/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')) + (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v1', 'proc_pid_cgroup')), + (r"/sys/fs/cgroup/cpu,cpuacct/system.slice/walinuxagent.service/cgroup.procs", os.path.join(data_dir, 'cgroups', 'cgroup.procs')), + (r"/sys/fs/cgroup/memory/system.slice/walinuxagent.service/cgroup.procs", os.path.join(data_dir, 'cgroups', 'cgroup.procs')) ] _MOCKED_FILES_V2 = [ @@ -130,7 +132,8 @@ (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'v2', 'proc_pid_cgroup')), ("/sys/fs/cgroup/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), ("/sys/fs/cgroup/azure.slice/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control')), - ("/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control_empty')) + ("/sys/fs/cgroup/azure.slice/walinuxagent.service/cgroup.subtree_control", os.path.join(data_dir, 'cgroups', 'v2', 'sys_fs_cgroup_cgroup.subtree_control_empty')), + (r"/sys/fs/cgroup/system.slice/walinuxagent.service/cgroup.procs", os.path.join(data_dir, 'cgroups', 'cgroup.procs')) ] _MOCKED_FILES_HYBRID = [ diff --git a/tests/test_agent.py b/tests/test_agent.py index 4b643ca36f..df1a7ca131 100644 --- a/tests/test_agent.py +++ b/tests/test_agent.py @@ -24,8 +24,9 @@ from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.ga import logcollector, cgroupconfigurator from azurelinuxagent.common.utils import fileutil -from azurelinuxagent.ga.cgroupapi import get_cgroup_api, InvalidCgroupMountpointException +from azurelinuxagent.ga.cgroupapi import InvalidCgroupMountpointException, CgroupV1 from azurelinuxagent.ga.collect_logs import CollectLogsHandler +from azurelinuxagent.ga.controllermetrics import AGENT_LOG_COLLECTOR from tests.lib.mock_cgroup_environment import mock_cgroup_v1_environment from tests.lib.tools import AgentTestCase, data_dir, Mock, patch @@ -247,16 +248,24 @@ def test_calls_collect_logs_on_valid_cgroups_v1(self, mock_log_collector): CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() - # Mock cgroup paths so process is in the log collector slice - def mock_cgroup_paths(*args, **kwargs): - if args and args[0] == "self": - relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) - return (relative_path, relative_path) - return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + # Mock cgroup so process is in the log collector slice + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return CgroupV1( + cgroup_name=AGENT_LOG_COLLECTOR, + controller_mountpoints={ + 'cpu,cpuacct':"/sys/fs/cgroup/cpu,cpuacct", + 'memory':"/sys/fs/cgroup/memory" + }, + controller_paths={ + 'cpu,cpuacct':"/sys/fs/cgroup/cpu,cpuacct/{0}".format(relative_path), + 'memory':"/sys/fs/cgroup/memory/{0}".format(relative_path) + } + ) with mock_cgroup_v1_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", - side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup", + side_effect=mock_cgroup): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) agent.collect_logs(is_full_mode=True) @@ -296,17 +305,26 @@ def test_doesnt_call_collect_logs_on_invalid_cgroups_v1(self, mock_log_collector CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() - # Mock cgroup paths so process is in incorrect slice - def mock_cgroup_paths(*args, **kwargs): - if args and args[0] == "self": - return ("NOT_THE_CORRECT_PATH", "NOT_THE_CORRECT_PATH") - return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + # Mock cgroup so process is in incorrect slice + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "NOT_THE_CORRECT_PATH" + return CgroupV1( + cgroup_name=AGENT_LOG_COLLECTOR, + controller_mountpoints={ + 'cpu,cpuacct': "/sys/fs/cgroup/cpu,cpuacct", + 'memory': "/sys/fs/cgroup/memory" + }, + controller_paths={ + 'cpu,cpuacct': "/sys/fs/cgroup/cpu,cpuacct/{0}".format(relative_path), + 'memory': "/sys/fs/cgroup/memory/{0}".format(relative_path) + } + ) def raise_on_sys_exit(*args): raise RuntimeError(args[0] if args else "Exiting") with mock_cgroup_v1_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup", side_effect=mock_cgroup): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: @@ -346,19 +364,25 @@ def test_doesnt_call_collect_logs_if_either_controller_not_mounted(self, mock_lo CollectLogsHandler.enable_monitor_cgroups_check() mock_log_collector.run = Mock() - # Mock cgroup paths so process is in the log collector slice and cpu is not mounted - def mock_cgroup_paths(*args, **kwargs): - if args and args[0] == "self": - relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) - return (None, relative_path) - return get_cgroup_api().get_process_cgroup_relative_paths(*args, **kwargs) + # Mock cgroup so process is in the log collector slice and cpu is not mounted + def mock_cgroup(*args, **kwargs): # pylint: disable=W0613 + relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT) + return CgroupV1( + cgroup_name=AGENT_LOG_COLLECTOR, + controller_mountpoints={ + 'memory': "/sys/fs/cgroup/memory" + }, + controller_paths={ + 'memory': "/sys/fs/cgroup/memory/{0}".format(relative_path) + } + ) def raise_on_sys_exit(*args): raise RuntimeError(args[0] if args else "Exiting") with mock_cgroup_v1_environment(self.tmp_dir): - with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup_paths", - side_effect=mock_cgroup_paths): + with patch("azurelinuxagent.ga.cgroupapi.SystemdCgroupApiv1.get_process_cgroup", + side_effect=mock_cgroup): agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf")) with patch("sys.exit", side_effect=raise_on_sys_exit) as mock_exit: diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py index 1fe21c329a..c3bb468b02 100644 --- a/tests_e2e/tests/lib/cgroup_helpers.py +++ b/tests_e2e/tests/lib/cgroup_helpers.py @@ -7,7 +7,7 @@ from azurelinuxagent.common.osutil import systemd from azurelinuxagent.common.utils import shellutil from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION -from azurelinuxagent.ga.cgroupapi import get_cgroup_api +from azurelinuxagent.ga.cgroupapi import get_cgroup_api, SystemdCgroupApiv1 from tests_e2e.tests.lib.agent_log import AgentLog from tests_e2e.tests.lib.logging import log from tests_e2e.tests.lib.retry import retry_if_false @@ -164,9 +164,14 @@ def check_log_message(message, after_timestamp=datetime.datetime.min): return False -def get_unit_cgroup_paths(unit_name): +def get_unit_cgroup_proc_path(unit_name, controller): """ - Returns the cgroup paths for the given unit + Returns the cgroup.procs path for the given unit and controller. """ cgroups_api = get_cgroup_api() - return cgroups_api.get_unit_cgroup_paths(unit_name) + unit_cgroup = cgroups_api.get_unit_cgroup(unit_name=unit_name, cgroup_name="test cgroup") + if isinstance(cgroups_api, SystemdCgroupApiv1): + return unit_cgroup.get_controller_procs_path(controller=controller) + else: + return unit_cgroup.get_procs_path() + diff --git a/tests_e2e/tests/scripts/agent_cgroups_process_check-unknown_process_check.py b/tests_e2e/tests/scripts/agent_cgroups_process_check-unknown_process_check.py index d1b3014a03..fff5746cce 100755 --- a/tests_e2e/tests/scripts/agent_cgroups_process_check-unknown_process_check.py +++ b/tests_e2e/tests/scripts/agent_cgroups_process_check-unknown_process_check.py @@ -18,14 +18,13 @@ # This script forces the process check by putting unknown process in the agent's cgroup -import os import subprocess import datetime from assertpy import fail from azurelinuxagent.common.utils import shellutil -from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, check_log_message, get_unit_cgroup_paths, AGENT_SERVICE_NAME +from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, check_log_message, get_unit_cgroup_proc_path, AGENT_SERVICE_NAME from tests_e2e.tests.lib.logging import log from tests_e2e.tests.lib.retry import retry_if_false @@ -62,8 +61,8 @@ def disable_agent_cgroups_with_unknown_process(pid): Note: System may kick the added process out of the cgroups, keeps adding until agent detect that process """ - def unknown_process_found(cpu_cgroup): - cgroup_procs_path = os.path.join(cpu_cgroup, "cgroup.procs") + def unknown_process_found(): + cgroup_procs_path = get_unit_cgroup_proc_path(AGENT_SERVICE_NAME, 'cpu,cpuacct') log.info("Adding dummy process %s to cgroup.procs file %s", pid, cgroup_procs_path) try: with open(cgroup_procs_path, 'a') as f: @@ -81,9 +80,7 @@ def unknown_process_found(cpu_cgroup): pid)), attempts=3) return found and retry_if_false(check_agent_quota_disabled, attempts=3) - cpu_cgroup, _ = get_unit_cgroup_paths(AGENT_SERVICE_NAME) - - found: bool = retry_if_false(lambda: unknown_process_found(cpu_cgroup), attempts=3) + found: bool = retry_if_false(unknown_process_found, attempts=3) if not found: fail("The agent did not detect unknown process: {0}".format(pid))