diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py index 2ad70c1b9d..912423c0c2 100644 --- a/azurelinuxagent/common/cgroup.py +++ b/azurelinuxagent/common/cgroup.py @@ -13,6 +13,8 @@ # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ +from collections import namedtuple + import errno import os import re @@ -23,6 +25,21 @@ from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.utils import fileutil + +MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value']) + + +class MetricsCategory(object): + MEMORY_CATEGORY = "Memory" + CPU_CATEGORY = "CPU" + + +class MetricsCounter(object): + PROCESSOR_PERCENT_TIME = "% Processor Time" + TOTAL_MEM_USAGE = "Total Memory Usage" + MAX_MEM_USAGE = "Max Memory Usage" + + re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') @@ -135,6 +152,12 @@ def get_tracked_processes(self): ' Internal error: {1}'.format(self.path, ustr(e))) return procs + def get_tracked_metrics(self): + """ + Retrieves the current value of the metrics tracked for this cgroup and returns them as an array + """ + raise NotImplementedError() + class CpuCgroup(CGroup): def __init__(self, name, cgroup_path): @@ -211,6 +234,11 @@ def get_cpu_usage(self): return round(100.0 * float(cgroup_delta) / float(system_delta), 3) + def get_tracked_metrics(self): + return [ + MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, self.get_cpu_usage()), + ] + class MemoryCgroup(CGroup): def __init__(self, name, cgroup_path): @@ -259,3 +287,9 @@ def get_max_memory_usage(self): raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e) return int(usage) + + def get_tracked_metrics(self): + return [ + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, self.get_memory_usage()), + MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, self.get_max_memory_usage()), + ] diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py index a1f620c1b6..41d8e3fc07 100644 --- a/azurelinuxagent/common/cgroupapi.py +++ b/azurelinuxagent/common/cgroupapi.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2018 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -432,6 +433,7 @@ class SystemdCgroupsApi(CGroupsApi): """ def __init__(self): self._cgroup_mountpoints = None + self._agent_unit_name = None @staticmethod def get_systemd_version(): @@ -620,6 +622,35 @@ def create_cgroup(controller): return cgroups + def get_agent_unit_name(self): + if self._agent_unit_name is None: + self._agent_unit_name = get_osutil().get_service_name() + ".service" + return self._agent_unit_name + + @staticmethod + def get_processes_in_cgroup(unit_name): + """ + Returns an array of tuples with the PID and command line of the processes that are currently + within the cgroup for the given unit. + """ + # + # The output of the command is similar to + # + # Unit walinuxagent.service (/system.slice/walinuxagent.service): + # ├─27519 /usr/bin/python3 -u /usr/sbin/waagent -daemon + # └─27547 python3 -u bin/WALinuxAgent-2.2.48.1-py2.7.egg -run-exthandlers + # + output = shellutil.run_command(['systemd-cgls', '--unit', unit_name]) + + processes = [] + + for line in output.splitlines(): + match = re.match('[^\d]*(?P\d+)\s+(?P.+)', line) + if match is not None: + processes.append((match.group('pid'), match.group('command'))) + + return processes + @staticmethod def _is_systemd_failure(scope_name, process_output): unit_not_found = "Unit {0} not found.".format(scope_name) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index 2079251c7c..42ba7f3c20 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -15,6 +15,7 @@ # Requires Python 2.6+ and Openssl 1.0+ import os +import re import subprocess from azurelinuxagent.common import logger @@ -23,7 +24,6 @@ from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException from azurelinuxagent.common.future import ustr -from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.version import get_distro from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion from azurelinuxagent.common.event import add_event, WALAEventOperation @@ -41,6 +41,8 @@ def __init__(self): self._cgroups_supported = False self._cgroups_enabled = False self._cgroups_api = None + self._get_processes_in_agent_cgroup_last_error = None + self._get_processes_in_agent_cgroup_error_count = 0 def initialize(self): try: @@ -113,7 +115,7 @@ def log_cgroup_warn(format_string, *args): # # check the cgroups for the agent # - agent_unit_name = get_osutil().get_service_name() + ".service" + agent_unit_name = self._cgroups_api.get_agent_unit_name() cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self") if cpu_cgroup_relative_path is None: log_cgroup_warn("The agent's process is not within a CPU cgroup") @@ -156,6 +158,9 @@ def log_cgroup_warn(format_string, *args): def enabled(self): return self._cgroups_enabled + def resource_limits_enforced(self): + return False + def enable(self): if not self._cgroups_supported: raise CGroupsException("Attempted to enable cgroups, but they are not supported on the current platform") @@ -171,7 +176,7 @@ def _invoke_cgroup_operation(self, operation, error_message, on_error=None): Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation. """ if not self.enabled(): - return + return None try: return operation() @@ -211,6 +216,29 @@ def __impl(): self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name)) + def get_processes_in_agent_cgroup(self): + """ + Returns an array of tuples with the PID and command line of the processes that are currently within the cgroup for the given unit. + + The return value can be None if cgroups are not enabled or if an error occurs during the operation. + """ + def __impl(): + agent_unit = self._cgroups_api.get_agent_unit_name() + return self._cgroups_api.get_processes_in_cgroup(agent_unit) + + def __on_error(exception): + # + # Send telemetry for a small sample of errors (if any) + # + self._get_processes_in_agent_cgroup_error_count = self._get_processes_in_agent_cgroup_error_count + 1 + if self._get_processes_in_agent_cgroup_error_count <= 5: + message = "Failed to list the processes in the agent's cgroup: {0}", ustr(exception) + if message != self._get_processes_in_agent_cgroup_last_error: + add_event(op=WALAEventOperation.CGroupsDebug, message=message) + self._get_processes_in_agent_cgroup_last_error = message + + return self._invoke_cgroup_operation(__impl, "Failed to list the processes in the agent's cgroup.", on_error=__on_error) + def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): """ @@ -261,3 +289,41 @@ def get_instance(): if CGroupConfigurator._instance is None: CGroupConfigurator._instance = CGroupConfigurator.__impl() return CGroupConfigurator._instance + + @staticmethod + def is_agent_process(command_line): + """ + Returns true if the given command line corresponds to a process started by the agent. + + NOTE: The function uses pattern matching to determine whether the process was spawned by the agent; this is more of a heuristic + than an exact check. + """ + patterns = [ + r".*waagent -daemon.*", + r".*(WALinuxAgent-.+\.egg|waagent) -run-exthandlers", + # The processes in the agent's cgroup are listed using systemd-cgls + r"^systemd-cgls --unit walinuxagent.service$", + # Extensions are started using systemd-run + r"^systemd-run --unit=.+ --scope ", + # + # The rest of the commands are started by the environment thread; many of them are distro-specific so this list may need + # additions as we add support for more distros. + # + # *** Monitor DHCP client restart + # + r"^pidof (dhclient|dhclient3|systemd-networkd)", + r"^ip route (show|add)", + # + # *** Enable firewall + # + r"^iptables --version$", + r"^iptables .+ -t security", + # + # *** Monitor host name changes + # + r"^ifdown .+ && ifup .+", + ] + for p in patterns: + if re.match(p, command_line) is not None: + return True + return False diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py index 3010449c2e..eab38b78c5 100644 --- a/azurelinuxagent/common/cgroupstelemetry.py +++ b/azurelinuxagent/common/cgroupstelemetry.py @@ -15,34 +15,18 @@ # Requires Python 2.6+ and Openssl 1.0+ import errno import threading -from collections import namedtuple from azurelinuxagent.common import logger -from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers -from azurelinuxagent.common.exception import CGroupsException +from azurelinuxagent.common.cgroup import CpuCgroup from azurelinuxagent.common.future import ustr from azurelinuxagent.common.logger import EVERY_SIX_HOURS from azurelinuxagent.common.resourceusage import ProcessInfo -MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value']) -StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric']) - DELIM = " | " DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND" DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND" -class MetricsCategory(object): - MEMORY_CATEGORY = "Memory" - PROCESS_CATEGORY = "Process" - - -class MetricsCounter(object): - PROCESSOR_PERCENT_TIME = "% Processor Time" - TOTAL_MEM_USAGE = "Total Memory Usage" - MAX_MEM_USAGE = "Max Memory Usage" - - class CGroupsTelemetry(object): """ """ @@ -110,24 +94,9 @@ def poll_all_tracked(): metrics = [] with CGroupsTelemetry._rlock: - def new_cpu_metric(name, value): - return MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, name, value) - - def new_memory_metric(name, value): - return MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, name, value) - - def new_max_memory_metric(name, value): - return MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, name, value) - for cgroup in CGroupsTelemetry._tracked[:]: try: - if cgroup.controller == CGroupContollers.CPU: - metrics.append(new_cpu_metric(cgroup.name, cgroup.get_cpu_usage())) - elif cgroup.controller == CGroupContollers.MEMORY: - metrics.append(new_memory_metric(cgroup.name, cgroup.get_memory_usage())) - metrics.append(new_max_memory_metric(cgroup.name, cgroup.get_max_memory_usage())) - else: - raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(cgroup.controller, cgroup.name)) + metrics.extend(cgroup.get_tracked_metrics()) except Exception as e: # There can be scenarios when the CGroup has been deleted by the time we are fetching the values # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index e1eaa8151c..e5860d9055 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -72,6 +72,7 @@ class WALAEventOperation: ArtifactsProfileBlob = "ArtifactsProfileBlob" AutoUpdate = "AutoUpdate" CGroupsCleanUp = "CGroupsCleanUp" + CGroupsDebug = "CGroupsDebug" CGroupsInfo = "CGroupsInfo" CGroupsInitialize = "CGroupsInitialize" CGroupsLimitsCrossed = "CGroupsLimitsCrossed" diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index e230063c6a..d4c31fa197 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -17,11 +17,11 @@ import datetime import threading -import time import uuid import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.networkutil as networkutil +from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.errorstate import ErrorState from azurelinuxagent.common.event import add_event, WALAEventOperation, report_metric, collect_events @@ -36,38 +36,130 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def generate_extension_metrics_telemetry_dictionary(schema_version=1.0, - performance_metrics=None): - if schema_version == 1.0: - telemetry_dict = {"SchemaVersion": 1.0} - if performance_metrics: - telemetry_dict["PerfMetrics"] = performance_metrics - return telemetry_dict - else: - return None - - def get_monitor_handler(): return MonitorHandler() +class PollResourceUsageOperation(PeriodicOperation): + """ + Periodic operation to poll the tracked cgroups for resource usage data. + + It also checks whether there are processes in the agent's cgroup that should not be there. + """ + def __init__(self): + super(PollResourceUsageOperation, self).__init__( + name="poll resource usage", + operation=self._operation_impl, + period=datetime.timedelta(minutes=5)) + self._last_error = None + self._error_count = 0 + + def _operation_impl(self): + # + # Check the processes in the agent cgroup + # + processes = CGroupConfigurator.get_instance().get_processes_in_agent_cgroup() + + if processes is not None: + unexpected_processes = [] + + for (_, command_line) in processes: + if not CGroupConfigurator.is_agent_process(command_line): + unexpected_processes.append(command_line) + + # Report a small sample of unexpected processes + if len(unexpected_processes) > 0 and self._error_count < 5: + unexpected_processes.sort() + error = ustr(unexpected_processes) + if error != self._last_error: + self._error_count += 1 + self._last_error = error + message = "The agent's cgroup includes unexpected processes: {0}".format(error) + logger.info(message) + add_event(op=WALAEventOperation.CGroupsDebug, message=message) + + # + # Report metrics + # + metrics = CGroupsTelemetry.poll_all_tracked() + + for metric in metrics: + report_metric(metric.category, metric.counter, metric.instance, metric.value) + + +class ResetPeriodicLogMessagesOperation(PeriodicOperation): + """ + Periodic operation to clean up the hash-tables maintained by the loggers. For reference, please check + azurelinuxagent.common.logger.Logger and azurelinuxagent.common.event.EventLogger classes + """ + def __init__(self): + super(ResetPeriodicLogMessagesOperation, self).__init__( + name="reset periodic log messages", + operation=ResetPeriodicLogMessagesOperation._operation_impl, + period=datetime.timedelta(hours=12)) + + @staticmethod + def _operation_impl(): + logger.reset_periodic() + + +class ReportNetworkErrorsOperation(PeriodicOperation): + def __init__(self): + super(ReportNetworkErrorsOperation, self).__init__( + name="report network errors", + operation=ReportNetworkErrorsOperation._operation_impl, + period=datetime.timedelta(minutes=30)) + + @staticmethod + def _operation_impl(): + io_errors = IOErrorCounter.get_and_reset() + hostplugin_errors = io_errors.get("hostplugin") + protocol_errors = io_errors.get("protocol") + other_errors = io_errors.get("other") + + if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: + msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, other_errors) + add_event(op=WALAEventOperation.HttpErrors, message=msg) + + +class ReportNetworkConfigurationChangesOperation(PeriodicOperation): + """ + Periodic operation to check and log changes in network configuration. + """ + + def __init__(self): + super(ReportNetworkConfigurationChangesOperation, self).__init__( + name="report network configuration changes", + operation=self._operation_impl, + period=datetime.timedelta(minutes=1)) + self.osutil = get_osutil() + self.last_route_table_hash = b'' + self.last_nic_state = {} + + def _operation_impl(self): + raw_route_list = self.osutil.read_route_table() + digest = hash_strings(raw_route_list) + if digest != self.last_route_table_hash: + self.last_route_table_hash = digest + route_list = self.osutil.get_list_of_routes(raw_route_list) + logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list)))) + + nic_state = self.osutil.get_nic_state() + if nic_state != self.last_nic_state: + description = "Initial" if self.last_nic_state == {} else "Updated" + logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values())))) + self.last_nic_state = nic_state + + class MonitorHandler(object): # telemetry EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - # host health - TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) - # cgroup data period - CGROUP_TELEMETRY_POLLING_PERIOD = datetime.timedelta(minutes=5) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) # imds IMDS_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) IMDS_HEALTH_PERIOD = datetime.timedelta(minutes=3) - # log network configuration - LOG_NETWORK_CONFIGURATION_PERIOD = datetime.timedelta(minutes=1) - # Resetting loggers period - RESET_LOGGERS_PERIOD = datetime.timedelta(hours=12) def __init__(self): self.osutil = get_osutil() @@ -75,19 +167,17 @@ def __init__(self): self.event_thread = None self._periodic_operations = [ - PeriodicOperation("reset_loggers", self.reset_loggers, self.RESET_LOGGERS_PERIOD), + ResetPeriodicLogMessagesOperation(), PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD), - PeriodicOperation("send_telemetry_heartbeat", self.send_telemetry_heartbeat, self.TELEMETRY_HEARTBEAT_PERIOD), - PeriodicOperation("poll_telemetry_metrics usage", self.poll_telemetry_metrics, self.CGROUP_TELEMETRY_POLLING_PERIOD), + ReportNetworkErrorsOperation(), + PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), - PeriodicOperation("log_altered_network_configuration", self.log_altered_network_configuration, self.LOG_NETWORK_CONFIGURATION_PERIOD), + ReportNetworkConfigurationChangesOperation(), ] self.protocol = None self.protocol_util = None self.health_service = None - self.last_route_table_hash = b'' - self.last_nic_state = {} self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() @@ -129,15 +219,6 @@ def start(self, init_data=False): self.event_thread.setName("MonitorHandler") self.event_thread.start() - def collect_and_send_events(self): - """ - Periodically send any events located in the events folder - """ - event_list = collect_events() - - if len(event_list.events) > 0: - self.protocol.report_event(event_list) - def daemon(self, init_data=False): try: if init_data: @@ -158,13 +239,14 @@ def daemon(self, init_data=False): except Exception as e: logger.error("An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e)) - def reset_loggers(self): + def collect_and_send_events(self): """ - The loggers maintain hash-tables in memory and they need to be cleaned up from time to time. - For reference, please check azurelinuxagent.common.logger.Logger and - azurelinuxagent.common.event.EventLogger classes + Periodically send any events located in the events folder """ - logger.reset_periodic() + event_list = collect_events() + + if len(event_list.events) > 0: + self.protocol.report_event(event_list) def send_imds_heartbeat(self): """ @@ -233,47 +315,3 @@ def send_host_plugin_heartbeat(self): message=msg, log_event=False) - def send_telemetry_heartbeat(self): - io_errors = IOErrorCounter.get_and_reset() - hostplugin_errors = io_errors.get("hostplugin") - protocol_errors = io_errors.get("protocol") - other_errors = io_errors.get("other") - - if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: - msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, protocol_errors, - other_errors) - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HttpErrors, - is_success=True, - message=msg, - log_event=False) - - def poll_telemetry_metrics(self): - """ - This method polls the tracked cgroups to get data from the cgroups filesystem and send it to the performance counters database. - - :return: List of Metrics (which would be sent to PerfCounterMetrics directly. - """ - metrics = CGroupsTelemetry.poll_all_tracked() - - for metric in metrics: - report_metric(metric.category, metric.counter, metric.instance, metric.value) - - def log_altered_network_configuration(self): - """ - Check various pieces of network configuration and, if altered since the last check, log the new state. - """ - raw_route_list = self.osutil.read_route_table() - digest = hash_strings(raw_route_list) - if digest != self.last_route_table_hash: - self.last_route_table_hash = digest - route_list = self.osutil.get_list_of_routes(raw_route_list) - logger.info("Route table: [{0}]".format(",".join(map(networkutil.RouteEntry.to_json, route_list)))) - - nic_state = self.osutil.get_nic_state() - if nic_state != self.last_nic_state: - description = "Initial" if self.last_nic_state == {} else "Updated" - logger.info("{0} NIC state: [{1}]".format(description, ", ".join(map(str, nic_state.values())))) - self.last_nic_state = nic_state diff --git a/tests/common/mock_cgroup_commands.py b/tests/common/mock_cgroup_commands.py index 7bc353399a..223ab0f25d 100644 --- a/tests/common/mock_cgroup_commands.py +++ b/tests/common/mock_cgroup_commands.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2020 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -28,12 +29,12 @@ # The output comes from an Ubuntu 18 system # _default_commands = [ - (r"systemctl --version", + (r"^systemctl --version$", '''systemd 237 +PAM +AUDIT +SELINUX +IMA +APPARMOR +SMACK +SYSVINIT +UTMP +LIBCRYPTSETUP +GCRYPT +GNUTLS +ACL +XZ +LZ4 +SECCOMP +BLKID +ELFUTILS +KMOD -IDN2 +IDN -PCRE2 default-hierarchy=hybrid '''), - (r"mount -t cgroup", + (r"^mount -t cgroup$", '''cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,xattr,name=systemd) cgroup on /sys/fs/cgroup/rdma type cgroup (rw,nosuid,nodev,noexec,relatime,rdma) cgroup on /sys/fs/cgroup/cpuset type cgroup (rw,nosuid,nodev,noexec,relatime,cpuset) @@ -48,30 +49,37 @@ cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,blkio) '''), - (r"mount -t cgroup2", + (r"^mount -t cgroup2$", '''cgroup on /sys/fs/cgroup/unified type cgroup2 (rw,nosuid,nodev,noexec,relatime) '''), - (r"systemctl show walinuxagent\.service --property CPUAccounting", + (r"^systemctl show walinuxagent\.service --property CPUAccounting$", '''CPUAccounting=no '''), - (r"systemctl show walinuxagent\.service --property MemoryAccounting", + (r"^systemctl show walinuxagent\.service --property MemoryAccounting$", '''MemoryAccounting=no '''), - (r"systemd-run --unit=([^\s]+) --scope ([^\s]+)", + (r"^systemd-run --unit=([^\s]+) --scope ([^\s]+)", ''' Running scope as unit: TEST_UNIT.scope Thu 28 May 2020 07:25:55 AM PDT +'''), + + (r"^systemd-cgls --unit walinuxagent.service$", +''' +Unit walinuxagent.service (/system.slice/walinuxagent.service): +├─27519 /usr/bin/python3 -u /usr/sbin/waagent -daemon +└─27547 python3 -u bin/WALinuxAgent-2.2.48.1-py2.7.egg -run-exthandlers '''), ] -_default_files = ( - (r"/proc/self/cgroup", os.path.join(data_dir, 'cgroups', 'proc_self_cgroup')), - (r"/proc/[0-9]+/cgroup", os.path.join(data_dir, 'cgroups', 'proc_pid_cgroup')), - (r"/sys/fs/cgroup/unified/cgroup.controllers", os.path.join(data_dir, 'cgroups', 'sys_fs_cgroup_unified_cgroup.controllers')), -) +_default_files = [ + (r"^/proc/self/cgroup$", os.path.join(data_dir, 'cgroups', 'proc_self_cgroup')), + (r"^/proc/[0-9]+/cgroup$", os.path.join(data_dir, 'cgroups', 'proc_pid_cgroup')), + (r"^/sys/fs/cgroup/unified/cgroup.controllers$", os.path.join(data_dir, 'cgroups', 'sys_fs_cgroup_unified_cgroup.controllers')), +] @contextlib.contextmanager def mock_cgroup_commands(): @@ -79,28 +87,32 @@ def mock_cgroup_commands(): original_read_file = fileutil.read_file original_path_exists = os.path.exists + def add_command(pattern, output): + patcher.commands.insert(0, (pattern, output)) + def mock_popen(command, *args, **kwargs): if isinstance(command, list): command_string = " ".join(command) else: command_string = command - for cmd in _default_commands: + for cmd in patcher.commands: match = re.match(cmd[0], command_string) if match is not None: command = ["echo", cmd[1]] + break return original_popen(command, *args, **kwargs) def mock_read_file(filepath, **kwargs): - for file in _default_files: + for file in patcher.files: match = re.match(file[0], filepath) if match is not None: filepath = file[1] return original_read_file(filepath, **kwargs) def mock_path_exists(path): - for file in _default_files: + for file in patcher.files: match = re.match(file[0], path) if match is not None: return True @@ -109,5 +121,10 @@ def mock_path_exists(path): with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as patcher: with patch("azurelinuxagent.common.cgroupapi.os.path.exists", side_effect=mock_path_exists): with patch("azurelinuxagent.common.cgroupapi.fileutil.read_file", side_effect=mock_read_file): - yield patcher + with patch('azurelinuxagent.common.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): + with patch('azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd', return_value=True): + patcher.commands = _default_commands[:] + patcher.files = _default_files[:] + patcher.add_command = add_command + yield patcher diff --git a/tests/common/test_cgroupapi.py b/tests/common/test_cgroupapi.py index e4baea11c2..00a8ad7faf 100644 --- a/tests/common/test_cgroupapi.py +++ b/tests/common/test_cgroupapi.py @@ -524,6 +524,19 @@ def test_create_extension_cgroups_root_should_create_extensions_root_slice(self) os.remove("/etc/systemd/system/{0}".format(unit_name)) shellutil.run_get_output("systemctl daemon-reload") + def test_get_processes_in_cgroup_should_return_the_processes_within_the_cgroup(self): + with mock_cgroup_commands(): + processes = SystemdCgroupsApi.get_processes_in_cgroup("walinuxagent.service") + + self.assertTrue(len(processes) >= 2, + "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes)) + + daemon_present = any("waagent -daemon" in command for (pid, command) in processes) + self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes)) + + extension_handler_present = any(re.search("(WALinuxAgent-.+\.egg|waagent) -run-exthandlers", command) for (pid, command) in processes) + self.assertTrue(extension_handler_present, "Could not find the extension handler in the cgroup: [{0}]".format(processes)) + @attr('requires_sudo') def test_create_extension_cgroups_should_create_extension_slice(self): self.assertTrue(i_am_root(), "Test does not run when non-root") diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index 895e9d33b9..5e33c53c49 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -17,8 +17,7 @@ from __future__ import print_function -import contextlib -import os +import re import subprocess from azurelinuxagent.common.cgroup import CGroup @@ -40,10 +39,8 @@ def _get_new_cgroup_configurator_instance(initialize=True): CGroupConfigurator._instance = None configurator = CGroupConfigurator.get_instance() if initialize: - with patch('azurelinuxagent.common.cgroupapi.CGroupsApi.cgroups_supported', return_value=True): - with patch('azurelinuxagent.common.cgroupapi.CGroupsApi._is_systemd', return_value=True): - with mock_cgroup_commands(): - configurator.initialize() + with mock_cgroup_commands(): + configurator.initialize() return configurator def test_initialize_should_start_tracking_the_agent_cgroups(self): @@ -130,6 +127,21 @@ def raise_exception(*_): message = args[0] self.assertIn("A TEST EXCEPTION", message) + def test_get_processes_in_agent_cgroup_should_return_the_processes_within_the_agent_cgroup(self): + with mock_cgroup_commands(): + configurator = CGroupConfiguratorSystemdTestCase._get_new_cgroup_configurator_instance() + + processes = configurator.get_processes_in_agent_cgroup() + + self.assertTrue(len(processes) >= 2, + "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes)) + + daemon_present = any("waagent -daemon" in command for (pid, command) in processes) + self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes)) + + extension_handler_present = any(re.search("(WALinuxAgent-.+\.egg|waagent) -run-exthandlers", command) for (pid, command) in processes) + self.assertTrue(extension_handler_present, "Could not find the extension handler in the cgroup: [{0}]".format(processes)) + @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_not_use_systemd_when_cgroups_are_not_enabled(self, _): configurator = CGroupConfiguratorSystemdTestCase._get_new_cgroup_configurator_instance() diff --git a/tests/common/test_cgroupstelemetry.py b/tests/common/test_cgroupstelemetry.py index d04c37e216..d130eb3630 100644 --- a/tests/common/test_cgroupstelemetry.py +++ b/tests/common/test_cgroupstelemetry.py @@ -124,8 +124,8 @@ def _assert_cgroups_are_tracked(self, num_extensions): def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value): for metric in metrics: - self.assertIn(metric.category, ["Process", "Memory"]) - if metric.category == "Process": + self.assertIn(metric.category, ["CPU", "Memory"]) + if metric.category == "CPU": self.assertEqual(metric.counter, "% Processor Time") self.assertEqual(metric.value, cpu_metric_value) if metric.category == "Memory": diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index b06b83bb2e..e349e2f148 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2018 Microsoft Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,6 +21,7 @@ import os import platform import random +import re import string import tempfile import time @@ -28,8 +30,9 @@ from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common import event, logger -from azurelinuxagent.common.cgroup import CGroup, CpuCgroup, MemoryCgroup -from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry, MetricValue +from azurelinuxagent.common.cgroup import CGroup, CpuCgroup, MemoryCgroup, MetricValue +from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator +from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.datacontract import get_properties from azurelinuxagent.common.event import add_event, WALAEventOperation, EVENTS_DIRECTORY from azurelinuxagent.common.exception import HttpError @@ -39,7 +42,8 @@ from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam from azurelinuxagent.common.utils import fileutil, restutil from azurelinuxagent.common.version import AGENT_VERSION, CURRENT_VERSION, CURRENT_AGENT, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME -from azurelinuxagent.ga.monitor import get_monitor_handler, MonitorHandler, PeriodicOperation +from azurelinuxagent.ga.monitor import get_monitor_handler, MonitorHandler, PeriodicOperation, ResetPeriodicLogMessagesOperation, PollResourceUsageOperation +from tests.common.mock_cgroup_commands import mock_cgroup_commands from tests.protocol.mockwiredata import DATA_FILE from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse from tests.tools import Mock, MagicMock, patch, AgentTestCase, clear_singleton_instances, PropertyMock @@ -157,10 +161,9 @@ def test_it_should_clear_periodic_log_messages(self): if len(logger.DEFAULT_LOGGER.periodic_messages) != 100: raise Exception('Test setup error: the periodic messages were not added') - with _create_monitor_handler(enabled_operations=["reset_loggers"]) as monitor_handler: - monitor_handler.run_and_wait() + ResetPeriodicLogMessagesOperation().run() - self.assertEqual(len(logger.DEFAULT_LOGGER.periodic_messages), 0, "The monitor thread did not reset the periodic log messages") + self.assertEqual(0, len(logger.DEFAULT_LOGGER.periodic_messages), "The monitor thread did not reset the periodic log messages") class TestEventMonitoring(AgentTestCase, HttpRequestPredicates): @@ -409,14 +412,9 @@ def test_send_extension_metrics_telemetry(self, patch_poll_all_tracked, patch_ad MetricValue("Memory", "Total Memory Usage", 1, 1), MetricValue("Memory", "Max Memory Usage", 1, 1)] - monitor_handler = get_monitor_handler() - monitor_handler.init_protocols() - monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.poll_telemetry_metrics() + PollResourceUsageOperation().run() self.assertEqual(1, patch_poll_all_tracked.call_count) self.assertEqual(3, patch_add_metric.call_count) # Three metrics being sent. - monitor_handler.stop() @patch('azurelinuxagent.common.event.EventLogger.add_metric') @patch('azurelinuxagent.common.event.EventLogger.add_event') @@ -425,15 +423,10 @@ def test_send_extension_metrics_telemetry_for_empty_cgroup(self, patch_poll_all_ patch_add_event, patch_add_metric,*args): patch_poll_all_tracked.return_value = [] - monitor_handler = get_monitor_handler() - monitor_handler.init_protocols() - monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.poll_telemetry_metrics() + PollResourceUsageOperation().run() self.assertEqual(1, patch_poll_all_tracked.call_count) self.assertEqual(0, patch_add_event.call_count) self.assertEqual(0, patch_add_metric.call_count) - monitor_handler.stop() @patch('azurelinuxagent.common.event.EventLogger.add_metric') @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") @@ -447,14 +440,9 @@ def test_send_extension_metrics_telemetry_handling_memory_cgroup_exceptions_errn CGroupsTelemetry._tracked.append(MemoryCgroup("cgroup_name", "/test/path")) - monitor_handler = get_monitor_handler() - monitor_handler.init_protocols() - monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.poll_telemetry_metrics() + PollResourceUsageOperation().run() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent. - monitor_handler.stop() @patch('azurelinuxagent.common.event.EventLogger.add_metric') @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") @@ -468,34 +456,22 @@ def test_send_extension_metrics_telemetry_handling_cpu_cgroup_exceptions_errno2( CGroupsTelemetry._tracked.append(CpuCgroup("cgroup_name", "/test/path")) - monitor_handler = get_monitor_handler() - monitor_handler.init_protocols() - monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.poll_telemetry_metrics() + PollResourceUsageOperation().run() self.assertEqual(0, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent. - monitor_handler.stop() @patch('azurelinuxagent.common.event.EventLogger.add_metric') @patch('azurelinuxagent.common.logger.Logger.periodic_warn') def test_send_extension_metrics_telemetry_for_unsupported_cgroup(self, patch_periodic_warn, patch_add_metric, *args): CGroupsTelemetry._tracked.append(CGroup("cgroup_name", "/test/path", "io")) - monitor_handler = get_monitor_handler() - monitor_handler.init_protocols() - monitor_handler.last_cgroup_polling_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.last_cgroup_report_telemetry = datetime.datetime.utcnow() - timedelta(hours=1) - monitor_handler.poll_telemetry_metrics() + PollResourceUsageOperation().run() self.assertEqual(1, patch_periodic_warn.call_count) self.assertEqual(0, patch_add_metric.call_count) # No metrics should be sent. - monitor_handler.stop() - def test_generate_extension_metrics_telemetry_dictionary(self, *args): num_polls = 10 num_extensions = 1 - num_summarization_values = 7 cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)] @@ -527,6 +503,64 @@ def test_generate_extension_metrics_telemetry_dictionary(self, *args): CGroupsTelemetry.poll_all_tracked() +class PollResourceUsageOperationTestCase(AgentTestCase): + @classmethod + def setUpClass(cls): + AgentTestCase.setUpClass() + # ensure cgroups are enabled by forcing a new instance + CGroupConfigurator._instance = None + with mock_cgroup_commands(): + CGroupConfigurator.get_instance().initialize() + + @classmethod + def tearDownClass(cls): + CGroupConfigurator._instance = None + AgentTestCase.tearDownClass() + + def test_it_should_report_processes_that_do_not_belong_to_the_agent_cgroup(self): + with mock_cgroup_commands() as mock_commands: + mock_commands.add_command(r'^systemd-cgls --unit walinuxagent.service$', +''' +Unit walinuxagent.service (/system.slice/walinuxagent.service): +├─27519 /usr/bin/python3 -u /usr/sbin/waagent -daemon +├─27547 python3 -u bin/WALinuxAgent-2.2.48.1-py2.7.egg -run-exthandlers +├─5821 pidof systemd-networkd +├─5822 iptables --version +├─5823 iptables -w -t security -D OUTPUT -d 168.63.129.16 -p tcp -m conntrack --ctstate INVALID,NEW -j ACCEPT +├─5824 iptables -w -t security -D OUTPUT -d 168.63.129.16 -p tcp -m owner --uid-owner 0 -j ACCEPT +├─5825 ip route show +├─5826 ifdown eth0 && ifup eth0 +├─5699 bash /var/lib/waagent/Microsoft.CPlat.Core.RunCommandLinux-1.0.1/bin/run-command-shim enable +├─5701 tee -ia /var/log/azure/run-command/handler.log +├─5719 /var/lib/waagent/Microsoft.CPlat.Core.RunCommandLinux-1.0.1/bin/run-command-extension enable +├─5727 /bin/sh -c /var/lib/waagent/run-command/download/1/script.sh +└─5728 /bin/sh /var/lib/waagent/run-command/download/1/script.sh +''') + with patch("azurelinuxagent.ga.monitor.add_event") as add_event_patcher: + PollResourceUsageOperation().run() + + messages = [kwargs["message"] for (_, kwargs) in add_event_patcher.call_args_list if "The agent's cgroup includes unexpected processes" in kwargs["message"]] + + self.assertEqual(1, len(messages), "Exactly 1 telemetry event should have been reported. Events: {0}".format(messages)) + + unexpected_processes = [ + 'bash /var/lib/waagent/Microsoft.CPlat.Core.RunCommandLinux-1.0.1/bin/run-command-shim enable', + 'tee -ia /var/log/azure/run-command/handler.log', + '/var/lib/waagent/Microsoft.CPlat.Core.RunCommandLinux-1.0.1/bin/run-command-extension enable', + '/bin/sh -c /var/lib/waagent/run-command/download/1/script.sh', + '/bin/sh /var/lib/waagent/run-command/download/1/script.sh', + ] + + for fp in unexpected_processes: + self.assertIn(fp, messages[0], "[{0}] was not reported as an unexpected process. Events: {1}".format(fp, messages)) + + # The list of processes in the message is an array of strings: "['foo', ..., 'bar']" + search = re.search(r'\[(?P.+)\]', messages[0]) + self.assertIsNotNone(search, "The event message is not in the expected format: {0}".format(messages[0])) + processes = search.group('processes') + self.assertEquals(5, len(processes.split(',')), 'Extra processes were reported as unexpected: {0}'.format(processes)) + + @patch("azurelinuxagent.common.utils.restutil.http_post") @patch('azurelinuxagent.common.protocol.wire.WireClient.get_goal_state') @patch('azurelinuxagent.common.event.EventLogger.add_event') diff --git a/tests/ga/test_remoteaccess_handler.py b/tests/ga/test_remoteaccess_handler.py index ea2f0d0cde..e65a7f806e 100644 --- a/tests/ga/test_remoteaccess_handler.py +++ b/tests/ga/test_remoteaccess_handler.py @@ -349,7 +349,7 @@ def test_handle_remote_access_bad_data_and_good_data(self, _): for user in remote_access.user_list.users: count += 1 user.name = "tstuser{0}".format(count) - if count is 2: + if count == 2: user.name = "" expiration_date = datetime.utcnow() + timedelta(days=count) user.expiration = expiration_date.strftime("%a, %d %b %Y %H:%M:%S ") + "UTC"