From 8194bbc6a8c54af1fd3f7a6ab5ca59094deb399e Mon Sep 17 00:00:00 2001 From: Varad Meru Date: Mon, 16 Dec 2019 15:52:07 -0800 Subject: [PATCH] Start sending additional memory information per process (#1729) * Adding ResourceUsage Class and Adding ProcessIds in Telemetry * Adding process tests to fetch the processes in cgroup * Sending Resource Metrics for Memory Usage telemetry. * Sending Memory Data for Tracked Processes * Fix proc_statm collection and dictionary formatting * Updating the process_name pattern when sending memory telemetry * Adding the default process_name, process_commandline * Handling inactive cgroups; not send empty values in ExtensionMetricsData - Making it more explicit for Memory cases as well if IOError came. - Adding a filter to make sure not adding entries into Metrics which are marked_for_delete - Tests changes * Fixing the missed test - Changing the memory exceptions. * Review Comments addressed, and clearer exception handling. - cgroups.py - cleaner returns and raises - cgroupstelemetry.py - exception handling when proc_statm for PID fails. - exception.py - cgroup comments fixed - resourceusage.py - raising an exception when failing to get memory info. - monitor.py - reverted the reset_logger changes - test_cgroupstelemetry.py - stray print and mock removed. - test_resourceusage.py - newline and end - test_monitor.py - removed reset_logger changes, and count of metrics fixed. * Making IOError explicit for get_memory_usage_proc_statm & test fixes - resourceusage.py - Propogate IOError(Errno2) above, and for other exceptions, raise ProcessInfoException. - test_resourceusage.py - Add more asserts to check the raising of IOError, ProcessInfoException. * Addressing review comments. - cgroupstelemetry.py - handling of exception thrown by get_proc_*. Also, some variable name refactoring. - resourceusage.py - Bubbling up exceptions in get_proc_* - test_cgroupstelemetry.py - Refactoring variable name. - test_resourceusage.py - Changes in test to test exception bubbling up. * Initializing a new logger for each test here to not conflict with others * Review comments addressed and some refactoring. - test_cgroups.py - Simple refactoring of class setup and asserts fixed for get_tracked_process - resourceusage.py - Comments fixed. - cgroupstelemetry.py - Refactored the strings into class for easy usage. - cgroup.py - Refactoring the controller names strings into class. Also changed get_tracked_processes' return behavior. --- azurelinuxagent/common/cgroup.py | 62 +- azurelinuxagent/common/cgroupstelemetry.py | 117 +++- azurelinuxagent/common/exception.py | 7 +- azurelinuxagent/common/resourceusage.py | 140 +++++ azurelinuxagent/ga/monitor.py | 65 +- tests/common/test_cgroups.py | 50 +- tests/common/test_cgroupstelemetry.py | 606 +++++++++---------- tests/common/test_resourceusage.py | 86 +++ tests/data/cgroups/cpu_mount/cgroup.procs | 0 tests/data/cgroups/dummy_proc_cmdline | Bin 0 -> 61 bytes tests/data/cgroups/dummy_proc_comm | 1 + tests/data/cgroups/dummy_proc_statm | 1 + tests/data/cgroups/memory_mount/cgroup.procs | 0 tests/ga/test_monitor.py | 39 +- 14 files changed, 773 insertions(+), 401 deletions(-) create mode 100644 azurelinuxagent/common/resourceusage.py create mode 100644 tests/common/test_resourceusage.py create mode 100644 tests/data/cgroups/cpu_mount/cgroup.procs create mode 100644 tests/data/cgroups/dummy_proc_cmdline create mode 100644 tests/data/cgroups/dummy_proc_comm create mode 100644 tests/data/cgroups/dummy_proc_statm create mode 100644 tests/data/cgroups/memory_mount/cgroup.procs diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py index 1f940b5f08..2ad70c1b9d 100644 --- a/azurelinuxagent/common/cgroup.py +++ b/azurelinuxagent/common/cgroup.py @@ -26,15 +26,20 @@ re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n') +class CGroupContollers(object): + CPU = "cpu" + MEMORY = "memory" + + class CGroup(object): @staticmethod def create(cgroup_path, controller, extension_name): """ Factory method to create the correct CGroup. """ - if controller == "cpu": + if controller == CGroupContollers.CPU: return CpuCgroup(extension_name, cgroup_path) - if controller == "memory": + if controller == CGroupContollers.MEMORY: return MemoryCgroup(extension_name, cgroup_path) raise CGroupsException('CGroup controller {0} is not supported'.format(controller)) @@ -107,14 +112,33 @@ def is_active(self): logger.periodic_warn(logger.EVERY_HALF_HOUR, 'Could not get list of tasks from "tasks" file in the cgroup: {0}.' ' Internal error: {1}'.format(self.path, ustr(e))) - return False - return False + def get_tracked_processes(self): + """ + :return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes. + """ + procs = [] + try: + procs = self._get_parameters("cgroup.procs") + except (IOError, OSError) as e: + if e.errno == errno.ENOENT: + # only suppressing file not found exceptions. + pass + else: + logger.periodic_warn(logger.EVERY_HALF_HOUR, + 'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.' + ' Internal error: {1}'.format(self.path, ustr(e))) + except CGroupsException as e: + logger.periodic_warn(logger.EVERY_HALF_HOUR, + 'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.' + ' Internal error: {1}'.format(self.path, ustr(e))) + return procs + class CpuCgroup(CGroup): def __init__(self, name, cgroup_path): - super(CpuCgroup, self).__init__(name, cgroup_path, "cpu") + super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU) self._osutil = get_osutil() self._previous_cgroup_cpu = None @@ -195,7 +219,7 @@ def __init__(self, name, cgroup_path): :return: MemoryCgroup """ - super(MemoryCgroup, self).__init__(name, cgroup_path, "memory") + super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY) def __str__(self): return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format( @@ -210,18 +234,13 @@ def get_memory_usage(self): :rtype: int """ usage = None - try: usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True) - except (IOError, OSError) as e: - if e.errno == errno.ENOENT: - # only suppressing file not found exceptions. - pass - else: - raise e + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: + raise + raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e) - if not usage: - usage = "0" return int(usage) def get_max_memory_usage(self): @@ -234,12 +253,9 @@ def get_max_memory_usage(self): usage = None try: usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True) - except (IOError, OSError) as e: - if e.errno == errno.ENOENT: - # only suppressing file not found exceptions. - pass - else: - raise e - if not usage: - usage = "0" + except Exception as e: + if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT: + raise + raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e) + return int(usage) diff --git a/azurelinuxagent/common/cgroupstelemetry.py b/azurelinuxagent/common/cgroupstelemetry.py index 3d13652087..4bbcba1ebf 100644 --- a/azurelinuxagent/common/cgroupstelemetry.py +++ b/azurelinuxagent/common/cgroupstelemetry.py @@ -19,12 +19,30 @@ from datetime import datetime as dt from azurelinuxagent.common import logger -from azurelinuxagent.common.cgroup import CpuCgroup +from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers from azurelinuxagent.common.exception import CGroupsException from azurelinuxagent.common.future import ustr - +from azurelinuxagent.common.logger import EVERY_SIX_HOURS +from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value']) +StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric']) + +DELIM = " | " +DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND" +DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND" + + +class MetricsCategory(object): + MEMORY_CATEGORY = "Memory" + PROCESS_CATEGORY = "Process" + + +class MetricsCounter(object): + PROCESSOR_PERCENT_TIME = "% Processor Time" + TOTAL_MEM_USAGE = "Total Memory Usage" + MAX_MEM_USAGE = "Max Memory Usage" + MEM_USED_BY_PROCESS = "Memory Used by Process" class CGroupsTelemetry(object): @@ -34,6 +52,26 @@ class CGroupsTelemetry(object): _cgroup_metrics = {} _rlock = threading.RLock() + @staticmethod + def get_process_info_summary(process_id): + process_cmdline = DEFAULT_PROCESS_COMMANDLINE + process_name = DEFAULT_PROCESS_NAME + + # The ProcessName and ProcessCommandLine can generate Exception if the file /proc//{comm,cmdline} cease to + # exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the + # details from those files. + try: + process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE + except Exception as e: + logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e)) + + try: + process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME + except Exception as e: + logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e)) + + return process_id + DELIM + process_name + DELIM + process_cmdline + @staticmethod def _get_metrics_list(metric): return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(), @@ -41,9 +79,10 @@ def _get_metrics_list(metric): @staticmethod def _process_cgroup_metric(cgroup_metrics): - memory_usage = cgroup_metrics.get_memory_usage() - max_memory_usage = cgroup_metrics.get_max_memory_usage() - cpu_usage = cgroup_metrics.get_cpu_usage() + memory_usage = cgroup_metrics.get_memory_metrics() + max_memory_usage = cgroup_metrics.get_max_memory_metrics() + cpu_usage = cgroup_metrics.get_cpu_metrics() + memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics() processed_extension = {} @@ -62,6 +101,14 @@ def _process_cgroup_metric(cgroup_metrics): else: processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)} + for pid_process_memory in memory_usage_per_process: + if "proc_statm_memory" in processed_extension: + processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \ + CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric) + else: + processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline: + CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)} + return processed_extension @staticmethod @@ -135,25 +182,41 @@ def poll_all_tracked(): with CGroupsTelemetry._rlock: for cgroup in CGroupsTelemetry._tracked[:]: - # noinspection PyBroadException if cgroup.name not in CGroupsTelemetry._cgroup_metrics: CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics() try: - if cgroup.controller == "cpu": + if cgroup.controller == CGroupContollers.CPU: current_cpu_usage = cgroup.get_cpu_usage() CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage) - metrics.append(MetricValue("Process", "% Processor Time", cgroup.name, current_cpu_usage)) - elif cgroup.controller == "memory": + metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter. + PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage)) + elif cgroup.controller == CGroupContollers.MEMORY: current_memory_usage = cgroup.get_memory_usage() CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage) - metrics.append(MetricValue("Memory", "Total Memory Usage", cgroup.name, current_memory_usage)) + metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter. + TOTAL_MEM_USAGE, cgroup.name, current_memory_usage)) max_memory_usage = cgroup.get_max_memory_usage() CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage) - metrics.append(MetricValue("Memory", "Max Memory Usage", cgroup.name, max_memory_usage)) + metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, + cgroup.name, max_memory_usage)) + + pids = cgroup.get_tracked_processes() + for pid in pids: + try: + mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid) + metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter. + MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid), + mem_usage_from_procstatm)) + CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory( + CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm) + except Exception as e: + if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT: + logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm " + "for pid {0}. Error : {1}", pid, ustr(e)) else: raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format( - cgroup.controller, cgroup.name)) + cgroup.controller, cgroup.name)) except Exception as e: # There can be scenarios when the CGroup has been deleted by the time we are fetching the values # from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log @@ -187,30 +250,48 @@ def __init__(self): self._memory_usage = Metric() self._max_memory_usage = Metric() self._cpu_usage = Metric() + self._proc_statm_mem = {} + self.marked_for_delete = False def add_memory_usage(self, usage): - self._memory_usage.append(usage) + if not self.marked_for_delete: + self._memory_usage.append(usage) def add_max_memory_usage(self, usage): - self._max_memory_usage.append(usage) + if not self.marked_for_delete: + self._max_memory_usage.append(usage) def add_cpu_usage(self, usage): - self._cpu_usage.append(usage) + if not self.marked_for_delete: + self._cpu_usage.append(usage) - def get_memory_usage(self): + def add_proc_statm_memory(self, pid, usage): + if not self.marked_for_delete: + if pid not in self._proc_statm_mem: + self._proc_statm_mem[pid] = Metric() + self._proc_statm_mem[pid].append(usage) + + def get_memory_metrics(self): return self._memory_usage - def get_max_memory_usage(self): + def get_max_memory_metrics(self): return self._max_memory_usage - def get_cpu_usage(self): + def get_cpu_metrics(self): return self._cpu_usage + def get_proc_statm_memory_metrics(self): + """ + :return: StatmMetricValue tuples of pid and metric + """ + return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()] + def clear(self): self._memory_usage.clear() self._max_memory_usage.clear() self._cpu_usage.clear() + self._proc_statm_mem.clear() class Metric(object): diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index b2cd30c583..4456573b06 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -16,6 +16,7 @@ # # Requires Python 2.6+ and Openssl 1.0+ # + """ Defines all exceptions """ @@ -44,7 +45,7 @@ def __init__(self, msg=None, inner=None): class AgentNetworkError(AgentError): """ - When network is not available\. + When network is not available. """ def __init__(self, msg=None, inner=None): @@ -52,6 +53,10 @@ def __init__(self, msg=None, inner=None): class CGroupsException(AgentError): + """ + Exception to classify any cgroups related issue. + """ + def __init__(self, msg=None, inner=None): super(CGroupsException, self).__init__(msg, inner) diff --git a/azurelinuxagent/common/resourceusage.py b/azurelinuxagent/common/resourceusage.py new file mode 100644 index 0000000000..841df3ea01 --- /dev/null +++ b/azurelinuxagent/common/resourceusage.py @@ -0,0 +1,140 @@ +# Copyright 2019 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ + +import os + +from azurelinuxagent.common import logger +from azurelinuxagent.common.exception import AgentError +from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.logger import EVERY_SIX_HOURS +from azurelinuxagent.common.utils import fileutil + + +PAGE_SIZE = os.sysconf('SC_PAGE_SIZE') +PROC_STATM_FILENAME_FORMAT = "/proc/{0}/statm" +PROC_CMDLINE_FILENAME_FORMAT = "/proc/{0}/cmdline" +PROC_COMM_FILENAME_FORMAT = "/proc/{0}/comm" +PROC_STATUS_FILENAME_FORMAT = "/proc/{0}/status" + + +class ResourceUsage(object): + pass + + +class MemoryResourceUsage(ResourceUsage): + @staticmethod + def get_memory_usage_from_proc_statm(process_id): + proc_pid_rss = 0 + try: + proc_pid_rss = MemoryResourceUsage._get_proc_rss(process_id) + except Exception as e: + if isinstance(e, (IOError, OSError)): + raise + logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] Could not get the /prod/{0}/statm data due to {1}", process_id, ustr(e)) + raise ProcessInfoException("Could not get the /proc/{0}/statm due to {1}".format(process_id, ustr(e))) + return proc_pid_rss + + @staticmethod + def _get_proc_rss(process_id): + """ + /proc//statm fields: columns are (in pages): + + total program size| + resident set size| + shared pages| + text (code) | + data/stack | + library | + dirty pages | + + Here an example: + root@vm:/# cat /proc/1392/statm + 17637 5316 2125 938 0 3332 0 + + :return: resident set size in bytes. + """ + pid_statm = fileutil.read_file(PROC_STATM_FILENAME_FORMAT.format(process_id)).split() + pid_rss = int(pid_statm[1]) # Index 1 is RSS. + + return pid_rss * PAGE_SIZE + + +class ProcessInfo(object): + @staticmethod + def get_proc_name(process_id): + proc_pid_rss = ProcessInfo._get_proc_comm(process_id) + return proc_pid_rss + + @staticmethod + def get_proc_cmdline(process_id): + proc_pid_rss = ProcessInfo._get_proc_cmdline(process_id) + return proc_pid_rss + + @classmethod + def _get_proc_cmdline(cls, process_id): + """ + /proc//cmdline returns cmdline arguments passed to the Linux kernel. The returned string is delimited with + the \0 character and needs to be replaced with some other character to make it readable. + + Here an example: + root@vm:/# cat /proc/1392/cmdline + python--targettest_resourceusage.py + root@vm:/# cat /proc/1392/cmdline | tr "\0" " " + python --target test_resourceusage.py + + :return: command line passed to the process string. + """ + cmdline_file_name = PROC_CMDLINE_FILENAME_FORMAT.format(process_id) + try: + pid_cmdline = fileutil.read_file(cmdline_file_name).replace("\0", " ").strip() + except Exception as e: + if isinstance(e, (IOError, OSError)): + raise + raise ProcessInfoException("Could not get contents from {0}".format(cmdline_file_name), e) + + return pid_cmdline + + @classmethod + def _get_proc_comm(cls, process_id): + """ + /proc//comm This file exposes the process's comm value-that is, the command name associated with the + process. Strings longer than TASK_COMM_LEN (16) characters are silently truncated. + + Here an example: + root@vm:/# cat /proc/1392/comm + python + + :return: process name + """ + comm_file_name = PROC_COMM_FILENAME_FORMAT.format(process_id) + try: + pid_comm = fileutil.read_file(comm_file_name).strip() + pid_comm_str = str(pid_comm) + except Exception as e: + if isinstance(e, (IOError, OSError)): + raise + raise ProcessInfoException("Could not get contents from {0}".format(comm_file_name), e) + + return pid_comm_str + + +class ProcessInfoException(AgentError): + """ + Exception to classify any issues when we get any issues related to fetching ProcessInfo (cmdline, comm, etc.). + """ + + def __init__(self, msg=None, inner=None): + super(ProcessInfoException, self).__init__(msg, inner) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 102018e65f..fdf314d1fe 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -96,7 +96,6 @@ def generate_extension_metrics_telemetry_dictionary(schema_version=1.0, else: return None - def get_monitor_handler(): return MonitorHandler() @@ -458,20 +457,23 @@ def poll_telemetry_metrics(self): """ This method polls the tracked cgroups to get data from the cgroups filesystem and send the data directly. - :return: + :return: List of Metrics (which would be sent to PerfCounterMetrics directly. """ - time_now = datetime.datetime.utcnow() - if not self.last_cgroup_polling_telemetry: - self.last_cgroup_polling_telemetry = time_now - - if time_now >= (self.last_cgroup_polling_telemetry + - MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): - metrics = CGroupsTelemetry.poll_all_tracked() - self.last_cgroup_polling_telemetry = time_now - - if metrics: - for metric in metrics: - report_metric(metric.category, metric.counter, metric.instance, metric.value) + try: # If there is an issue in reporting, it should not take down whole monitor thread. + time_now = datetime.datetime.utcnow() + if not self.last_cgroup_polling_telemetry: + self.last_cgroup_polling_telemetry = time_now + + if time_now >= (self.last_cgroup_polling_telemetry + + MonitorHandler.CGROUP_TELEMETRY_POLLING_PERIOD): + metrics = CGroupsTelemetry.poll_all_tracked() + self.last_cgroup_polling_telemetry = time_now + + if metrics: + for metric in metrics: + report_metric(metric.category, metric.counter, metric.instance, metric.value) + except Exception as e: + logger.warn("Could not poll all the tracked telemetry due to {0}", ustr(e)) def send_telemetry_metrics(self): """ @@ -481,22 +483,25 @@ def send_telemetry_metrics(self): """ time_now = datetime.datetime.utcnow() - if not self.last_cgroup_report_telemetry: - self.last_cgroup_report_telemetry = time_now - - if time_now >= (self.last_cgroup_report_telemetry + MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD): - performance_metrics = CGroupsTelemetry.report_all_tracked() - self.last_cgroup_report_telemetry = time_now - - if performance_metrics: - message = generate_extension_metrics_telemetry_dictionary(schema_version=1.0, - performance_metrics=performance_metrics) - add_event(name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.ExtensionMetricsData, - is_success=True, - message=ustr(message), - log_event=False) + try: # If there is an issue in reporting, it should not take down whole monitor thread. + if not self.last_cgroup_report_telemetry: + self.last_cgroup_report_telemetry = time_now + + if time_now >= (self.last_cgroup_report_telemetry + MonitorHandler.CGROUP_TELEMETRY_REPORTING_PERIOD): + performance_metrics = CGroupsTelemetry.report_all_tracked() + self.last_cgroup_report_telemetry = time_now + + if performance_metrics: + message = generate_extension_metrics_telemetry_dictionary(schema_version=1.0, + performance_metrics=performance_metrics) + add_event(name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.ExtensionMetricsData, + is_success=True, + message=ustr(message), + log_event=False) + except Exception as e: + logger.warn("Could not report all the tracked telemetry due to {0}", ustr(e)) def log_altered_network_configuration(self): """ diff --git a/tests/common/test_cgroups.py b/tests/common/test_cgroups.py index 6827fb8bff..219b25ad1b 100644 --- a/tests/common/test_cgroups.py +++ b/tests/common/test_cgroups.py @@ -35,17 +35,24 @@ def consume_cpu_time(): class TestCGroup(AgentTestCase): + @staticmethod + def _clean_up_test_files(): + with open(os.path.join(data_dir, "cgroups", "cpu_mount", "tasks"), mode="wb") as tasks: + tasks.truncate(0) + with open(os.path.join(data_dir, "cgroups", "memory_mount", "tasks"), mode="wb") as tasks: + tasks.truncate(0) + with open(os.path.join(data_dir, "cgroups", "cpu_mount", "cgroup.procs"), mode="wb") as procs: + procs.truncate(0) + with open(os.path.join(data_dir, "cgroups", "memory_mount", "cgroup.procs"), mode="wb") as procs: + procs.truncate(0) def setUp(self): AgentTestCase.setUp(self) + TestCGroup._clean_up_test_files() def tearDown(self): AgentTestCase.tearDown(self) - - with open(os.path.join(data_dir, "cgroups", "cpu_mount", "tasks"), mode="wb") as tasks: - tasks.truncate(0) - with open(os.path.join(data_dir, "cgroups", "memory_mount", "tasks"), mode="wb") as tasks: - tasks.truncate(0) + TestCGroup._clean_up_test_files() def test_correct_creation(self): test_cgroup = CGroup.create("dummy_path", "cpu", "test_extension") @@ -77,6 +84,23 @@ def test_is_active(self): self.assertEqual(True, test_cgroup.is_active()) + def test_get_tracked_processes(self): + test_cgroup = CGroup.create(os.path.join(data_dir, "cgroups", "cpu_mount"), "cpu", "test_extension") + self.assertListEqual(test_cgroup.get_tracked_processes(), []) + + with open(os.path.join(data_dir, "cgroups", "cpu_mount", "cgroup.procs"), mode="wb") as tasks: + tasks.write(str(1000).encode()) + + self.assertEqual(['1000'], test_cgroup.get_tracked_processes()) + + test_cgroup = CGroup.create(os.path.join(data_dir, "cgroups", "memory_mount"), "memory", "test_extension") + self.assertListEqual(test_cgroup.get_tracked_processes(), []) + + with open(os.path.join(data_dir, "cgroups", "memory_mount", "cgroup.procs"), mode="wb") as tasks: + tasks.write(str(1000).encode()) + + self.assertEqual(['1000'], test_cgroup.get_tracked_processes()) + @patch("azurelinuxagent.common.logger.periodic_warn") def test_is_active_file_not_present(self, patch_periodic_warn): test_cgroup = CGroup.create(os.path.join(data_dir, "cgroups", "not_cpu_mount"), "cpu", "test_extension") @@ -93,8 +117,8 @@ def test_is_active_incorrect_file(self, patch_periodic_warn): self.assertEqual(False, test_cgroup.is_active()) self.assertEqual(1, patch_periodic_warn.call_count) - test_cgroup = CGroup.create(os.path.join(data_dir, "cgroups", "memory_mount", "tasks"), "memory", "test_extension") - self.assertEqual(False, test_cgroup.is_active()) + test_cgp = CGroup.create(os.path.join(data_dir, "cgroups", "memory_mount", "tasks"), "memory", "test_extension") + self.assertEqual(False, test_cgp.is_active()) self.assertEqual(2, patch_periodic_warn.call_count) @@ -227,8 +251,12 @@ def test_get_metrics(self): def test_get_metrics_when_files_not_present(self): test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups")) - memory_usage = test_mem_cg.get_memory_usage() - self.assertEqual(0, memory_usage) + with self.assertRaises(IOError) as e: + test_mem_cg.get_memory_usage() - max_memory_usage = test_mem_cg.get_max_memory_usage() - self.assertEqual(0, max_memory_usage) + self.assertEqual(e.exception.errno, errno.ENOENT) + + with self.assertRaises(IOError) as e: + test_mem_cg.get_max_memory_usage() + + self.assertEqual(e.exception.errno, errno.ENOENT) diff --git a/tests/common/test_cgroupstelemetry.py b/tests/common/test_cgroupstelemetry.py index c6d14d48ef..38c2558df5 100644 --- a/tests/common/test_cgroupstelemetry.py +++ b/tests/common/test_cgroupstelemetry.py @@ -23,7 +23,7 @@ from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry, Metric from azurelinuxagent.common.osutil.default import BASE_CGROUPS, DefaultOSUtil -from azurelinuxagent.common.protocol.restapi import ExtHandlerProperties, ExtHandler +from azurelinuxagent.common.protocol.restapi import ExtHandler, ExtHandlerProperties from azurelinuxagent.common.utils import fileutil from azurelinuxagent.ga.exthandlers import ExtHandlerInstance from nose.plugins.attrib import attr @@ -78,11 +78,36 @@ def make_new_cgroup(name="test-cgroup"): class TestCGroupsTelemetry(AgentTestCase): + TestProcessIds = ["1000", "1001", "1002"] + TestProcStatmMemoryUsed = 1234 + TestProcComm = "python" + TestProcCommandLine = "python -u bin/WALinuxAgent-2.2.45-py2.7.egg -run-exthandlers" + NumSummarizationValues = 7 + @classmethod def setUpClass(cls): AgentTestCase.setUpClass() - # CPU Cgroups compute usage based on /proc/stat and /sys/fs/cgroup/.../cpuacct.stat; use mock data for those files + # Use the default value for memory used from proc_statm + cls.mock_get_memory_usage_from_proc_statm = patch("azurelinuxagent.common.resourceusage.MemoryResourceUsage." + "get_memory_usage_from_proc_statm", return_value=TestCGroupsTelemetry.TestProcStatmMemoryUsed) + cls.mock_get_memory_usage_from_proc_statm.start() + + # Use the default value for memory used from proc_statm + cls.mock_get_tracked_processes = patch("azurelinuxagent.common.cgroup.CGroup.get_tracked_processes", + return_value=TestCGroupsTelemetry.TestProcessIds) + cls.mock_get_tracked_processes.start() + + cls.mock_get_proc_name = patch("azurelinuxagent.common.resourceusage.ProcessInfo.get_proc_name", + return_value=TestCGroupsTelemetry.TestProcComm) + cls.mock_get_proc_name.start() + + cls.mock_get_proc_cmdline = patch("azurelinuxagent.common.resourceusage.ProcessInfo.get_proc_cmdline", + return_value=TestCGroupsTelemetry.TestProcCommandLine) + cls.mock_get_proc_cmdline.start() + + # CPU Cgroups compute usage based on /proc/stat and /sys/fs/cgroup/.../cpuacct.stat; use mock data for those + # files original_read_file = fileutil.read_file def mock_read_file(filepath, **args): @@ -92,12 +117,18 @@ def mock_read_file(filepath, **args): filepath = os.path.join(data_dir, "cgroups", "cpuacct.stat_t0") return original_read_file(filepath, **args) - cls._mock_read_cpu_cgroup_file = patch("azurelinuxagent.common.utils.fileutil.read_file", side_effect=mock_read_file) + cls._mock_read_cpu_cgroup_file = patch("azurelinuxagent.common.utils.fileutil.read_file", + side_effect=mock_read_file) cls._mock_read_cpu_cgroup_file.start() @classmethod def tearDownClass(cls): + cls.mock_get_memory_usage_from_proc_statm.stop() + cls.mock_get_tracked_processes.stop() + cls.mock_get_proc_name.stop() + cls.mock_get_proc_cmdline.stop() cls._mock_read_cpu_cgroup_file.stop() + AgentTestCase.tearDownClass() def setUp(self): @@ -108,34 +139,102 @@ def tearDown(self): AgentTestCase.tearDown(self) CGroupsTelemetry.reset() - def _assert_cgroup_metrics_equal(self, cpu_usage, memory_usage, max_memory_usage): - for _, cgroup_metric in CGroupsTelemetry._cgroup_metrics.items(): - self.assertListEqual(cgroup_metric.get_memory_usage()._data, memory_usage) - self.assertListEqual(cgroup_metric.get_max_memory_usage()._data, max_memory_usage) - self.assertListEqual(cgroup_metric.get_cpu_usage()._data, cpu_usage) + @staticmethod + def _track_new_extension_cgroups(num_extensions): + for i in range(num_extensions): + dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) + CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + + dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", + "dummy_extension_{0}".format(i)) + CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - def _assert_cgroup_polling_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, max_memory_metric_value): + def _assert_cgroups_are_tracked(self, num_extensions): + for i in range(num_extensions): + self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) + self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) + + def _assert_calculated_resource_metrics_equal(self, cpu_usage, memory_usage, max_memory_usage, + memory_statm_memory_usage, proc_ids=None): + if not proc_ids: + proc_ids = TestCGroupsTelemetry.TestProcessIds + + processes_instances = [CGroupsTelemetry.get_process_info_summary(pid) for pid in proc_ids] + for _, cgroup_metric in CGroupsTelemetry._cgroup_metrics.items(): + self.assertListEqual(cgroup_metric.get_memory_metrics()._data, memory_usage) + self.assertListEqual(cgroup_metric.get_max_memory_metrics()._data, max_memory_usage) + self.assertListEqual(cgroup_metric.get_cpu_metrics()._data, cpu_usage) + for kv_pair in cgroup_metric.get_proc_statm_memory_metrics(): + self.assertIn(kv_pair.pid_name_cmdline, processes_instances) + self.assertListEqual(kv_pair.resource_metric._data, memory_statm_memory_usage) + + def _assert_polled_metrics_equal(self, metrics, cpu_metric_value, memory_metric_value, + max_memory_metric_value, proc_stat_memory_usage_value, pids=None): for metric in metrics: self.assertIn(metric.category, ["Process", "Memory"]) if metric.category == "Process": self.assertEqual(metric.counter, "% Processor Time") self.assertEqual(metric.value, cpu_metric_value) if metric.category == "Memory": - self.assertIn(metric.counter, ["Total Memory Usage", "Max Memory Usage"]) + self.assertIn(metric.counter, ["Total Memory Usage", "Max Memory Usage", "Memory Used by Process"]) if metric.counter == "Total Memory Usage": self.assertEqual(metric.value, memory_metric_value) elif metric.counter == "Max Memory Usage": self.assertEqual(metric.value, max_memory_metric_value) + elif metric.counter == "Memory Used by Process": + if pids: + processes_instances = [CGroupsTelemetry.get_process_info_summary(pid) for pid in + pids] + else: + processes_instances = [CGroupsTelemetry.get_process_info_summary(pid) for pid in + TestCGroupsTelemetry.TestProcessIds] + self.assertIn(metric.instance, processes_instances) + self.assertEqual(metric.value, proc_stat_memory_usage_value) + + def _assert_extension_metrics_data(self, collected_metrics, num_extensions, cpu_percent_values, + proc_stat_memory_usage_values, memory_usage_values, max_memory_usage_values, + is_cpu_present=True, is_memory_present=True): + num_summarization_values = TestCGroupsTelemetry.NumSummarizationValues + + if not (is_cpu_present or is_memory_present): + self.assertEquals(collected_metrics, {}) + return + else: + for i in range(num_extensions): + name = "dummy_extension_{0}".format(i) + + if is_memory_present: + self.assertIn(name, collected_metrics) + self.assertIn("memory", collected_metrics[name]) + self.assertIn("cur_mem", collected_metrics[name]["memory"]) + self.assertIn("max_mem", collected_metrics[name]["memory"]) + self.assertEqual(num_summarization_values, len(collected_metrics[name]["memory"]["cur_mem"])) + self.assertEqual(num_summarization_values, len(collected_metrics[name]["memory"]["max_mem"])) + + self.assertIn("proc_statm_memory", collected_metrics[name]) + self.assertEqual(3, len(collected_metrics[name]["proc_statm_memory"])) # number of processes added + for tracked_process in collected_metrics[name]["proc_statm_memory"]: + self.assertEqual(num_summarization_values, + len(collected_metrics[name]["proc_statm_memory"][tracked_process])) + self.assertListEqual(generate_metric_list(proc_stat_memory_usage_values), + collected_metrics[name]["proc_statm_memory"][tracked_process][0:5]) + + self.assertListEqual(generate_metric_list(memory_usage_values), + collected_metrics[name]["memory"]["cur_mem"][0:5]) + self.assertListEqual(generate_metric_list(max_memory_usage_values), + collected_metrics[name]["memory"]["max_mem"][0:5]) + + if is_cpu_present: + self.assertIn("cpu", collected_metrics[name]) + self.assertIn("cur_cpu", collected_metrics[name]["cpu"]) + self.assertEqual(num_summarization_values, len(collected_metrics[name]["cpu"]["cur_cpu"])) + self.assertListEqual(generate_metric_list(cpu_percent_values), + collected_metrics[name]["cpu"]["cur_cpu"][0:5]) def test_telemetry_polling_with_active_cgroups(self, *args): - num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + num_extensions = 3 - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: @@ -146,145 +245,129 @@ def test_telemetry_polling_with_active_cgroups(self, *args): current_cpu = 30 current_memory = 209715200 current_max_memory = 471859200 + current_proc_statm = TestCGroupsTelemetry.TestProcStatmMemoryUsed + # 1 CPU metric + 1 Current Memory + 1 Max memor + num_processes * memory from statm + num_of_metrics_per_extn_expected = 1 + 1 + 1 + 3 * 1 patch_get_cpu_usage.return_value = current_cpu patch_get_memory_usage.return_value = current_memory # example 200 MB patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB + num_polls = 10 - poll_count = 1 - - for data_count in range(poll_count, 10): + for data_count in range(1, num_polls + 1): metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) - self._assert_cgroup_metrics_equal( - cpu_usage=[current_cpu] * data_count, - memory_usage=[current_memory] * data_count, - max_memory_usage=[current_max_memory] * data_count) - self.assertEqual(len(metrics), num_extensions * 3) - self._assert_cgroup_polling_metrics_equal(metrics, current_cpu, current_memory, current_max_memory) - - CGroupsTelemetry.report_all_tracked() - - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - self._assert_cgroup_metrics_equal([], [], []) - - def test_telemetry_polling_with_inactive_cgroups(self, *args): - num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: - patch_is_active.return_value = False - - no_extensions_expected = 0 - data_count = 1 - current_cpu = 30 - current_memory = 209715200 - current_max_memory = 471859200 + self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) + self._assert_calculated_resource_metrics_equal(cpu_usage=[current_cpu] * data_count, + memory_usage=[current_memory] * data_count, + max_memory_usage=[current_max_memory] * data_count, + proc_ids=TestCGroupsTelemetry.TestProcessIds, + memory_statm_memory_usage=[current_proc_statm] * data_count) + self.assertEqual(len(metrics), num_extensions * num_of_metrics_per_extn_expected) + self._assert_polled_metrics_equal(metrics, current_cpu, current_memory, current_max_memory, + current_proc_statm) - patch_get_cpu_usage.return_value = current_cpu - patch_get_memory_usage.return_value = current_memory # example 200 MB - patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB + collected_metrics = CGroupsTelemetry.report_all_tracked() - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) + self._assert_extension_metrics_data(collected_metrics, num_extensions, + [current_cpu] * num_polls, + [TestCGroupsTelemetry.TestProcStatmMemoryUsed] * num_polls, + [current_memory] * num_polls, + [current_max_memory] * num_polls, + is_cpu_present=False) - metrics = CGroupsTelemetry.poll_all_tracked() + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self._assert_calculated_resource_metrics_equal([], [], [], [], []) - for i in range(num_extensions): - self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - self._assert_cgroup_metrics_equal( - cpu_usage=[current_cpu] * data_count, - memory_usage=[current_memory] * data_count, - max_memory_usage=[current_max_memory] * data_count) - self.assertEqual(len(metrics), num_extensions * 3) - self._assert_cgroup_polling_metrics_equal(metrics, current_cpu, current_memory, current_max_memory) + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.common.cgroup.CGroup.is_active", return_value=False) + def test_telemetry_polling_with_inactive_cgroups(self, *_): + num_extensions = 5 + no_extensions_expected = 0 - CGroupsTelemetry.report_all_tracked() + self._track_new_extension_cgroups(num_extensions) + self._assert_cgroups_are_tracked(num_extensions) - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), no_extensions_expected) - self._assert_cgroup_metrics_equal([], [], []) + metrics = CGroupsTelemetry.poll_all_tracked() - def test_telemetry_polling_with_changing_cgroups_state(self, *args): - num_extensions = 5 for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) + self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self._assert_calculated_resource_metrics_equal([], [], [], [], proc_ids=None) + self.assertEqual(len(metrics), 0) - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: - patch_is_active.return_value = True + collected_metrics = CGroupsTelemetry.report_all_tracked() + self._assert_extension_metrics_data(collected_metrics, num_extensions, [], [], [], [], is_cpu_present=False, + is_memory_present=False) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), no_extensions_expected) + self._assert_calculated_resource_metrics_equal([], [], [], [], []) + + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") + @patch("azurelinuxagent.common.cgroup.CGroup.is_active") + @patch("azurelinuxagent.common.resourceusage.MemoryResourceUsage.get_memory_usage_from_proc_statm") + def test_telemetry_polling_with_changing_cgroups_state(self, patch_get_statm, patch_is_active, patch_get_cpu_usage, + patch_get_mem, patch_get_max_mem, *args): + num_extensions = 5 + self._track_new_extension_cgroups(num_extensions) - no_extensions_expected = 0 - expected_data_count = 2 + patch_is_active.return_value = True - current_cpu = 30 - current_memory = 209715200 - current_max_memory = 471859200 + no_extensions_expected = 0 + expected_data_count = 1 - patch_get_cpu_usage.return_value = current_cpu - patch_get_memory_usage.return_value = current_memory # example 200 MB - patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB + current_cpu = 30 + current_memory = 209715200 + current_max_memory = 471859200 + current_proc_statm = 20000000 - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) + patch_get_cpu_usage.return_value = current_cpu + patch_get_mem.return_value = current_memory # example 200 MB + patch_get_max_mem.return_value = current_max_memory # example 450 MB + patch_get_statm.return_value = current_proc_statm - CGroupsTelemetry.poll_all_tracked() + self._assert_cgroups_are_tracked(num_extensions) + CGroupsTelemetry.poll_all_tracked() - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) + self._assert_cgroups_are_tracked(num_extensions) - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + patch_is_active.return_value = False + patch_get_cpu_usage.side_effect = raise_ioerror + patch_get_mem.side_effect = raise_ioerror + patch_get_max_mem.side_effect = raise_ioerror + patch_get_statm.side_effect = raise_ioerror - patch_is_active.return_value = False - CGroupsTelemetry.poll_all_tracked() + CGroupsTelemetry.poll_all_tracked() - for i in range(num_extensions): - self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) + for i in range(num_extensions): + self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) + self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - self._assert_cgroup_metrics_equal( - cpu_usage=[current_cpu] * expected_data_count, - memory_usage=[current_memory] * expected_data_count, - max_memory_usage=[current_max_memory] * expected_data_count) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self._assert_calculated_resource_metrics_equal( + cpu_usage=[current_cpu] * expected_data_count, + memory_usage=[current_memory] * expected_data_count, + max_memory_usage=[current_max_memory] * expected_data_count, + proc_ids=TestCGroupsTelemetry.TestProcessIds, + memory_statm_memory_usage=[current_proc_statm] * expected_data_count + ) - CGroupsTelemetry.report_all_tracked() + CGroupsTelemetry.report_all_tracked() - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), no_extensions_expected) - self._assert_cgroup_metrics_equal([], [], []) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), no_extensions_expected) + self._assert_calculated_resource_metrics_equal([], [], [], [], []) + # mocking get_proc_stat to make it run on Mac and other systems. This test does not need to read the values of the + # /proc/stat file on the filesystem. @patch("azurelinuxagent.common.logger.periodic_warn") def test_telemetry_polling_to_not_generate_transient_logs_ioerror_file_not_found(self, patch_periodic_warn): num_extensions = 1 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - + self._track_new_extension_cgroups(num_extensions) self.assertEqual(0, patch_periodic_warn.call_count) # Not expecting logs present for io_error with errno=errno.ENOENT @@ -302,14 +385,7 @@ def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied( num_extensions = 1 num_controllers = 2 is_active_check_per_controller = 2 - - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) self.assertEqual(0, patch_periodic_warn.call_count) @@ -329,13 +405,7 @@ def test_telemetry_polling_to_generate_transient_logs_ioerror_permission_denied( def test_telemetry_polling_to_generate_transient_logs_index_error(self): num_extensions = 1 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) # Generating a different kind of error (non-IOError) to check the logging. # Trying to invoke IndexError during the getParameter call @@ -346,101 +416,61 @@ def test_telemetry_polling_to_generate_transient_logs_index_error(self): CGroupsTelemetry.poll_all_tracked() self.assertEqual(expected_call_count, patch_periodic_warn.call_count) - def test_telemetry_calculations(self, *args): + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") + @patch("azurelinuxagent.common.cgroup.CGroup.is_active") + @patch("azurelinuxagent.common.resourceusage.MemoryResourceUsage.get_memory_usage_from_proc_statm") + def test_telemetry_calculations(self, patch_get_statm, patch_is_active, patch_get_cpu_usage, + patch_get_memory_usage, patch_get_memory_max_usage, *args): num_polls = 10 num_extensions = 1 - num_summarization_values = 7 cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls)] # only verifying calculations and not validity of the values. memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] max_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] + proc_stat_memory_usage_values = [random.randint(0, 8 * 1024 ** 3) for _ in range(num_polls)] - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) + self._track_new_extension_cgroups(num_extensions) + self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked)) - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + for i in range(num_polls): + patch_is_active.return_value = True + patch_get_cpu_usage.return_value = cpu_percent_values[i] + patch_get_memory_usage.return_value = memory_usage_values[i] # example 200 MB + patch_get_memory_max_usage.return_value = max_memory_usage_values[i] # example 450 MB + patch_get_statm.return_value = proc_stat_memory_usage_values[i] - self.assertEqual(2 * num_extensions, len(CGroupsTelemetry._tracked)) + metrics = CGroupsTelemetry.poll_all_tracked() - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: - for i in range(num_polls): - patch_is_active.return_value = True - patch_get_cpu_usage.return_value = cpu_percent_values[i] - patch_get_memory_usage.return_value = memory_usage_values[i] # example 200 MB - patch_get_memory_max_usage.return_value = max_memory_usage_values[i] # example 450 MB - metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(len(metrics), 3 * num_extensions) - self._assert_cgroup_polling_metrics_equal(metrics, cpu_percent_values[i], memory_usage_values[i], - max_memory_usage_values[i]) + # 1 CPU metric + 1 Current Memory + 1 Max memory + num_processes (3) * memory from statm + self.assertEqual(len(metrics), 6 * num_extensions) + self._assert_polled_metrics_equal(metrics, cpu_percent_values[i], memory_usage_values[i], + max_memory_usage_values[i], + proc_stat_memory_usage_values[i]) collected_metrics = CGroupsTelemetry.report_all_tracked() - for i in range(num_extensions): - name = "dummy_extension_{0}".format(i) - - self.assertIn(name, collected_metrics) - self.assertIn("memory", collected_metrics[name]) - self.assertIn("cur_mem", collected_metrics[name]["memory"]) - self.assertIn("max_mem", collected_metrics[name]["memory"]) - self.assertEqual(num_summarization_values, len(collected_metrics[name]["memory"]["cur_mem"])) - self.assertEqual(num_summarization_values, len(collected_metrics[name]["memory"]["max_mem"])) - - self.assertListEqual(generate_metric_list(memory_usage_values), - collected_metrics[name]["memory"]["cur_mem"][0:5]) - self.assertListEqual(generate_metric_list(max_memory_usage_values), - collected_metrics[name]["memory"]["max_mem"][0:5]) - - self.assertIn("cpu", collected_metrics[name]) - self.assertIn("cur_cpu", collected_metrics[name]["cpu"]) - self.assertEqual(num_summarization_values, len(collected_metrics[name]["cpu"]["cur_cpu"])) - self.assertListEqual(generate_metric_list(cpu_percent_values), - collected_metrics[name]["cpu"]["cur_cpu"][0:5]) + self._assert_extension_metrics_data(collected_metrics, num_extensions, + cpu_percent_values, proc_stat_memory_usage_values, memory_usage_values, + max_memory_usage_values) def test_cgroup_tracking(self, *args): - with patch("azurelinuxagent.common.cgroup.CpuCgroup.initialize_cpu_usage") as patch_initialize_cpu_usage: - num_extensions = 5 - num_controllers = 2 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - - self.assertEqual(num_extensions * num_controllers, len(CGroupsTelemetry._tracked)) - self.assertEqual(num_extensions, patch_initialize_cpu_usage.call_count) + num_extensions = 5 + num_controllers = 2 + self._track_new_extension_cgroups(num_extensions) + self._assert_cgroups_are_tracked(num_extensions) + self.assertEqual(num_extensions * num_controllers, len(CGroupsTelemetry._tracked)) def test_cgroup_pruning(self, *args): num_extensions = 5 num_controllers = 2 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - + self._track_new_extension_cgroups(num_extensions) + self._assert_cgroups_are_tracked(num_extensions) self.assertEqual(num_extensions * num_controllers, len(CGroupsTelemetry._tracked)) CGroupsTelemetry.prune_all_tracked() - for i in range(num_extensions): self.assertFalse(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) self.assertFalse(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) @@ -449,136 +479,102 @@ def test_cgroup_pruning(self, *args): def test_cgroup_is_tracked(self, *args): num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", "dummy_extension_{0}". - format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - - for i in range(num_extensions): - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_cpu_path_{0}".format(i))) - self.assertTrue(CGroupsTelemetry.is_tracked("dummy_memory_path_{0}".format(i))) - + self._track_new_extension_cgroups(num_extensions) + self._assert_cgroups_are_tracked(num_extensions) self.assertFalse(CGroupsTelemetry.is_tracked("not_present_cpu_dummy_path")) self.assertFalse(CGroupsTelemetry.is_tracked("not_present_memory_dummy_path")) + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_incorrect_cgroups_mounted(self, *args): num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) - - with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - patch_get_cpu_usage.side_effect = Exception("File not found") - patch_get_memory_usage.side_effect = Exception("File not found") + self._track_new_extension_cgroups(num_extensions) - for data_count in range(1, 10): - metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(len(metrics), 0) + for data_count in range(1, 10): + metrics = CGroupsTelemetry.poll_all_tracked() + self.assertEqual(len(metrics), 0) - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - collected_metrics = {} - for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items(): - collected_metrics[name] = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics) - self.assertEqual(collected_metrics[name], {}) # empty + collected_metrics = {} + for name, cgroup_metrics in CGroupsTelemetry._cgroup_metrics.items(): + collected_metrics[name] = CGroupsTelemetry._process_cgroup_metric(cgroup_metrics) + self.assertEqual(collected_metrics[name], {}) # empty + @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_memory_cgroup_mounted(self, *args): num_extensions = 5 - - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: - patch_is_active.return_value = True - patch_get_memory_usage.side_effect = Exception("File not found") + with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: + patch_is_active.return_value = True - current_cpu = 30 - patch_get_cpu_usage.return_value = current_cpu + current_cpu = 30 + patch_get_cpu_usage.return_value = current_cpu - poll_count = 1 + poll_count = 1 - for data_count in range(poll_count, 10): - metrics = CGroupsTelemetry.poll_all_tracked() + for data_count in range(poll_count, 10): + metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - self._assert_cgroup_metrics_equal(cpu_usage=[current_cpu] * data_count, memory_usage=[], max_memory_usage=[]) - self.assertEqual(len(metrics), num_extensions * 1) # Only CPU populated - self._assert_cgroup_polling_metrics_equal(metrics, current_cpu, 0, 0) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self._assert_calculated_resource_metrics_equal(cpu_usage=[current_cpu] * data_count, memory_usage=[] + , max_memory_usage=[], proc_ids=[], + memory_statm_memory_usage=[]) + self.assertEqual(len(metrics), num_extensions * 1) # Only CPU populated + self._assert_polled_metrics_equal(metrics, current_cpu, 0, 0, 0) - CGroupsTelemetry.report_all_tracked() + CGroupsTelemetry.report_all_tracked() - self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) - self._assert_cgroup_metrics_equal([], [], []) + self.assertEqual(CGroupsTelemetry._cgroup_metrics.__len__(), num_extensions) + self._assert_calculated_resource_metrics_equal([], [], [], [], []) + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) def test_process_cgroup_metric_with_no_cpu_cgroup_mounted(self, *args): num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), "memory", - "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage") as patch_get_memory_max_usage: with patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage") as patch_get_memory_usage: - with patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") as patch_get_cpu_usage: - with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: - patch_is_active.return_value = True - - patch_get_cpu_usage.side_effect = Exception("File not found") - - current_memory = 209715200 - current_max_memory = 471859200 - - patch_get_memory_usage.return_value = current_memory # example 200 MB - patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB - - poll_count = 1 - - for data_count in range(poll_count, 10): - metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) - self._assert_cgroup_metrics_equal( - cpu_usage=[], - memory_usage=[current_memory] * data_count, - max_memory_usage=[current_max_memory] * data_count) - # Memory is only populated, CPU is not. Thus 2 metrics per cgroup. - self.assertEqual(len(metrics), num_extensions * 2) - self._assert_cgroup_polling_metrics_equal(metrics, 0, current_memory, current_max_memory) + with patch("azurelinuxagent.common.cgroup.CGroup.is_active") as patch_is_active: + patch_is_active.return_value = True - CGroupsTelemetry.report_all_tracked() + current_memory = 209715200 + current_max_memory = 471859200 + patch_get_memory_usage.return_value = current_memory # example 200 MB + patch_get_memory_max_usage.return_value = current_max_memory # example 450 MB + num_polls = 10 + for data_count in range(1, num_polls + 1): + metrics = CGroupsTelemetry.poll_all_tracked() self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) - self._assert_cgroup_metrics_equal([], [], []) + self._assert_calculated_resource_metrics_equal(cpu_usage=[], memory_usage=[current_memory] * data_count, + max_memory_usage=[current_max_memory] * data_count, + memory_statm_memory_usage=[TestCGroupsTelemetry.TestProcStatmMemoryUsed] * data_count, + proc_ids=TestCGroupsTelemetry.TestProcessIds) + # Memory is only populated, CPU is not. Thus 5 metrics per cgroup. + self.assertEqual(len(metrics), num_extensions * 5) + self._assert_polled_metrics_equal(metrics, 0, current_memory, current_max_memory, + TestCGroupsTelemetry.TestProcStatmMemoryUsed) + + collected_metrics = CGroupsTelemetry.report_all_tracked() + self._assert_extension_metrics_data(collected_metrics, num_extensions, + [], [TestCGroupsTelemetry.TestProcStatmMemoryUsed] * num_polls, + [current_memory] * num_polls, + [current_max_memory] * num_polls, + is_cpu_present=False) + + self.assertEqual(len(CGroupsTelemetry._cgroup_metrics), num_extensions) + self._assert_calculated_resource_metrics_equal([], [], [], [], []) @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_memory_usage", side_effect=raise_ioerror) @patch("azurelinuxagent.common.cgroup.MemoryCgroup.get_max_memory_usage", side_effect=raise_ioerror) @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage", side_effect=raise_ioerror) def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): num_extensions = 5 - for i in range(num_extensions): - dummy_cpu_cgroup = CGroup.create("dummy_cpu_path_{0}".format(i), "cpu", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup) - - dummy_memory_cgroup = CGroup.create("dummy_memory_path_{0}".format(i), - "memory", "dummy_extension_{0}".format(i)) - CGroupsTelemetry.track_cgroup(dummy_memory_cgroup) + self._track_new_extension_cgroups(num_extensions) with patch("azurelinuxagent.common.cgroupstelemetry.CGroupsTelemetry._process_cgroup_metric") as \ patch_process_cgroup_metric: diff --git a/tests/common/test_resourceusage.py b/tests/common/test_resourceusage.py new file mode 100644 index 0000000000..7d38799322 --- /dev/null +++ b/tests/common/test_resourceusage.py @@ -0,0 +1,86 @@ +# Copyright 2016 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# + +import os + +from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo, ProcessInfoException +from azurelinuxagent.common.utils import fileutil +from tests.tools import AgentTestCase, data_dir, patch + + +def raise_ioerror(*_): + e = IOError() + from errno import ENOENT + e.errno = ENOENT + raise e + + +def raise_exception(*_): + raise Exception() + + +class TestMemoryResourceUsage(AgentTestCase): + @patch("azurelinuxagent.common.resourceusage.fileutil") + def test_get_memory_usage_from_proc_statm(self, patch_read_file): + patch_read_file.read_file.return_value = fileutil.read_file(os.path.join( + data_dir, "cgroups", "dummy_proc_statm")) + mem_usage = MemoryResourceUsage.get_memory_usage_from_proc_statm(1000) + self.assertEqual(mem_usage, 331866112) + + # No such file exists. Throw IOError (similar to the IOError we throw for Cgroups). + patch_read_file.read_file.side_effect = raise_ioerror + with self.assertRaises(IOError): + MemoryResourceUsage.get_memory_usage_from_proc_statm(1000) + + # Some other exception occured. Throw ProcessInfoException. + patch_read_file.read_file.side_effect = raise_exception + with self.assertRaises(ProcessInfoException): + MemoryResourceUsage.get_memory_usage_from_proc_statm(1000) + + +class TestProcessInfo(AgentTestCase): + @patch("azurelinuxagent.common.resourceusage.fileutil") + def test_get_proc_cmdline(self, patch_read_file): + patch_read_file.read_file.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_cmdline")) + cmdline = ProcessInfo.get_proc_cmdline(1000) + self.assertEqual("python -u bin/WALinuxAgent-2.2.45-py2.7.egg -run-exthandlers", cmdline) + + patch_read_file.read_file.side_effect = raise_ioerror + # No such file exists; _get_proc_cmdline throws exception. + with self.assertRaises(IOError): + ProcessInfo._get_proc_cmdline(1000) + + patch_read_file.read_file.side_effect = raise_exception + # Other exception; _get_proc_cmdline throws exception. + with self.assertRaises(ProcessInfoException): + ProcessInfo._get_proc_cmdline(1000) + + @patch("azurelinuxagent.common.resourceusage.fileutil") + def test_get_proc_comm(self, patch_read_file): + patch_read_file.read_file.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_comm")) + proc_name = ProcessInfo.get_proc_name(1000) + self.assertEqual("python", proc_name) + + patch_read_file.read_file.side_effect = raise_ioerror + # No such file exists; expect None instead. + with self.assertRaises(IOError): + ProcessInfo.get_proc_name(1000) + + patch_read_file.read_file.side_effect = raise_exception + # Other exception; _get_proc_cmdline throws exception. + with self.assertRaises(ProcessInfoException): + ProcessInfo._get_proc_comm(1000) diff --git a/tests/data/cgroups/cpu_mount/cgroup.procs b/tests/data/cgroups/cpu_mount/cgroup.procs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/data/cgroups/dummy_proc_cmdline b/tests/data/cgroups/dummy_proc_cmdline new file mode 100644 index 0000000000000000000000000000000000000000..125c505195a2425e6008896e7c0451022963ef9e GIT binary patch literal 61 zcmXS@EXl~vW6&*SNXpF94|nv*%qy*MOi#@#(KXUD(lasDEvPioGuKN^PiN3AD$UbP QttiPz%uC5hEh=UJ0BU&?`2YX_ literal 0 HcmV?d00001 diff --git a/tests/data/cgroups/dummy_proc_comm b/tests/data/cgroups/dummy_proc_comm new file mode 100644 index 0000000000..d8654aa0e2 --- /dev/null +++ b/tests/data/cgroups/dummy_proc_comm @@ -0,0 +1 @@ +python \ No newline at end of file diff --git a/tests/data/cgroups/dummy_proc_statm b/tests/data/cgroups/dummy_proc_statm new file mode 100644 index 0000000000..7f78f852b5 --- /dev/null +++ b/tests/data/cgroups/dummy_proc_statm @@ -0,0 +1 @@ +980608 81022 30304 4 0 93606 0 \ No newline at end of file diff --git a/tests/data/cgroups/memory_mount/cgroup.procs b/tests/data/cgroups/memory_mount/cgroup.procs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 8b3a21442b..bcc2a78f4a 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -37,6 +37,7 @@ from azurelinuxagent.common.event import CONTAINER_ID_ENV_VARIABLE, EventLogger, WALAEventOperation from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.logger import Logger from azurelinuxagent.common.osutil.default import BASE_CGROUPS, DefaultOSUtil from azurelinuxagent.common.protocol.imds import ComputeInfo from azurelinuxagent.common.protocol.restapi import VMInfo @@ -112,6 +113,13 @@ def get_event_message(duration, evt_type, is_internal, is_success, message, name @patch("azurelinuxagent.common.protocol.healthservice.HealthService._report") @patch("azurelinuxagent.common.utils.restutil.http_get") class TestMonitor(AgentTestCase): + def setUp(self): + AgentTestCase.setUp(self) + prefix = "UnitTest" + logger.DEFAULT_LOGGER = Logger(prefix=prefix) + + def tearDown(self): + AgentTestCase.tearDown(self) def test_parse_xml_event(self, *args): data_str = load_data('ext/event_from_extension.xml') @@ -1070,6 +1078,7 @@ def test_report_event_metrics_sent_for_actual_cgroup(self, patch_report_event, p max_num_polls = 5 time_to_wait = 1 extn_name = "foobar-1.0.0" + extn_folder_name = extn_name.replace("-", "_") cgs = make_new_cgroup(extn_name) self.assertEqual(len(cgs), 2) @@ -1088,30 +1097,34 @@ def test_report_event_metrics_sent_for_actual_cgroup(self, patch_report_event, p command = self.create_script("keep_cpu_busy_and_consume_memory_for_{0}_seconds".format(time_to_wait), ''' nohup python -c "import time +import subprocess for i in range(3): x = [1, 2, 3, 4, 5] * (i * 1000) time.sleep({0}) x *= 0 - print('Test loop')" & + print('Test loop') + +" & '''.format(time_to_wait)) self.log_dir = os.path.join(self.tmp_dir, "log") - with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_base_dir", lambda *_: self.tmp_dir) as \ - patch_get_base_dir: - with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_log_dir", lambda *_: self.log_dir) as \ - patch_get_log_dir: + with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_base_dir", lambda *_: self.tmp_dir): + with patch("azurelinuxagent.ga.exthandlers.ExtHandlerInstance.get_log_dir", lambda *_: self.log_dir): ext_handler_instance.launch_command(command) - self.assertTrue(CGroupsTelemetry.is_tracked(os.path.join( - BASE_CGROUPS, "cpu", "walinuxagent.extensions", "foobar_1.0.0"))) - self.assertTrue(CGroupsTelemetry.is_tracked(os.path.join( - BASE_CGROUPS, "memory", "walinuxagent.extensions", "foobar_1.0.0"))) + self.assertTrue(CGroupsTelemetry.is_tracked(os.path.join(BASE_CGROUPS, "cpu", "walinuxagent.extensions", extn_folder_name))) + self.assertTrue(CGroupsTelemetry.is_tracked(os.path.join(BASE_CGROUPS, "memory", "walinuxagent.extensions", extn_folder_name))) for i in range(max_num_polls): metrics = CGroupsTelemetry.poll_all_tracked() - self.assertEqual(len(metrics), 3) + # Currently there are 3 types of memory related metrics and 1 CPU related metric. + # % Processor Time + # Total Memory Usage + # Max Memory Usage + # Memory Used by Process - This can have multiple entries (for each process that gets created). + self.assertEqual(len(metrics), 4) monitor_handler.poll_telemetry_metrics() monitor_handler.send_telemetry_metrics() @@ -1120,6 +1133,7 @@ def test_report_event_metrics_sent_for_actual_cgroup(self, patch_report_event, p telemetry_event_list = patch_report_event.call_args_list[0][0][0] for e in telemetry_event_list.events: + print([(i.name, i.value) for i in e.parameters]) details_of_event = [x for x in e.parameters if x.name in ["Category", "Counter", "Instance", "Value"]] @@ -1127,9 +1141,8 @@ def test_report_event_metrics_sent_for_actual_cgroup(self, patch_report_event, p if i.name == "Category": self.assertIn(i.value, ["Memory", "Process"]) if i.name == "Counter": - self.assertIn(i.value, ["Max Memory Usage", "Total Memory Usage", "% Processor Time"]) - if i.name == "Instance": - self.assertEqual(i.value, extn_name) + self.assertIn(i.value, ["Max Memory Usage", "Total Memory Usage", "% Processor Time", + "Memory Used by Process"]) if i.name == "Value": self.assertTrue(isinstance(i.value, int) or isinstance(i.value, float))