Skip to content

Commit

Permalink
Start sending additional memory information per process (#1729)
Browse files Browse the repository at this point in the history
* Adding ResourceUsage Class and Adding ProcessIds in Telemetry

* Adding process tests to fetch the processes in cgroup

* Sending Resource Metrics for Memory Usage telemetry.

* Sending Memory Data for Tracked Processes

* Fix proc_statm collection and dictionary formatting

* Updating the process_name pattern when sending memory telemetry

* Adding the default process_name, process_commandline

* Handling inactive cgroups; not send empty values in ExtensionMetricsData

- Making it more explicit for Memory cases as well if IOError came.
- Adding a filter to make sure not adding entries into Metrics which are marked_for_delete
- Tests changes

* Fixing the missed test - Changing the memory exceptions.

* Review Comments addressed, and clearer exception handling.

- cgroups.py - cleaner returns and raises
- cgroupstelemetry.py - exception handling when proc_statm for PID fails.
- exception.py - cgroup comments fixed
- resourceusage.py - raising an exception when failing to get memory info.
- monitor.py - reverted the reset_logger changes
- test_cgroupstelemetry.py - stray print and mock removed.
- test_resourceusage.py - newline and end
- test_monitor.py - removed reset_logger changes, and count of metrics fixed.

* Making IOError explicit for get_memory_usage_proc_statm & test fixes

- resourceusage.py - Propogate IOError(Errno2) above, and for other exceptions, raise ProcessInfoException.
- test_resourceusage.py - Add more asserts to check the raising of IOError, ProcessInfoException.

* Addressing review comments.

- cgroupstelemetry.py - handling of exception thrown by get_proc_*. Also, some variable name refactoring.
- resourceusage.py - Bubbling up exceptions in get_proc_*
- test_cgroupstelemetry.py - Refactoring variable name.
- test_resourceusage.py - Changes in test to test exception bubbling up.

* Initializing a new logger for each test here to not conflict with others

* Review comments addressed and some refactoring.

- test_cgroups.py - Simple refactoring of class setup and asserts fixed for get_tracked_process
- resourceusage.py - Comments fixed.
- cgroupstelemetry.py - Refactored the strings into class for easy usage.
- cgroup.py - Refactoring the controller names strings into class. Also changed get_tracked_processes' return behavior.
  • Loading branch information
vrdmr authored Dec 16, 2019
1 parent 435dd44 commit 8194bbc
Show file tree
Hide file tree
Showing 14 changed files with 773 additions and 401 deletions.
62 changes: 39 additions & 23 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@
re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')


class CGroupContollers(object):
CPU = "cpu"
MEMORY = "memory"


class CGroup(object):
@staticmethod
def create(cgroup_path, controller, extension_name):
"""
Factory method to create the correct CGroup.
"""
if controller == "cpu":
if controller == CGroupContollers.CPU:
return CpuCgroup(extension_name, cgroup_path)
if controller == "memory":
if controller == CGroupContollers.MEMORY:
return MemoryCgroup(extension_name, cgroup_path)
raise CGroupsException('CGroup controller {0} is not supported'.format(controller))

Expand Down Expand Up @@ -107,14 +112,33 @@ def is_active(self):
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of tasks from "tasks" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
return False

return False

def get_tracked_processes(self):
"""
:return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.
"""
procs = []
try:
procs = self._get_parameters("cgroup.procs")
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
except CGroupsException as e:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
return procs


class CpuCgroup(CGroup):
def __init__(self, name, cgroup_path):
super(CpuCgroup, self).__init__(name, cgroup_path, "cpu")
super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU)

self._osutil = get_osutil()
self._previous_cgroup_cpu = None
Expand Down Expand Up @@ -195,7 +219,7 @@ def __init__(self, name, cgroup_path):
:return: MemoryCgroup
"""
super(MemoryCgroup, self).__init__(name, cgroup_path, "memory")
super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY)

def __str__(self):
return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
Expand All @@ -210,18 +234,13 @@ def get_memory_usage(self):
:rtype: int
"""
usage = None

try:
usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)

if not usage:
usage = "0"
return int(usage)

def get_max_memory_usage(self):
Expand All @@ -234,12 +253,9 @@ def get_max_memory_usage(self):
usage = None
try:
usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
if not usage:
usage = "0"
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)

return int(usage)
117 changes: 99 additions & 18 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,30 @@
from datetime import datetime as dt

from azurelinuxagent.common import logger
from azurelinuxagent.common.cgroup import CpuCgroup
from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.future import ustr

from azurelinuxagent.common.logger import EVERY_SIX_HOURS
from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo

MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])

DELIM = " | "
DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"


class MetricsCategory(object):
MEMORY_CATEGORY = "Memory"
PROCESS_CATEGORY = "Process"


class MetricsCounter(object):
PROCESSOR_PERCENT_TIME = "% Processor Time"
TOTAL_MEM_USAGE = "Total Memory Usage"
MAX_MEM_USAGE = "Max Memory Usage"
MEM_USED_BY_PROCESS = "Memory Used by Process"


class CGroupsTelemetry(object):
Expand All @@ -34,16 +52,37 @@ class CGroupsTelemetry(object):
_cgroup_metrics = {}
_rlock = threading.RLock()

@staticmethod
def get_process_info_summary(process_id):
process_cmdline = DEFAULT_PROCESS_COMMANDLINE
process_name = DEFAULT_PROCESS_NAME

# The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
# exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
# details from those files.
try:
process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
except Exception as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

try:
process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
except Exception as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

return process_id + DELIM + process_name + DELIM + process_cmdline

@staticmethod
def _get_metrics_list(metric):
return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),
metric.first_poll_time(), metric.last_poll_time()]

@staticmethod
def _process_cgroup_metric(cgroup_metrics):
memory_usage = cgroup_metrics.get_memory_usage()
max_memory_usage = cgroup_metrics.get_max_memory_usage()
cpu_usage = cgroup_metrics.get_cpu_usage()
memory_usage = cgroup_metrics.get_memory_metrics()
max_memory_usage = cgroup_metrics.get_max_memory_metrics()
cpu_usage = cgroup_metrics.get_cpu_metrics()
memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()

processed_extension = {}

Expand All @@ -62,6 +101,14 @@ def _process_cgroup_metric(cgroup_metrics):
else:
processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}

for pid_process_memory in memory_usage_per_process:
if "proc_statm_memory" in processed_extension:
processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
else:
processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline:
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}

return processed_extension

@staticmethod
Expand Down Expand Up @@ -135,25 +182,41 @@ def poll_all_tracked():

with CGroupsTelemetry._rlock:
for cgroup in CGroupsTelemetry._tracked[:]:
# noinspection PyBroadException
if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
try:
if cgroup.controller == "cpu":
if cgroup.controller == CGroupContollers.CPU:
current_cpu_usage = cgroup.get_cpu_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage)
metrics.append(MetricValue("Process", "% Processor Time", cgroup.name, current_cpu_usage))
elif cgroup.controller == "memory":
metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.
PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage))
elif cgroup.controller == CGroupContollers.MEMORY:
current_memory_usage = cgroup.get_memory_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage)
metrics.append(MetricValue("Memory", "Total Memory Usage", cgroup.name, current_memory_usage))
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
TOTAL_MEM_USAGE, cgroup.name, current_memory_usage))

max_memory_usage = cgroup.get_max_memory_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
metrics.append(MetricValue("Memory", "Max Memory Usage", cgroup.name, max_memory_usage))
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE,
cgroup.name, max_memory_usage))

pids = cgroup.get_tracked_processes()
for pid in pids:
try:
mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid),
mem_usage_from_procstatm))
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
except Exception as e:
if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm "
"for pid {0}. Error : {1}", pid, ustr(e))
else:
raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
cgroup.controller, cgroup.name))
cgroup.controller, cgroup.name))
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand Down Expand Up @@ -187,30 +250,48 @@ def __init__(self):
self._memory_usage = Metric()
self._max_memory_usage = Metric()
self._cpu_usage = Metric()
self._proc_statm_mem = {}

self.marked_for_delete = False

def add_memory_usage(self, usage):
self._memory_usage.append(usage)
if not self.marked_for_delete:
self._memory_usage.append(usage)

def add_max_memory_usage(self, usage):
self._max_memory_usage.append(usage)
if not self.marked_for_delete:
self._max_memory_usage.append(usage)

def add_cpu_usage(self, usage):
self._cpu_usage.append(usage)
if not self.marked_for_delete:
self._cpu_usage.append(usage)

def get_memory_usage(self):
def add_proc_statm_memory(self, pid, usage):
if not self.marked_for_delete:
if pid not in self._proc_statm_mem:
self._proc_statm_mem[pid] = Metric()
self._proc_statm_mem[pid].append(usage)

def get_memory_metrics(self):
return self._memory_usage

def get_max_memory_usage(self):
def get_max_memory_metrics(self):
return self._max_memory_usage

def get_cpu_usage(self):
def get_cpu_metrics(self):
return self._cpu_usage

def get_proc_statm_memory_metrics(self):
"""
:return: StatmMetricValue tuples of pid and metric
"""
return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()]

def clear(self):
self._memory_usage.clear()
self._max_memory_usage.clear()
self._cpu_usage.clear()
self._proc_statm_mem.clear()


class Metric(object):
Expand Down
7 changes: 6 additions & 1 deletion azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#
# Requires Python 2.6+ and Openssl 1.0+
#

"""
Defines all exceptions
"""
Expand Down Expand Up @@ -44,14 +45,18 @@ def __init__(self, msg=None, inner=None):

class AgentNetworkError(AgentError):
"""
When network is not available\.
When network is not available.
"""

def __init__(self, msg=None, inner=None):
super(AgentNetworkError, self).__init__(msg, inner)


class CGroupsException(AgentError):
"""
Exception to classify any cgroups related issue.
"""

def __init__(self, msg=None, inner=None):
super(CGroupsException, self).__init__(msg, inner)

Expand Down
Loading

0 comments on commit 8194bbc

Please sign in to comment.