Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start sending additional memory information per process #1729

Merged
merged 22 commits into from
Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
68e9558
Adding ResourceUsage Class and Adding ProcessIds in Telemetry
vrdmr Nov 5, 2019
ecabd64
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Nov 15, 2019
dd0d010
Adding process tests to fetch the processes in cgroup
vrdmr Nov 15, 2019
80c37fb
Sending Resource Metrics for Memory Usage telem.
vrdmr Nov 15, 2019
63cf6d3
Sending Memory Data for Tracked Processes
vrdmr Nov 16, 2019
b98de77
Fixing the Unittest for Reset Loggers - Test with new Logger.
vrdmr Nov 16, 2019
eb87fff
Fix proc_statm collection and dictionary formatting
vrdmr Nov 19, 2019
2c66e0e
Updating the process_name pattern when sending memory telemetry
vrdmr Dec 3, 2019
f2bfc03
Fixing the extension metrics data model
vrdmr Dec 4, 2019
e3ec514
Fixing and simplifying the tests
vrdmr Dec 6, 2019
c22de7f
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Dec 6, 2019
06f5413
Fixing the Non-ASCII character '\xe2' in the comments.
vrdmr Dec 6, 2019
070d9d0
Changing the process statm structure.
vrdmr Dec 10, 2019
4f8d449
Handling inactive cgroups; not send empty values in ExtensionMetricsData
vrdmr Dec 12, 2019
dc7b2fc
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Dec 12, 2019
ee67c0f
Fixing the missed test - Changing the memory exceptions.
vrdmr Dec 12, 2019
65a52a3
Review Comments addressed, and clearer exception handling.
vrdmr Dec 13, 2019
8a5329f
Making IOError excplicit for get_memory_usage_proc_statm & test fixes
vrdmr Dec 13, 2019
ccc2366
Addressing review comments.
vrdmr Dec 13, 2019
2386fb4
Initializing a new logger for each test here to not conflict with others
vrdmr Dec 13, 2019
3254191
Review comments addressed and some refactoring.
vrdmr Dec 14, 2019
5e35ec8
nit; missed it earlier.
vrdmr Dec 14, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 39 additions & 23 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,20 @@
re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')


class CGroupContollers(object):
CPU = "cpu"
MEMORY = "memory"


class CGroup(object):
@staticmethod
def create(cgroup_path, controller, extension_name):
"""
Factory method to create the correct CGroup.
"""
if controller == "cpu":
if controller == CGroupContollers.CPU:
return CpuCgroup(extension_name, cgroup_path)
if controller == "memory":
if controller == CGroupContollers.MEMORY:
return MemoryCgroup(extension_name, cgroup_path)
raise CGroupsException('CGroup controller {0} is not supported'.format(controller))

Expand Down Expand Up @@ -107,14 +112,33 @@ def is_active(self):
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of tasks from "tasks" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
return False

return False

def get_tracked_processes(self):
"""
:return: List of Str (Pids). Will return an empty string if we couldn't fetch any tracked processes.
"""
procs = []
try:
procs = self._get_parameters("cgroup.procs")
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
except CGroupsException as e:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
return procs


class CpuCgroup(CGroup):
def __init__(self, name, cgroup_path):
super(CpuCgroup, self).__init__(name, cgroup_path, "cpu")
super(CpuCgroup, self).__init__(name, cgroup_path, CGroupContollers.CPU)

self._osutil = get_osutil()
self._previous_cgroup_cpu = None
Expand Down Expand Up @@ -195,7 +219,7 @@ def __init__(self, name, cgroup_path):

:return: MemoryCgroup
"""
super(MemoryCgroup, self).__init__(name, cgroup_path, "memory")
super(MemoryCgroup, self).__init__(name, cgroup_path, CGroupContollers.MEMORY)

def __str__(self):
return "cgroup: Name: {0}, cgroup_path: {1}; Controller: {2}".format(
Expand All @@ -210,18 +234,13 @@ def get_memory_usage(self):
:rtype: int
"""
usage = None

try:
usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
vrdmr marked this conversation as resolved.
Show resolved Hide resolved

if not usage:
usage = "0"
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
return int(usage)

def get_max_memory_usage(self):
Expand All @@ -234,12 +253,9 @@ def get_max_memory_usage(self):
usage = None
try:
usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
if not usage:
usage = "0"
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)

return int(usage)
117 changes: 99 additions & 18 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,30 @@
from datetime import datetime as dt

from azurelinuxagent.common import logger
from azurelinuxagent.common.cgroup import CpuCgroup
from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.future import ustr

from azurelinuxagent.common.logger import EVERY_SIX_HOURS
from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo

MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])

DELIM = " | "
DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"


class MetricsCategory(object):
MEMORY_CATEGORY = "Memory"
PROCESS_CATEGORY = "Process"


class MetricsCounter(object):
PROCESSOR_PERCENT_TIME = "% Processor Time"
TOTAL_MEM_USAGE = "Total Memory Usage"
MAX_MEM_USAGE = "Max Memory Usage"
MEM_USED_BY_PROCESS = "Memory Used by Process"


class CGroupsTelemetry(object):
Expand All @@ -34,16 +52,37 @@ class CGroupsTelemetry(object):
_cgroup_metrics = {}
_rlock = threading.RLock()

@staticmethod
def get_process_info_summary(process_id):
process_cmdline = DEFAULT_PROCESS_COMMANDLINE
process_name = DEFAULT_PROCESS_NAME

# The ProcessName and ProcessCommandLine can generate Exception if the file /proc/<pid>/{comm,cmdline} cease to
# exist; eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the
# details from those files.
try:
process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
except Exception as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

try:
process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME
except Exception as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))

return process_id + DELIM + process_name + DELIM + process_cmdline
vrdmr marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _get_metrics_list(metric):
return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),
metric.first_poll_time(), metric.last_poll_time()]

@staticmethod
def _process_cgroup_metric(cgroup_metrics):
memory_usage = cgroup_metrics.get_memory_usage()
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
max_memory_usage = cgroup_metrics.get_max_memory_usage()
cpu_usage = cgroup_metrics.get_cpu_usage()
memory_usage = cgroup_metrics.get_memory_metrics()
max_memory_usage = cgroup_metrics.get_max_memory_metrics()
cpu_usage = cgroup_metrics.get_cpu_metrics()
memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()

processed_extension = {}

Expand All @@ -62,6 +101,14 @@ def _process_cgroup_metric(cgroup_metrics):
else:
processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}

for pid_process_memory in memory_usage_per_process:
if "proc_statm_memory" in processed_extension:
processed_extension["proc_statm_memory"][pid_process_memory.pid_name_cmdline] = \
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
else:
processed_extension["proc_statm_memory"] = {pid_process_memory.pid_name_cmdline:
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}

return processed_extension

@staticmethod
Expand Down Expand Up @@ -135,25 +182,41 @@ def poll_all_tracked():

with CGroupsTelemetry._rlock:
for cgroup in CGroupsTelemetry._tracked[:]:
# noinspection PyBroadException
if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
try:
if cgroup.controller == "cpu":
if cgroup.controller == CGroupContollers.CPU:
current_cpu_usage = cgroup.get_cpu_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_cpu_usage(current_cpu_usage)
metrics.append(MetricValue("Process", "% Processor Time", cgroup.name, current_cpu_usage))
elif cgroup.controller == "memory":
metrics.append(MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.
PROCESSOR_PERCENT_TIME, cgroup.name, current_cpu_usage))
elif cgroup.controller == CGroupContollers.MEMORY:
current_memory_usage = cgroup.get_memory_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_memory_usage(current_memory_usage)
metrics.append(MetricValue("Memory", "Total Memory Usage", cgroup.name, current_memory_usage))
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
TOTAL_MEM_USAGE, cgroup.name, current_memory_usage))

max_memory_usage = cgroup.get_max_memory_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
metrics.append(MetricValue("Memory", "Max Memory Usage", cgroup.name, max_memory_usage))
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE,
cgroup.name, max_memory_usage))

pids = cgroup.get_tracked_processes()
for pid in pids:
try:
mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
metrics.append(MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.
MEM_USED_BY_PROCESS, CGroupsTelemetry.get_process_info_summary(pid),
mem_usage_from_procstatm))
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
except Exception as e:
if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
logger.periodic_warn(logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm "
"for pid {0}. Error : {1}", pid, ustr(e))
else:
raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
cgroup.controller, cgroup.name))
cgroup.controller, cgroup.name))
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand Down Expand Up @@ -187,30 +250,48 @@ def __init__(self):
self._memory_usage = Metric()
self._max_memory_usage = Metric()
self._cpu_usage = Metric()
self._proc_statm_mem = {}

self.marked_for_delete = False

def add_memory_usage(self, usage):
self._memory_usage.append(usage)
if not self.marked_for_delete:
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
self._memory_usage.append(usage)

def add_max_memory_usage(self, usage):
self._max_memory_usage.append(usage)
if not self.marked_for_delete:
self._max_memory_usage.append(usage)

def add_cpu_usage(self, usage):
self._cpu_usage.append(usage)
if not self.marked_for_delete:
self._cpu_usage.append(usage)

def get_memory_usage(self):
def add_proc_statm_memory(self, pid, usage):
if not self.marked_for_delete:
if pid not in self._proc_statm_mem:
self._proc_statm_mem[pid] = Metric()
self._proc_statm_mem[pid].append(usage)

def get_memory_metrics(self):
return self._memory_usage

def get_max_memory_usage(self):
def get_max_memory_metrics(self):
return self._max_memory_usage

def get_cpu_usage(self):
def get_cpu_metrics(self):
return self._cpu_usage

def get_proc_statm_memory_metrics(self):
"""
:return: StatmMetricValue tuples of pid and metric
"""
return [StatmMetricValue(pid_name_cmdline, metric) for pid_name_cmdline, metric in self._proc_statm_mem.items()]

def clear(self):
self._memory_usage.clear()
self._max_memory_usage.clear()
self._cpu_usage.clear()
self._proc_statm_mem.clear()


class Metric(object):
Expand Down
7 changes: 6 additions & 1 deletion azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#
# Requires Python 2.6+ and Openssl 1.0+
#

"""
Defines all exceptions
"""
Expand Down Expand Up @@ -44,14 +45,18 @@ def __init__(self, msg=None, inner=None):

class AgentNetworkError(AgentError):
"""
When network is not available\.
When network is not available.
"""

def __init__(self, msg=None, inner=None):
super(AgentNetworkError, self).__init__(msg, inner)


class CGroupsException(AgentError):
"""
Exception to classify any cgroups related issue.
"""

def __init__(self, msg=None, inner=None):
super(CGroupsException, self).__init__(msg, inner)

Expand Down
Loading