Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start sending additional memory information per process #1729

Merged
merged 22 commits into from
Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
68e9558
Adding ResourceUsage Class and Adding ProcessIds in Telemetry
vrdmr Nov 5, 2019
ecabd64
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Nov 15, 2019
dd0d010
Adding process tests to fetch the processes in cgroup
vrdmr Nov 15, 2019
80c37fb
Sending Resource Metrics for Memory Usage telem.
vrdmr Nov 15, 2019
63cf6d3
Sending Memory Data for Tracked Processes
vrdmr Nov 16, 2019
b98de77
Fixing the Unittest for Reset Loggers - Test with new Logger.
vrdmr Nov 16, 2019
eb87fff
Fix proc_statm collection and dictionary formatting
vrdmr Nov 19, 2019
2c66e0e
Updating the process_name pattern when sending memory telemetry
vrdmr Dec 3, 2019
f2bfc03
Fixing the extension metrics data model
vrdmr Dec 4, 2019
e3ec514
Fixing and simplifying the tests
vrdmr Dec 6, 2019
c22de7f
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Dec 6, 2019
06f5413
Fixing the Non-ASCII character '\xe2' in the comments.
vrdmr Dec 6, 2019
070d9d0
Changing the process statm structure.
vrdmr Dec 10, 2019
4f8d449
Handling inactive cgroups; not send empty values in ExtensionMetricsData
vrdmr Dec 12, 2019
dc7b2fc
Merge branch 'develop' into vameru/send-addl-memory-information
vrdmr Dec 12, 2019
ee67c0f
Fixing the missed test - Changing the memory exceptions.
vrdmr Dec 12, 2019
65a52a3
Review Comments addressed, and clearer exception handling.
vrdmr Dec 13, 2019
8a5329f
Making IOError excplicit for get_memory_usage_proc_statm & test fixes
vrdmr Dec 13, 2019
ccc2366
Addressing review comments.
vrdmr Dec 13, 2019
2386fb4
Initializing a new logger for each test here to not conflict with others
vrdmr Dec 13, 2019
3254191
Review comments addressed and some refactoring.
vrdmr Dec 14, 2019
5e35ec8
nit; missed it earlier.
vrdmr Dec 14, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 32 additions & 16 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,29 @@ def is_active(self):

return False

def get_tracked_processes(self):
"""

:return: List of Str (Pids)
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
"""
try:
procs = self._get_parameters("cgroup.procs")
if procs:
return procs
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of procs from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
except CGroupsException as e:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not get list of tasks from "cgroup.procs" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
return None


class CpuCgroup(CGroup):
def __init__(self, name, cgroup_path):
Expand Down Expand Up @@ -213,15 +236,11 @@ def get_memory_usage(self):

try:
usage = self._get_parameters('memory.usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)
vrdmr marked this conversation as resolved.
Show resolved Hide resolved

if not usage:
usage = "0"
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
return int(usage)

def get_max_memory_usage(self):
Expand All @@ -234,12 +253,9 @@ def get_max_memory_usage(self):
usage = None
try:
usage = self._get_parameters('memory.max_usage_in_bytes', first_line_only=True)
except (IOError, OSError) as e:
if e.errno == errno.ENOENT:
# only suppressing file not found exceptions.
pass
else:
raise e
if not usage:
usage = "0"
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
raise
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)

return int(usage)
81 changes: 69 additions & 12 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#
# Requires Python 2.6+ and Openssl 1.0+
import errno
import os
import threading
from collections import namedtuple
from datetime import datetime as dt
Expand All @@ -22,9 +23,14 @@
from azurelinuxagent.common.cgroup import CpuCgroup
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.future import ustr

from azurelinuxagent.common.resourceusage import MemoryResourceUsage, ProcessInfo

MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
StatmMetricValue = namedtuple('StatmMetricValue', ['pid', 'resource_metric'])

DELIM = " | "
DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"


class CGroupsTelemetry(object):
Expand All @@ -34,16 +40,28 @@ class CGroupsTelemetry(object):
_cgroup_metrics = {}
_rlock = threading.RLock()

@staticmethod
def get_process_info_summary(process_id):
process_cmdline = ProcessInfo.get_proc_cmdline(process_id) if not None else DEFAULT_PROCESS_COMMANDLINE
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
process_name = ProcessInfo.get_proc_name(process_id) if not None else DEFAULT_PROCESS_NAME

# The ProcessName and ProcessCommandLine can be None if the file /proc/<pid>/{comm,cmdline} cease to exist;
# eg: the process can die, or finish. Which is why we need Default Names, in case we fail to fetch the details
# from those files.

return process_id + DELIM + process_name + DELIM + process_cmdline
vrdmr marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _get_metrics_list(metric):
return [metric.average(), metric.min(), metric.max(), metric.median(), metric.count(),
metric.first_poll_time(), metric.last_poll_time()]

@staticmethod
def _process_cgroup_metric(cgroup_metrics):
memory_usage = cgroup_metrics.get_memory_usage()
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
max_memory_usage = cgroup_metrics.get_max_memory_usage()
cpu_usage = cgroup_metrics.get_cpu_usage()
memory_usage = cgroup_metrics.get_memory_metrics()
max_memory_usage = cgroup_metrics.get_max_memory_metrics()
cpu_usage = cgroup_metrics.get_cpu_metrics()
memory_usage_per_process = cgroup_metrics.get_proc_statm_memory_metrics()

processed_extension = {}

Expand All @@ -62,6 +80,14 @@ def _process_cgroup_metric(cgroup_metrics):
else:
processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}

if memory_usage_per_process:
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
for pid_process_memory in memory_usage_per_process:
if "proc_statm_memory" in processed_extension:
processed_extension["proc_statm_memory"][pid_process_memory.pid] = \
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)
else:
processed_extension["proc_statm_memory"] = {pid_process_memory.pid:
CGroupsTelemetry._get_metrics_list(pid_process_memory.resource_metric)}
return processed_extension

@staticmethod
Expand Down Expand Up @@ -135,7 +161,6 @@ def poll_all_tracked():

with CGroupsTelemetry._rlock:
for cgroup in CGroupsTelemetry._tracked[:]:
# noinspection PyBroadException
if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()
try:
Expand All @@ -151,9 +176,26 @@ def poll_all_tracked():
max_memory_usage = cgroup.get_max_memory_usage()
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_max_memory_usage(max_memory_usage)
metrics.append(MetricValue("Memory", "Max Memory Usage", cgroup.name, max_memory_usage))

pids = cgroup.get_tracked_processes()

if pids:
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
for pid in pids:
try:
mem_usage_from_procstatm = MemoryResourceUsage.get_memory_usage_from_proc_statm(pid)
metrics.append(MetricValue("Memory", "Memory Used by Process",
CGroupsTelemetry.get_process_info_summary(pid),
mem_usage_from_procstatm))
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_proc_statm_memory(
CGroupsTelemetry.get_process_info_summary(pid), mem_usage_from_procstatm)
except Exception as e:
if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
logger.periodic_warn(
logger.EVERY_HOUR, "[PERIODIC] Could not collect proc_statm for pid {0}. "
"Error : {1}", pid, ustr(e))
else:
raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(
cgroup.controller, cgroup.name))
cgroup.controller, cgroup.name))
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand Down Expand Up @@ -187,30 +229,45 @@ def __init__(self):
self._memory_usage = Metric()
self._max_memory_usage = Metric()
self._cpu_usage = Metric()
self._proc_statm_mem = {}

self.marked_for_delete = False

def add_memory_usage(self, usage):
self._memory_usage.append(usage)
if not self.marked_for_delete:
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
self._memory_usage.append(usage)

def add_max_memory_usage(self, usage):
self._max_memory_usage.append(usage)
if not self.marked_for_delete:
self._max_memory_usage.append(usage)

def add_cpu_usage(self, usage):
self._cpu_usage.append(usage)
if not self.marked_for_delete:
self._cpu_usage.append(usage)

def get_memory_usage(self):
def add_proc_statm_memory(self, pid, usage):
if not self.marked_for_delete:
if pid not in self._proc_statm_mem:
self._proc_statm_mem[pid] = Metric()
self._proc_statm_mem[pid].append(usage)

def get_memory_metrics(self):
return self._memory_usage

def get_max_memory_usage(self):
def get_max_memory_metrics(self):
return self._max_memory_usage

def get_cpu_usage(self):
def get_cpu_metrics(self):
return self._cpu_usage

def get_proc_statm_memory_metrics(self):
return [StatmMetricValue(pid, metric) for pid, metric in self._proc_statm_mem.items()]

def clear(self):
self._memory_usage.clear()
self._max_memory_usage.clear()
self._cpu_usage.clear()
self._proc_statm_mem.clear()


class Metric(object):
Expand Down
7 changes: 6 additions & 1 deletion azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#
# Requires Python 2.6+ and Openssl 1.0+
#

"""
Defines all exceptions
"""
Expand Down Expand Up @@ -44,14 +45,18 @@ def __init__(self, msg=None, inner=None):

class AgentNetworkError(AgentError):
"""
When network is not available\.
When network is not available.
"""

def __init__(self, msg=None, inner=None):
super(AgentNetworkError, self).__init__(msg, inner)


class CGroupsException(AgentError):
"""
Exception to classify any cgroups related issue.
"""

def __init__(self, msg=None, inner=None):
super(CGroupsException, self).__init__(msg, inner)

Expand Down
143 changes: 143 additions & 0 deletions azurelinuxagent/common/resourceusage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright 2019 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+

import os

from azurelinuxagent.common import logger
from azurelinuxagent.common.exception import AgentError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.logger import EVERY_SIX_HOURS
from azurelinuxagent.common.utils import fileutil


PAGE_SIZE = os.sysconf('SC_PAGE_SIZE')
PROC_STATM_FILENAME_FORMAT = "/proc/{0}/statm"
PROC_CMDLINE_FILENAME_FORMAT = "/proc/{0}/cmdline"
PROC_COMM_FILENAME_FORMAT = "/proc/{0}/comm"
PROC_STATUS_FILENAME_FORMAT = "/proc/{0}/status"


class ResourceUsage(object):
pass


class MemoryResourceUsage(ResourceUsage):
@staticmethod
def get_memory_usage_from_proc_statm(process_id):
proc_pid_rss = 0
try:
proc_pid_rss = MemoryResourceUsage._get_proc_rss(process_id)
except Exception as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] Could not get the /prod/{0}/statm data due to {1}",
process_id, ustr(e))
raise ProcessInfoException("Could not get the /proc/{0}/statm due to {1}".format(process_id, ustr(e)))
return proc_pid_rss

@staticmethod
def _get_proc_rss(process_id):
"""
/proc/<pid>/statm fields: columns are (in pages):

total program size|
resident set size|
shared pages|
text (code) |
data/stack |
library |
dirty pages |

Here an example:
root@vm:/# cat /proc/1392/statm
17637 5316 2125 938 0 3332 0

:return: resident set size in bytes.
"""
pid_statm = fileutil.read_file(PROC_STATM_FILENAME_FORMAT.format(process_id)).split()
pid_rss = int(pid_statm[1]) # Index 1 is RSS.

return pid_rss * PAGE_SIZE


class ProcessInfo(object):
@staticmethod
def get_proc_name(process_id):
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
proc_pid_rss = None
try:
proc_pid_rss = ProcessInfo._get_proc_comm(process_id)
except ProcessInfoException as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
return proc_pid_rss

@staticmethod
def get_proc_cmdline(process_id):
proc_pid_rss = None
try:
proc_pid_rss = ProcessInfo._get_proc_cmdline(process_id)
except ProcessInfoException as e:
logger.periodic_info(EVERY_SIX_HOURS, "[PERIODIC] {0}", ustr(e))
return proc_pid_rss

@classmethod
def _get_proc_cmdline(cls, process_id):
"""
/proc/<pid>/cmdline returns cmdline arguments passed to the Linux kernel. The returned string is delimited with
the \0 character and needs to be replaced with some other character to make it readable.

Here an example:
root@vm:/# cat /proc/1392/cmdlineg
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
python--targettest_resourceusage.py
root@vm:/# cat /proc/1392/cmdline | tr "\0" " "
python --target test_resourceusage.py

:return: command line passed to the process string.
"""
cmdline_file_name = PROC_CMDLINE_FILENAME_FORMAT.format(process_id)
try:
pid_cmdline = fileutil.read_file(cmdline_file_name).replace("\0", " ").strip()
except Exception as e:
raise ProcessInfoException("Could not get contents from {0}".format(cmdline_file_name), e)

return pid_cmdline

@classmethod
def _get_proc_comm(cls, process_id):
"""
/proc/<pid>/comm This file exposes the process's comm value-that is, the command name associated with the
process. Strings longer than TASK_COMM_LEN (16) characters are silently truncated.

Here an example:
root@vm:/# cat /proc/1392/comm
python

:return: process name
"""
comm_file_name = PROC_COMM_FILENAME_FORMAT.format(process_id)
try:
pid_comm = fileutil.read_file(comm_file_name).strip()
pid_comm_str = str(pid_comm)
except Exception as e:
raise ProcessInfoException("Could not get contents from {0}".format(comm_file_name), e)

return pid_comm_str


class ProcessInfoException(AgentError):
"""
When we
vrdmr marked this conversation as resolved.
Show resolved Hide resolved
"""

def __init__(self, msg=None, inner=None):
super(ProcessInfoException, self).__init__(msg, inner)
Loading