Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor refactoring of cgroup telemetry #1607

Merged
merged 2 commits into from
Aug 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 6 additions & 28 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,6 @@ def _get_parameters(self, parameter_name, first_line_only=False):
raise CGroupsException("Exception while attempting to read {0}".format(parameter_filename), e)
return result

def collect(self):
raise NotImplementedError()

def is_active(self):
try:
tasks = self._get_parameters("tasks")
Expand Down Expand Up @@ -184,15 +181,14 @@ def _get_cpu_percent(self):

return round(float(cpu_delta * self._osutil.get_processor_cores() * 100) / float(system_delta), 3)

def collect(self):
def get_cpu_usage(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The implementation of collect returned an array of different length with different items for each of the controllers. This makes it difficult to use and error prone. I removed it and instead added explicit methods to get each metric.

"""
Collect and return a list of all cpu metrics. If no metrics are collected, return an empty list.
Collects and return the cpu usage.

:rtype: [(str, str, float)]
:rtype: float
"""
self._update_cpu_data()
usage = self._get_cpu_percent()
return [CollectedMetrics("cpu", "% Processor Time", usage)]
return self._get_cpu_percent()


class MemoryCgroup(CGroup):
Expand All @@ -209,7 +205,7 @@ def __str__(self):
self.name, self.path, self.controller
)

def _get_memory_usage(self):
def get_memory_usage(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made it public

"""
Collect memory.usage_in_bytes from the cgroup.

Expand All @@ -231,7 +227,7 @@ def _get_memory_usage(self):
usage = "0"
return int(usage)

def _get_memory_max_usage(self):
def get_max_memory_usage(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

made it public + rename

"""
Collect memory.usage_in_bytes from the cgroup.

Expand All @@ -250,21 +246,3 @@ def _get_memory_max_usage(self):
if not usage:
usage = "0"
return int(usage)

def collect(self):
"""
Collect and return a list of all memory metrics

:rtype: [(str, str, float)]
"""
usage = self._get_memory_usage()
max_usage = self._get_memory_max_usage()
return [CollectedMetrics("memory", "Total Memory Usage", usage),
CollectedMetrics("memory", "Max Memory Usage", max_usage)]


class CollectedMetrics(object):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was not needed

def __init__(self, controller, metric_name, value):
self.controller = controller
self.metric_name = metric_name
self.value = value
87 changes: 40 additions & 47 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,26 @@ def _get_metrics_list(metric):

@staticmethod
def _process_cgroup_metric(cgroup_metrics):
current_memory_usage, max_memory_levels, current_cpu_usage = cgroup_metrics.get_metrics()
memory_usage = cgroup_metrics.get_memory_usage()
max_memory_usage = cgroup_metrics.get_max_memory_usage()
cpu_usage = cgroup_metrics.get_cpu_usage()

processed_extension = {}

if current_cpu_usage.count() > 0:
processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(current_cpu_usage)}
if cpu_usage.count() > 0:
processed_extension["cpu"] = {"cur_cpu": CGroupsTelemetry._get_metrics_list(cpu_usage)}

if current_memory_usage.count() > 0:
if memory_usage.count() > 0:
if "memory" in processed_extension:
processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(current_memory_usage)
processed_extension["memory"]["cur_mem"] = CGroupsTelemetry._get_metrics_list(memory_usage)
else:
processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(current_memory_usage)}
processed_extension["memory"] = {"cur_mem": CGroupsTelemetry._get_metrics_list(memory_usage)}

if max_memory_levels.count() > 0:
if max_memory_usage.count() > 0:
if "memory" in processed_extension:
processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_levels)
processed_extension["memory"]["max_mem"] = CGroupsTelemetry._get_metrics_list(max_memory_usage)
else:
processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_levels)}
processed_extension["memory"] = {"max_mem": CGroupsTelemetry._get_metrics_list(max_memory_usage)}

return processed_extension

Expand Down Expand Up @@ -118,19 +120,7 @@ def poll_all_tracked():
if cgroup.name not in CGroupsTelemetry._cgroup_metrics:
CGroupsTelemetry._cgroup_metrics[cgroup.name] = CgroupMetrics()

metric = None
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved the exception handling to collect_data

# noinspection PyBroadException
try:
metric = cgroup.collect()
except Exception as e:
if isinstance(e, (IOError, OSError)) and e.errno == errno.ENOENT:
pass
else:
logger.periodic_warn(logger.EVERY_HALF_HOUR,
'Could not collect the cgroup metrics for cgroup path {0}. '
'Internal error : {1}'.format(cgroup.path, ustr(e)))
if metric:
CGroupsTelemetry._cgroup_metrics[cgroup.name].add_new_data(cgroup.controller, metric)
CGroupsTelemetry._cgroup_metrics[cgroup.name].collect_data(cgroup)

if not cgroup.is_active():
CGroupsTelemetry.stop_tracking(cgroup)
Expand All @@ -152,34 +142,38 @@ def cleanup():

class CgroupMetrics(object):
def __init__(self):
self._current_memory_usage = Metric()
self._max_memory_levels = Metric()
self._current_cpu_usage = Metric()
self._memory_usage = Metric()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed those members to have consistent names with the rest of the code

self._max_memory_usage = Metric()
self._cpu_usage = Metric()
self.marked_for_delete = False

def _add_memory_usage(self, metric):
self._current_memory_usage.append(metric[0].value)
self._max_memory_levels.append(metric[1].value)

def _add_cpu_usage(self, metric):
self._current_cpu_usage.append(metric[0].value)

def add_new_data(self, controller, metric):
if metric:
if controller == "cpu":
self._add_cpu_usage(metric)
elif controller == "memory":
self._add_memory_usage(metric)
def collect_data(self, cgroup):
# noinspection PyBroadException
try:
if cgroup.controller == "cpu":
self._cpu_usage.append(cgroup.get_cpu_usage())
elif cgroup.controller == "memory":
self._memory_usage.append(cgroup.get_memory_usage())
self._max_memory_usage.append(cgroup.get_max_memory_usage())
else:
raise CGroupsException('CGroup controller {0} is not supported'.format(controller))
except Exception as e:
if not isinstance(e, (IOError, OSError)) or e.errno != errno.ENOENT:
logger.periodic_warn(logger.EVERY_HALF_HOUR, 'Could not collect metrics for cgroup {0}. Error : {1}'.format(cgroup.path, ustr(e)))

def get_memory_usage(self):
return self._memory_usage

def get_max_memory_usage(self):
return self._max_memory_usage

def get_metrics(self):
return self._current_memory_usage, self._max_memory_levels, self._current_cpu_usage
def get_cpu_usage(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get_metrics was returning a list of values, replaced it with explicit methods for each value

return self._cpu_usage

def clear(self):
self._current_memory_usage.clear()
self._max_memory_levels.clear()
self._current_cpu_usage.clear()
self._memory_usage.clear()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed for consistency

self._max_memory_usage.clear()
self._cpu_usage.clear()


class Metric(object):
Expand All @@ -188,12 +182,12 @@ def __init__(self):
self._first_poll_time = None
self._last_poll_time = None

def append(self, metric):
def append(self, data):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'metric' was confusing since we are not adding a Metric, but a number

if not self._first_poll_time:
# We only want to do it first time.
self._first_poll_time = dt.utcnow()

self._data.append(metric)
self._data.append(data)
self._last_poll_time = dt.utcnow()

def clear(self):
Expand All @@ -202,8 +196,7 @@ def clear(self):
self._data *= 0

def average(self):
return float(sum(self._data)
) / float(len(self._data)) if self._data else None
return float(sum(self._data)) / float(len(self._data)) if self._data else None

def max(self):
return max(self._data) if self._data else None
Expand Down
42 changes: 13 additions & 29 deletions tests/common/test_cgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,19 +113,17 @@ def test_cpu_cgroup_create(self, patch_get_proc_stat):

@patch("azurelinuxagent.common.osutil.default.DefaultOSUtil.get_processor_cores", return_value=1)
@patch("azurelinuxagent.common.osutil.default.DefaultOSUtil._get_proc_stat")
def test_collect(self, patch_get_proc_stat, *args):
def test_get_cpu_usage(self, patch_get_proc_stat, *args):
patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups", "dummy_proc_stat"))
test_cpu_cg = CpuCgroup("test_extension", os.path.join(data_dir, "cgroups", "cpu_mount"))

# Mocking CPU consumption
patch_get_proc_stat.return_value = fileutil.read_file(os.path.join(data_dir, "cgroups",
"dummy_proc_stat_updated"))

collected_metric = test_cpu_cg.collect()[0]
cpu_usage = test_cpu_cg.get_cpu_usage()

self.assertEqual("cpu", collected_metric.controller)
self.assertEqual("% Processor Time", collected_metric.metric_name)
self.assertEqual(5.114, collected_metric.value)
self.assertEqual(5.114, cpu_usage)

def test_get_current_cpu_total_exception_handling(self):
test_cpu_cg = CpuCgroup("test_extension", "dummy_path")
Expand All @@ -141,34 +139,20 @@ def test_memory_cgroup_create(self):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "memory_mount"))
self.assertEqual("memory", test_mem_cg.controller)

def test_collect(self):
def test_get_metrics(self):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups", "memory_mount"))
metrics = test_mem_cg.collect()

current_mem_collected_metric = metrics[0]
memory_usage = test_mem_cg.get_memory_usage()
self.assertEqual(100000, memory_usage)

self.assertEqual("memory", current_mem_collected_metric.controller)
self.assertEqual("Total Memory Usage", current_mem_collected_metric.metric_name)
self.assertEqual(100000, current_mem_collected_metric.value)
max_memory_usage = test_mem_cg.get_max_memory_usage()
self.assertEqual(1000000, max_memory_usage)

max_mem_collected_metric = metrics[1]

self.assertEqual("memory", max_mem_collected_metric.controller)
self.assertEqual("Max Memory Usage", max_mem_collected_metric.metric_name)
self.assertEqual(1000000, max_mem_collected_metric.value)

def test_collect_when_files_not_present(self):
def test_get_metrics_when_files_not_present(self):
test_mem_cg = MemoryCgroup("test_extension", os.path.join(data_dir, "cgroups"))
metrics = test_mem_cg.collect()

current_mem_collected_metric = metrics[0]

self.assertEqual("memory", current_mem_collected_metric.controller)
self.assertEqual("Total Memory Usage", current_mem_collected_metric.metric_name)
self.assertEqual(0, current_mem_collected_metric.value)

max_mem_collected_metric = metrics[1]
memory_usage = test_mem_cg.get_memory_usage()
self.assertEqual(0, memory_usage)

self.assertEqual("memory", max_mem_collected_metric.controller)
self.assertEqual("Max Memory Usage", max_mem_collected_metric.metric_name)
self.assertEqual(0, max_mem_collected_metric.value)
max_memory_usage = test_mem_cg.get_max_memory_usage()
self.assertEqual(0, max_memory_usage)
Loading