Skip to content

Commit

Permalink
Fix the threshold telemetry issue (#1394)
Browse files Browse the repository at this point in the history
* Adding the get_limits call within the collect_all_tracked call
* Added tests for threshold
  • Loading branch information
vrdmr authored Nov 16, 2018
1 parent 52cb1bb commit 575c399
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 36 deletions.
72 changes: 43 additions & 29 deletions azurelinuxagent/common/cgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,7 @@ def collect_all_tracked():
cgroup_name = cgroup_name if cgroup_name else WRAPPER_CGROUP_NAME
results[cgroup_name] = collector.collect()
limits[cgroup_name] = collector.cgroup.threshold

return results, limits

@staticmethod
Expand Down Expand Up @@ -370,12 +371,12 @@ def _construct_systemd_path_for_hierarchy(hierarchy, cgroup_name):
return os.path.join(BASE_CGROUPS, hierarchy, 'system.slice', cgroup_name).rstrip(os.path.sep)

@staticmethod
def for_extension(name):
return CGroups(name, CGroups._construct_custom_path_for_hierarchy)
def for_extension(name, limits=None):
return CGroups(name, CGroups._construct_custom_path_for_hierarchy, limits)

@staticmethod
def for_systemd_service(name):
return CGroups(name.lower(), CGroups._construct_systemd_path_for_hierarchy)
def for_systemd_service(name, limits=None):
return CGroups(name.lower(), CGroups._construct_systemd_path_for_hierarchy, limits)

@staticmethod
def enabled():
Expand All @@ -389,22 +390,23 @@ def disable():
def enable():
CGroups._enabled = True

def __init__(self, name, path_maker):
def __init__(self, name, path_maker, limits=None):
"""
Construct CGroups object. Create appropriately-named directory for each hierarchy of interest.
:param str name: Name for the cgroup (usually the full name of the extension)
:param path_maker: Function which constructs the root path for a given hierarchy where this cgroup lives
"""
if name == "":
if not name or name == "":
self.name = "Agents+Extensions"
self.is_wrapper_cgroup = True
else:
self.name = name
self.is_wrapper_cgroup = False

self.cgroups = {}
self.threshold = None

self.threshold = CGroupsLimits(self.name, limits)

if not self.enabled():
return
Expand Down Expand Up @@ -481,22 +483,6 @@ def add(self, pid):
tasks_file = self._get_cgroup_file(hierarchy, 'cgroup.procs')
fileutil.append_file(tasks_file, "{0}\n".format(pid))

def get_cpu_limits(self):
# default values
cpu_limit = DEFAULT_CPU_LIMIT_AGENT if AGENT_NAME.lower() in self.name.lower() else DEFAULT_CPU_LIMIT_EXT

return cpu_limit

def get_memory_limits(self):
# default values
mem_limit = max(DEFAULT_MEM_LIMIT_MIN_MB, round(self._osutil.get_total_mem() * DEFAULT_MEM_LIMIT_PCT / 100, 0))

# agent values
if AGENT_NAME.lower() in self.name.lower():
mem_limit = min(DEFAULT_MEM_LIMIT_MAX_MB, mem_limit)

return mem_limit

def set_limits(self):
"""
Set per-hierarchy limits based on the cgroup name (agent or particular extension)
Expand All @@ -513,9 +499,8 @@ def set_limits(self):
logger.info('No cgroups limits for {0}'.format(self.name))
return

# default values
cpu_limit = self.get_cpu_limits()
mem_limit = self.get_memory_limits()
cpu_limit = self.threshold.cpu_limit
mem_limit = self.threshold.memory_limit

msg = '{0}: {1}% {2}mb'.format(self.name, cpu_limit, mem_limit)
logger.info("Setting cgroups limits for {0}".format(msg))
Expand All @@ -538,9 +523,6 @@ def set_limits(self):
message=msg,
log_event=False)

# Returning the limits -
self.threshold = {"cpu": cpu_limit, "memory": mem_limit}

@staticmethod
def _apply_wrapper_limits(path, hierarchy):
"""
Expand Down Expand Up @@ -796,3 +778,35 @@ def set_memory_limit(self, limit=None, unit='megabytes'):
fileutil.write_file(memory_limit_file, '{0}\n'.format(value))
else:
raise CGroupsException("Memory hierarchy not available in this cgroup")


class CGroupsLimits(object):
@staticmethod
def _get_value_or_default(name, threshold, limit, compute_default):
return threshold[limit] if threshold and limit in threshold else compute_default(name)

def __init__(self, cgroup_name, threshold=None):
if not cgroup_name or cgroup_name == "":
cgroup_name = "Agents+Extensions"

self.cpu_limit = self._get_value_or_default(cgroup_name, threshold, "cpu", CGroupsLimits.get_default_cpu_limits)
self.memory_limit = self._get_value_or_default(cgroup_name, threshold, "memory",
CGroupsLimits.get_default_memory_limits)

@staticmethod
def get_default_cpu_limits(cgroup_name):
# default values
cpu_limit = DEFAULT_CPU_LIMIT_AGENT if AGENT_NAME.lower() in cgroup_name.lower() else DEFAULT_CPU_LIMIT_EXT
return cpu_limit

@staticmethod
def get_default_memory_limits(cgroup_name):
os_util = get_osutil()

# default values
mem_limit = max(DEFAULT_MEM_LIMIT_MIN_MB, round(os_util.get_total_mem() * DEFAULT_MEM_LIMIT_PCT / 100, 0))

# agent values
if AGENT_NAME.lower() in cgroup_name.lower():
mem_limit = min(DEFAULT_MEM_LIMIT_MAX_MB, mem_limit)
return mem_limit
12 changes: 7 additions & 5 deletions azurelinuxagent/ga/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,9 +427,11 @@ def send_cgroup_telemetry(self):
report_metric(metric_group, metric_name, cgroup_name, value)

if metric_group == "Memory":
if value >= thresholds["memory"]:
msg = "CGroup {0}: Crossed the Memory Threshold. Current Value:{1}, Threshold:{2}.".format(
cgroup_name, value, thresholds["memory"])
# Memory is collected in bytes, and limit is set in megabytes.
if value >= CGroups._format_memory_value('megabytes', thresholds.memory_limit):
msg = "CGroup {0}: Crossed the Memory Threshold. " \
"Current Value:{1} bytes, Threshold:{2} megabytes.".format(cgroup_name, value,
thresholds.memory_limit)
add_event(name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.CGroupsLimitsCrossed,
Expand All @@ -438,9 +440,9 @@ def send_cgroup_telemetry(self):
log_event=True)

if metric_group == "Process":
if value >= thresholds["cpu"]:
if value >= thresholds.cpu_limit:
msg = "CGroup {0}: Crossed the Processor Threshold. Current Value:{1}, Threshold:{2}.".format(
cgroup_name, value, thresholds["cpu"])
cgroup_name, value, thresholds.cpu_limit)
add_event(name=AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.CGroupsLimitsCrossed,
Expand Down
49 changes: 47 additions & 2 deletions tests/common/test_cgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

from __future__ import print_function

from azurelinuxagent.common.cgroups import CGroupsTelemetry, CGroups, CGroupsException, BASE_CGROUPS, Cpu, Memory, \
DEFAULT_MEM_LIMIT_MIN_MB
from azurelinuxagent.common.cgroups import CGroupsTelemetry, CGroups, CGroupsLimits, \
CGroupsException, CGroupsLimits, BASE_CGROUPS, Cpu, Memory, DEFAULT_MEM_LIMIT_MIN_MB
from azurelinuxagent.common.version import AGENT_NAME
from tests.tools import *

Expand Down Expand Up @@ -144,6 +144,11 @@ def exercise_telemetry_instantiation(self, test_cgroup):
else:
self.fail("Unknown metric {0}/{1} value {2}".format(metric_family, metric_name, metric_value))

my_limits = limits[test_extension_name]
self.assertIsInstance(my_limits, CGroupsLimits, msg="is not the correct instance")
self.assertGreater(my_limits.cpu_limit, 0.0)
self.assertGreater(my_limits.memory_limit, 0.0)

@skip_if_predicate_false(i_am_root, "Test does not run when non-root")
def test_telemetry_instantiation_as_superuser(self):
"""
Expand Down Expand Up @@ -286,3 +291,43 @@ def test_limits(self):
self.assert_limits(ext_name="normal_extension", expected_cpu_limit=-1, limits_enforced=False)
self.assert_limits(ext_name=AGENT_NAME, expected_cpu_limit=-1, limits_enforced=False)
self.assert_limits(ext_name="normal_extension", expected_cpu_limit=40, exception_raised=True)


class TestCGroupsLimits(AgentTestCase):
@patch("azurelinuxagent.common.osutil.default.DefaultOSUtil.get_total_mem", return_value=1024)
def test_no_limits_passed(self, patched_get_total_mem):
cgroup_name = "test_cgroup"
limits = CGroupsLimits(cgroup_name)
self.assertEqual(limits.cpu_limit, CGroupsLimits.get_default_cpu_limits(cgroup_name ))
self.assertEqual(limits.memory_limit, CGroupsLimits.get_default_memory_limits(cgroup_name ))

limits = CGroupsLimits(None)
self.assertEqual(limits.cpu_limit, CGroupsLimits.get_default_cpu_limits(cgroup_name))
self.assertEqual(limits.memory_limit, CGroupsLimits.get_default_memory_limits(cgroup_name))

@patch("azurelinuxagent.common.osutil.default.DefaultOSUtil.get_total_mem", return_value=1024)
def test_with_limits_passed(self, patched_get_total_mem):
cpu_limit = 50
memory_limit = 300
cgroup_name = "test_cgroup"

threshold = {"cpu": cpu_limit}
limits = CGroupsLimits(cgroup_name, threshold=threshold)
self.assertEqual(limits.cpu_limit, cpu_limit)
self.assertEqual(limits.memory_limit, CGroupsLimits.get_default_memory_limits(cgroup_name))

threshold = {"memory": memory_limit}
limits = CGroupsLimits(cgroup_name, threshold=threshold)
self.assertEqual(limits.cpu_limit, CGroupsLimits.get_default_cpu_limits(cgroup_name))
self.assertEqual(limits.memory_limit, memory_limit)

threshold = {"cpu": cpu_limit, "memory": memory_limit}
limits = CGroupsLimits(cgroup_name, threshold=threshold)
self.assertEqual(limits.cpu_limit, cpu_limit)
self.assertEqual(limits.memory_limit, memory_limit)

# Incorrect key
threshold = {"cpux": cpu_limit}
limits = CGroupsLimits(cgroup_name, threshold=threshold)
self.assertEqual(limits.cpu_limit, CGroupsLimits.get_default_cpu_limits(cgroup_name))
self.assertEqual(limits.memory_limit, CGroupsLimits.get_default_memory_limits(cgroup_name))

0 comments on commit 575c399

Please sign in to comment.