Skip to content

Commit

Permalink
Report processes that do not belong to the agent's cgroup (#1908)
Browse files Browse the repository at this point in the history
* Detect processes that should not be in the agent's cgroup

* Add items to process list

* Add all processes from env thread

* Rename Process category

Co-authored-by: narrieta <narrieta>
  • Loading branch information
narrieta authored Jun 15, 2020
1 parent 7f8b071 commit 29f5b67
Show file tree
Hide file tree
Showing 12 changed files with 397 additions and 182 deletions.
34 changes: 34 additions & 0 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
from collections import namedtuple

import errno
import os
import re
Expand All @@ -23,6 +25,21 @@
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil


MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])


class MetricsCategory(object):
MEMORY_CATEGORY = "Memory"
CPU_CATEGORY = "CPU"


class MetricsCounter(object):
PROCESSOR_PERCENT_TIME = "% Processor Time"
TOTAL_MEM_USAGE = "Total Memory Usage"
MAX_MEM_USAGE = "Max Memory Usage"


re_user_system_times = re.compile(r'user (\d+)\nsystem (\d+)\n')


Expand Down Expand Up @@ -135,6 +152,12 @@ def get_tracked_processes(self):
' Internal error: {1}'.format(self.path, ustr(e)))
return procs

def get_tracked_metrics(self):
"""
Retrieves the current value of the metrics tracked for this cgroup and returns them as an array
"""
raise NotImplementedError()


class CpuCgroup(CGroup):
def __init__(self, name, cgroup_path):
Expand Down Expand Up @@ -211,6 +234,11 @@ def get_cpu_usage(self):

return round(100.0 * float(cgroup_delta) / float(system_delta), 3)

def get_tracked_metrics(self):
return [
MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, self.get_cpu_usage()),
]


class MemoryCgroup(CGroup):
def __init__(self, name, cgroup_path):
Expand Down Expand Up @@ -259,3 +287,9 @@ def get_max_memory_usage(self):
raise CGroupsException("Exception while attempting to read {0}".format("memory.usage_in_bytes"), e)

return int(usage)

def get_tracked_metrics(self):
return [
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, self.name, self.get_memory_usage()),
MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, self.name, self.get_max_memory_usage()),
]
31 changes: 31 additions & 0 deletions azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -432,6 +433,7 @@ class SystemdCgroupsApi(CGroupsApi):
"""
def __init__(self):
self._cgroup_mountpoints = None
self._agent_unit_name = None

@staticmethod
def get_systemd_version():
Expand Down Expand Up @@ -620,6 +622,35 @@ def create_cgroup(controller):

return cgroups

def get_agent_unit_name(self):
if self._agent_unit_name is None:
self._agent_unit_name = get_osutil().get_service_name() + ".service"
return self._agent_unit_name

@staticmethod
def get_processes_in_cgroup(unit_name):
"""
Returns an array of tuples with the PID and command line of the processes that are currently
within the cgroup for the given unit.
"""
#
# The output of the command is similar to
#
# Unit walinuxagent.service (/system.slice/walinuxagent.service):
# ├─27519 /usr/bin/python3 -u /usr/sbin/waagent -daemon
# └─27547 python3 -u bin/WALinuxAgent-2.2.48.1-py2.7.egg -run-exthandlers
#
output = shellutil.run_command(['systemd-cgls', '--unit', unit_name])

processes = []

for line in output.splitlines():
match = re.match('[^\d]*(?P<pid>\d+)\s+(?P<command>.+)', line)
if match is not None:
processes.append((match.group('pid'), match.group('command')))

return processes

@staticmethod
def _is_systemd_failure(scope_name, process_output):
unit_not_found = "Unit {0} not found.".format(scope_name)
Expand Down
72 changes: 69 additions & 3 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# Requires Python 2.6+ and Openssl 1.0+

import os
import re
import subprocess

from azurelinuxagent.common import logger
Expand All @@ -23,7 +24,6 @@
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.version import get_distro
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion
from azurelinuxagent.common.event import add_event, WALAEventOperation
Expand All @@ -41,6 +41,8 @@ def __init__(self):
self._cgroups_supported = False
self._cgroups_enabled = False
self._cgroups_api = None
self._get_processes_in_agent_cgroup_last_error = None
self._get_processes_in_agent_cgroup_error_count = 0

def initialize(self):
try:
Expand Down Expand Up @@ -113,7 +115,7 @@ def log_cgroup_warn(format_string, *args):
#
# check the cgroups for the agent
#
agent_unit_name = get_osutil().get_service_name() + ".service"
agent_unit_name = self._cgroups_api.get_agent_unit_name()
cpu_cgroup_relative_path, memory_cgroup_relative_path = self._cgroups_api.get_process_cgroup_relative_paths("self")
if cpu_cgroup_relative_path is None:
log_cgroup_warn("The agent's process is not within a CPU cgroup")
Expand Down Expand Up @@ -156,6 +158,9 @@ def log_cgroup_warn(format_string, *args):
def enabled(self):
return self._cgroups_enabled

def resource_limits_enforced(self):
return False

def enable(self):
if not self._cgroups_supported:
raise CGroupsException("Attempted to enable cgroups, but they are not supported on the current platform")
Expand All @@ -171,7 +176,7 @@ def _invoke_cgroup_operation(self, operation, error_message, on_error=None):
Ensures the given operation is invoked only if cgroups are enabled and traps any errors on the operation.
"""
if not self.enabled():
return
return None

try:
return operation()
Expand Down Expand Up @@ -211,6 +216,29 @@ def __impl():

self._invoke_cgroup_operation(__impl, "Failed to delete cgroups for extension '{0}'.".format(name))

def get_processes_in_agent_cgroup(self):
"""
Returns an array of tuples with the PID and command line of the processes that are currently within the cgroup for the given unit.
The return value can be None if cgroups are not enabled or if an error occurs during the operation.
"""
def __impl():
agent_unit = self._cgroups_api.get_agent_unit_name()
return self._cgroups_api.get_processes_in_cgroup(agent_unit)

def __on_error(exception):
#
# Send telemetry for a small sample of errors (if any)
#
self._get_processes_in_agent_cgroup_error_count = self._get_processes_in_agent_cgroup_error_count + 1
if self._get_processes_in_agent_cgroup_error_count <= 5:
message = "Failed to list the processes in the agent's cgroup: {0}", ustr(exception)
if message != self._get_processes_in_agent_cgroup_last_error:
add_event(op=WALAEventOperation.CGroupsDebug, message=message)
self._get_processes_in_agent_cgroup_last_error = message

return self._invoke_cgroup_operation(__impl, "Failed to list the processes in the agent's cgroup.", on_error=__on_error)

def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
error_code=ExtensionErrorCodes.PluginUnknownFailure):
"""
Expand Down Expand Up @@ -261,3 +289,41 @@ def get_instance():
if CGroupConfigurator._instance is None:
CGroupConfigurator._instance = CGroupConfigurator.__impl()
return CGroupConfigurator._instance

@staticmethod
def is_agent_process(command_line):
"""
Returns true if the given command line corresponds to a process started by the agent.
NOTE: The function uses pattern matching to determine whether the process was spawned by the agent; this is more of a heuristic
than an exact check.
"""
patterns = [
r".*waagent -daemon.*",
r".*(WALinuxAgent-.+\.egg|waagent) -run-exthandlers",
# The processes in the agent's cgroup are listed using systemd-cgls
r"^systemd-cgls --unit walinuxagent.service$",
# Extensions are started using systemd-run
r"^systemd-run --unit=.+ --scope ",
#
# The rest of the commands are started by the environment thread; many of them are distro-specific so this list may need
# additions as we add support for more distros.
#
# *** Monitor DHCP client restart
#
r"^pidof (dhclient|dhclient3|systemd-networkd)",
r"^ip route (show|add)",
#
# *** Enable firewall
#
r"^iptables --version$",
r"^iptables .+ -t security",
#
# *** Monitor host name changes
#
r"^ifdown .+ && ifup .+",
]
for p in patterns:
if re.match(p, command_line) is not None:
return True
return False
35 changes: 2 additions & 33 deletions azurelinuxagent/common/cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,18 @@
# Requires Python 2.6+ and Openssl 1.0+
import errno
import threading
from collections import namedtuple

from azurelinuxagent.common import logger
from azurelinuxagent.common.cgroup import CpuCgroup, CGroupContollers
from azurelinuxagent.common.exception import CGroupsException
from azurelinuxagent.common.cgroup import CpuCgroup
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.logger import EVERY_SIX_HOURS
from azurelinuxagent.common.resourceusage import ProcessInfo

MetricValue = namedtuple('Metric', ['category', 'counter', 'instance', 'value'])
StatmMetricValue = namedtuple('StatmMetricValue', ['pid_name_cmdline', 'resource_metric'])

DELIM = " | "
DEFAULT_PROCESS_NAME = "NO_PROCESS_FOUND"
DEFAULT_PROCESS_COMMANDLINE = "NO_CMDLINE_FOUND"


class MetricsCategory(object):
MEMORY_CATEGORY = "Memory"
PROCESS_CATEGORY = "Process"


class MetricsCounter(object):
PROCESSOR_PERCENT_TIME = "% Processor Time"
TOTAL_MEM_USAGE = "Total Memory Usage"
MAX_MEM_USAGE = "Max Memory Usage"


class CGroupsTelemetry(object):
"""
"""
Expand Down Expand Up @@ -110,24 +94,9 @@ def poll_all_tracked():
metrics = []

with CGroupsTelemetry._rlock:
def new_cpu_metric(name, value):
return MetricValue(MetricsCategory.PROCESS_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, name, value)

def new_memory_metric(name, value):
return MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.TOTAL_MEM_USAGE, name, value)

def new_max_memory_metric(name, value):
return MetricValue(MetricsCategory.MEMORY_CATEGORY, MetricsCounter.MAX_MEM_USAGE, name, value)

for cgroup in CGroupsTelemetry._tracked[:]:
try:
if cgroup.controller == CGroupContollers.CPU:
metrics.append(new_cpu_metric(cgroup.name, cgroup.get_cpu_usage()))
elif cgroup.controller == CGroupContollers.MEMORY:
metrics.append(new_memory_metric(cgroup.name, cgroup.get_memory_usage()))
metrics.append(new_max_memory_metric(cgroup.name, cgroup.get_max_memory_usage()))
else:
raise CGroupsException('CGroup controller {0} is not supported for cgroup {1}'.format(cgroup.controller, cgroup.name))
metrics.extend(cgroup.get_tracked_metrics())
except Exception as e:
# There can be scenarios when the CGroup has been deleted by the time we are fetching the values
# from it. This would raise IOError with file entry not found (ERRNO: 2). We do not want to log
Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class WALAEventOperation:
ArtifactsProfileBlob = "ArtifactsProfileBlob"
AutoUpdate = "AutoUpdate"
CGroupsCleanUp = "CGroupsCleanUp"
CGroupsDebug = "CGroupsDebug"
CGroupsInfo = "CGroupsInfo"
CGroupsInitialize = "CGroupsInitialize"
CGroupsLimitsCrossed = "CGroupsLimitsCrossed"
Expand Down
Loading

0 comments on commit 29f5b67

Please sign in to comment.