Skip to content

Commit

Permalink
Merge branch 'develop' into fedora_disto
Browse files Browse the repository at this point in the history
  • Loading branch information
narrieta authored Oct 28, 2022
2 parents 2cdcead + a1f3049 commit e011623
Show file tree
Hide file tree
Showing 104 changed files with 2,407 additions and 1,531 deletions.
3 changes: 2 additions & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
1
# See https://help.github.com/articles/about-codeowners/
# for more info about CODEOWNERS file

Expand All @@ -20,4 +21,4 @@
#
# Linux Agent team
#
* @narrieta @kevinclark19a @ZhidongPeng @dhivyaganesan @nagworld9
* @narrieta @ZhidongPeng @nagworld9 @maddieford
31 changes: 20 additions & 11 deletions azurelinuxagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
PY_VERSION_MAJOR, PY_VERSION_MINOR, \
PY_VERSION_MICRO, GOAL_STATE_AGENT_VERSION, \
get_daemon_version, set_daemon_version
from azurelinuxagent.ga.collect_logs import CollectLogsHandler
from azurelinuxagent.ga.collect_logs import CollectLogsHandler, get_log_collector_monitor_handler
from azurelinuxagent.pa.provision.default import ProvisionHandler


Expand Down Expand Up @@ -196,36 +196,45 @@ def show_configuration(self):
print("{0} = {1}".format(k, configuration[k]))

def collect_logs(self, is_full_mode):
logger.set_prefix("LogCollector")

if is_full_mode:
print("Running log collector mode full")
logger.info("Running log collector mode full")
else:
print("Running log collector mode normal")
logger.info("Running log collector mode normal")

# Check the cgroups unit
cpu_cgroup_path, memory_cgroup_path, log_collector_monitor = None, None, None
if CollectLogsHandler.should_validate_cgroups():
cpu_cgroup_path, memory_cgroup_path = SystemdCgroupsApi.get_process_cgroup_relative_paths("self")
cgroups_api = SystemdCgroupsApi()
cpu_cgroup_path, memory_cgroup_path = cgroups_api.get_process_cgroup_paths("self")

cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)

if not cpu_slice_matches or not memory_slice_matches:
print("The Log Collector process is not in the proper cgroups:")
logger.info("The Log Collector process is not in the proper cgroups:")
if not cpu_slice_matches:
print("\tunexpected cpu slice")
logger.info("\tunexpected cpu slice")
if not memory_slice_matches:
print("\tunexpected memory slice")
logger.info("\tunexpected memory slice")

sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)

try:
log_collector = LogCollector(is_full_mode)
log_collector = LogCollector(is_full_mode, cpu_cgroup_path, memory_cgroup_path)
log_collector_monitor = get_log_collector_monitor_handler(log_collector.cgroups)
log_collector_monitor.run()
archive = log_collector.collect_logs_and_get_archive()
print("Log collection successfully completed. Archive can be found at {0} "
logger.info("Log collection successfully completed. Archive can be found at {0} "
"and detailed log output can be found at {1}".format(archive, OUTPUT_RESULTS_FILE_PATH))
except Exception as e:
print("Log collection completed unsuccessfully. Error: {0}".format(ustr(e)))
print("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH))
logger.error("Log collection completed unsuccessfully. Error: {0}".format(ustr(e)))
logger.info("Detailed log output can be found at {0}".format(OUTPUT_RESULTS_FILE_PATH))
sys.exit(1)
finally:
if log_collector_monitor is not None:
log_collector_monitor.stop()

@staticmethod
def setup_firewall(firewall_metadata):
Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
_DEFAULT_REPORT_PERIOD = timedelta(seconds=conf.get_cgroup_check_period())

AGENT_NAME_TELEMETRY = "walinuxagent.service" # Name used for telemetry; it needs to be consistent even if the name of the service changes
AGENT_LOG_COLLECTOR = "azure-walinuxagent-logcollector"


class CounterNotFound(Exception):
Expand Down
47 changes: 39 additions & 8 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil, systemd
from azurelinuxagent.common.version import get_distro
Expand Down Expand Up @@ -74,10 +74,9 @@
CPUAccounting=yes
CPUQuota={cpu_quota}
MemoryAccounting=yes
MemoryLimit={memory_limit}
"""
_LOGCOLLECTOR_CPU_QUOTA = "5%"
_LOGCOLLECTOR_MEMORY_LIMIT = "30M" # K for kb, M for mb
LOGCOLLECTOR_MEMORY_LIMIT = 30 * 1024 ** 2 # 30Mb

_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf"
_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """
Expand Down Expand Up @@ -144,6 +143,7 @@ def __init__(self):
self._cgroups_api = None
self._agent_cpu_cgroup_path = None
self._agent_memory_cgroup_path = None
self._agent_memory_cgroup = None
self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.

def initialize(self):
Expand Down Expand Up @@ -195,7 +195,8 @@ def initialize(self):

if self._agent_memory_cgroup_path is not None:
_log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path))
self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup)

_log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)

Expand Down Expand Up @@ -349,8 +350,7 @@ def __setup_azure_slice():
files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))

if not os.path.exists(logcollector_slice):
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA,
memory_limit=_LOGCOLLECTOR_MEMORY_LIMIT)
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)

files_to_create.append((logcollector_slice, slice_contents))

Expand Down Expand Up @@ -652,8 +652,9 @@ def _check_processes_in_agent_cgroup(self):
current = process
while current != 0 and current not in agent_commands:
current = self._get_parent(current)
# Process started by agent will have a marker and check if that marker found in process environment.
if current == 0 and not self.__is_process_descendant_of_the_agent(process):
# Verify if Process started by agent based on the marker found in process environment or process is in Zombie state.
# If so, consider it as valid process in agent cgroup.
if current == 0 and not (self.__is_process_descendant_of_the_agent(process) or self.__is_zombie_process(process)):
unexpected.append(self.__format_process(process))
if len(unexpected) >= 5: # collect just a small sample
break
Expand Down Expand Up @@ -706,13 +707,43 @@ def __is_process_descendant_of_the_agent(pid):
pass
return False

@staticmethod
def __is_zombie_process(pid):
"""
Returns True if process is in Zombie state otherwise False.
Ex: cat /proc/18171/stat
18171 (python3) S 18103 18103 18103 0 -1 4194624 57736 64902 0 3
"""
try:
stat = '/proc/{0}/stat'.format(pid)
if os.path.exists(stat):
with open(stat, "r") as stat_file:
return stat_file.read().split()[2] == 'Z'
except Exception:
pass
return False

@staticmethod
def _check_agent_throttled_time(cgroup_metrics):
for metric in cgroup_metrics:
if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
if metric.value > conf.get_agent_cpu_throttled_time_threshold():
raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))

def check_agent_memory_usage(self):
if self.enabled() and self._agent_memory_cgroup:
metrics = self._agent_memory_cgroup.get_tracked_metrics()
current_usage = 0
for metric in metrics:
if metric.counter == MetricsCounter.TOTAL_MEM_USAGE:
current_usage += metric.value
elif metric.counter == MetricsCounter.SWAP_MEM_USAGE:
current_usage += metric.value

if current_usage > conf.get_agent_memory_quota():
raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage))

@staticmethod
def _get_parent(pid):
"""
Expand Down
20 changes: 20 additions & 0 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Debug.CgroupLogMetrics": False,
"Debug.CgroupDisableOnProcessCheckFailure": True,
"Debug.CgroupDisableOnQuotaCheckFailure": True,
"Debug.EnableAgentMemoryUsageCheck": False,
"Debug.EnableFastTrack": True,
"Debug.EnableGAVersioning": False
}
Expand Down Expand Up @@ -186,6 +187,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Debug.CgroupCheckPeriod": 300,
"Debug.AgentCpuQuota": 50,
"Debug.AgentCpuThrottledTimeThreshold": 120,
"Debug.AgentMemoryQuota": 30 * 1024 ** 2,
"Debug.EtpCollectionPeriod": 300,
"Debug.AutoUpdateHotfixFrequency": 14400,
"Debug.AutoUpdateNormalFrequency": 86400,
Expand Down Expand Up @@ -555,6 +557,24 @@ def get_agent_cpu_throttled_time_threshold(conf=__conf__):
return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120)


def get_agent_memory_quota(conf=__conf__):
"""
Memory quota for the agent in bytes.
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2)


def get_enable_agent_memory_usage_check(conf=__conf__):
"""
If True, Agent checks it's Memory usage.
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False)


def get_cgroup_monitor_expiry_time(conf=__conf__):
"""
cgroups monitoring for pilot extensions disabled after expiry time
Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class WALAEventOperation:
ActivateResourceDisk = "ActivateResourceDisk"
AgentBlacklisted = "AgentBlacklisted"
AgentEnabled = "AgentEnabled"
AgentMemory = "AgentMemory"
AgentUpgrade = "AgentUpgrade"
ArtifactsProfileBlob = "ArtifactsProfileBlob"
CGroupsCleanUp = "CGroupsCleanUp"
Expand Down
8 changes: 8 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ def __init__(self, msg=None, inner=None):
super(AgentConfigError, self).__init__(msg, inner)


class AgentMemoryExceededException(AgentError):
"""
When Agent memory limit reached.
"""
def __init__(self, msg=None, inner=None):
super(AgentMemoryExceededException, self).__init__(msg, inner)


class AgentNetworkError(AgentError):
"""
When network is not available.
Expand Down
27 changes: 25 additions & 2 deletions azurelinuxagent/common/logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
from datetime import datetime
from heapq import heappush, heappop

from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_LOG_COLLECTOR, MemoryCgroup
from azurelinuxagent.common.conf import get_lib_dir, get_ext_log_dir, get_agent_log_file
from azurelinuxagent.common.event import initialize_event_logger_vminfo_common_parameters
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.logcollector_manifests import MANIFEST_NORMAL, MANIFEST_FULL

# Please note: be careful when adding agent dependencies in this module.
# This module uses its own logger and logs to its own file, not to the agent log.
from azurelinuxagent.common.protocol.util import get_protocol_util

_EXTENSION_LOG_DIR = get_ext_log_dir()
_AGENT_LIB_DIR = get_lib_dir()
Expand All @@ -45,7 +48,7 @@

CGROUPS_UNIT = "collect-logs.scope"

FORCE_KILLED_ERRCODE = -9
GRACEFUL_KILL_ERRCODE = 3
INVALID_CGROUPS_ERRCODE = 2

_MUST_COLLECT_FILES = [
Expand All @@ -67,12 +70,14 @@ class LogCollector(object):

_TRUNCATED_FILE_PREFIX = "truncated_"

def __init__(self, is_full_mode=False):
def __init__(self, is_full_mode=False, cpu_cgroup_path=None, memory_cgroup_path=None):
self._is_full_mode = is_full_mode
self._manifest = MANIFEST_FULL if is_full_mode else MANIFEST_NORMAL
self._must_collect_files = self._expand_must_collect_files()
self._create_base_dirs()
self._set_logger()
self._initialize_telemetry()
self.cgroups = self._set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path)

@staticmethod
def _mkdir(dirname):
Expand All @@ -99,6 +104,24 @@ def _set_logger():
_LOGGER.addHandler(_f_handler)
_LOGGER.setLevel(logging.INFO)

@staticmethod
def _set_resource_usage_cgroups(cpu_cgroup_path, memory_cgroup_path):
cpu_cgroup = CpuCgroup(AGENT_LOG_COLLECTOR, cpu_cgroup_path)
msg = "Started tracking cpu cgroup {0}".format(cpu_cgroup)
_LOGGER.info(msg)
cpu_cgroup.initialize_cpu_usage()
memory_cgroup = MemoryCgroup(AGENT_LOG_COLLECTOR, memory_cgroup_path)
msg = "Started tracking memory cgroup {0}".format(memory_cgroup)
_LOGGER.info(msg)
return [cpu_cgroup, memory_cgroup]

@staticmethod
def _initialize_telemetry():
protocol = get_protocol_util().get_protocol()
protocol.client.update_goal_state(force_update=True)
# Initialize the common parameters for telemetry events
initialize_event_logger_vminfo_common_parameters(protocol)

@staticmethod
def _run_shell_command(command, stdout=subprocess.PIPE, log_output=False):
"""
Expand Down
3 changes: 2 additions & 1 deletion azurelinuxagent/common/osutil/suse.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def set_dhcp_hostname(self, hostname):
)
if hostname_send_setting:
value = hostname_send_setting.split('=')[-1]
if value == '"AUTO"' or value == '"{0}"'.format(hostname):
# wicked's source accepts values with double quotes, single quotes, and no quotes at all.
if value in ('"AUTO"', "'AUTO'", 'AUTO') or value == '"{0}"'.format(hostname):
# Return if auto send host-name is configured or the current
# hostname is already set up to be sent
return
Expand Down
6 changes: 3 additions & 3 deletions azurelinuxagent/common/protocol/extensions_goal_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# Requires Python 2.6+ and Openssl 1.0+
import datetime

import azurelinuxagent.common.logger as logger
from azurelinuxagent.common import logger
from azurelinuxagent.common.AgentGlobals import AgentGlobals
from azurelinuxagent.common.exception import AgentError
from azurelinuxagent.common.utils import textutil
Expand Down Expand Up @@ -126,7 +126,7 @@ def on_hold(self):
raise NotImplementedError()

@property
def agent_manifests(self):
def agent_families(self):
raise NotImplementedError()

@property
Expand Down Expand Up @@ -233,7 +233,7 @@ def on_hold(self):
return False

@property
def agent_manifests(self):
def agent_families(self):
return []

@property
Expand Down
Loading

0 comments on commit e011623

Please sign in to comment.