Skip to content

Commit

Permalink
Enable Periodic Log Collection in ubuntu systemd distros (#2295)
Browse files Browse the repository at this point in the history
  • Loading branch information
kevinclark19a authored Jul 20, 2021
1 parent f6699cd commit 6492ebd
Show file tree
Hide file tree
Showing 10 changed files with 68 additions and 88 deletions.
26 changes: 3 additions & 23 deletions azurelinuxagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,37 +203,17 @@ def collect_logs(self, is_full_mode):

# Check the cgroups unit
if CollectLogsHandler.should_validate_cgroups():

def validate_cgroup_path(path):
if path is None:
return False, False

regex_match = re.match(r'^(?P<slice>.*)/(?P<unit>.*)$', path)
if regex_match is None:
return False, False

slice_group, unit_group = regex_match.group("slice", "unit")

slice_matches = (slice_group == cgroupconfigurator.AZURE_SLICE)
unit_matches = (unit_group == logcollector.CGROUPS_UNIT)

return slice_matches, unit_matches

cpu_cgroup_path, memory_cgroup_path = SystemdCgroupsApi.get_process_cgroup_relative_paths("self")

cpu_slice_matches, cpu_unit_matches = validate_cgroup_path(cpu_cgroup_path)
memory_slice_matches, memory_unit_matches = validate_cgroup_path(memory_cgroup_path)
cpu_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in cpu_cgroup_path)
memory_slice_matches = (cgroupconfigurator.LOGCOLLECTOR_SLICE in memory_cgroup_path)

if not all([cpu_slice_matches, cpu_unit_matches, memory_slice_matches, memory_unit_matches]):
if not cpu_slice_matches or not memory_slice_matches:
print("The Log Collector process is not in the proper cgroups:")
if not cpu_slice_matches:
print("\tunexpected cpu slice")
if not cpu_unit_matches:
print("\tunexpected cpu unit")
if not memory_slice_matches:
print("\tunexpected memory slice")
if not memory_unit_matches:
print("\tunexpected memory unit")

sys.exit(logcollector.INVALID_CGROUPS_ERRCODE)

Expand Down
24 changes: 24 additions & 0 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,23 @@
[Slice]
CPUAccounting=yes
"""
LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice"
# More info on resource limits properties in systemd here:
# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
_LOGCOLLECTOR_SLICE_CONTENTS_FMT = """
[Unit]
Description=Slice for Azure VM Agent Periodic Log Collector
DefaultDependencies=no
Before=slices.target
[Slice]
CPUAccounting=yes
CPUQuota={cpu_quota}
MemoryAccounting=yes
MemoryLimit={memory_limit}
"""
_LOGCOLLECTOR_CPU_QUOTA = "5%"
_LOGCOLLECTOR_MEMORY_LIMIT = "30M" # K for kb, M for mb

_AGENT_DROP_IN_FILE_SLICE = "10-Slice.conf"
_AGENT_DROP_IN_FILE_SLICE_CONTENTS = """
# This drop-in unit file was created by the Azure VM Agent.
Expand Down Expand Up @@ -277,6 +294,7 @@ def __setup_azure_slice():
unit_file_install_path = systemd.get_unit_file_install_path()
azure_slice = os.path.join(unit_file_install_path, AZURE_SLICE)
vmextensions_slice = os.path.join(unit_file_install_path, _VMEXTENSIONS_SLICE)
logcollector_slice = os.path.join(unit_file_install_path, LOGCOLLECTOR_SLICE)
agent_unit_file = systemd.get_agent_unit_file()
agent_drop_in_path = systemd.get_agent_drop_in_path()
agent_drop_in_file_slice = os.path.join(agent_drop_in_path, _AGENT_DROP_IN_FILE_SLICE)
Expand All @@ -290,6 +308,12 @@ def __setup_azure_slice():
if not os.path.exists(vmextensions_slice):
files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))

if not os.path.exists(logcollector_slice):
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA,
memory_limit=_LOGCOLLECTOR_MEMORY_LIMIT)

files_to_create.append((logcollector_slice, slice_contents))

if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None:
CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice)
else:
Expand Down
39 changes: 13 additions & 26 deletions azurelinuxagent/ga/collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from azurelinuxagent.common.future import subprocess_dev_null, ustr
from azurelinuxagent.common.interfaces import ThreadHandlerInterface
from azurelinuxagent.common.logcollector import COMPRESSED_ARCHIVE_PATH
from azurelinuxagent.common.osutil import systemd
from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator
from azurelinuxagent.common.protocol.util import get_protocol_util
from azurelinuxagent.common.utils import shellutil
from azurelinuxagent.common.utils.shellutil import CommandError
Expand All @@ -43,17 +43,17 @@ def get_collect_logs_handler():
def is_log_collection_allowed():
# There are three conditions that need to be met in order to allow periodic log collection:
# 1) It should be enabled in the configuration.
# 2) The system must be using systemd to manage services. Needed for resource limiting of the log collection.
# 2) The system must be using cgroups to manage services. Needed for resource limiting of the log collection.
# 3) The python version must be greater than 2.6 in order to support the ZipFile library used when collecting.
conf_enabled = conf.get_collect_logs()
systemd_present = systemd.is_systemd()
cgroups_enabled = CGroupConfigurator.get_instance().enabled()
supported_python = PY_VERSION_MINOR >= 7 if PY_VERSION_MAJOR == 2 else PY_VERSION_MAJOR == 3
is_allowed = conf_enabled and systemd_present and supported_python
is_allowed = conf_enabled and cgroups_enabled and supported_python

msg = "Checking if log collection is allowed at this time [{0}]. All three conditions must be met: " \
"configuration enabled [{1}], systemd present [{2}], python supported: [{3}]".format(is_allowed,
"configuration enabled [{1}], cgroups enabled [{2}], python supported: [{3}]".format(is_allowed,
conf_enabled,
systemd_present,
cgroups_enabled,
supported_python)
logger.info(msg)
add_event(
Expand Down Expand Up @@ -155,41 +155,26 @@ def collect_and_send_logs(self):
if self._collect_logs():
self._send_logs()

@staticmethod
def _get_resource_limits():
# Define CPU limit (as percentage of CPU time) and memory limit (absolute value in megabytes).
cpu_limit = "5%"
memory_limit = "30M" # K for kb, M for mb
return cpu_limit, memory_limit

def _collect_logs(self):
logger.info("Starting log collection...")

# Invoke the command line tool in the agent to collect logs, with resource limits on CPU and memory (RAM).

systemd_cmd = [
"systemd-run", "--unit={0}".format(logcollector.CGROUPS_UNIT),
"--slice={0}".format(cgroupconfigurator.AZURE_SLICE)
"--slice={0}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE), "--scope"
]

# More info on resource limits properties in systemd here:
# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/resource_management_guide/sec-modifying_control_groups
cpu_limit, memory_limit = CollectLogsHandler._get_resource_limits()
resource_limits = ["--property=CPUAccounting=1", "--property=CPUQuota={0}".format(cpu_limit),
"--property=MemoryAccounting=1", "--property=MemoryLimit={0}".format(memory_limit)]

# The log tool is invoked from the current agent's egg with the command line option
collect_logs_cmd = [sys.executable, "-u", sys.argv[0], "-collect-logs"]
final_command = systemd_cmd + resource_limits + collect_logs_cmd
final_command = systemd_cmd + collect_logs_cmd

def exec_command(output_file):
start_time = datetime.datetime.utcnow()
success = False
msg = None
try:
# TODO: Remove track_process (and its implementation) when the log collector is moved to the agent's cgroup
shellutil.run_command(final_command, log_error=False, track_process=False,
stdout=output_file, stderr=output_file)
shellutil.run_command(final_command, log_error=False, stdout=output_file, stderr=output_file)
duration = elapsed_milliseconds(start_time)
archive_size = os.path.getsize(COMPRESSED_ARCHIVE_PATH)

Expand All @@ -207,11 +192,13 @@ def exec_command(output_file):
# pylint has limited (i.e. no) awareness of control flow w.r.t. typing. we disable=no-member
# here because we know e must be a CommandError but pylint still considers the case where
# e is a different type of exception.
err_msg = ustr("[stderr] %s", e.stderr) # pylint: disable=no-member
err_msg = ustr("Log Collector exited with code {0}").format(e.returncode) # pylint: disable=no-member

if e.returncode == 2: # pylint: disable=no-member
if e.returncode == logcollector.INVALID_CGROUPS_ERRCODE: # pylint: disable=no-member
logger.info("Disabling periodic log collection until service restart due to process error.")
self.stop()
else:
logger.info(err_msg)

msg = "Failed to collect logs. Elapsed time: {0} ms. Error: {1}".format(duration, err_msg)
# No need to log to the local log since we logged stdout, stderr from the process.
Expand Down
2 changes: 1 addition & 1 deletion config/debian/waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Logs.Verbose=n
# Logs.Console=y

# Enable periodic log collection, default is n
Logs.Collect=n
Logs.Collect=y

# How frequently to collect logs, default is each hour
Logs.CollectPeriod=3600
Expand Down
2 changes: 1 addition & 1 deletion config/suse/waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Logs.Verbose=n
# Logs.Console=y

# Enable periodic log collection, default is n
Logs.Collect=n
Logs.Collect=y

# How frequently to collect logs, default is each hour
Logs.CollectPeriod=3600
Expand Down
2 changes: 1 addition & 1 deletion config/ubuntu/waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ Logs.Verbose=n
# Logs.Console=y

# Enable periodic log collection, default is n
Logs.Collect=n
Logs.Collect=y

# How frequently to collect logs, default is each hour
Logs.CollectPeriod=3600
Expand Down
2 changes: 1 addition & 1 deletion config/waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ Logs.Verbose=n
# Logs.Console=y

# Enable periodic log collection, default is n
Logs.Collect=n
Logs.Collect=y

# How frequently to collect logs, default is each hour
Logs.CollectPeriod=3600
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_waagent.conf
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ ResourceDisk.MountOptions=None
Logs.Verbose=n

# Enable periodic log collection, default is n
Logs.Collect=n
Logs.Collect=y

# How frequently to collect logs, default is each hour
Logs.CollectPeriod=3600
Expand Down
49 changes: 19 additions & 30 deletions tests/ga/test_collect_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os

from azurelinuxagent.common import logger, conf
from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator
from azurelinuxagent.common.logger import Logger
from azurelinuxagent.common.protocol.util import ProtocolUtil
from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed
Expand All @@ -28,7 +29,7 @@


@contextlib.contextmanager
def _create_collect_logs_handler(iterations=1, systemd_present=True):
def _create_collect_logs_handler(iterations=1, cgroups_enabled=True, collect_logs_conf=True):
"""
Creates an instance of CollectLogsHandler that
* Uses a mock_wire_protocol for network requests,
Expand All @@ -46,8 +47,11 @@ def _create_collect_logs_handler(iterations=1, systemd_present=True):
with patch("azurelinuxagent.ga.collect_logs.get_protocol_util", return_value=protocol_util):
with patch("azurelinuxagent.ga.collect_logs.CollectLogsHandler.stopped", side_effect=[False] * iterations + [True]):
with patch("time.sleep"):
with patch("azurelinuxagent.common.osutil.systemd.is_systemd", return_value=systemd_present):
with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=True):

# Grab the singleton to patch it
cgroups_configurator_singleton = CGroupConfigurator.get_instance()
with patch.object(cgroups_configurator_singleton, "enabled", return_value=cgroups_enabled):
with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=collect_logs_conf):
def run_and_wait():
collect_logs_handler.run()
collect_logs_handler.join()
Expand Down Expand Up @@ -93,40 +97,25 @@ def _create_dummy_archive(self, size=1024):
def test_it_should_only_collect_logs_if_conditions_are_met(self):
# In order to collect logs, three conditions have to be met:
# 1) the flag must be set to true in the conf file
# 2) systemd must be managing services
# 2) cgroups must be managing services
# 3) python version 2.7+ which is automatically true for these tests since they are disabled on py2.6

# systemd not present, config flag false
with _create_collect_logs_handler(systemd_present=False):
with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=False):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")
# cgroups not enabled, config flag false
with _create_collect_logs_handler(cgroups_enabled=False, collect_logs_conf=False):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")

# systemd present, config flag false
with _create_collect_logs_handler():
with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=False):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")
# cgroups enabled, config flag false
with _create_collect_logs_handler(cgroups_enabled=True, collect_logs_conf=False):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")

# systemd not present, config flag true
with _create_collect_logs_handler(systemd_present=False):
with patch("azurelinuxagent.ga.collect_logs.conf.get_collect_logs", return_value=True):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")
# cgroups not enabled, config flag true
with _create_collect_logs_handler(cgroups_enabled=False, collect_logs_conf=True):
self.assertEqual(False, is_log_collection_allowed(), "Log collection should not have been enabled")

# systemd present, config flag true
with _create_collect_logs_handler():
# cgroups enabled, config flag true
with _create_collect_logs_handler(cgroups_enabled=True, collect_logs_conf=True):
self.assertEqual(True, is_log_collection_allowed(), "Log collection should have been enabled")

def test_log_collection_is_invoked_with_resource_limits(self):
with _create_collect_logs_handler() as collect_logs_handler:
with patch("azurelinuxagent.ga.collect_logs.shellutil.run_command") as patch_run_command:
collect_logs_handler.run_and_wait()

args, _ = patch_run_command.call_args
self.assertIn("systemd-run", args[0], "The log collector should have been invoked with systemd-run")
self.assertIn("--property=CPUAccounting=1", args[0], "The log collector should have been invoked with CPUAccounting turned on")
self.assertIn("--property=MemoryAccounting=1", args[0], "The log collector should have been invoked with MemoryAccounting turned on")
self.assertIn("--property=CPUQuota=5%", args[0], "The log collector should have been invoked with a CPU limit")
self.assertIn("--property=MemoryLimit=30M", args[0], "The log collector should have been invoked with a memory limit")

def test_it_uploads_logs_when_collection_is_successful(self):
archive_size = 42

Expand Down
8 changes: 4 additions & 4 deletions tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
HttpProxy.Host = None
HttpProxy.Port = None
Lib.Dir = /var/lib/waagent
Logs.Collect = False
Logs.Collect = True
Logs.CollectPeriod = 3600
Logs.Console = True
Logs.Verbose = False
Expand Down Expand Up @@ -223,14 +223,14 @@ def test_calls_collect_logs_on_valid_cgroups(self, mock_log_collector):
@staticmethod
def mock_cgroup_paths(*args, **kwargs):
if args and args[0] == "self":
relative_path = "{0}/{1}".format(cgroupconfigurator.AZURE_SLICE, logcollector.CGROUPS_UNIT)
return (relative_path, relative_path)
relative_path = "{0}/{1}".format(cgroupconfigurator.LOGCOLLECTOR_SLICE, logcollector.CGROUPS_UNIT)
return (cgroupconfigurator.LOGCOLLECTOR_SLICE, relative_path)
return SystemdCgroupsApi.get_process_cgroup_relative_paths(*args, **kwargs)

with patch.object(SystemdCgroupsApi, "get_process_cgroup_relative_paths", mock_cgroup_paths):
agent = Agent(False, conf_file_path=os.path.join(data_dir, "test_waagent.conf"))
agent.collect_logs(is_full_mode=True)

mock_log_collector.assert_called_once()
finally:
CollectLogsHandler.disable_cgroups_validation()
Expand Down

0 comments on commit 6492ebd

Please sign in to comment.