Skip to content

Commit

Permalink
Fix check for systemd-run failure when invoking extensions (#1943)
Browse files Browse the repository at this point in the history
  • Loading branch information
pgombar authored Jul 16, 2020
1 parent 0edea53 commit 7e3eac1
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 50 deletions.
11 changes: 7 additions & 4 deletions azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil, shellutil
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
TELEMETRY_MESSAGE_MAX_LEN
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import get_distro

Expand Down Expand Up @@ -652,9 +653,11 @@ def get_processes_in_cgroup(cgroup_path):
return processes

@staticmethod
def _is_systemd_failure(scope_name, process_output):
def _is_systemd_failure(scope_name, stderr):
stderr.seek(0)
stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
unit_not_found = "Unit {0} not found.".format(scope_name)
return unit_not_found in process_output or scope_name not in process_output
return unit_not_found in stderr or scope_name not in stderr

def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
error_code=ExtensionErrorCodes.PluginUnknownFailure):
Expand Down Expand Up @@ -709,8 +712,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd,
except ExtensionError as e:
# The extension didn't terminate successfully. Determine whether it was due to systemd errors or
# extension errors.
systemd_failure = self._is_systemd_failure(scope, stderr)
process_output = read_output(stdout, stderr)
systemd_failure = self._is_systemd_failure(scope, process_output)

if not systemd_failure:
# There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
Expand Down
3 changes: 2 additions & 1 deletion azurelinuxagent/common/utils/extensionprocessutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def read_output(stdout, stderr):
return format_stdout_stderr("", "Cannot read stdout/stderr: {0}".format(ustr(e)))


def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
def format_stdout_stderr(stdout, stderr):
"""
Format stdout and stderr's output to make it suitable in telemetry.
The goal is to maximize the amount of output given the constraints
Expand All @@ -118,6 +118,7 @@ def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
"""
template = "[stdout]\n{0}\n\n[stderr]\n{1}"
# +6 == len("{0}") + len("{1}")
max_len = TELEMETRY_MESSAGE_MAX_LEN
max_len_each = int((max_len - len(template) + 6) / 2)

if max_len_each <= 0:
Expand Down
75 changes: 57 additions & 18 deletions tests/common/test_cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@
import re
import subprocess
import tempfile

from nose.plugins.attrib import attr

from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, VM_AGENT_CGROUP_NAME
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.utils import shellutil, fileutil
from nose.plugins.attrib import attr
from tests.common.mock_cgroup_commands import mock_cgroup_commands
from tests.utils.cgroups_tools import CGroupsTools
from tests.tools import AgentTestCase, patch, skip_if_predicate_false, is_systemd_present, i_am_root, mock_sleep
from tests.utils.cgroups_tools import CGroupsTools


class _MockedFileSystemTestCase(AgentTestCase):
Expand Down Expand Up @@ -61,7 +63,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self):
(['centos', '7.5', 'Source'], False),
(['redhat', '7.7', 'Maipo'], False),
(['redhat', '7.7.1908', 'Core'], False),
(['bigip', '15.0.1', 'Final'], False),
(['bigip', '15.0.1', 'Final'], False),
(['gaia', '273.562', 'R80.30'], False),
(['debian' '9.1', ''], False),
]
Expand Down Expand Up @@ -529,7 +531,7 @@ def test_get_processes_in_cgroup_should_return_the_processes_within_the_cgroup(s
processes = SystemdCgroupsApi.get_processes_in_cgroup("/sys/fs/cgroup/cpu/system.slice/walinuxagent.service")

self.assertTrue(len(processes) >= 2,
"The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))
"The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))

daemon_present = any("waagent -daemon" in command for (pid, command) in processes)
self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes))
Expand Down Expand Up @@ -684,15 +686,14 @@ def mock_popen(command, *args, **kwargs):

self.assertEquals(2, len(extension_calls), "The extension should have been invoked exactly twice")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")
"The first call to the extension should have used systemd")
self.assertEquals(command, extension_calls[1],
"The second call to the extension should not have used systemd")
"The second call to the extension should not have used systemd")

self.assertEquals(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created")

self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured")


@patch('time.sleep', side_effect=lambda _: mock_sleep())
def test_start_extension_command_should_invoke_the_command_directly_if_systemd_times_out(self, _):
# Systemd has its own internal timeout which is shorter than what we define for extension operation timeout.
Expand All @@ -710,8 +711,6 @@ def mock_popen(*args, **kwargs):

return original_popen(new_args, **kwargs)

expected_output = "[stdout]\n{0}\n\n\n[stderr]\n"

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as popen_patch:
Expand Down Expand Up @@ -740,29 +739,69 @@ def mock_popen(*args, **kwargs):
@patch('time.sleep', side_effect=lambda _: mock_sleep())
def test_start_extension_command_should_not_use_fallback_option_if_extension_fails(self, *args):
self.assertTrue(i_am_root(), "Test does not run when non-root")
command = "ls folder_does_not_exist"

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
with self.assertRaises(ExtensionError) as context_manager:
SystemdCgroupsApi().start_extension_command(
extension_name="Microsoft.Compute.TestExtension-1.2.3",
command=command,
timeout=300,
shell=True,
cwd=self.tmp_dir,
env={},
stdout=stdout,
stderr=stderr)

extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]]

self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")

self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
# The scope name should appear in the process output since systemd-run was invoked and stderr
# wasn't truncated.
self.assertIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))

@attr('requires_sudo')
@patch("azurelinuxagent.common.cgroupapi.add_event")
@patch('time.sleep', side_effect=lambda _: mock_sleep())
@patch("azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN", 5)
def test_start_extension_command_should_not_use_fallback_option_if_extension_fails_with_long_output(self, *args):
self.assertTrue(i_am_root(), "Test does not run when non-root")

long_output = "a"*20 # large enough to ensure both stdout and stderr are truncated
long_stdout_stderr_command = "echo {0} && echo {0} >&2 && ls folder_does_not_exist".format(long_output)

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) \
as patch_mock_popen:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
with self.assertRaises(ExtensionError) as context_manager:
SystemdCgroupsApi().start_extension_command(
extension_name="Microsoft.Compute.TestExtension-1.2.3",
command="ls folder_does_not_exist",
command=long_stdout_stderr_command,
timeout=300,
shell=True,
cwd=self.tmp_dir,
env={},
stdout=stdout,
stderr=stderr)

# We should have invoked the extension command only once, in the systemd-run case
self.assertEquals(1, patch_mock_popen.call_count)
args = patch_mock_popen.call_args[0][0]
self.assertIn("systemd-run --unit", args)
extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if long_stdout_stderr_command in args[0]]

self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")

self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
# stdout and stderr should have been truncated, so the scope name doesn't appear in stderr
# even though systemd-run ran
self.assertNotIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))

@attr('requires_sudo')
@patch("azurelinuxagent.common.cgroupapi.add_event")
Expand Down
57 changes: 30 additions & 27 deletions tests/utils/test_extension_process_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,29 +164,26 @@ def test_handle_process_completion_should_raise_on_nonzero_exit_code(self):
self.assertEquals(context_manager.exception.code, error_code)
self.assertIn("Non-zero exit code:", ustr(context_manager.exception))

def test_read_output_it_should_return_no_content(self):
def test_read_output_should_return_no_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 0):
expected = "[stdout]\n\n\n[stderr]\n"
expected = ""
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 10):
expected = "[stdout]\nThe quick \n\n[stderr]\nThe five b"
def test_read_output_should_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
expected = "[stdout]\nr the lazy dog.\n\n" \
"[stderr]\ns jump quickly."
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_return_all_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
def test_read_output_should_not_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 90):
expected = "[stdout]\nThe quick brown fox jumps over the lazy dog.\n\n" \
"[stderr]\nThe five boxing wizards jump quickly."
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_handle_exceptions(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', "type error"):
actual = read_output(self.stdout, self.stderr)
self.assertIn("Cannot read stdout/stderr", actual)

def test_format_stdout_stderr00(self):
"""
Expand All @@ -197,8 +194,9 @@ def test_format_stdout_stderr00(self):
stderr = "The five boxing wizards jump quickly."

expected = "[stdout]\n{0}\n\n[stderr]\n{1}".format(stdout, stderr)
actual = format_stdout_stderr(stdout, stderr, 1000)
self.assertEqual(expected, actual)
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)

def test_format_stdout_stderr01(self):
"""
Expand All @@ -210,9 +208,10 @@ def test_format_stdout_stderr01(self):

# noinspection SpellCheckingInspection
expected = '[stdout]\ns over the lazy dog.\n\n[stderr]\nizards jump quickly.'
actual = format_stdout_stderr(stdout, stderr, 60)
self.assertEqual(expected, actual)
self.assertEqual(60, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 60):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(60, len(actual))

def test_format_stdout_stderr02(self):
"""
Expand All @@ -223,9 +222,10 @@ def test_format_stdout_stderr02(self):
stderr = "The five boxing wizards jump quickly."

expected = '[stdout]\nempty\n\n[stderr]\ns jump quickly.'
actual = format_stdout_stderr(stdout, stderr, 40)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))

def test_format_stdout_stderr03(self):
"""
Expand All @@ -236,9 +236,10 @@ def test_format_stdout_stderr03(self):
stderr = "empty"

expected = '[stdout]\nr the lazy dog.\n\n[stderr]\nempty'
actual = format_stdout_stderr(stdout, stderr, 40)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))

def test_format_stdout_stderr04(self):
"""
Expand All @@ -249,15 +250,17 @@ def test_format_stdout_stderr04(self):
stderr = "The five boxing wizards jump quickly."

expected = ''
actual = format_stdout_stderr(stdout, stderr, 4)
self.assertEqual(expected, actual)
self.assertEqual(0, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 4):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(0, len(actual))

def test_format_stdout_stderr05(self):
"""
If stdout and stderr are empty, an empty template is returned.
"""

expected = '[stdout]\n\n\n[stderr]\n'
actual = format_stdout_stderr('', '', 1000)
self.assertEqual(expected, actual)
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
actual = format_stdout_stderr('', '')
self.assertEqual(expected, actual)

0 comments on commit 7e3eac1

Please sign in to comment.