Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix check for systemd-run failure when invoking extensions #1943

Merged
merged 9 commits into from
Jul 16, 2020
11 changes: 7 additions & 4 deletions azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil, shellutil
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output
from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
TELEMETRY_MESSAGE_MAX_LEN
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import get_distro

Expand Down Expand Up @@ -652,9 +653,11 @@ def get_processes_in_cgroup(cgroup_path):
return processes

@staticmethod
def _is_systemd_failure(scope_name, process_output):
def _is_systemd_failure(scope_name, stderr):
stderr.seek(0)
stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
unit_not_found = "Unit {0} not found.".format(scope_name)
return unit_not_found in process_output or scope_name not in process_output
return unit_not_found in stderr or scope_name not in stderr

def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
error_code=ExtensionErrorCodes.PluginUnknownFailure):
Expand Down Expand Up @@ -709,8 +712,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd,
except ExtensionError as e:
# The extension didn't terminate successfully. Determine whether it was due to systemd errors or
# extension errors.
systemd_failure = self._is_systemd_failure(scope, stderr)
process_output = read_output(stdout, stderr)
systemd_failure = self._is_systemd_failure(scope, process_output)

if not systemd_failure:
# There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error
Expand Down
3 changes: 2 additions & 1 deletion azurelinuxagent/common/utils/extensionprocessutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def read_output(stdout, stderr):
return format_stdout_stderr("", "Cannot read stdout/stderr: {0}".format(ustr(e)))


def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
def format_stdout_stderr(stdout, stderr):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lesson to learn: don't assign constants in the function signature since this is loaded before the function body, so patching the constant will not work.

"""
Format stdout and stderr's output to make it suitable in telemetry.
The goal is to maximize the amount of output given the constraints
Expand All @@ -118,6 +118,7 @@ def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
"""
template = "[stdout]\n{0}\n\n[stderr]\n{1}"
# +6 == len("{0}") + len("{1}")
max_len = TELEMETRY_MESSAGE_MAX_LEN
max_len_each = int((max_len - len(template) + 6) / 2)

if max_len_each <= 0:
Expand Down
75 changes: 57 additions & 18 deletions tests/common/test_cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,17 @@
import re
import subprocess
import tempfile

from nose.plugins.attrib import attr

from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, VM_AGENT_CGROUP_NAME
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.utils import shellutil, fileutil
from nose.plugins.attrib import attr
from tests.common.mock_cgroup_commands import mock_cgroup_commands
from tests.utils.cgroups_tools import CGroupsTools
from tests.tools import AgentTestCase, patch, skip_if_predicate_false, is_systemd_present, i_am_root, mock_sleep
from tests.utils.cgroups_tools import CGroupsTools


class _MockedFileSystemTestCase(AgentTestCase):
Expand Down Expand Up @@ -61,7 +63,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self):
(['centos', '7.5', 'Source'], False),
(['redhat', '7.7', 'Maipo'], False),
(['redhat', '7.7.1908', 'Core'], False),
(['bigip', '15.0.1', 'Final'], False),
(['bigip', '15.0.1', 'Final'], False),
(['gaia', '273.562', 'R80.30'], False),
(['debian' '9.1', ''], False),
]
Expand Down Expand Up @@ -529,7 +531,7 @@ def test_get_processes_in_cgroup_should_return_the_processes_within_the_cgroup(s
processes = SystemdCgroupsApi.get_processes_in_cgroup("/sys/fs/cgroup/cpu/system.slice/walinuxagent.service")

self.assertTrue(len(processes) >= 2,
"The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))
"The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))

daemon_present = any("waagent -daemon" in command for (pid, command) in processes)
self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes))
Expand Down Expand Up @@ -684,15 +686,14 @@ def mock_popen(command, *args, **kwargs):

self.assertEquals(2, len(extension_calls), "The extension should have been invoked exactly twice")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")
"The first call to the extension should have used systemd")
self.assertEquals(command, extension_calls[1],
"The second call to the extension should not have used systemd")
"The second call to the extension should not have used systemd")

self.assertEquals(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created")

self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured")


@patch('time.sleep', side_effect=lambda _: mock_sleep())
def test_start_extension_command_should_invoke_the_command_directly_if_systemd_times_out(self, _):
# Systemd has its own internal timeout which is shorter than what we define for extension operation timeout.
Expand All @@ -710,8 +711,6 @@ def mock_popen(*args, **kwargs):

return original_popen(new_args, **kwargs)

expected_output = "[stdout]\n{0}\n\n\n[stderr]\n"

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as popen_patch:
Expand Down Expand Up @@ -740,29 +739,69 @@ def mock_popen(*args, **kwargs):
@patch('time.sleep', side_effect=lambda _: mock_sleep())
def test_start_extension_command_should_not_use_fallback_option_if_extension_fails(self, *args):
self.assertTrue(i_am_root(), "Test does not run when non-root")
command = "ls folder_does_not_exist"

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
with self.assertRaises(ExtensionError) as context_manager:
SystemdCgroupsApi().start_extension_command(
extension_name="Microsoft.Compute.TestExtension-1.2.3",
command=command,
timeout=300,
shell=True,
cwd=self.tmp_dir,
env={},
stdout=stdout,
stderr=stderr)

extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]]
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is where the indentation was wrong. It was at the same level with the context manager, instead of outside, like now.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this new code... you mentioned it was in existing code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i see it now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the assert for root was hiding the issue

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but it should've failed in travis then right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It didn't fail because these asserts were never executed. I ran the test both from sudo shell and from sudo PyCharm and could never hit the asserts. I'm guessing it has something to do with the context manager which just exits the current scope when the object is released?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ohh dang. Good catch though! Hopefully we can catch these moving on

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm guessing it has something to do with the context manager which just exits the current scope when the object is released?

The test case raises, so the call to start_extension_command will exit the with statement.

I seem to remember at some point I formatted a few tests in this file so that may have moved the assert to the with statement.


self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")

self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
# The scope name should appear in the process output since systemd-run was invoked and stderr
# wasn't truncated.
self.assertIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))

@attr('requires_sudo')
@patch("azurelinuxagent.common.cgroupapi.add_event")
@patch('time.sleep', side_effect=lambda _: mock_sleep())
@patch("azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN", 5)
def test_start_extension_command_should_not_use_fallback_option_if_extension_fails_with_long_output(self, *args):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New test to cover long extension outputs. This would fail before the changes in this PR.

self.assertTrue(i_am_root(), "Test does not run when non-root")

long_output = "a"*20 # large enough to ensure both stdout and stderr are truncated
long_stdout_stderr_command = "echo {0} && echo {0} >&2 && ls folder_does_not_exist".format(long_output)

with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) \
as patch_mock_popen:
with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
with self.assertRaises(ExtensionError) as context_manager:
SystemdCgroupsApi().start_extension_command(
extension_name="Microsoft.Compute.TestExtension-1.2.3",
command="ls folder_does_not_exist",
command=long_stdout_stderr_command,
timeout=300,
shell=True,
cwd=self.tmp_dir,
env={},
stdout=stdout,
stderr=stderr)

# We should have invoked the extension command only once, in the systemd-run case
self.assertEquals(1, patch_mock_popen.call_count)
args = patch_mock_popen.call_args[0][0]
self.assertIn("systemd-run --unit", args)
extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if long_stdout_stderr_command in args[0]]

self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
"The first call to the extension should have used systemd")

self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
self.assertIn("Non-zero exit code", ustr(context_manager.exception))
# stdout and stderr should have been truncated, so the scope name doesn't appear in stderr
# even though systemd-run ran
self.assertNotIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))

@attr('requires_sudo')
@patch("azurelinuxagent.common.cgroupapi.add_event")
Expand Down
57 changes: 30 additions & 27 deletions tests/utils/test_extension_process_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,29 +164,26 @@ def test_handle_process_completion_should_raise_on_nonzero_exit_code(self):
self.assertEquals(context_manager.exception.code, error_code)
self.assertIn("Non-zero exit code:", ustr(context_manager.exception))

def test_read_output_it_should_return_no_content(self):
def test_read_output_should_return_no_content(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Once I fixed the patching for TELEMETRY_MESSAGE_MAX_LEN, these tests produced different (correct) results :)

with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 0):
expected = "[stdout]\n\n\n[stderr]\n"
expected = ""
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 10):
expected = "[stdout]\nThe quick \n\n[stderr]\nThe five b"
def test_read_output_should_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
expected = "[stdout]\nr the lazy dog.\n\n" \
"[stderr]\ns jump quickly."
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_return_all_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
def test_read_output_should_not_truncate_the_content(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 90):
expected = "[stdout]\nThe quick brown fox jumps over the lazy dog.\n\n" \
"[stderr]\nThe five boxing wizards jump quickly."
actual = read_output(self.stdout, self.stderr)
self.assertEqual(expected, actual)

def test_read_output_it_should_handle_exceptions(self):
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', "type error"):
actual = read_output(self.stdout, self.stderr)
self.assertIn("Cannot read stdout/stderr", actual)

def test_format_stdout_stderr00(self):
"""
Expand All @@ -197,8 +194,9 @@ def test_format_stdout_stderr00(self):
stderr = "The five boxing wizards jump quickly."

expected = "[stdout]\n{0}\n\n[stderr]\n{1}".format(stdout, stderr)
actual = format_stdout_stderr(stdout, stderr, 1000)
self.assertEqual(expected, actual)
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)

def test_format_stdout_stderr01(self):
"""
Expand All @@ -210,9 +208,10 @@ def test_format_stdout_stderr01(self):

# noinspection SpellCheckingInspection
expected = '[stdout]\ns over the lazy dog.\n\n[stderr]\nizards jump quickly.'
actual = format_stdout_stderr(stdout, stderr, 60)
self.assertEqual(expected, actual)
self.assertEqual(60, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 60):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(60, len(actual))

def test_format_stdout_stderr02(self):
"""
Expand All @@ -223,9 +222,10 @@ def test_format_stdout_stderr02(self):
stderr = "The five boxing wizards jump quickly."

expected = '[stdout]\nempty\n\n[stderr]\ns jump quickly.'
actual = format_stdout_stderr(stdout, stderr, 40)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))

def test_format_stdout_stderr03(self):
"""
Expand All @@ -236,9 +236,10 @@ def test_format_stdout_stderr03(self):
stderr = "empty"

expected = '[stdout]\nr the lazy dog.\n\n[stderr]\nempty'
actual = format_stdout_stderr(stdout, stderr, 40)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(40, len(actual))

def test_format_stdout_stderr04(self):
"""
Expand All @@ -249,15 +250,17 @@ def test_format_stdout_stderr04(self):
stderr = "The five boxing wizards jump quickly."

expected = ''
actual = format_stdout_stderr(stdout, stderr, 4)
self.assertEqual(expected, actual)
self.assertEqual(0, len(actual))
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 4):
actual = format_stdout_stderr(stdout, stderr)
self.assertEqual(expected, actual)
self.assertEqual(0, len(actual))

def test_format_stdout_stderr05(self):
"""
If stdout and stderr are empty, an empty template is returned.
"""

expected = '[stdout]\n\n\n[stderr]\n'
actual = format_stdout_stderr('', '', 1000)
self.assertEqual(expected, actual)
with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
actual = format_stdout_stderr('', '')
self.assertEqual(expected, actual)