From 7e3eac100eb40baffb39a2510c36b66238948488 Mon Sep 17 00:00:00 2001 From: Paula Gombar Date: Thu, 16 Jul 2020 10:29:16 -0700 Subject: [PATCH] Fix check for systemd-run failure when invoking extensions (#1943) --- azurelinuxagent/common/cgroupapi.py | 11 ++- .../common/utils/extensionprocessutil.py | 3 +- tests/common/test_cgroupapi.py | 75 ++++++++++++++----- tests/utils/test_extension_process_util.py | 57 +++++++------- 4 files changed, 96 insertions(+), 50 deletions(-) diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py index 3cda3b9e72..576e3af126 100644 --- a/azurelinuxagent/common/cgroupapi.py +++ b/azurelinuxagent/common/cgroupapi.py @@ -31,7 +31,8 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.utils import fileutil, shellutil -from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output +from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \ + TELEMETRY_MESSAGE_MAX_LEN from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.version import get_distro @@ -652,9 +653,11 @@ def get_processes_in_cgroup(cgroup_path): return processes @staticmethod - def _is_systemd_failure(scope_name, process_output): + def _is_systemd_failure(scope_name, stderr): + stderr.seek(0) + stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace') unit_not_found = "Unit {0} not found.".format(scope_name) - return unit_not_found in process_output or scope_name not in process_output + return unit_not_found in stderr or scope_name not in stderr def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr, error_code=ExtensionErrorCodes.PluginUnknownFailure): @@ -709,8 +712,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd, except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. + systemd_failure = self._is_systemd_failure(scope, stderr) process_output = read_output(stdout, stderr) - systemd_failure = self._is_systemd_failure(scope, process_output) if not systemd_failure: # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error diff --git a/azurelinuxagent/common/utils/extensionprocessutil.py b/azurelinuxagent/common/utils/extensionprocessutil.py index 9197bb4bfe..7c3f5b62d5 100644 --- a/azurelinuxagent/common/utils/extensionprocessutil.py +++ b/azurelinuxagent/common/utils/extensionprocessutil.py @@ -99,7 +99,7 @@ def read_output(stdout, stderr): return format_stdout_stderr("", "Cannot read stdout/stderr: {0}".format(ustr(e))) -def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN): +def format_stdout_stderr(stdout, stderr): """ Format stdout and stderr's output to make it suitable in telemetry. The goal is to maximize the amount of output given the constraints @@ -118,6 +118,7 @@ def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN): """ template = "[stdout]\n{0}\n\n[stderr]\n{1}" # +6 == len("{0}") + len("{1}") + max_len = TELEMETRY_MESSAGE_MAX_LEN max_len_each = int((max_len - len(template) + 6) / 2) if max_len_each <= 0: diff --git a/tests/common/test_cgroupapi.py b/tests/common/test_cgroupapi.py index 3ceb972d7c..8104dc6001 100644 --- a/tests/common/test_cgroupapi.py +++ b/tests/common/test_cgroupapi.py @@ -21,15 +21,17 @@ import re import subprocess import tempfile + +from nose.plugins.attrib import attr + from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, VM_AGENT_CGROUP_NAME from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import shellutil, fileutil -from nose.plugins.attrib import attr from tests.common.mock_cgroup_commands import mock_cgroup_commands -from tests.utils.cgroups_tools import CGroupsTools from tests.tools import AgentTestCase, patch, skip_if_predicate_false, is_systemd_present, i_am_root, mock_sleep +from tests.utils.cgroups_tools import CGroupsTools class _MockedFileSystemTestCase(AgentTestCase): @@ -61,7 +63,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self): (['centos', '7.5', 'Source'], False), (['redhat', '7.7', 'Maipo'], False), (['redhat', '7.7.1908', 'Core'], False), - (['bigip', '15.0.1', 'Final'], False), + (['bigip', '15.0.1', 'Final'], False), (['gaia', '273.562', 'R80.30'], False), (['debian' '9.1', ''], False), ] @@ -529,7 +531,7 @@ def test_get_processes_in_cgroup_should_return_the_processes_within_the_cgroup(s processes = SystemdCgroupsApi.get_processes_in_cgroup("/sys/fs/cgroup/cpu/system.slice/walinuxagent.service") self.assertTrue(len(processes) >= 2, - "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes)) + "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes)) daemon_present = any("waagent -daemon" in command for (pid, command) in processes) self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes)) @@ -684,15 +686,14 @@ def mock_popen(command, *args, **kwargs): self.assertEquals(2, len(extension_calls), "The extension should have been invoked exactly twice") self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0], - "The first call to the extension should have used systemd") + "The first call to the extension should have used systemd") self.assertEquals(command, extension_calls[1], - "The second call to the extension should not have used systemd") + "The second call to the extension should not have used systemd") self.assertEquals(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created") self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured") - @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_invoke_the_command_directly_if_systemd_times_out(self, _): # Systemd has its own internal timeout which is shorter than what we define for extension operation timeout. @@ -710,8 +711,6 @@ def mock_popen(*args, **kwargs): return original_popen(new_args, **kwargs) - expected_output = "[stdout]\n{0}\n\n\n[stderr]\n" - with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as popen_patch: @@ -740,15 +739,51 @@ def mock_popen(*args, **kwargs): @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_not_use_fallback_option_if_extension_fails(self, *args): self.assertTrue(i_am_root(), "Test does not run when non-root") + command = "ls folder_does_not_exist" + + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: + with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: + with self.assertRaises(ExtensionError) as context_manager: + SystemdCgroupsApi().start_extension_command( + extension_name="Microsoft.Compute.TestExtension-1.2.3", + command=command, + timeout=300, + shell=True, + cwd=self.tmp_dir, + env={}, + stdout=stdout, + stderr=stderr) + + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]] + + self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once") + self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0], + "The first call to the extension should have used systemd") + + self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure) + self.assertIn("Non-zero exit code", ustr(context_manager.exception)) + # The scope name should appear in the process output since systemd-run was invoked and stderr + # wasn't truncated. + self.assertIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception)) + + @attr('requires_sudo') + @patch("azurelinuxagent.common.cgroupapi.add_event") + @patch('time.sleep', side_effect=lambda _: mock_sleep()) + @patch("azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN", 5) + def test_start_extension_command_should_not_use_fallback_option_if_extension_fails_with_long_output(self, *args): + self.assertTrue(i_am_root(), "Test does not run when non-root") + + long_output = "a"*20 # large enough to ensure both stdout and stderr are truncated + long_stdout_stderr_command = "echo {0} && echo {0} >&2 && ls folder_does_not_exist".format(long_output) with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: - with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) \ - as patch_mock_popen: + with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch: with self.assertRaises(ExtensionError) as context_manager: SystemdCgroupsApi().start_extension_command( extension_name="Microsoft.Compute.TestExtension-1.2.3", - command="ls folder_does_not_exist", + command=long_stdout_stderr_command, timeout=300, shell=True, cwd=self.tmp_dir, @@ -756,13 +791,17 @@ def test_start_extension_command_should_not_use_fallback_option_if_extension_fai stdout=stdout, stderr=stderr) - # We should have invoked the extension command only once, in the systemd-run case - self.assertEquals(1, patch_mock_popen.call_count) - args = patch_mock_popen.call_args[0][0] - self.assertIn("systemd-run --unit", args) + extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if long_stdout_stderr_command in args[0]] + + self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once") + self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0], + "The first call to the extension should have used systemd") - self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure) - self.assertIn("Non-zero exit code", ustr(context_manager.exception)) + self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure) + self.assertIn("Non-zero exit code", ustr(context_manager.exception)) + # stdout and stderr should have been truncated, so the scope name doesn't appear in stderr + # even though systemd-run ran + self.assertNotIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception)) @attr('requires_sudo') @patch("azurelinuxagent.common.cgroupapi.add_event") diff --git a/tests/utils/test_extension_process_util.py b/tests/utils/test_extension_process_util.py index c51f78afce..0ef7803513 100644 --- a/tests/utils/test_extension_process_util.py +++ b/tests/utils/test_extension_process_util.py @@ -164,29 +164,26 @@ def test_handle_process_completion_should_raise_on_nonzero_exit_code(self): self.assertEquals(context_manager.exception.code, error_code) self.assertIn("Non-zero exit code:", ustr(context_manager.exception)) - def test_read_output_it_should_return_no_content(self): + def test_read_output_should_return_no_content(self): with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 0): - expected = "[stdout]\n\n\n[stderr]\n" + expected = "" actual = read_output(self.stdout, self.stderr) self.assertEqual(expected, actual) - def test_read_output_it_should_truncate_the_content(self): - with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 10): - expected = "[stdout]\nThe quick \n\n[stderr]\nThe five b" + def test_read_output_should_truncate_the_content(self): + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50): + expected = "[stdout]\nr the lazy dog.\n\n" \ + "[stderr]\ns jump quickly." actual = read_output(self.stdout, self.stderr) self.assertEqual(expected, actual) - def test_read_output_it_should_return_all_content(self): - with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50): + def test_read_output_should_not_truncate_the_content(self): + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 90): expected = "[stdout]\nThe quick brown fox jumps over the lazy dog.\n\n" \ "[stderr]\nThe five boxing wizards jump quickly." actual = read_output(self.stdout, self.stderr) self.assertEqual(expected, actual) - def test_read_output_it_should_handle_exceptions(self): - with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', "type error"): - actual = read_output(self.stdout, self.stderr) - self.assertIn("Cannot read stdout/stderr", actual) def test_format_stdout_stderr00(self): """ @@ -197,8 +194,9 @@ def test_format_stdout_stderr00(self): stderr = "The five boxing wizards jump quickly." expected = "[stdout]\n{0}\n\n[stderr]\n{1}".format(stdout, stderr) - actual = format_stdout_stderr(stdout, stderr, 1000) - self.assertEqual(expected, actual) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000): + actual = format_stdout_stderr(stdout, stderr) + self.assertEqual(expected, actual) def test_format_stdout_stderr01(self): """ @@ -210,9 +208,10 @@ def test_format_stdout_stderr01(self): # noinspection SpellCheckingInspection expected = '[stdout]\ns over the lazy dog.\n\n[stderr]\nizards jump quickly.' - actual = format_stdout_stderr(stdout, stderr, 60) - self.assertEqual(expected, actual) - self.assertEqual(60, len(actual)) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 60): + actual = format_stdout_stderr(stdout, stderr) + self.assertEqual(expected, actual) + self.assertEqual(60, len(actual)) def test_format_stdout_stderr02(self): """ @@ -223,9 +222,10 @@ def test_format_stdout_stderr02(self): stderr = "The five boxing wizards jump quickly." expected = '[stdout]\nempty\n\n[stderr]\ns jump quickly.' - actual = format_stdout_stderr(stdout, stderr, 40) - self.assertEqual(expected, actual) - self.assertEqual(40, len(actual)) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40): + actual = format_stdout_stderr(stdout, stderr) + self.assertEqual(expected, actual) + self.assertEqual(40, len(actual)) def test_format_stdout_stderr03(self): """ @@ -236,9 +236,10 @@ def test_format_stdout_stderr03(self): stderr = "empty" expected = '[stdout]\nr the lazy dog.\n\n[stderr]\nempty' - actual = format_stdout_stderr(stdout, stderr, 40) - self.assertEqual(expected, actual) - self.assertEqual(40, len(actual)) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40): + actual = format_stdout_stderr(stdout, stderr) + self.assertEqual(expected, actual) + self.assertEqual(40, len(actual)) def test_format_stdout_stderr04(self): """ @@ -249,9 +250,10 @@ def test_format_stdout_stderr04(self): stderr = "The five boxing wizards jump quickly." expected = '' - actual = format_stdout_stderr(stdout, stderr, 4) - self.assertEqual(expected, actual) - self.assertEqual(0, len(actual)) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 4): + actual = format_stdout_stderr(stdout, stderr) + self.assertEqual(expected, actual) + self.assertEqual(0, len(actual)) def test_format_stdout_stderr05(self): """ @@ -259,5 +261,6 @@ def test_format_stdout_stderr05(self): """ expected = '[stdout]\n\n\n[stderr]\n' - actual = format_stdout_stderr('', '', 1000) - self.assertEqual(expected, actual) + with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000): + actual = format_stdout_stderr('', '') + self.assertEqual(expected, actual)