Fix check for systemd-run failure when invoking extensions (#1943)

Azure · Jul 16, 2020 · 7e3eac1 · 7e3eac1
1 parent 0edea53
commit 7e3eac1
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 50 deletions.
diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py
@@ -31,7 +31,8 @@
 from azurelinuxagent.common.future import ustr
 from azurelinuxagent.common.osutil import get_osutil
 from azurelinuxagent.common.utils import fileutil, shellutil
-from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output
+from azurelinuxagent.common.utils.extensionprocessutil import handle_process_completion, read_output, \
+    TELEMETRY_MESSAGE_MAX_LEN
 from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
 from azurelinuxagent.common.version import get_distro
 
@@ -652,9 +653,11 @@ def get_processes_in_cgroup(cgroup_path):
         return processes
 
     @staticmethod
-    def _is_systemd_failure(scope_name, process_output):
+    def _is_systemd_failure(scope_name, stderr):
+        stderr.seek(0)
+        stderr = ustr(stderr.read(TELEMETRY_MESSAGE_MAX_LEN), encoding='utf-8', errors='backslashreplace')
         unit_not_found = "Unit {0} not found.".format(scope_name)
-        return unit_not_found in process_output or scope_name not in process_output
+        return unit_not_found in stderr or scope_name not in stderr
 
     def start_extension_command(self, extension_name, command, timeout, shell, cwd, env, stdout, stderr,
                                 error_code=ExtensionErrorCodes.PluginUnknownFailure):
@@ -709,8 +712,8 @@ def start_extension_command(self, extension_name, command, timeout, shell, cwd,
         except ExtensionError as e:
             # The extension didn't terminate successfully. Determine whether it was due to systemd errors or
             # extension errors.
+            systemd_failure = self._is_systemd_failure(scope, stderr)
             process_output = read_output(stdout, stderr)
-            systemd_failure = self._is_systemd_failure(scope, process_output)
 
             if not systemd_failure:
                 # There was an extension error; it either timed out or returned a non-zero exit code. Re-raise the error

diff --git a/azurelinuxagent/common/utils/extensionprocessutil.py b/azurelinuxagent/common/utils/extensionprocessutil.py
@@ -99,7 +99,7 @@ def read_output(stdout, stderr):
         return format_stdout_stderr("", "Cannot read stdout/stderr: {0}".format(ustr(e)))
 
 
-def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
+def format_stdout_stderr(stdout, stderr):
     """
     Format stdout and stderr's output to make it suitable in telemetry.
     The goal is to maximize the amount of output given the constraints
@@ -118,6 +118,7 @@ def format_stdout_stderr(stdout, stderr, max_len=TELEMETRY_MESSAGE_MAX_LEN):
     """
     template = "[stdout]\n{0}\n\n[stderr]\n{1}"
     # +6 == len("{0}") + len("{1}")
+    max_len = TELEMETRY_MESSAGE_MAX_LEN
     max_len_each = int((max_len - len(template) + 6) / 2)
 
     if max_len_each <= 0:

diff --git a/tests/common/test_cgroupapi.py b/tests/common/test_cgroupapi.py
@@ -21,15 +21,17 @@
 import re
 import subprocess
 import tempfile
+
+from nose.plugins.attrib import attr
+
 from azurelinuxagent.common.cgroupapi import CGroupsApi, FileSystemCgroupsApi, SystemdCgroupsApi, VM_AGENT_CGROUP_NAME
 from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
 from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes
 from azurelinuxagent.common.future import ustr
 from azurelinuxagent.common.utils import shellutil, fileutil
-from nose.plugins.attrib import attr
 from tests.common.mock_cgroup_commands import mock_cgroup_commands
-from tests.utils.cgroups_tools import CGroupsTools
 from tests.tools import AgentTestCase, patch, skip_if_predicate_false, is_systemd_present, i_am_root, mock_sleep
+from tests.utils.cgroups_tools import CGroupsTools
 
 
 class _MockedFileSystemTestCase(AgentTestCase):
@@ -61,7 +63,7 @@ def test_cgroups_should_be_supported_only_on_ubuntu_16_and_later(self):
             (['centos', '7.5', 'Source'], False),
             (['redhat', '7.7', 'Maipo'], False),
             (['redhat', '7.7.1908', 'Core'], False),
-            (['bigip',  '15.0.1', 'Final'], False),
+            (['bigip', '15.0.1', 'Final'], False),
             (['gaia', '273.562', 'R80.30'], False),
             (['debian' '9.1', ''], False),
         ]
@@ -529,7 +531,7 @@ def test_get_processes_in_cgroup_should_return_the_processes_within_the_cgroup(s
             processes = SystemdCgroupsApi.get_processes_in_cgroup("/sys/fs/cgroup/cpu/system.slice/walinuxagent.service")
 
             self.assertTrue(len(processes) >= 2,
-                "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))
+                            "The cgroup should contain at least 2 procceses (daemon and extension handler): [{0}]".format(processes))
 
             daemon_present = any("waagent -daemon" in command for (pid, command) in processes)
             self.assertTrue(daemon_present, "Could not find the daemon in the cgroup: [{0}]".format(processes))
@@ -684,15 +686,14 @@ def mock_popen(command, *args, **kwargs):
 
                     self.assertEquals(2, len(extension_calls), "The extension should have been invoked exactly twice")
                     self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
-                        "The first call to the extension should have used systemd")
+                                  "The first call to the extension should have used systemd")
                     self.assertEquals(command, extension_calls[1],
-                        "The second call to the extension should not have used systemd")
+                                      "The second call to the extension should not have used systemd")
 
                     self.assertEquals(len(CGroupsTelemetry._tracked), 0, "No cgroups should have been created")
 
                     self.assertIn("TEST_OUTPUT\n", command_output, "The test output was not captured")
 
-
     @patch('time.sleep', side_effect=lambda _: mock_sleep())
     def test_start_extension_command_should_invoke_the_command_directly_if_systemd_times_out(self, _):
         # Systemd has its own internal timeout which is shorter than what we define for extension operation timeout.
@@ -710,8 +711,6 @@ def mock_popen(*args, **kwargs):
 
             return original_popen(new_args, **kwargs)
 
-        expected_output = "[stdout]\n{0}\n\n\n[stderr]\n"
-
         with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
             with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
                 with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", side_effect=mock_popen) as popen_patch:
@@ -740,29 +739,69 @@ def mock_popen(*args, **kwargs):
     @patch('time.sleep', side_effect=lambda _: mock_sleep())
     def test_start_extension_command_should_not_use_fallback_option_if_extension_fails(self, *args):
         self.assertTrue(i_am_root(), "Test does not run when non-root")
+        command = "ls folder_does_not_exist"
+
+        with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
+            with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
+                with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
+                    with self.assertRaises(ExtensionError) as context_manager:
+                        SystemdCgroupsApi().start_extension_command(
+                            extension_name="Microsoft.Compute.TestExtension-1.2.3",
+                            command=command,
+                            timeout=300,
+                            shell=True,
+                            cwd=self.tmp_dir,
+                            env={},
+                            stdout=stdout,
+                            stderr=stderr)
+
+                    extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if command in args[0]]
+
+                    self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
+                    self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
+                                  "The first call to the extension should have used systemd")
+
+                    self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
+                    self.assertIn("Non-zero exit code", ustr(context_manager.exception))
+                    # The scope name should appear in the process output since systemd-run was invoked and stderr
+                    # wasn't truncated.
+                    self.assertIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))
+
+    @attr('requires_sudo')
+    @patch("azurelinuxagent.common.cgroupapi.add_event")
+    @patch('time.sleep', side_effect=lambda _: mock_sleep())
+    @patch("azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN", 5)
+    def test_start_extension_command_should_not_use_fallback_option_if_extension_fails_with_long_output(self, *args):
+        self.assertTrue(i_am_root(), "Test does not run when non-root")
+
+        long_output = "a"*20  # large enough to ensure both stdout and stderr are truncated
+        long_stdout_stderr_command = "echo {0} && echo {0} >&2 && ls folder_does_not_exist".format(long_output)
 
         with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout:
             with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr:
-                with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) \
-                        as patch_mock_popen:
+                with patch("azurelinuxagent.common.cgroupapi.subprocess.Popen", wraps=subprocess.Popen) as popen_patch:
                     with self.assertRaises(ExtensionError) as context_manager:
                         SystemdCgroupsApi().start_extension_command(
                             extension_name="Microsoft.Compute.TestExtension-1.2.3",
-                            command="ls folder_does_not_exist",
+                            command=long_stdout_stderr_command,
                             timeout=300,
                             shell=True,
                             cwd=self.tmp_dir,
                             env={},
                             stdout=stdout,
                             stderr=stderr)
 
-                        # We should have invoked the extension command only once, in the systemd-run case
-                        self.assertEquals(1, patch_mock_popen.call_count)
-                        args = patch_mock_popen.call_args[0][0]
-                        self.assertIn("systemd-run --unit", args)
+                    extension_calls = [args[0] for (args, _) in popen_patch.call_args_list if long_stdout_stderr_command in args[0]]
+
+                    self.assertEquals(1, len(extension_calls), "The extension should have been invoked exactly once")
+                    self.assertIn("systemd-run --unit=Microsoft.Compute.TestExtension_1.2.3", extension_calls[0],
+                                  "The first call to the extension should have used systemd")
 
-                        self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
-                        self.assertIn("Non-zero exit code", ustr(context_manager.exception))
+                    self.assertEquals(context_manager.exception.code, ExtensionErrorCodes.PluginUnknownFailure)
+                    self.assertIn("Non-zero exit code", ustr(context_manager.exception))
+                    # stdout and stderr should have been truncated, so the scope name doesn't appear in stderr
+                    # even though systemd-run ran
+                    self.assertNotIn("Microsoft.Compute.TestExtension_1.2.3", ustr(context_manager.exception))
 
     @attr('requires_sudo')
     @patch("azurelinuxagent.common.cgroupapi.add_event")

diff --git a/tests/utils/test_extension_process_util.py b/tests/utils/test_extension_process_util.py
@@ -164,29 +164,26 @@ def test_handle_process_completion_should_raise_on_nonzero_exit_code(self):
                 self.assertEquals(context_manager.exception.code, error_code)
                 self.assertIn("Non-zero exit code:", ustr(context_manager.exception))
 
-    def test_read_output_it_should_return_no_content(self):
+    def test_read_output_should_return_no_content(self):
         with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 0):
-            expected = "[stdout]\n\n\n[stderr]\n"
+            expected = ""
             actual = read_output(self.stdout, self.stderr)
             self.assertEqual(expected, actual)
 
-    def test_read_output_it_should_truncate_the_content(self):
-        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 10):
-            expected = "[stdout]\nThe quick \n\n[stderr]\nThe five b"
+    def test_read_output_should_truncate_the_content(self):
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
+            expected = "[stdout]\nr the lazy dog.\n\n" \
+                       "[stderr]\ns jump quickly."
             actual = read_output(self.stdout, self.stderr)
             self.assertEqual(expected, actual)
 
-    def test_read_output_it_should_return_all_content(self):
-        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 50):
+    def test_read_output_should_not_truncate_the_content(self):
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 90):
             expected = "[stdout]\nThe quick brown fox jumps over the lazy dog.\n\n" \
                        "[stderr]\nThe five boxing wizards jump quickly."
             actual = read_output(self.stdout, self.stderr)
             self.assertEqual(expected, actual)
 
-    def test_read_output_it_should_handle_exceptions(self):
-        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', "type error"):
-            actual = read_output(self.stdout, self.stderr)
-            self.assertIn("Cannot read stdout/stderr", actual)
 
     def test_format_stdout_stderr00(self):
         """
@@ -197,8 +194,9 @@ def test_format_stdout_stderr00(self):
         stderr = "The five boxing wizards jump quickly."
 
         expected = "[stdout]\n{0}\n\n[stderr]\n{1}".format(stdout, stderr)
-        actual = format_stdout_stderr(stdout, stderr, 1000)
-        self.assertEqual(expected, actual)
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
+            actual = format_stdout_stderr(stdout, stderr)
+            self.assertEqual(expected, actual)
 
     def test_format_stdout_stderr01(self):
         """
@@ -210,9 +208,10 @@ def test_format_stdout_stderr01(self):
 
         # noinspection SpellCheckingInspection
         expected = '[stdout]\ns over the lazy dog.\n\n[stderr]\nizards jump quickly.'
-        actual = format_stdout_stderr(stdout, stderr, 60)
-        self.assertEqual(expected, actual)
-        self.assertEqual(60, len(actual))
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 60):
+            actual = format_stdout_stderr(stdout, stderr)
+            self.assertEqual(expected, actual)
+            self.assertEqual(60, len(actual))
 
     def test_format_stdout_stderr02(self):
         """
@@ -223,9 +222,10 @@ def test_format_stdout_stderr02(self):
         stderr = "The five boxing wizards jump quickly."
 
         expected = '[stdout]\nempty\n\n[stderr]\ns jump quickly.'
-        actual = format_stdout_stderr(stdout, stderr, 40)
-        self.assertEqual(expected, actual)
-        self.assertEqual(40, len(actual))
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
+            actual = format_stdout_stderr(stdout, stderr)
+            self.assertEqual(expected, actual)
+            self.assertEqual(40, len(actual))
 
     def test_format_stdout_stderr03(self):
         """
@@ -236,9 +236,10 @@ def test_format_stdout_stderr03(self):
         stderr = "empty"
 
         expected = '[stdout]\nr the lazy dog.\n\n[stderr]\nempty'
-        actual = format_stdout_stderr(stdout, stderr, 40)
-        self.assertEqual(expected, actual)
-        self.assertEqual(40, len(actual))
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 40):
+            actual = format_stdout_stderr(stdout, stderr)
+            self.assertEqual(expected, actual)
+            self.assertEqual(40, len(actual))
 
     def test_format_stdout_stderr04(self):
         """
@@ -249,15 +250,17 @@ def test_format_stdout_stderr04(self):
         stderr = "The five boxing wizards jump quickly."
 
         expected = ''
-        actual = format_stdout_stderr(stdout, stderr, 4)
-        self.assertEqual(expected, actual)
-        self.assertEqual(0, len(actual))
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 4):
+            actual = format_stdout_stderr(stdout, stderr)
+            self.assertEqual(expected, actual)
+            self.assertEqual(0, len(actual))
 
     def test_format_stdout_stderr05(self):
         """
         If stdout and stderr are empty, an empty template is returned.
         """
 
         expected = '[stdout]\n\n\n[stderr]\n'
-        actual = format_stdout_stderr('', '', 1000)
-        self.assertEqual(expected, actual)
+        with patch('azurelinuxagent.common.utils.extensionprocessutil.TELEMETRY_MESSAGE_MAX_LEN', 1000):
+            actual = format_stdout_stderr('', '')
+            self.assertEqual(expected, actual)