diff --git a/cloudinit/cmd/status.py b/cloudinit/cmd/status.py index f5ee9c115f8..d0237076121 100644 --- a/cloudinit/cmd/status.py +++ b/cloudinit/cmd/status.py @@ -11,6 +11,7 @@ import json import os import sys +import time from copy import deepcopy from time import gmtime, sleep, strftime from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union @@ -272,12 +273,15 @@ def get_bootstatus(disable_file, paths) -> Tuple[UXAppBootStatusCode, str]: return (bootstatus_code, reason) -def _get_systemd_status() -> Optional[UXAppStatus]: - """Get status from systemd. +def _get_error_or_running_from_systemd() -> Optional[UXAppStatus]: + """Get if systemd is in error or running state. Using systemd, we can get more fine-grained status of the individual unit. Determine if we're still - running or if there's an error we haven't otherwise detected + running or if there's an error we haven't otherwise detected. + + If we don't detect error or running, return None as we don't want to + report any other particular status based on systemd. """ for service in [ "cloud-final.service", @@ -324,6 +328,35 @@ def _get_systemd_status() -> Optional[UXAppStatus]: return None +def _get_error_or_running_from_systemd_with_retry( + existing_status, max_wait=5 +) -> Optional[UXAppStatus]: + """Get systemd status and retry if dbus isn't ready. + + If cloud-init has determined that we're still running, then we can + ignore the status from systemd. However, if cloud-init has detected error, + then we should retry on systemd status so we don't incorrectly report + error state while cloud-init is still running. + """ + start_time = time.time() + while time.time() - start_time < max_wait: + try: + return _get_error_or_running_from_systemd() + except subp.ProcessExecutionError: + if existing_status in ( + UXAppStatus.DEGRADED_RUNNING, + UXAppStatus.RUNNING, + ): + return None + sleep(0.25) + print( + f"Failed to get status from systemd after {max_wait} seconds. " + "Cloud-init may still be running.", + file=sys.stderr, + ) + return None + + def get_status_details(paths: Optional[Paths] = None) -> StatusDetails: """Return a dict with status, details and errors. @@ -395,7 +428,7 @@ def get_status_details(paths: Optional[Paths] = None) -> StatusDetails: UXAppStatus.NOT_RUN, UXAppStatus.DISABLED, ): - systemd_status = _get_systemd_status() + systemd_status = _get_error_or_running_from_systemd_with_retry(status) if systemd_status: status = systemd_status diff --git a/tests/unittests/cmd/test_status.py b/tests/unittests/cmd/test_status.py index 6c85a59ab61..abb3b7e3838 100644 --- a/tests/unittests/cmd/test_status.py +++ b/tests/unittests/cmd/test_status.py @@ -9,9 +9,14 @@ import pytest +from cloudinit import subp from cloudinit.atomic_helper import write_json from cloudinit.cmd import status -from cloudinit.cmd.status import UXAppStatus, _get_systemd_status +from cloudinit.cmd.status import ( + UXAppStatus, + _get_error_or_running_from_systemd, + _get_error_or_running_from_systemd_with_retry, +) from cloudinit.subp import SubpResult from cloudinit.util import ensure_file from tests.unittests.helpers import wrap_and_call @@ -59,7 +64,10 @@ class TestStatus: "Cloud-init enabled by systemd cloud-init-generator", ), ) - @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None) + @mock.patch( + f"{M_PATH}_get_error_or_running_from_systemd_with_retry", + return_value=None, + ) def test_get_status_details_ds_none( self, m_get_systemd_status, @@ -704,7 +712,10 @@ def fakeexists(filepath): ], ) @mock.patch(M_PATH + "read_cfg_paths") - @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None) + @mock.patch( + f"{M_PATH}_get_error_or_running_from_systemd_with_retry", + return_value=None, + ) def test_status_output( self, m_get_systemd_status, @@ -745,7 +756,10 @@ def test_status_output( assert out == expected_status @mock.patch(M_PATH + "read_cfg_paths") - @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None) + @mock.patch( + f"{M_PATH}_get_error_or_running_from_systemd_with_retry", + return_value=None, + ) def test_status_wait_blocks_until_done( self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys ): @@ -796,7 +810,10 @@ def fake_sleep(interval): assert out == "....\nstatus: done\n" @mock.patch(M_PATH + "read_cfg_paths") - @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None) + @mock.patch( + f"{M_PATH}_get_error_or_running_from_systemd_with_retry", + return_value=None, + ) def test_status_wait_blocks_until_error( self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys ): @@ -849,7 +866,10 @@ def fake_sleep(interval): assert out == "....\nstatus: error\n" @mock.patch(M_PATH + "read_cfg_paths") - @mock.patch(f"{M_PATH}_get_systemd_status", return_value=None) + @mock.patch( + f"{M_PATH}_get_error_or_running_from_systemd_with_retry", + return_value=None, + ) def test_status_main( self, m_get_systemd_status, m_read_cfg_paths, config: Config, capsys ): @@ -873,7 +893,12 @@ def test_status_main( assert out == "status: running\n" -class TestSystemdStatusDetails: +class TestGetErrorOrRunningFromSystemd: + @pytest.fixture(autouse=True) + def common_mocks(self, mocker): + mocker.patch("cloudinit.cmd.status.sleep") + yield + @pytest.mark.parametrize( ["active_state", "unit_file_state", "sub_state", "main_pid", "status"], [ @@ -881,7 +906,7 @@ class TestSystemdStatusDetails: # enabled, enabled-runtime, and static into an "enabled" state # and everything else functionally disabled. # Additionally, SubStates are undocumented and may mean something - # different depending on the ActiveState they are mapped too. + # different depending on the ActiveState they are mapped to. # Because of this I'm only testing SubState combinations seen # in real-world testing (or using "any" string if we dont care). ("activating", "enabled", "start", "123", UXAppStatus.RUNNING), @@ -910,7 +935,7 @@ class TestSystemdStatusDetails: ("failed", "invalid", "failed", "0", UXAppStatus.ERROR), ], ) - def test_get_systemd_status( + def test_get_error_or_running_from_systemd( self, active_state, unit_file_state, sub_state, main_pid, status ): with mock.patch( @@ -923,4 +948,70 @@ def test_get_systemd_status( stderr=None, ), ): - assert _get_systemd_status() == status + assert _get_error_or_running_from_systemd() == status + + def test_exception_while_running(self, mocker, capsys): + m_subp = mocker.patch( + f"{M_PATH}subp.subp", + side_effect=subp.ProcessExecutionError( + "Message recipient disconnected from message bus without" + " replying" + ), + ) + assert ( + _get_error_or_running_from_systemd_with_retry( + UXAppStatus.RUNNING, max_wait=1000 + ) + is None + ) + assert 1 == m_subp.call_count + assert "Failed to get status" not in capsys.readouterr().err + + def test_retry(self, mocker, capsys): + m_subp = mocker.patch( + f"{M_PATH}subp.subp", + side_effect=[ + subp.ProcessExecutionError( + "Message recipient disconnected from message bus without" + " replying" + ), + subp.ProcessExecutionError( + "Message recipient disconnected from message bus without" + " replying" + ), + SubpResult( + "ActiveState=activating\nUnitFileState=enabled\n" + "SubState=start\nMainPID=123\n", + stderr=None, + ), + ], + ) + assert ( + _get_error_or_running_from_systemd_with_retry( + UXAppStatus.ERROR, max_wait=1000 + ) + is UXAppStatus.RUNNING + ) + assert 3 == m_subp.call_count + assert "Failed to get status" not in capsys.readouterr().err + + def test_retry_timeout(self, mocker, capsys): + m_subp = mocker.patch( + f"{M_PATH}subp.subp", + side_effect=subp.ProcessExecutionError( + "Message recipient disconnected from message bus without" + " replying" + ), + ) + mocker.patch("time.time", side_effect=[1, 2, 50]) + assert ( + _get_error_or_running_from_systemd_with_retry( + UXAppStatus.ERROR, max_wait=5 + ) + is None + ) + assert 1 == m_subp.call_count + assert ( + "Failed to get status from systemd after 5 seconds. " + "Cloud-init may still be running." + ) in capsys.readouterr().err