Skip to content

Commit

Permalink
Fix race condition in service.running on systemd
Browse files Browse the repository at this point in the history
This state will check if the service is running after it runs a
`service.start` on it. However, systemd has more potential states than
just up or down. It also has the concept of `activating` and
`deactivating` states. An example of this are services which use
`Type=notify`, and rely on the service to tell systemd that it is up.
For these services, `systemctl start` will exit (ceding control back to
Salt) and systemd will await its notification. In that interim, the
service will be in either the `activating` or `deactivating` state.

Importantly, Salt uses `systemctl is-active service_name` to check if
the service is up, and any state other than `active` results in a
nonzero exit code, which Salt interprets as the service being down.

So, if the notification doesn't come quick enough, then when Salt checks
on the service's status post-start, it will appear to Salt to be down
when it is actually in the `activating` state.

This commit modifies the `systemd_service` module such that, when the
status is `activating` or `deactivating`, the `systemctl is-active` will
be periodically retried, up to a tunable amount of time (by default 3
seconds), until some result other than `activating` or `deactivating` is
returned (or the timeout is reached, at which time the service will be
assumed to be down). This will keep services from being misinterpreted
as being dead when it just took a little longer than normal to start.

I realize that there is already an `init_delay` argument for this state,
but this _always_ sleeps for that period of time, and also applies to
all `service` modules. The idea behind making changes to the
`systemd_service` module is to catch many issues like this _before_ you
have to start troubleshooting why it's being identified as dead when
it's not actually dead. I'm open to suggestions.
  • Loading branch information
Erik Johnson committed Jan 2, 2020
1 parent be5811e commit 4136ca7
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 36 deletions.
55 changes: 39 additions & 16 deletions salt/modules/systemd_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import fnmatch
import re
import shlex
import time

# Import Salt libs
import salt.utils.files
Expand Down Expand Up @@ -1062,44 +1063,66 @@ def force_reload(name, no_block=True, unmask=False, unmask_runtime=False):

# The unused sig argument is required to maintain consistency with the API
# established by Salt's service management states.
def status(name, sig=None): # pylint: disable=unused-argument
def status(name, sig=None, wait=3): # pylint: disable=unused-argument
'''
Return the status for a service via systemd.
If the name contains globbing, a dict mapping service name to True/False
Check whether or not a service is active.
If the name contains globbing, a dict mapping service names to True/False
values is returned.
.. versionchanged:: 2018.3.0
The service name can now be a glob (e.g. ``salt*``)
Args:
name (str): The name of the service to check
sig (str): Not implemented
name
The name of the service to check
Returns:
bool: True if running, False otherwise
dict: Maps service name to True if running, False otherwise
sig
Not implemented, but required to be accepted as it is passed by service
states
wait : 3
If the service is in the process of changing states (i.e. it is in
either the ``activating`` or ``deactivating`` state), wait up to this
amount of seconds (checking again periodically) before determining
whether the service is active.
.. versionadded:: 2019.2.3
CLI Example:
.. code-block:: bash
salt '*' service.status <service name> [service signature]
'''
def _get_status(service):
ret = __salt__['cmd.run_all'](_systemctl_cmd('is-active', service),
python_shell=False,
ignore_retcode=True,
redirect_stderr=True)
return ret['retcode'] == 0, ret['stdout']

contains_globbing = bool(re.search(r'\*|\?|\[.+\]', name))
if contains_globbing:
services = fnmatch.filter(get_all(), name)
else:
services = [name]
results = {}
ret = {}
for service in services:
_check_for_unit_changes(service)
results[service] = __salt__['cmd.retcode'](
_systemctl_cmd('is-active', service),
python_shell=False,
ignore_retcode=True) == 0
ret[service], _message = _get_status(service)
if not ret[service]:
# Check if the service is in the process of activating/deactivating
start_time = time.time()
# match both 'activating' and 'deactivating'
while 'activating' in _message \
and (time.time() - start_time <= wait):
time.sleep(0.5)
ret[service], _message = _get_status(service)
if ret[service]:
break

if contains_globbing:
return results
return results[name]
return ret
return ret[name]


# **kwargs is required to maintain consistency with the API established by
Expand Down
58 changes: 39 additions & 19 deletions salt/states/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,9 @@ def _get_systemd_only(func, kwargs):
warnings = []
valid_args = _argspec(func).args
for systemd_arg in SYSTEMD_ONLY:
arg_val = kwargs.get(systemd_arg, False)
if arg_val:
if systemd_arg in kwargs:
if systemd_arg in valid_args:
ret[systemd_arg] = arg_val
ret[systemd_arg] = kwargs[systemd_arg]
else:
warnings.append(
'The \'{0}\' argument is not supported by this '
Expand All @@ -107,6 +106,11 @@ def _get_systemd_only(func, kwargs):
return ret, warnings


def _add_warnings(ret, warnings):
current_warnings = ret.setdefault('warnings', [])
current_warnings.extend([x for x in warnings if x not in current_warnings])


def _enabled_used_error(ret):
'''
Warn of potential typo.
Expand Down Expand Up @@ -336,9 +340,6 @@ def running(name,
enable=None,
sig=None,
init_delay=None,
no_block=False,
unmask=False,
unmask_runtime=False,
**kwargs):
'''
Ensure that the service is running
Expand Down Expand Up @@ -383,6 +384,13 @@ def running(name,
In previous releases, Salt would simply unmask a service before
making any changes. This behavior is no longer the default.
wait : 3
**For systemd minions only.** Passed through when using
:py:func:`service.status <salt.modules.systemd_service.status>` to
determine whether the service is running or not.
.. versionadded:: 2019.2.3
.. note::
``watch`` can be used with service.running to restart a service when
another state changes ( example: a file.managed state that creates the
Expand Down Expand Up @@ -414,17 +422,21 @@ def running(name,
ret['comment'] = exc.strerror
return ret

status_kwargs, warnings = _get_systemd_only(__salt__['service.status'], kwargs)
if warnings:
_add_warnings(ret, warnings)

# lot of custom init script won't or mis implement the status
# command, so it is just an indicator but can not be fully trusted
before_toggle_status = __salt__['service.status'](name, sig)
before_toggle_status = __salt__['service.status'](name, sig, **status_kwargs)
if 'service.enabled' in __salt__:
before_toggle_enable_status = __salt__['service.enabled'](name)
else:
before_toggle_enable_status = True

unmask_ret = {'comment': ''}
if unmask:
unmask_ret = unmasked(name, unmask_runtime)
if kwargs.get('unmask', False):
unmask_ret = unmasked(name, kwargs.get('unmask_runtime', False))

# See if the service is already running
if before_toggle_status:
Expand All @@ -448,9 +460,9 @@ def running(name,

# Conditionally add systemd-specific args to call to service.start
start_kwargs, warnings = \
_get_systemd_only(__salt__['service.start'], locals())
_get_systemd_only(__salt__['service.start'], kwargs)
if warnings:
ret.setdefault('warnings', []).extend(warnings)
_add_warnings(ret, warnings)

if salt.utils.platform.is_windows() and kwargs.get('timeout', False):
start_kwargs.update({'timeout': kwargs.get('timeout')})
Expand All @@ -475,7 +487,7 @@ def running(name,
time.sleep(init_delay)

# only force a change state if we have explicitly detected them
after_toggle_status = __salt__['service.status'](name, sig)
after_toggle_status = __salt__['service.status'](name, sig, **kwargs)
if 'service.enabled' in __salt__:
after_toggle_enable_status = __salt__['service.enabled'](name)
else:
Expand Down Expand Up @@ -503,7 +515,7 @@ def running(name,
.format(ret['comment'], init_delay)
)

if unmask:
if kwargs.get('unmask', False):
ret['comment'] = '\n'.join([ret['comment'], unmask_ret['comment']])

return ret
Expand Down Expand Up @@ -568,9 +580,13 @@ def dead(name,
ret['comment'] = exc.strerror
return ret

status_kwargs, warnings = _get_systemd_only(__salt__['service.status'], kwargs)
if warnings:
_add_warnings(ret, warnings)

# lot of custom init script won't or mis implement the status
# command, so it is just an indicator but can not be fully trusted
before_toggle_status = __salt__['service.status'](name, sig)
before_toggle_status = __salt__['service.status'](name, sig, **status_kwargs)
if 'service.enabled' in __salt__:
if salt.utils.platform.is_windows():
# service.enabled in Windows returns True for services that are set
Expand Down Expand Up @@ -599,7 +615,7 @@ def dead(name,
# Conditionally add systemd-specific args to call to service.start
stop_kwargs, warnings = _get_systemd_only(__salt__['service.stop'], kwargs)
if warnings:
ret.setdefault('warnings', []).extend(warnings)
_add_warnings(ret, warnings)

if salt.utils.platform.is_windows() and kwargs.get('timeout', False):
stop_kwargs.update({'timeout': kwargs.get('timeout')})
Expand All @@ -618,7 +634,7 @@ def dead(name,
time.sleep(init_delay)

# only force a change state if we have explicitly detected them
after_toggle_status = __salt__['service.status'](name)
after_toggle_status = __salt__['service.status'](name, **status_kwargs)
if 'service.enabled' in __salt__:
after_toggle_enable_status = __salt__['service.enabled'](name)
else:
Expand Down Expand Up @@ -884,17 +900,21 @@ def mod_watch(name,
'comment': ''}
past_participle = None

status_kwargs, warnings = _get_systemd_only(__salt__['service.status'], kwargs)
if warnings:
_add_warnings(ret, warnings)

if sfun == 'dead':
verb = 'stop'
past_participle = verb + 'ped'
if __salt__['service.status'](name, sig):
if __salt__['service.status'](name, sig, **status_kwargs):
func = __salt__['service.stop']
else:
ret['result'] = True
ret['comment'] = 'Service is already {0}'.format(past_participle)
return ret
elif sfun == 'running':
if __salt__['service.status'](name, sig):
if __salt__['service.status'](name, sig, **status_kwargs):
if 'service.reload' in __salt__ and reload:
if 'service.force_reload' in __salt__ and force:
func = __salt__['service.force_reload']
Expand Down Expand Up @@ -929,7 +949,7 @@ def mod_watch(name,

func_kwargs, warnings = _get_systemd_only(func, kwargs)
if warnings:
ret.setdefault('warnings', []).extend(warnings)
_add_warnings(ret, warnings)

try:
result = func(name, **func_kwargs)
Expand Down
76 changes: 76 additions & 0 deletions tests/unit/modules/test_systemd_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from tests.support.mixins import LoaderModuleMockMixin
from tests.support.unit import TestCase
from tests.support.mock import (
Mock,
MagicMock,
patch,
)
Expand Down Expand Up @@ -257,6 +258,81 @@ def test_execs(self):
with patch.object(systemd, 'show', mock):
self.assertDictEqual(systemd.execs(), {'a': 'c', 'b': 'c'})

def test_status(self):
'''
Test to confirm that the function retries when the service is in the
activating/deactivating state.
'''
active = {
'stdout': 'active',
'stderr': '',
'retcode': 0,
'pid': 12345}
inactive = {
'stdout': 'inactive',
'stderr': '',
'retcode': 3,
'pid': 12345}
activating = {
'stdout': 'activating',
'stderr': '',
'retcode': 3,
'pid': 12345}
deactivating = {
'stdout': 'deactivating',
'stderr': '',
'retcode': 3,
'pid': 12345}

check_mock = Mock()

cmd_mock = MagicMock(side_effect=[activating, activating, active, inactive])
with patch.dict(systemd.__salt__, {'cmd.run_all': cmd_mock}), \
patch.object(systemd, '_check_for_unit_changes', check_mock):
ret = systemd.status('foo')
assert ret is True
# We should only have had three calls, since the third was not
# either in the activating or deactivating state and we should not
# have retried after.
assert cmd_mock.call_count == 3

cmd_mock = MagicMock(side_effect=[deactivating, deactivating, inactive, active])
with patch.dict(systemd.__salt__, {'cmd.run_all': cmd_mock}), \
patch.object(systemd, '_check_for_unit_changes', check_mock):
ret = systemd.status('foo')
assert ret is False
# We should only have had three calls, since the third was not
# either in the activating or deactivating state and we should not
# have retried after.
assert cmd_mock.call_count == 3

cmd_mock = MagicMock(side_effect=[activating, activating, active, inactive])
with patch.dict(systemd.__salt__, {'cmd.run_all': cmd_mock}), \
patch.object(systemd, '_check_for_unit_changes', check_mock):
ret = systemd.status('foo', wait=0.25)
assert ret is False
# We should only have had two calls, because "wait" was set too low
# to allow for more than one retry.
assert cmd_mock.call_count == 2

cmd_mock = MagicMock(side_effect=[active, inactive])
with patch.dict(systemd.__salt__, {'cmd.run_all': cmd_mock}), \
patch.object(systemd, '_check_for_unit_changes', check_mock):
ret = systemd.status('foo')
assert ret is True
# We should only have a single call, because the first call was in
# the active state.
assert cmd_mock.call_count == 1

cmd_mock = MagicMock(side_effect=[inactive, active])
with patch.dict(systemd.__salt__, {'cmd.run_all': cmd_mock}), \
patch.object(systemd, '_check_for_unit_changes', check_mock):
ret = systemd.status('foo')
assert ret is False
# We should only have a single call, because the first call was in
# the inactive state.
assert cmd_mock.call_count == 1


class SystemdScopeTestCase(TestCase, LoaderModuleMockMixin):
'''
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/states/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,4 @@ def test_mod_watch(self):
self.assertDictEqual(service.mod_watch("salt", "running"),
ret[3])

self.assertDictEqual(service.mod_watch("salt", "stack"), ret[1])
self.assertDictEqual(service.mod_watch("salt", "stack"), ret[1])

0 comments on commit 4136ca7

Please sign in to comment.