Skip to content

Commit

Permalink
fix: retry AWS hotplug for async IMDS
Browse files Browse the repository at this point in the history
Make tests more robust to temporary network failure.
Document hotplug limitations.

Fixes canonicalGH-5373
  • Loading branch information
holmanb committed Jan 31, 2025
1 parent 5f290f5 commit 5322b93
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 10 deletions.
19 changes: 18 additions & 1 deletion cloudinit/cmd/devel/hotplug_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def initialize_datasource(hotplug_init: Init, subsystem: str):
return datasource


def handle_hotplug(hotplug_init: Init, devpath, subsystem, udevaction):
def handle_hotplug(hotplug_init: Init, devpath, subsystem, udevaction) -> None:
datasource = initialize_datasource(hotplug_init, subsystem)
if not datasource:
return
Expand All @@ -216,6 +216,23 @@ def handle_hotplug(hotplug_init: Init, devpath, subsystem, udevaction):
action=udevaction,
success_fn=hotplug_init._write_to_cache,
)
start = time.time()
elapsed = 0.0
if not datasource.hotplug_retry_settings.force_retry:
try_hotplug(subsystem, event_handler, datasource)
LOG.info("Not forcing retry")
return
LOG.info("%s < %s", elapsed, datasource.hotplug_retry_settings.sleep_total)
while elapsed < datasource.hotplug_retry_settings.sleep_total:
try_hotplug(subsystem, event_handler, datasource)
LOG.info(
"Gathering network configuration again due to IMDS limitations."
)
elapsed = time.time() - start
time.sleep(datasource.hotplug_retry_settings.sleep_period)


def try_hotplug(subsystem, event_handler, datasource) -> None:
wait_times = [1, 3, 5, 10, 30]
last_exception = Exception("Bug while processing hotplug event.")
for attempt, wait in enumerate(wait_times):
Expand Down
4 changes: 3 additions & 1 deletion cloudinit/sources/DataSourceEc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from cloudinit.net import netplan
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.net.ephemeral import EphemeralIPNetwork
from cloudinit.sources import NicOrder
from cloudinit.sources import HotplugRetrySettings, NicOrder
from cloudinit.sources.helpers import ec2

LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -114,6 +114,7 @@ class DataSourceEc2(sources.DataSource):
}

extra_hotplug_udev_rules = _EXTRA_HOTPLUG_UDEV_RULES
hotplug_retry_settings = HotplugRetrySettings(True, 5, 30)

def __init__(self, sys_cfg, distro, paths):
super(DataSourceEc2, self).__init__(sys_cfg, distro, paths)
Expand All @@ -125,6 +126,7 @@ def _unpickle(self, ci_pkl_version: int) -> None:
super()._unpickle(ci_pkl_version)
self.extra_hotplug_udev_rules = _EXTRA_HOTPLUG_UDEV_RULES
self._fallback_nic_order = NicOrder.MAC
self.hotplug_retry_settings = HotplugRetrySettings(True, 5, 30)

def _get_cloud_name(self):
"""Return the cloud name as identified during _get_data."""
Expand Down
17 changes: 17 additions & 0 deletions cloudinit/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,14 @@ class DataSourceHostname(NamedTuple):
is_default: bool


class HotplugRetrySettings(NamedTuple):
"""in seconds"""

force_retry: bool
sleep_period: int
sleep_total: int


class DataSource(CloudInitPickleMixin, metaclass=abc.ABCMeta):

dsmode = DSMODE_NETWORK
Expand Down Expand Up @@ -311,6 +319,14 @@ class DataSource(CloudInitPickleMixin, metaclass=abc.ABCMeta):
# in the updated metadata
skip_hotplug_detect = False

# AWS interface data propagates to the IMDS without a syncronization method
# Since no better alternative exists, use a datasource-specific mechanism
# which retries periodically for a set amount of time - apply configuration
# as needed. Do not force retry on other datasources.
#
# https://github.com/amazonlinux/amazon-ec2-net-utils/blob/601bc3513fa7b8a6ab46d9496b233b079e55f2e9/lib/lib.sh#L483
hotplug_retry_settings = HotplugRetrySettings(False, 0, 0)

# Extra udev rules for cc_install_hotplug
extra_hotplug_udev_rules: Optional[str] = None

Expand Down Expand Up @@ -355,6 +371,7 @@ def _unpickle(self, ci_pkl_version: int) -> None:
"skip_hotplug_detect": False,
"vendordata2": None,
"vendordata2_raw": None,
"hotplug_retry_settings": HotplugRetrySettings(False, 0, 0),
}
for key, value in expected_attrs.items():
if not hasattr(self, key):
Expand Down
5 changes: 5 additions & 0 deletions doc/module-docs/cc_install_hotplug/data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ cc_install_hotplug:
refresh the instance metadata from the datasource, detect the device in
the updated metadata, then apply the updated network configuration.
Udev rules are installed while cloud-init is running, which means that
devices which are added during boot might not be configured. To work
around this limitation, one can wait until cloud-init has completed
before hotplugging devices.
Currently supported datasources: Openstack, EC2
examples:
- comment: |
Expand Down
57 changes: 49 additions & 8 deletions tests/integration_tests/modules/test_hotplug.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import time
from collections import namedtuple

Expand Down Expand Up @@ -36,6 +37,7 @@
when: ['boot-new-instance']
"""

LOG = logging.getLogger()
ip_addr = namedtuple("ip_addr", "interface state ip4 ip6")


Expand Down Expand Up @@ -308,17 +310,36 @@ def test_multi_nic_hotplug(client: IntegrationInstance):


@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
@pytest.mark.skip(reason="IMDS race, see GH-5373. Unskip when fixed.")
def test_multi_nic_hotplug_vpc(session_cloud: IntegrationCloud):
@pytest.mark.parametrize("test_early_plug", [False, True])
def test_multi_nic_hotplug_vpc(
session_cloud: IntegrationCloud, test_early_plug: bool
):
"""Tests that additional secondary NICs are routable from local
networks after the hotplug hook is executed when network updates
are configured on the HOTPLUG event."""
if test_early_plug:
# This test usually passes without wait_for_cloud_init(),
# but sometimes the hotplug event races with the end of cloud-init
# so occasionally fails. For now, skip it. Keep it around, because
# this variation of the test would be valuable to test early hotplug,
# if it is ever supported.
pytest.skip(
"Skipping test because cloud-init doesn't support hotplug "
"until after cloud-init (yet)"
)
with session_cloud.launch(
user_data=USER_DATA
) as client, session_cloud.launch() as bastion:
ips_before = _get_ip_addr(client)
primary_priv_ip4 = ips_before[1].ip4
primary_priv_ip6 = ips_before[1].ip6
if not test_early_plug:
# cloud-init is incapable of hotplugged devices until after
# completion (cloud-init.target / cloud-init status --wait)
#
# To reliably test cloud-init hotplug, wait for completion before
# testing behaviors.
wait_for_cloud_init(client)
client.instance.add_network_interface(ipv6_address_count=1)

_wait_till_hotplug_complete(client)
Expand Down Expand Up @@ -348,13 +369,14 @@ def test_multi_nic_hotplug_vpc(session_cloud: IntegrationCloud):
assert len(ips_after_add) == len(ips_before) + 1

# pings to primary and secondary NICs work
r = bastion.execute(f"ping -c1 {primary_priv_ip4}")
# use -w so that test is less flaky with temporary network failure
r = bastion.execute(f"ping -c1 -w5 {primary_priv_ip4}")
assert r.ok, r.stdout
r = bastion.execute(f"ping -c1 {secondary_priv_ip4}")
r = bastion.execute(f"ping -c1 -w5 {secondary_priv_ip4}")
assert r.ok, r.stdout
r = bastion.execute(f"ping -c1 {primary_priv_ip6}")
r = bastion.execute(f"ping -c1 -w5 {primary_priv_ip6}")
assert r.ok, r.stdout
r = bastion.execute(f"ping -c1 {secondary_priv_ip6}")
r = bastion.execute(f"ping -c1 -w5 {secondary_priv_ip6}")
assert r.ok, r.stdout

# Check every route has metrics associated. See LP: #2055397
Expand All @@ -368,12 +390,31 @@ def test_multi_nic_hotplug_vpc(session_cloud: IntegrationCloud):
_wait_till_hotplug_complete(client, expected_runs=2)

# ping to primary NIC works
assert bastion.execute(f"ping -c1 {primary_priv_ip4}").ok
assert bastion.execute(f"ping -c1 {primary_priv_ip6}").ok
retries = 32
error = ""
for i in range(retries):
if bastion.execute(f"ping -c1 -w5 {primary_priv_ip4}").ok:
break
LOG.info("Failed to ping %s on try #%s", primary_priv_ip4, i + 1)
else:
error = (
f"Failed to ping {primary_priv_ip4} after {retries} retries"
)

for i in range(retries):
if bastion.execute(f"ping -c1 -w5 {primary_priv_ip6}").ok:
break
LOG.info("Failed to ping %s on try #%s", primary_priv_ip6, i + 1)
else:
error = (
f"Failed to ping {primary_priv_ip6} after {retries} retries"
)

log_content = client.read_from_file("/var/log/cloud-init.log")
verify_clean_log(log_content)
verify_clean_boot(client)
if error:
raise Exception(error)


@pytest.mark.skipif(PLATFORM != "ec2", reason="test is ec2 specific")
Expand Down
5 changes: 5 additions & 0 deletions tools/hook-hotplug
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# cloud-init is ready; if so invoke cloud-init hotplug-hook

fifo=/run/cloud-init/hook-hotplug-cmd
log_file=/run/cloud-init/hook-hotplug.log

should_run() {
if [ -d /run/systemd ]; then
Expand All @@ -17,6 +18,9 @@ should_run() {
}

if ! should_run; then
# This happens when a device is hotplugged before cloud-init-hotplugd.socket is
# listening on the socket.
echo "Not running hotplug, not ready yet" >> ${log_file}
exit 0
fi

Expand All @@ -25,3 +29,4 @@ exec 3<>$fifo
env_params=" --subsystem=${SUBSYSTEM} handle --devpath=${DEVPATH} --udevaction=${ACTION}"
# write params to cloud-init's hotplug-hook fifo
echo "${env_params}" >&3
echo "Running hotplug hook: $env_params" >> ${log_file}

0 comments on commit 5322b93

Please sign in to comment.