Skip to content

Commit

Permalink
sources/azure: refactor _report_ready_if_needed and _poll_imds (#1222)
Browse files Browse the repository at this point in the history
Refactor _report_ready_if_needed() to work for both Savable PPS
and Runnable PPS:
* rename _report_ready_if_needed() to _report_ready_for_pps()
* return interface name from lease to support _poll_imds() behavior
  without changing it.
* fixes an issue where reporting ready return value was silently
  ignored for Savable PPS.
* add explicit handling for failure to obtain DHCP lease to
  result in sources.InvalidMetaDataException.

Refactor _poll_imds():
* use _report_ready_for_pps() for reporting ready, removing this logic
  to simplify loop logic.
* move netlink and vnetswitch out of while loop to simplify loop logic,
  leaving only reprovision polling in loop.
* add explicit handling for failure to obtain DHCP lease and
  retry in the next iteration.

Signed-off-by: Chris Patterson cpatterson@microsoft.com
  • Loading branch information
cjp256 authored Jan 31, 2022
1 parent 0362abc commit 902aa44
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 123 deletions.
230 changes: 111 additions & 119 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from cloudinit import net, sources, ssh_util, subp, util
from cloudinit.event import EventScope, EventType
from cloudinit.net import device_driver
from cloudinit.net.dhcp import NoDHCPLeaseError
from cloudinit.reporting import events
from cloudinit.sources.helpers import netlink
from cloudinit.sources.helpers.azure import (
Expand Down Expand Up @@ -890,36 +891,18 @@ def _create_report_ready_marker(self):
)

@azure_ds_telemetry_reporter
def _report_ready_if_needed(self):
"""Report ready to the platform if the marker file is not present,
and create the marker file.
def _report_ready_for_pps(self, lease: dict) -> None:
"""Report ready for PPS, creating the marker file upon completion.
:raises sources.InvalidMetaDataException: On error reporting ready.
"""
have_not_reported_ready = not os.path.isfile(
REPORTED_READY_MARKER_FILE
)
report_ready_succeeded = self._report_ready(lease=lease)
if not report_ready_succeeded:
msg = "Failed reporting ready while in the preprovisioning pool."
report_diagnostic_event(msg, logger_func=LOG.error)
raise sources.InvalidMetaDataException(msg)

if have_not_reported_ready:
report_diagnostic_event(
"Reporting ready before nic detach", logger_func=LOG.info
)
try:
with EphemeralDHCPv4WithReporting(azure_ds_reporter) as lease:
self._report_ready(lease=lease)
except Exception as e:
report_diagnostic_event(
"Exception reporting ready during "
"preprovisioning before nic detach: %s" % e,
logger_func=LOG.error,
)
raise
self._create_report_ready_marker()
else:
report_diagnostic_event(
"Already reported ready before nic detach."
" The marker file already exists: %s"
% REPORTED_READY_MARKER_FILE,
logger_func=LOG.error,
)
self._create_report_ready_marker()

@azure_ds_telemetry_reporter
def _check_if_nic_is_primary(self, ifname):
Expand Down Expand Up @@ -1113,7 +1096,20 @@ def _wait_for_all_nics_ready(self):
# Report ready if the marker file is not already present.
# The nic of the preprovisioned vm gets hot-detached as soon as
# we report ready. So no need to save the dhcp context.
self._report_ready_if_needed()
if not os.path.isfile(REPORTED_READY_MARKER_FILE):
try:
with EphemeralDHCPv4WithReporting(
azure_ds_reporter
) as lease:
self._report_ready_for_pps(lease)
except NoDHCPLeaseError as error:
report_diagnostic_event(
"DHCP failed while in provisioning pool",
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from error

has_nic_been_detached = bool(
os.path.isfile(REPROVISION_NIC_DETACHED_MARKER_FILE)
Expand Down Expand Up @@ -1159,8 +1155,7 @@ def _poll_imds(self):
self.imds_logging_threshold = 1
self.imds_poll_counter = 1
dhcp_attempts = 0
vnet_switched = False
return_val = None
reprovision_data = None

def exc_cb(msg, exception):
if isinstance(exception, UrlError):
Expand Down Expand Up @@ -1201,110 +1196,107 @@ def exc_cb(msg, exception):
# When the interface is hot-attached, we would have already
# done dhcp and set the dhcp context. In that case, skip
# the attempt to do dhcp.
is_ephemeral_ctx_present = self._ephemeral_dhcp_ctx is not None
msg = (
"Unexpected error. Dhcp context is not expected to be already "
"set when we need to wait for vnet switch"
)
if is_ephemeral_ctx_present and report_ready:
if self._ephemeral_dhcp_ctx is not None and report_ready:
report_diagnostic_event(msg, logger_func=LOG.error)
raise RuntimeError(msg)

while True:
if report_ready:
try:
# Since is_ephemeral_ctx_present is set only once, this ensures
# that with regular reprovisioning, dhcp is always done every
# time the loop runs.
if not is_ephemeral_ctx_present:
# Save our EphemeralDHCPv4 context to avoid repeated dhcp
# later when we report ready
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
lease = self._ephemeral_dhcp_ctx.obtain_lease()

if vnet_switched:
dhcp_attempts += 1
if report_ready:
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
lease = self._ephemeral_dhcp_ctx.obtain_lease()
nl_sock = netlink.create_bound_netlink_socket()
self._report_ready_for_pps(lease)

# Networking must remain up for netlink to detect
# media disconnect/connect.
LOG.debug(
"Wait for vnetswitch to happen on %s",
lease["interface"],
)
with events.ReportEventStack(
name="wait-for-media-disconnect-connect",
description="wait for vnet switch",
parent=azure_ds_reporter,
):
try:
nl_sock = netlink.create_bound_netlink_socket()
except netlink.NetlinkCreateSocketError as e:
report_diagnostic_event(
"Failed to create bound netlink socket: %s" % e,
logger_func=LOG.warning,
netlink.wait_for_media_disconnect_connect(
nl_sock, lease["interface"]
)
self._ephemeral_dhcp_ctx.clean_network()
break

report_ready_succeeded = self._report_ready(lease=lease)
if not report_ready_succeeded:
msg = (
"Failed reporting ready while in "
"the preprovisioning pool."
except AssertionError as e:
report_diagnostic_event(
"Error while waiting for vnet switch: %s" % e,
logger_func=LOG.error,
)
report_diagnostic_event(msg, logger_func=LOG.error)
self._ephemeral_dhcp_ctx.clean_network()
raise sources.InvalidMetaDataException(msg)

self._create_report_ready_marker()
report_ready = False

LOG.debug("Wait for vnetswitch to happen")
with events.ReportEventStack(
name="wait-for-media-disconnect-connect",
description="wait for vnet switch",
parent=azure_ds_reporter,
):
try:
netlink.wait_for_media_disconnect_connect(
nl_sock, lease["interface"]
)
except AssertionError as e:
report_diagnostic_event(
"Error while waiting for vnet switch: %s" % e,
logger_func=LOG.error,
)
break

vnet_switched = True
self._ephemeral_dhcp_ctx.clean_network()
else:
with events.ReportEventStack(
name="get-reprovision-data-from-imds",
description="get reprovision data from imds",
parent=azure_ds_reporter,
):
return_val = readurl(
url,
timeout=IMDS_TIMEOUT_IN_SECONDS,
headers=headers,
exception_cb=exc_cb,
infinite=True,
log_req_resp=False,
).contents
break
except UrlError:
# Teardown our EphemeralDHCPv4 context on failure as we retry
self._ephemeral_dhcp_ctx.clean_network()

# Also reset this flag which determines if we should do dhcp
# during retries.
is_ephemeral_ctx_present = False
except netlink.NetlinkCreateSocketError as e:
report_diagnostic_event(
"Failed to create bound netlink socket: %s" % e,
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from e
except NoDHCPLeaseError as e:
report_diagnostic_event(
"DHCP failed while in provisioning pool",
logger_func=LOG.warning,
)
raise sources.InvalidMetaDataException(
"Failed to report ready while in provisioning pool."
) from e
finally:
if nl_sock:
nl_sock.close()

if vnet_switched:
report_diagnostic_event(
"attempted dhcp %d times after reuse" % dhcp_attempts,
logger_func=LOG.debug,
)
report_diagnostic_event(
"polled imds %d times after reuse" % self.imds_poll_counter,
logger_func=LOG.debug,
)
# Teardown old network configuration.
self._ephemeral_dhcp_ctx.clean_network()
self._ephemeral_dhcp_ctx = None

while not reprovision_data:
if self._ephemeral_dhcp_ctx is None:
self._ephemeral_dhcp_ctx = EphemeralDHCPv4WithReporting(
azure_ds_reporter
)
dhcp_attempts += 1
try:
self._ephemeral_dhcp_ctx.obtain_lease()
except NoDHCPLeaseError:
continue

with events.ReportEventStack(
name="get-reprovision-data-from-imds",
description="get reprovision data from imds",
parent=azure_ds_reporter,
):
try:
reprovision_data = readurl(
url,
timeout=IMDS_TIMEOUT_IN_SECONDS,
headers=headers,
exception_cb=exc_cb,
infinite=True,
log_req_resp=False,
).contents
except UrlError:
self._ephemeral_dhcp_ctx.clean_network()
self._ephemeral_dhcp_ctx = None
continue

report_diagnostic_event(
"attempted dhcp %d times after reuse" % dhcp_attempts,
logger_func=LOG.debug,
)
report_diagnostic_event(
"polled imds %d times after reuse" % self.imds_poll_counter,
logger_func=LOG.debug,
)

return return_val
return reprovision_data

@azure_ds_telemetry_reporter
def _report_failure(self, description: Optional[str] = None) -> bool:
Expand Down
5 changes: 1 addition & 4 deletions tests/unittests/sources/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3213,10 +3213,8 @@ def setUp(self):
dsaz.BUILTIN_DS_CONFIG["data_dir"] = self.waagent_d

@mock.patch("time.sleep", mock.MagicMock())
@mock.patch(MOCKPATH + "EphemeralDHCPv4WithReporting", autospec=True)
def test_poll_imds_re_dhcp_on_timeout(
self,
m_dhcpv4,
m_report_ready,
m_request,
m_media_switch,
Expand All @@ -3236,7 +3234,6 @@ def test_poll_imds_re_dhcp_on_timeout(
m_media_switch.return_value = None
dhcp_ctx = mock.MagicMock(lease=lease)
dhcp_ctx.obtain_lease.return_value = lease
m_dhcpv4.return_value = dhcp_ctx

self.tries = 0

Expand All @@ -3260,7 +3257,7 @@ def fake_timeout_once(**kwargs):
dsa._poll_imds()
self.assertEqual(m_report_ready.call_count, 1)
m_report_ready.assert_called_with(lease=lease)
self.assertEqual(3, m_dhcpv4.call_count, "Expected 3 DHCP calls")
self.assertEqual(3, m_dhcp.call_count, "Expected 3 DHCP calls")
self.assertEqual(4, self.tries, "Expected 4 total reads from IMDS")

@mock.patch("os.path.isfile")
Expand Down

0 comments on commit 902aa44

Please sign in to comment.