Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Azure: Retry getting network metadata during nic attach for all non-timeout errors #878

Merged
merged 2 commits into from
Apr 26, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 56 additions & 9 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from xml.dom import minidom
import xml.etree.ElementTree as ET
from enum import Enum
import requests

from cloudinit import dmi
from cloudinit import log as logging
Expand Down Expand Up @@ -612,7 +613,9 @@ def get_imds_data_with_api_fallback(
self,
fallback_nic,
retries,
md_type=metadata_type.compute):
md_type=metadata_type.compute,
exc_cb=retry_on_url_exc,
infinite=False):
"""
Wrapper for get_metadata_from_imds so that we can have flexibility
in which IMDS api-version we use. If a particular instance of IMDS
Expand All @@ -632,7 +635,8 @@ def get_imds_data_with_api_fallback(
fallback_nic=fallback_nic,
retries=0,
md_type=md_type,
api_version=IMDS_VER_WANT
api_version=IMDS_VER_WANT,
exc_cb=exc_cb
)
except UrlError as err:
LOG.info(
Expand All @@ -655,7 +659,9 @@ def get_imds_data_with_api_fallback(
fallback_nic=fallback_nic,
retries=retries,
md_type=md_type,
api_version=IMDS_VER_MIN
api_version=IMDS_VER_MIN,
exc_cb=exc_cb,
infinite=infinite
)

def device_name_to_device(self, name):
Expand Down Expand Up @@ -858,6 +864,9 @@ def _check_if_nic_is_primary(self, ifname):
is_primary = False
expected_nic_count = -1
imds_md = None
metadata_poll_count = 0
metadata_logging_threshold = 1
metadata_timeout_count = 0

# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
Expand All @@ -882,14 +891,48 @@ def _check_if_nic_is_primary(self, ifname):
% (ifname, e), logger_func=LOG.error)
raise

# Retry polling network metadata for a limited duration only when the
# calls fail due to timeout. This is because the platform drops packets
# going towards IMDS when it is not a primary nic. If the calls fail
# due to other issues like 410, 503 etc, then it means we are primary
# but IMDS service is unavailable at the moment. Retry indefinitely in
# those cases since we cannot move on without the network metadata.
def network_metadata_exc_cb(msg, exc):
nonlocal metadata_timeout_count, metadata_poll_count
nonlocal metadata_logging_threshold

metadata_poll_count = metadata_poll_count + 1

# Log when needed but back off exponentially to avoid exploding
# the log file.
if metadata_poll_count >= metadata_logging_threshold:
metadata_logging_threshold *= 2
report_diagnostic_event(
"Ran into exception when attempting to reach %s "
"after %d polls." % (msg, metadata_poll_count),
logger_func=LOG.error)

if isinstance(exc, UrlError):
report_diagnostic_event("poll IMDS with %s failed. "
"Exception: %s and code: %s" %
(msg, exc.cause, exc.code),
logger_func=LOG.error)

if exc.cause and isinstance(exc.cause, requests.Timeout):
metadata_timeout_count = metadata_timeout_count + 1
return (metadata_timeout_count <= 10)
return True

# Primary nic detection will be optimized in the future. The fact that
# primary nic is being attached first helps here. Otherwise each nic
# could add several seconds of delay.
try:
imds_md = self.get_imds_data_with_api_fallback(
ifname,
5,
metadata_type.network
0,
metadata_type.network,
network_metadata_exc_cb,
True
)
except Exception as e:
LOG.warning(
Expand Down Expand Up @@ -2030,7 +2073,9 @@ def _generate_network_config_from_fallback_config() -> dict:
def get_metadata_from_imds(fallback_nic,
retries,
md_type=metadata_type.compute,
api_version=IMDS_VER_MIN):
api_version=IMDS_VER_MIN,
exc_cb=retry_on_url_exc,
infinite=False):
"""Query Azure's instance metadata service, returning a dictionary.

If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
Expand All @@ -2049,7 +2094,7 @@ def get_metadata_from_imds(fallback_nic,
kwargs = {'logfunc': LOG.debug,
'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
'func': _get_metadata_from_imds,
'args': (retries, md_type, api_version,)}
'args': (retries, exc_cb, md_type, api_version, infinite)}
if net.is_up(fallback_nic):
return util.log_time(**kwargs)
else:
Expand All @@ -2067,14 +2112,16 @@ def get_metadata_from_imds(fallback_nic,
@azure_ds_telemetry_reporter
def _get_metadata_from_imds(
retries,
exc_cb,
md_type=metadata_type.compute,
api_version=IMDS_VER_MIN):
api_version=IMDS_VER_MIN,
infinite=False):
url = "{}?api-version={}".format(md_type.value, api_version)
headers = {"Metadata": "true"}
try:
response = readurl(
url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
retries=retries, exception_cb=retry_on_url_exc)
retries=retries, exception_cb=exc_cb, infinite=infinite)
except Exception as e:
# pylint:disable=no-member
if isinstance(e, UrlError) and e.code == 400:
Expand Down
95 changes: 84 additions & 11 deletions tests/unittests/test_datasource/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def test_get_compute_metadata_uses_compute_url(
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
Expand All @@ -467,7 +467,7 @@ def test_get_network_metadata_uses_network_url(
"http://169.254.169.254/metadata/instance/network?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
Expand All @@ -486,7 +486,7 @@ def test_get_default_metadata_uses_compute_url(
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
Expand All @@ -511,7 +511,7 @@ def test_get_metadata_performs_dhcp_when_network_is_down(
m_readurl.assert_called_with(
self.network_md_url, exception_cb=mock.ANY,
headers={'Metadata': 'true'}, retries=2,
timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)

@mock.patch('cloudinit.url_helper.time.sleep')
@mock.patch(MOCKPATH + 'net.is_up', autospec=True)
Expand Down Expand Up @@ -2613,15 +2613,22 @@ def test_wait_for_nic_attach_multinic_attach(

def nic_attach_ret(nl_sock, nics_found):
nonlocal m_attach_call_count
if m_attach_call_count == 0:
m_attach_call_count = m_attach_call_count + 1
m_attach_call_count = m_attach_call_count + 1
if m_attach_call_count == 1:
return "eth0"
return "eth1"
elif m_attach_call_count == 2:
return "eth1"
raise RuntimeError("Must have found primary nic by now.")

# Simulate two NICs by adding the same one twice.
md = {
"interface": [
IMDS_NETWORK_METADATA['interface'][0],
IMDS_NETWORK_METADATA['interface'][0]
]
}

def network_metadata_ret(ifname, retries, type):
# Simulate two NICs by adding the same one twice.
md = IMDS_NETWORK_METADATA
md['interface'].append(md['interface'][0])
def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
if ifname == "eth0":
return md
raise requests.Timeout('Fake connection timeout')
Expand All @@ -2643,6 +2650,72 @@ def network_metadata_ret(ifname, retries, type):
self.assertEqual(1, m_imds.call_count)
self.assertEqual(2, m_link_up.call_count)

@mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
def test_check_if_nic_is_primary_retries_on_failures(
self, m_dhcpv4, m_imds):
"""Retry polling for network metadata on all failures except timeout"""
dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
lease = {
'interface': 'eth9', 'fixed-address': '192.168.2.9',
'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
'unknown-245': '624c3620'}

eth0Retries = []
eth1Retries = []
# Simulate two NICs by adding the same one twice.
md = {
"interface": [
IMDS_NETWORK_METADATA['interface'][0],
IMDS_NETWORK_METADATA['interface'][0]
]
}

def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
nonlocal eth0Retries, eth1Retries

# Simulate readurl functionality with retries and
# exception callbacks so that the callback logic can be
# validated.
if ifname == "eth0":
cause = requests.HTTPError()
for _ in range(0, 15):
error = url_helper.UrlError(cause=cause, code=410)
eth0Retries.append(exc_cb("No goal state.", error))
else:
cause = requests.Timeout('Fake connection timeout')
for _ in range(0, 10):
error = url_helper.UrlError(cause=cause)
eth1Retries.append(exc_cb("Connection timeout", error))
# Should stop retrying after 10 retries
eth1Retries.append(exc_cb("Connection timeout", error))
raise cause
return md

m_imds.side_effect = network_metadata_ret

dhcp_ctx = mock.MagicMock(lease=lease)
dhcp_ctx.obtain_lease.return_value = lease
m_dhcpv4.return_value = dhcp_ctx

is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
self.assertEqual(True, is_primary)
self.assertEqual(2, expected_nic_count)

# All Eth0 errors are non-timeout errors. So we should have been
# retrying indefinitely until success.
for i in eth0Retries:
self.assertTrue(i)

is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
self.assertEqual(False, is_primary)

# All Eth1 errors are timeout errors. Retry happens for a max of 10 and
# then we should have moved on assuming it is not the primary nic.
for i in range(0, 10):
self.assertTrue(eth1Retries[i])
self.assertFalse(eth1Retries[10])

@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
def test_wait_for_link_up_returns_if_already_up(
self, m_is_link_up):
Expand Down