Skip to content

Commit

Permalink
Retry getting network metadata during nic attach when we get 410
Browse files Browse the repository at this point in the history
  • Loading branch information
Aswin Rajamannar committed Apr 20, 2021
1 parent d132356 commit 1b6acc0
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 20 deletions.
45 changes: 36 additions & 9 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from xml.dom import minidom
import xml.etree.ElementTree as ET
from enum import Enum
import requests

from cloudinit import dmi
from cloudinit import log as logging
Expand Down Expand Up @@ -612,7 +613,9 @@ def get_imds_data_with_api_fallback(
self,
fallback_nic,
retries,
md_type=metadata_type.compute):
md_type=metadata_type.compute,
exc_cb=retry_on_url_exc,
infinite=False):
"""
Wrapper for get_metadata_from_imds so that we can have flexibility
in which IMDS api-version we use. If a particular instance of IMDS
Expand All @@ -632,7 +635,8 @@ def get_imds_data_with_api_fallback(
fallback_nic=fallback_nic,
retries=0,
md_type=md_type,
api_version=IMDS_VER_WANT
api_version=IMDS_VER_WANT,
exc_cb=exc_cb
)
except UrlError as err:
LOG.info(
Expand All @@ -655,7 +659,9 @@ def get_imds_data_with_api_fallback(
fallback_nic=fallback_nic,
retries=retries,
md_type=md_type,
api_version=IMDS_VER_MIN
api_version=IMDS_VER_MIN,
exc_cb=exc_cb,
infinite=infinite
)

def device_name_to_device(self, name):
Expand Down Expand Up @@ -858,6 +864,7 @@ def _check_if_nic_is_primary(self, ifname):
is_primary = False
expected_nic_count = -1
imds_md = None
network_md_call_count = 0

# For now, only a VM's primary NIC can contact IMDS and WireServer. If
# DHCP fails for a NIC, we have no mechanism to determine if the NIC is
Expand All @@ -882,14 +889,30 @@ def _check_if_nic_is_primary(self, ifname):
% (ifname, e), logger_func=LOG.error)
raise

# Retry polling network metadata for a limited duration only when the
# calls fail due to timeout. This is because the platform drops packets
# going towards IMDS when it is not a primary nic. If the calls fail
# due to other issues like 410, 503 etc, then it means we are primary
# but IMDS service is unavailable at the moment. Retry indefinitely in
# those cases since we cannot move on without the network metadata.
def network_metadata_exc_cb(msg, exc):
nonlocal network_md_call_count

if exc.cause and isinstance(exc.cause, requests.Timeout):
network_md_call_count = network_md_call_count + 1
return (network_md_call_count <= 10)
return True

# Primary nic detection will be optimized in the future. The fact that
# primary nic is being attached first helps here. Otherwise each nic
# could add several seconds of delay.
try:
imds_md = self.get_imds_data_with_api_fallback(
ifname,
5,
metadata_type.network
0,
metadata_type.network,
network_metadata_exc_cb,
True
)
except Exception as e:
LOG.warning(
Expand Down Expand Up @@ -2030,7 +2053,9 @@ def _generate_network_config_from_fallback_config() -> dict:
def get_metadata_from_imds(fallback_nic,
retries,
md_type=metadata_type.compute,
api_version=IMDS_VER_MIN):
api_version=IMDS_VER_MIN,
exc_cb=retry_on_url_exc,
infinite=False):
"""Query Azure's instance metadata service, returning a dictionary.
If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
Expand All @@ -2049,7 +2074,7 @@ def get_metadata_from_imds(fallback_nic,
kwargs = {'logfunc': LOG.debug,
'msg': 'Crawl of Azure Instance Metadata Service (IMDS)',
'func': _get_metadata_from_imds,
'args': (retries, md_type, api_version,)}
'args': (retries, exc_cb, md_type, api_version, infinite)}
if net.is_up(fallback_nic):
return util.log_time(**kwargs)
else:
Expand All @@ -2067,14 +2092,16 @@ def get_metadata_from_imds(fallback_nic,
@azure_ds_telemetry_reporter
def _get_metadata_from_imds(
retries,
exc_cb,
md_type=metadata_type.compute,
api_version=IMDS_VER_MIN):
api_version=IMDS_VER_MIN,
infinite=False):
url = "{}?api-version={}".format(md_type.value, api_version)
headers = {"Metadata": "true"}
try:
response = readurl(
url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
retries=retries, exception_cb=retry_on_url_exc)
retries=retries, exception_cb=exc_cb, infinite=infinite)
except Exception as e:
# pylint:disable=no-member
if isinstance(e, UrlError) and e.code == 400:
Expand Down
95 changes: 84 additions & 11 deletions tests/unittests/test_datasource/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ def test_get_compute_metadata_uses_compute_url(
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
Expand All @@ -467,7 +467,7 @@ def test_get_network_metadata_uses_network_url(
"http://169.254.169.254/metadata/instance/network?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
Expand All @@ -486,7 +486,7 @@ def test_get_default_metadata_uses_compute_url(
"http://169.254.169.254/metadata/instance?api-version="
"2019-06-01", exception_cb=mock.ANY,
headers=mock.ANY, retries=mock.ANY,
timeout=mock.ANY)
timeout=mock.ANY, infinite=False)

@mock.patch(MOCKPATH + 'readurl', autospec=True)
@mock.patch(MOCKPATH + 'EphemeralDHCPv4WithReporting', autospec=True)
Expand All @@ -511,7 +511,7 @@ def test_get_metadata_performs_dhcp_when_network_is_down(
m_readurl.assert_called_with(
self.network_md_url, exception_cb=mock.ANY,
headers={'Metadata': 'true'}, retries=2,
timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS)
timeout=dsaz.IMDS_TIMEOUT_IN_SECONDS, infinite=False)

@mock.patch('cloudinit.url_helper.time.sleep')
@mock.patch(MOCKPATH + 'net.is_up', autospec=True)
Expand Down Expand Up @@ -2613,15 +2613,22 @@ def test_wait_for_nic_attach_multinic_attach(

def nic_attach_ret(nl_sock, nics_found):
nonlocal m_attach_call_count
if m_attach_call_count == 0:
m_attach_call_count = m_attach_call_count + 1
m_attach_call_count = m_attach_call_count + 1
if m_attach_call_count == 1:
return "eth0"
return "eth1"
elif m_attach_call_count == 2:
return "eth1"
raise RuntimeError("Must have found primary nic by now.")

# Simulate two NICs by adding the same one twice.
md = {
"interface": [
IMDS_NETWORK_METADATA['interface'][0],
IMDS_NETWORK_METADATA['interface'][0]
]
}

def network_metadata_ret(ifname, retries, type):
# Simulate two NICs by adding the same one twice.
md = IMDS_NETWORK_METADATA
md['interface'].append(md['interface'][0])
def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
if ifname == "eth0":
return md
raise requests.Timeout('Fake connection timeout')
Expand All @@ -2643,6 +2650,72 @@ def network_metadata_ret(ifname, retries, type):
self.assertEqual(1, m_imds.call_count)
self.assertEqual(2, m_link_up.call_count)

@mock.patch(MOCKPATH + 'DataSourceAzure.get_imds_data_with_api_fallback')
@mock.patch(MOCKPATH + 'EphemeralDHCPv4')
def test_check_if_nic_is_primary_retries_on_failures(
self, m_dhcpv4, m_imds):
"""Retry polling for network metadata on all failures except timeout"""
dsa = dsaz.DataSourceAzure({}, distro=None, paths=self.paths)
lease = {
'interface': 'eth9', 'fixed-address': '192.168.2.9',
'routers': '192.168.2.1', 'subnet-mask': '255.255.255.0',
'unknown-245': '624c3620'}

eth0Retries = []
eth1Retries = []
# Simulate two NICs by adding the same one twice.
md = {
"interface": [
IMDS_NETWORK_METADATA['interface'][0],
IMDS_NETWORK_METADATA['interface'][0]
]
}

def network_metadata_ret(ifname, retries, type, exc_cb, infinite):
nonlocal eth0Retries, eth1Retries

# Simulate readurl functionality with retries and
# exception callbacks so that the callback logic can be
# validated.
if ifname == "eth0":
cause = requests.HTTPError()
for i in range(0, 15):
error = url_helper.UrlError(cause=cause, code=410)
eth0Retries.append(exc_cb("No goal state.", error))
else:
cause = requests.Timeout('Fake connection timeout')
for i in range(0, 10):
error = url_helper.UrlError(cause=cause)
eth1Retries.append(exc_cb("Connection timeout", error))
# Should stop retrying after 10 retries
eth1Retries.append(exc_cb("Connection timeout", error))
raise cause
return md

m_imds.side_effect = network_metadata_ret

dhcp_ctx = mock.MagicMock(lease=lease)
dhcp_ctx.obtain_lease.return_value = lease
m_dhcpv4.return_value = dhcp_ctx

is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth0")
self.assertEqual(True, is_primary)
self.assertEqual(2, expected_nic_count)

# All Eth0 errors are non-timeout errors. So we should have been
# retrying indefinitely until success.
for i in eth0Retries:
self.assertTrue(i)

is_primary, expected_nic_count = dsa._check_if_nic_is_primary("eth1")
self.assertEqual(False, is_primary)

# All Eth1 errors are timeout errors. Retry happens for a max of 10 and
# then we should have moved on assuming it is not the primary nic.
for i in range(0, 10):
self.assertTrue(eth1Retries[i])
self.assertFalse(eth1Retries[10])

@mock.patch('cloudinit.distros.networking.LinuxNetworking.try_set_link_up')
def test_wait_for_link_up_returns_if_already_up(
self, m_is_link_up):
Expand Down

0 comments on commit 1b6acc0

Please sign in to comment.