Skip to content

Commit

Permalink
azure: workaround to disable reporting IMDS failures on Azure Stack
Browse files Browse the repository at this point in the history
There are Azure Stack implementations which do not support IMDS.  The
only way to detect this is by checking for the presence of the
static route to IMDS.  If we don't see the route to IMDS, limit the
failure report to the host via KVP so we don't report failure to
wireserver in these cases.

There will be future work to detect that we are running on Azure
Stack and have explicit configuration passed along via ovf-env.xml
to toggle IMDS support as the environment dictates.  Until then,
this approach minimizes the risk of regression on Azure public cloud
while allowing Azure Stack VMs to provision, albeit with a logged
error that it could not fetch metadata.

- refactor _check_if_primary() to track configured routes

- use this to toggle reporting behavior

- add duration check to TestGetMetadataFromImds tests in case we
  want to toggle duration in the future.

Example diagnostic on Azure public cloud:

```
2023-10-27 12:37:04,634 - azure.py[DEBUG]: Obtained DHCP lease on interface 'eth0' (primary=True driver='hv_netvsc' router='10.0.0.1' routes=[('0.0.0.0/0', '10.0.0.1'), ('168.63.129.16/32', '10.0.0.1'), ('169.254.169.254/32', '10.0.0.1')] lease={'interface': 'eth0', 'fixed-address': '10.0.0.4', 'server-name': 'IAD011091108004SOC', 'subnet-mask': '255.255.255.0', 'dhcp-lease-time': '4294967295', 'routers': '10.0.0.1', 'dhcp-message-type': '5', 'domain-name-servers': '168.63.129.16', 'dhcp-server-identifier': '168.63.129.16', 'dhcp-renewal-time': '4294967295', 'rfc3442-classless-static-routes': '0,10,0,0,1,32,168,63,129,16,10,0,0,1,32,169,254,169,254,10,0,0,1', 'dhcp-rebinding-time': '4294967295', 'unknown-245': 'a8:3f:81:10', 'domain-name': 'uejkdvkrjiqe1jxrijlvafqihe.bx.internal.cloudapp.net', 'renew': '1 2159/12/03 19:05:19', 'rebind': '1 2159/12/03 19:05:19', 'expire': '1 2159/12/03 19:05:19'} imds_routed=True wireserver_routed=True)
```

Example diagnostic on Azure Stack:

```
2023-10-27 12:35:47,363 - azure.py[DEBUG]: Obtained DHCP lease on interface 'eth0' (primary=True driver='hv_netvsc' router='10.126.64.1' routes=[('0.0.0.0/0', '10.126.64.1'), ('168.63.129.16/32', '10.126.64.1')] lease={'interface': 'eth0', 'fixed-address': '10.126.64.35', 'subnet-mask': '255.255.252.0', 'routers': '10.126.64.1', 'dhcp-lease-time': '4294967295', 'dhcp-message-type': '5', 'domain-name-servers': '10.50.10.50,10.50.50.50', 'dhcp-server-identifier': '168.63.129.16', 'interface-mtu': '1500', 'dhcp-renewal-time': '4294967295', 'unknown-245': 'a8:3f:81:10', 'rfc3442-classless-static-routes': '0,10,126,64,1,32,168,63,129,16,10,126,64,1', 'dhcp-rebinding-time': '4294967295', 'domain-name': 'corp.microsoft.com', 'renew': '1 2159/12/03 19:04:02', 'rebind': '1 2159/12/03 19:04:02', 'expire': '1 2159/12/03 19:04:02'} imds_routed=False wireserver_routed=True)
```

We can see that IMDS routing is detected appropriately.

Signed-off-by: Chris Patterson <cpatterson@microsoft.com>
  • Loading branch information
cjp256 authored and blackboxsw committed Nov 13, 2023
1 parent 9def422 commit 745fbf6
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 12 deletions.
44 changes: 34 additions & 10 deletions cloudinit/sources/DataSourceAzure.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from time import sleep, time
from typing import Any, Dict, List, Optional

import requests

from cloudinit import net, sources, ssh_util, subp, util
from cloudinit.event import EventScope, EventType
from cloudinit.net import device_driver
Expand Down Expand Up @@ -334,6 +336,8 @@ def __init__(self, sys_cfg, distro, paths):
self._iso_dev = None
self._network_config = None
self._ephemeral_dhcp_ctx = None
self._route_configured_for_imds = False
self._route_configured_for_wireserver = False
self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT
self._reported_ready_marker_file = os.path.join(
paths.cloud_dir, "data", "reported_ready"
Expand All @@ -344,6 +348,8 @@ def _unpickle(self, ci_pkl_version: int) -> None:

self._ephemeral_dhcp_ctx = None
self._iso_dev = None
self._route_configured_for_imds = False
self._route_configured_for_wireserver = False
self._wireserver_endpoint = DEFAULT_WIRESERVER_ENDPOINT
self._reported_ready_marker_file = os.path.join(
self.paths.cloud_dir, "data", "reported_ready"
Expand Down Expand Up @@ -371,17 +377,21 @@ def _check_if_primary(self, ephipv4: EphemeralIPv4Network) -> bool:
# Primary nics must contain routes.
return False

routed_networks = [r[0] for r in ephipv4.static_routes]
wireserver_route = f"{self._wireserver_endpoint}/32"
primary = any(
n in routed_networks
for n in [
"169.254.169.254/32",
wireserver_route,
]
routed_networks = [r[0].split("/")[0] for r in ephipv4.static_routes]

# Expected to be true for all of Azure public cloud and future Azure
# Stack versions with IMDS capabilities, but false for existing ones.
self._route_configured_for_imds = "169.254.169.254" in routed_networks

# Expected to be true for Azure public cloud and Azure Stack.
self._route_configured_for_wireserver = (
self._wireserver_endpoint in routed_networks
)

return primary
return (
self._route_configured_for_imds
or self._route_configured_for_wireserver
)

@azure_ds_telemetry_reporter
def _setup_ephemeral_networking(
Expand Down Expand Up @@ -503,14 +513,17 @@ def _setup_ephemeral_networking(
primary = self._check_if_primary(ephipv4)
report_diagnostic_event(
"Obtained DHCP lease on interface %r "
"(primary=%r driver=%r router=%r routes=%r lease=%r)"
"(primary=%r driver=%r router=%r routes=%r lease=%r "
"imds_routed=%r wireserver_routed=%r)"
% (
iface,
primary,
driver,
ephipv4.router,
ephipv4.static_routes,
lease,
self._route_configured_for_imds,
self._route_configured_for_wireserver,
),
logger_func=LOG.debug,
)
Expand All @@ -531,6 +544,8 @@ def _setup_ephemeral_networking(
@azure_ds_telemetry_reporter
def _teardown_ephemeral_networking(self) -> None:
"""Teardown ephemeral networking."""
self._route_configured_for_imds = False
self._route_configured_for_wireserver = False
if self._ephemeral_dhcp_ctx is None:
return

Expand Down Expand Up @@ -753,6 +768,7 @@ def crawl_metadata(self):
def get_metadata_from_imds(self, report_failure: bool) -> Dict:
start_time = time()
retry_deadline = start_time + 300

error_string: Optional[str] = None
error_report: Optional[errors.ReportableError] = None
try:
Expand All @@ -765,6 +781,14 @@ def get_metadata_from_imds(self, report_failure: bool) -> Dict:
error_report = errors.ReportableErrorImdsUrlError(
exception=error, duration=duration
)

# As a temporary workaround to support Azure Stack implementations
# which may not enable IMDS, don't report connection errors to
# wireserver if route is not configured.
if not self._route_configured_for_imds and isinstance(
error.cause, requests.ConnectionError
):
report_failure = False
except ValueError as error:
error_string = str(error)
error_report = errors.ReportableErrorImdsMetadataParsingException(
Expand Down
23 changes: 21 additions & 2 deletions tests/unittests/sources/test_azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3440,7 +3440,8 @@ def test_retry_process_error(
"routes=[('0.0.0.0/0', '10.0.0.1'), "
"('168.63.129.16/32', '10.0.0.1'), "
"('169.254.169.254/32', '10.0.0.1')] "
"lease={'interface': 'fakeEth0'})",
"lease={'interface': 'fakeEth0'} "
"imds_routed=True wireserver_routed=True)",
logger_func=dsaz.LOG.debug,
),
]
Expand Down Expand Up @@ -4366,6 +4367,7 @@ def test_imds_failure_results_in_provisioning_failure(self):


class TestGetMetadataFromImds:
@pytest.mark.parametrize("route_configured_for_imds", [False, True])
@pytest.mark.parametrize("report_failure", [False, True])
@pytest.mark.parametrize(
"exception,reported_error_type",
Expand All @@ -4387,14 +4389,18 @@ def test_errors(
mock_azure_report_failure_to_fabric,
mock_imds_fetch_metadata_with_api_fallback,
mock_kvp_report_failure_to_host,
mock_time,
monkeypatch,
report_failure,
reported_error_type,
route_configured_for_imds,
):
monkeypatch.setattr(
azure_ds, "_is_ephemeral_networking_up", lambda: True
)
azure_ds._route_configured_for_imds = route_configured_for_imds
mock_imds_fetch_metadata_with_api_fallback.side_effect = exception
mock_time.return_value = 0.0

assert (
azure_ds.get_metadata_from_imds(report_failure=report_failure)
Expand All @@ -4404,13 +4410,26 @@ def test_errors(
mock.call(retry_deadline=mock.ANY)
]

expected_duration = 300
assert (
mock_imds_fetch_metadata_with_api_fallback.call_args[1][
"retry_deadline"
]
== expected_duration
)

reported_error = mock_kvp_report_failure_to_host.call_args[0][0]
assert isinstance(reported_error, reported_error_type)
assert reported_error.supporting_data["exception"] == repr(exception)
assert mock_kvp_report_failure_to_host.mock_calls == [
mock.call(reported_error)
]
if report_failure:

if report_failure and (
route_configured_for_imds
or not isinstance(exception, url_helper.UrlError)
or not isinstance(exception.cause, requests.ConnectionError)
):
assert mock_azure_report_failure_to_fabric.mock_calls == [
mock.call(endpoint=mock.ANY, error=reported_error)
]
Expand Down

0 comments on commit 745fbf6

Please sign in to comment.