Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move fetch_vm_settings() to HostPluginProtocol #2486

Merged
merged 5 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 180 additions & 2 deletions azurelinuxagent/common/protocol/hostplugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
from azurelinuxagent.common import logger
from azurelinuxagent.common.errorstate import ErrorState, ERROR_STATE_HOST_PLUGIN_FAILURE
from azurelinuxagent.common.event import WALAEventOperation, add_event
from azurelinuxagent.common.exception import HttpError, ProtocolError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.exception import HttpError, ProtocolError, ResourceGoneError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.future import ustr, httpclient
from azurelinuxagent.common.protocol.healthservice import HealthService
from azurelinuxagent.common.protocol.extensions_goal_state_factory import ExtensionsGoalStateFactory
from azurelinuxagent.common.utils import restutil
from azurelinuxagent.common.utils import textutil
from azurelinuxagent.common.utils.textutil import remove_bom
Expand Down Expand Up @@ -79,6 +81,11 @@ def __init__(self, endpoint, container_id, role_config_name):
self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE)
self.fetch_last_timestamp = None
self.status_last_timestamp = None
self._host_plugin_version = FlexibleVersion("0.0.0.0") # Version 0 means "unknown"
self._host_plugin_supports_vm_settings = False
self._host_plugin_supports_vm_settings_next_check = datetime.datetime.now()
self._vm_settings_error_reporter = _VmSettingsErrorReporter()
self._cached_vm_settings = None # Cached value of the most recent ExtensionsGoalStateFromVmSettings

@staticmethod
def _extract_deployment_id(role_config_name):
Expand Down Expand Up @@ -383,3 +390,174 @@ def _base64_encode(self, data):
if PY_VERSION_MAJOR > 2:
return s.decode('utf-8')
return s

Copy link
Member Author

@narrieta narrieta Jan 25, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from the WireClient, the only difference in the implementation is the use of ResourceGoneError to report 410s (added the exception here because the retry logic needs to be implemented by the caller)

def fetch_vm_settings(self, force_update):
"""
Queries the vmSettings from the HostGAPlugin and returns an (ExtensionsGoalStateFromVmSettings, bool) tuple with the vmSettings and
a boolean indicating if they are an updated (True) or a cached value (False).
larohra marked this conversation as resolved.
Show resolved Hide resolved

Raises TypeError if the HostGAPlugin does not support the vmSettings API, ResourceGoneError if the container ID and roleconfig name
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved
need to be refreshed, or ProtocolError if the request fails for any other reason (e.g. not supported, time out, server error).
"""
def raise_not_supported(reset_state=False):
if reset_state:
self._host_plugin_supports_vm_settings = False
self._host_plugin_supports_vm_settings_next_check = datetime.datetime.now() + datetime.timedelta(hours=6) # check again in 6 hours
# "Not supported" is not considered an error, so don't use self._vm_settings_error_reporter to report it
logger.info("vmSettings is not supported")
add_event(op=WALAEventOperation.HostPlugin, message="vmSettings is not supported", is_success=True)
raise VmSettingsNotSupported()

try:
# Raise if VmSettings are not supported but check for periodically since the HostGAPlugin could have been updated since the last check
if not self._host_plugin_supports_vm_settings and self._host_plugin_supports_vm_settings_next_check > datetime.datetime.now():
raise_not_supported()

etag = None if force_update or self._cached_vm_settings is None else self._cached_vm_settings.etag
correlation_id = str(uuid.uuid4())

def format_message(msg):
return "GET vmSettings [correlation ID: {0} eTag: {1}]: {2}".format(correlation_id, etag, msg)

def get_vm_settings():
url, headers = self.get_vm_settings_request(correlation_id)
if etag is not None:
headers['if-none-match'] = etag
return restutil.http_get(url, headers=headers, use_proxy=False, max_retry=1, return_raw_response=True)

self._vm_settings_error_reporter.report_request()

response = get_vm_settings()

if response.status == httpclient.GONE:
raise ResourceGoneError()

if response.status == httpclient.NOT_FOUND: # the HostGAPlugin does not support FastTrack
raise_not_supported(reset_state=True)

if response.status == httpclient.NOT_MODIFIED: # The goal state hasn't changed, return the current instance
return self._cached_vm_settings, False

if response.status != httpclient.OK:
error_description = restutil.read_response_error(response)
# For historical reasons the HostGAPlugin returns 502 (BAD_GATEWAY) for internal errors instead of using
# 500 (INTERNAL_SERVER_ERROR). We add a short prefix to the error message in the hope that it will help
# clear any confusion produced by the poor choice of status code.
if response.status == httpclient.BAD_GATEWAY:
error_description = "[Internal error in HostGAPlugin] {0}".format(error_description)
error_description = format_message(error_description)

if 400 <= response.status <= 499:
self._vm_settings_error_reporter.report_error(error_description, _VmSettingsError.ClientError)
elif 500 <= response.status <= 599:
self._vm_settings_error_reporter.report_error(error_description, _VmSettingsError.ServerError)
else:
self._vm_settings_error_reporter.report_error(error_description)

raise ProtocolError(error_description)

for h in response.getheaders():
if h[0].lower() == 'etag':
response_etag = h[1]
break
else: # since the vmSettings were updated, the response must include an etag
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved
message = format_message("The vmSettings response does not include an Etag header")
self._vm_settings_error_reporter.report_error(message)
raise ProtocolError(message)

response_content = ustr(response.read(), encoding='utf-8')

vm_settings = ExtensionsGoalStateFactory.create_from_vm_settings(response_etag, response_content)

# log the HostGAPlugin version
if vm_settings.host_ga_plugin_version != self._host_plugin_version:
self._host_plugin_version = vm_settings.host_ga_plugin_version
message = "HostGAPlugin version: {0}".format(vm_settings.host_ga_plugin_version)
logger.info(message)
add_event(op=WALAEventOperation.HostPlugin, message=message, is_success=True)

# Don't support HostGAPlugin versions older than 115
if vm_settings.host_ga_plugin_version < FlexibleVersion("1.0.8.115"):
raise_not_supported(reset_state=True)

logger.info("Fetched new vmSettings [correlation ID: {0} New eTag: {1}]", correlation_id, vm_settings.etag)
self._host_plugin_supports_vm_settings = True
self._cached_vm_settings = vm_settings
return vm_settings, True

except (ProtocolError, ResourceGoneError, VmSettingsNotSupported):
raise
except Exception as exception:
if isinstance(exception, IOError) and "timed out" in ustr(exception):
message = format_message("Timeout")
self._vm_settings_error_reporter.report_error(message, _VmSettingsError.Timeout)
else:
message = format_message("Request failed: {0}".format(textutil.format_exception(exception)))
self._vm_settings_error_reporter.report_error(message, _VmSettingsError.RequestFailed)
raise ProtocolError(message)
finally:
self._vm_settings_error_reporter.report_summary()


class VmSettingsNotSupported(TypeError):
pass


class _VmSettingsError(object):
ServerError = 'ServerError'
ClientError = 'ClientError'
Timeout = 'Timeout'
RequestFailed = 'RequestFailed'


class _VmSettingsErrorReporter(object):
_MaxErrors = 5 # Max number of error reported by period
_Period = datetime.timedelta(hours=1) # How often to report the summary

def __init__(self):
self._reset()

def _reset(self):
self._request_count = 0 # Total number of vmSettings HTTP requests
self._error_count = 0 # Total number of errors issuing vmSettings requests (includes all kinds of errors)
self._server_error_count = 0 # Count of server side errors (HTTP status in the 500s)
self._client_error_count = 0 # Count of client side errors (HTTP status in the 400s)
self._timeout_count = 0 # Count of timeouts on vmSettings requests
self._request_failure_count = 0 # Total count of requests that could not be issued (does not include timeouts or requests that were actually issued and failed, for example, with 500 or 400 statuses)
self._next_period = datetime.datetime.now() + _VmSettingsErrorReporter._Period

def report_request(self):
self._request_count += 1

def report_error(self, error, category=None):
self._error_count += 1

if self._error_count <= _VmSettingsErrorReporter._MaxErrors:
add_event(op=WALAEventOperation.VmSettings, message=error, is_success=False, log_event=False)

if category == _VmSettingsError.ServerError:
self._server_error_count += 1
elif category == _VmSettingsError.ClientError:
self._client_error_count += 1
elif category == _VmSettingsError.Timeout:
self._timeout_count += 1
elif category == _VmSettingsError.RequestFailed:
self._request_failure_count += 1

def report_summary(self):
if datetime.datetime.now() >= self._next_period:
summary = {
"requests": self._request_count,
"errors": self._error_count,
"serverErrors": self._server_error_count,
"clientErrors": self._client_error_count,
"timeouts": self._timeout_count,
"failedRequests": self._request_failure_count
}
# always send telemetry, but log errors only
larohra marked this conversation as resolved.
Show resolved Hide resolved
message = json.dumps(summary)
add_event(op=WALAEventOperation.VmSettingsSummary, message=message, is_success=False, log_event=False)
if self._error_count > 0:
logger.info("[VmSettingsSummary] {0}", message)

self._reset()
Loading