From 6f0b291682a490d9c227f2a0fa8b1a64f5bc4f06 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 12:46:38 -0700 Subject: [PATCH 01/61] refactor monitor daemon --- azurelinuxagent/ga/exthandlers.py | 4 - azurelinuxagent/ga/monitor.py | 172 ++++++++++++++++-------------- 2 files changed, 94 insertions(+), 82 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index a802e1927d..28953dd7e9 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -236,10 +236,6 @@ def run(self): message=msg) return - def run_status(self): - self.report_ext_handlers_status() - return - def get_upgrade_guid(self, name): return self.last_upgrade_guids.get(name, (None, False))[0] diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index e1bfd2a6f3..ce2b2b3d03 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -91,6 +91,10 @@ def get_monitor_handler(): class MonitorHandler(object): + + TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) + EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) + def __init__(self): self.osutil = get_osutil() self.protocol_util = get_protocol_util() @@ -172,91 +176,62 @@ def collect_event(self, evt_file_name): msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) - def collect_and_send_events(self): - event_list = TelemetryEventList() - event_dir = os.path.join(conf.get_lib_dir(), "events") - event_files = os.listdir(event_dir) - for event_file in event_files: - if not event_file.endswith(".tld"): - continue - event_file_path = os.path.join(event_dir, event_file) - try: - data_str = self.collect_event(event_file_path) - except EventError as e: - logger.error("{0}", e) - continue + def collect_and_send_events(self, protocol, last_event_collection): + if last_event_collection is None: + last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD - try: - event = parse_event(data_str) - self.add_sysinfo(event) - event_list.events.append(event) - except (ValueError, ProtocolError) as e: - logger.warn("Failed to decode event file: {0}", e) - continue - - if len(event_list.events) == 0: - return + if datetime.datetime.utcnow() < (last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): + return last_event_collection try: - protocol = self.protocol_util.get_protocol() - protocol.report_event(event_list) - except ProtocolError as e: - logger.error("{0}", e) + event_list = TelemetryEventList() + event_dir = os.path.join(conf.get_lib_dir(), "events") + event_files = os.listdir(event_dir) + for event_file in event_files: + if not event_file.endswith(".tld"): + continue + event_file_path = os.path.join(event_dir, event_file) + try: + data_str = self.collect_event(event_file_path) + except EventError as e: + logger.error("{0}", e) + continue + + try: + event = parse_event(data_str) + self.add_sysinfo(event) + event_list.events.append(event) + except (ValueError, ProtocolError) as e: + logger.warn("Failed to decode event file: {0}", e) + continue + + if len(event_list.events) == 0: + return - def daemon(self): - period = datetime.timedelta(minutes=30) - protocol = self.protocol_util.get_protocol() - last_heartbeat = datetime.datetime.utcnow() - period + try: + protocol.report_event(event_list) + except ProtocolError as e: + logger.error("{0}", e) + except Exception as e: + logger.warn("Failed to send events: {0}", e) - # Create a new identifier on each restart and reset the counter - heartbeat_id = str(uuid.uuid4()).upper() + return datetime.datetime.utcnow() + + def daemon(self): + # Create a new identifier on each restart, reset the counter and all events counter = 0 + last_event_collection = None + last_telemetry_heartbeat = None + heartbeat_id = str(uuid.uuid4()).upper() + protocol = self.protocol_util.get_protocol() while True: - if datetime.datetime.utcnow() >= (last_heartbeat + period): - last_heartbeat = datetime.datetime.utcnow() - incarnation = protocol.get_incarnation() - dropped_packets = self.osutil.get_firewall_dropped_packets( - protocol.endpoint) - - msg = "{0};{1};{2};{3}".format( - incarnation, counter, heartbeat_id, dropped_packets) - - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HeartBeat, - is_success=True, - message=msg, - log_event=False) - - counter += 1 - - io_errors = IOErrorCounter.get_and_reset() - hostplugin_errors = io_errors.get("hostplugin") - protocol_errors = io_errors.get("protocol") - other_errors = io_errors.get("other") - - if hostplugin_errors > 0 \ - or protocol_errors > 0 \ - or other_errors > 0: - - msg = "hostplugin:{0};protocol:{1};other:{2}"\ - .format(hostplugin_errors, - protocol_errors, - other_errors) - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HttpErrors, - is_success=True, - message=msg, - log_event=False) - - try: - self.collect_and_send_events() - except Exception as e: - logger.warn("Failed to send events: {0}", e) - time.sleep(60) + last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, + counter, + heartbeat_id, + last_telemetry_heartbeat) + last_event_collection = self.collect_and_send_events(protocol, + last_event_collection) + time.sleep(5) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] @@ -267,3 +242,44 @@ def add_sysinfo(self, event): param.value) event.parameters.remove(param) event.parameters.extend(self.sysinfo) + + def send_telemetry_heartbeat(self, protocol, counter, heartbeat_id, last_telemetry_heartbeat): + + if last_telemetry_heartbeat is None: + last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD + + if datetime.datetime.utcnow() < (last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): + return last_telemetry_heartbeat + + incarnation = protocol.get_incarnation() + dropped_packets = self.osutil.get_firewall_dropped_packets(protocol.endpoint) + msg = "{0};{1};{2};{3}".format(incarnation, counter, heartbeat_id, dropped_packets) + + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HeartBeat, + is_success=True, + message=msg, + log_event=False) + + counter += 1 + + io_errors = IOErrorCounter.get_and_reset() + hostplugin_errors = io_errors.get("hostplugin") + protocol_errors = io_errors.get("protocol") + other_errors = io_errors.get("other") + + if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: + msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, + protocol_errors, + other_errors) + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HttpErrors, + is_success=True, + message=msg, + log_event=False) + + return datetime.datetime.utcnow() From 97aa665fbeecbd3561bd4ed2fb9b8db817cd5c46 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 13:59:39 -0700 Subject: [PATCH 02/61] initial commit for hostga health monitor --- azurelinuxagent/common/protocol/hostplugin.py | 17 +++++++ azurelinuxagent/common/utils/restutil.py | 3 ++ azurelinuxagent/ga/monitor.py | 45 ++++++++++++++++++- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index f80836d2ec..9e6445243d 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -35,6 +35,7 @@ URI_FORMAT_GET_EXTENSION_ARTIFACT = "http://{0}:{1}/extensionArtifact" URI_FORMAT_PUT_VM_STATUS = "http://{0}:{1}/status" URI_FORMAT_PUT_LOG = "http://{0}:{1}/vmAgentLog" +URI_FORMAT_HEALTH = "http://{0}:{1}/health" API_VERSION = "2015-09-01" HEADER_CONTAINER_ID = "x-ms-containerid" HEADER_VERSION = "x-ms-version" @@ -77,6 +78,22 @@ def ensure_initialized(self): is_success=self.is_available) return self.is_available + def get_health(self): + """ + Call the /health endpoint + :return: True if 200 received, False otherwise + """ + url = URI_FORMAT_HEALTH.format(self.endpoint, + HOST_PLUGIN_PORT) + logger.verbose("HostGAPlugin: Getting health from [{0}]", url) + status_ok = False + try: + response = restutil.http_get(url, max_retry=1) + status_ok = restutil.request_succeeded(response) + except HttpError as e: + logger.verbose("HostGAPlugin: Exception getting health", ustr(e)) + return status_ok + def get_api_versions(self): url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint, HOST_PLUGIN_PORT) diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index 5ceb4c949d..f15ceb5499 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -385,12 +385,15 @@ def http_delete(url, headers=None, use_proxy=False, retry_codes=retry_codes, retry_delay=retry_delay) + def request_failed(resp, ok_codes=OK_CODES): return not request_succeeded(resp, ok_codes=ok_codes) + def request_succeeded(resp, ok_codes=OK_CODES): return resp is not None and resp.status in ok_codes + def read_response_error(resp): result = '' if resp is not None: diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index ce2b2b3d03..42b5ddb6ec 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -26,6 +26,7 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.utils.fileutil as fileutil import azurelinuxagent.common.logger as logger +from azurelinuxagent.common.errorstate import ErrorState from azurelinuxagent.common.event import add_event, WALAEventOperation from azurelinuxagent.common.exception import EventError, ProtocolError, OSUtilError, HttpError @@ -92,8 +93,10 @@ def get_monitor_handler(): class MonitorHandler(object): - TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) + TELEMETRY_HEARTBEAT_PERIOD = datetime.timedelta(minutes=30) + HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) + HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) def __init__(self): self.osutil = get_osutil() @@ -218,12 +221,16 @@ def collect_and_send_events(self, protocol, last_event_collection): return datetime.datetime.utcnow() def daemon(self): + # Create a new identifier on each restart, reset the counter and all events counter = 0 last_event_collection = None last_telemetry_heartbeat = None + last_host_plugin_heartbeat = None heartbeat_id = str(uuid.uuid4()).upper() protocol = self.protocol_util.get_protocol() + host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) + while True: last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, counter, @@ -231,6 +238,9 @@ def daemon(self): last_telemetry_heartbeat) last_event_collection = self.collect_and_send_events(protocol, last_event_collection) + last_host_plugin_heartbeat = self.send_host_plugin_heartbeat(protocol, + last_host_plugin_heartbeat, + host_plugin_errorstate) time.sleep(5) def add_sysinfo(self, event): @@ -243,6 +253,39 @@ def add_sysinfo(self, event): event.parameters.remove(param) event.parameters.extend(self.sysinfo) + def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_plugin_errorstate): + + """ + Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to + communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. + """ + + if last_host_plugin_heartbeat is None: + last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD + + if datetime.datetime.utcnow() < (last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): + return last_host_plugin_heartbeat + + try: + host_plugin = protocol.client.get_host_plugin() + host_plugin.ensure_initialized() + is_currently_healthy = host_plugin.get_health() + + if is_currently_healthy: + host_plugin_errorstate.reset() + else: + host_plugin_errorstate.incr() + + is_healthy = host_plugin_errorstate.is_triggered() is False + + # TODO: send healthstore signal + logger.info("HostGAPlugin health: {0}", is_healthy) + + except Exception as e: + logger.error("Could not send host plugin heartbeat: {0}", ustr(e)) + + return datetime.datetime.utcnow() + def send_telemetry_heartbeat(self, protocol, counter, heartbeat_id, last_telemetry_heartbeat): if last_telemetry_heartbeat is None: From 1920e3fcbc795ea010bce3973df7eed785732251 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 14:08:50 -0700 Subject: [PATCH 03/61] send telemetry when hostplugin heartbeat fails --- azurelinuxagent/common/event.py | 1 + azurelinuxagent/ga/monitor.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 1c26a13990..1f55c23bbe 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -56,6 +56,7 @@ class WALAEventOperation: HealthCheck = "HealthCheck" HeartBeat = "HeartBeat" HostPlugin = "HostPlugin" + HostPluginHeartbeat = "HostPluginHeartbeat" HttpErrors = "HttpErrors" Install = "Install" InitializeHostPlugin = "InitializeHostPlugin" diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 42b5ddb6ec..ca644f889b 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -282,7 +282,14 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ logger.info("HostGAPlugin health: {0}", is_healthy) except Exception as e: - logger.error("Could not send host plugin heartbeat: {0}", ustr(e)) + msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HostPluginHeartbeat, + is_success=False, + message=msg, + log_event=False) return datetime.datetime.utcnow() From 33240bbd546b10c428658a508a91e5291a592b78 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 14:40:58 -0700 Subject: [PATCH 04/61] adjust monitor delta --- azurelinuxagent/ga/monitor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index ca644f889b..6de8f2f59d 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -241,7 +241,8 @@ def daemon(self): last_host_plugin_heartbeat = self.send_host_plugin_heartbeat(protocol, last_host_plugin_heartbeat, host_plugin_errorstate) - time.sleep(5) + # currently the smallest delta is 1 minute + time.sleep(60) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] From 0a73ce6ff4b65b5725acf1e220fbb8e4fa552db2 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 15:41:03 -0700 Subject: [PATCH 05/61] fix call --- .../common/protocol/healthservice.py | 85 +++++++++++++++++++ azurelinuxagent/ga/monitor.py | 12 ++- 2 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 azurelinuxagent/common/protocol/healthservice.py diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py new file mode 100644 index 0000000000..8e1e865fd2 --- /dev/null +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -0,0 +1,85 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# + +import json + +from azurelinuxagent.common import logger +from azurelinuxagent.common.exception import HttpError +from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.utils import restutil + + +class Observation(object): + def __init__(self, name, is_healthy, description, value): + self.name = name + self.is_healthy = is_healthy + self.description = description + self.value = value + + @property + def as_obj(self): + return { + "ObservationName": self.name, + "IsHealthy": self.is_healthy, + "Description": self.description, + "Value": self.value + } + + +class HealthService(object): + + ENDPOINT = 'http://{0}:80/HealthService' + API = 'reporttargethealth' + OBSERVER_NAME = 'WALinuxAgent' + VERSION = 1 + HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'HostPluginHeartbeat' + + def __init__(self, endpoint): + self.endpoint = HealthService.ENDPOINT.format(endpoint) + self.api = HealthService.API + self.version = HealthService.VERSION + self.source = HealthService.OBSERVER_NAME + self.observations = list() + + @property + def as_json(self): + data = { + "Api": self.api, + "Version": self.version, + "Source": self.source, + "Observations": [o.as_obj for o in self.observations] + } + return json.dumps(data) + + def observe_host_plugin_heartbeat(self, is_healthy): + self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, + is_healthy=is_healthy, + description='', + value='')) + + def report(self): + logger.verbose('HealthService: report observations') + try: + # TODO: remove + logger.info('Report observation to {0}: {1}', self.endpoint, self.as_json) + + restutil.http_post(self.endpoint, self.as_json) + del self.observations[:] + except HttpError as e: + logger.warn("HealthService could not report observations: {0}", ustr(e)) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 6de8f2f59d..d06c921134 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -33,6 +33,7 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil from azurelinuxagent.common.protocol import get_protocol_util +from azurelinuxagent.common.protocol.healthservice import HealthService from azurelinuxagent.common.protocol.imds import get_imds_client from azurelinuxagent.common.protocol.restapi import TelemetryEventParam, \ TelemetryEventList, \ @@ -230,6 +231,7 @@ def daemon(self): heartbeat_id = str(uuid.uuid4()).upper() protocol = self.protocol_util.get_protocol() host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) + health_service = HealthService(protocol.endpoint) while True: last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, @@ -240,7 +242,8 @@ def daemon(self): last_event_collection) last_host_plugin_heartbeat = self.send_host_plugin_heartbeat(protocol, last_host_plugin_heartbeat, - host_plugin_errorstate) + host_plugin_errorstate, + health_service) # currently the smallest delta is 1 minute time.sleep(60) @@ -254,7 +257,7 @@ def add_sysinfo(self, event): event.parameters.remove(param) event.parameters.extend(self.sysinfo) - def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_plugin_errorstate): + def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_plugin_errorstate, health_service): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to @@ -278,9 +281,10 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ host_plugin_errorstate.incr() is_healthy = host_plugin_errorstate.is_triggered() is False + logger.verbose("HostGAPlugin health: {0}", is_healthy) - # TODO: send healthstore signal - logger.info("HostGAPlugin health: {0}", is_healthy) + health_service.observe_host_plugin_heartbeat(is_healthy) + health_service.report() except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) From 3f90f20ef208a5e510fd2a41315244bdd1451dd5 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 15:56:05 -0700 Subject: [PATCH 06/61] use correct endpoint --- azurelinuxagent/common/protocol/healthservice.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 8e1e865fd2..b0dfdf09f6 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -20,6 +20,7 @@ import json from azurelinuxagent.common import logger +from azurelinuxagent.common.dhcp import KNOWN_WIRESERVER_IP from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import restutil @@ -44,6 +45,7 @@ def as_obj(self): class HealthService(object): + VNET_ENDPOINT = '169.254.169.254' ENDPOINT = 'http://{0}:80/HealthService' API = 'reporttargethealth' OBSERVER_NAME = 'WALinuxAgent' @@ -51,7 +53,8 @@ class HealthService(object): HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'HostPluginHeartbeat' def __init__(self, endpoint): - self.endpoint = HealthService.ENDPOINT.format(endpoint) + self.endpoint = HealthService.ENDPOINT.format(endpoint if endpoint != KNOWN_WIRESERVER_IP + else HealthService.VNET_ENDPOINT) self.api = HealthService.API self.version = HealthService.VERSION self.source = HealthService.OBSERVER_NAME From 051e9f5a5bfa20a65eafb16bf2e316e9bcaf2346 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 16:18:50 -0700 Subject: [PATCH 07/61] update version --- azurelinuxagent/common/protocol/healthservice.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index b0dfdf09f6..672dd6eec6 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -18,9 +18,9 @@ # import json +from datetime import datetime from azurelinuxagent.common import logger -from azurelinuxagent.common.dhcp import KNOWN_WIRESERVER_IP from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils import restutil @@ -45,16 +45,14 @@ def as_obj(self): class HealthService(object): - VNET_ENDPOINT = '169.254.169.254' ENDPOINT = 'http://{0}:80/HealthService' API = 'reporttargethealth' + VERSION = "1.0" OBSERVER_NAME = 'WALinuxAgent' - VERSION = 1 HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'HostPluginHeartbeat' def __init__(self, endpoint): - self.endpoint = HealthService.ENDPOINT.format(endpoint if endpoint != KNOWN_WIRESERVER_IP - else HealthService.VNET_ENDPOINT) + self.endpoint = HealthService.ENDPOINT.format(endpoint) self.api = HealthService.API self.version = HealthService.VERSION self.source = HealthService.OBSERVER_NAME @@ -73,8 +71,8 @@ def as_json(self): def observe_host_plugin_heartbeat(self, is_healthy): self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=is_healthy, - description='', - value='')) + description=datetime.utcnow(), + value=is_healthy)) def report(self): logger.verbose('HealthService: report observations') From 427a04fa4734fa69c87b0ed9434232de9abd003d Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 16:20:05 -0700 Subject: [PATCH 08/61] cleanup --- azurelinuxagent/common/protocol/healthservice.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 672dd6eec6..7a99aab74a 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -18,7 +18,6 @@ # import json -from datetime import datetime from azurelinuxagent.common import logger from azurelinuxagent.common.exception import HttpError @@ -71,8 +70,8 @@ def as_json(self): def observe_host_plugin_heartbeat(self, is_healthy): self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=is_healthy, - description=datetime.utcnow(), - value=is_healthy)) + description='', + value='')) def report(self): logger.verbose('HealthService: report observations') From 3399ef934f69e5ec59b76649821253d47549674f Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 16:23:49 -0700 Subject: [PATCH 09/61] add content type header --- azurelinuxagent/common/protocol/healthservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 7a99aab74a..aefb30574e 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -79,7 +79,7 @@ def report(self): # TODO: remove logger.info('Report observation to {0}: {1}', self.endpoint, self.as_json) - restutil.http_post(self.endpoint, self.as_json) + restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) del self.observations[:] except HttpError as e: logger.warn("HealthService could not report observations: {0}", ustr(e)) From 89e67f68896160d17e985aa10bf8525f9a7aa8cd Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 16:26:54 -0700 Subject: [PATCH 10/61] cleanup logging --- azurelinuxagent/common/protocol/healthservice.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index aefb30574e..d88334b599 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -76,10 +76,8 @@ def observe_host_plugin_heartbeat(self, is_healthy): def report(self): logger.verbose('HealthService: report observations') try: - # TODO: remove - logger.info('Report observation to {0}: {1}', self.endpoint, self.as_json) - restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) del self.observations[:] + logger.verbose('HealthService: Reported observations to {0}: {1}', self.endpoint, self.as_json) except HttpError as e: - logger.warn("HealthService could not report observations: {0}", ustr(e)) + logger.warn("HealthService: could not report observations: {0}", ustr(e)) From c466c11655c01879e138882e3a3659330aa9811f Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 29 May 2018 16:30:59 -0700 Subject: [PATCH 11/61] update observation name --- azurelinuxagent/common/protocol/healthservice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index d88334b599..6feed49cc2 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -48,7 +48,7 @@ class HealthService(object): API = 'reporttargethealth' VERSION = "1.0" OBSERVER_NAME = 'WALinuxAgent' - HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'HostPluginHeartbeat' + HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'GuestAgentPluginHeartbeat' def __init__(self, endpoint): self.endpoint = HealthService.ENDPOINT.format(endpoint) From f386a74453c99ef55a5742b12bb83e1e7a34671e Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 12:08:16 -0700 Subject: [PATCH 12/61] minor refactor --- azurelinuxagent/common/protocol/hostplugin.py | 4 ++-- azurelinuxagent/common/protocol/wire.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 9e6445243d..d420745484 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -97,8 +97,8 @@ def get_health(self): def get_api_versions(self): url = URI_FORMAT_GET_API_VERSIONS.format(self.endpoint, HOST_PLUGIN_PORT) - logger.verbose("HostGAPlugin: Getting API versions at [{0}]".format( - url)) + logger.verbose("HostGAPlugin: Getting API versions at [{0}]" + .format(url)) return_val = [] try: headers = {HEADER_CONTAINER_ID: self.container_id} diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 265b1f6fd5..f5931f981b 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -251,12 +251,10 @@ def _build_health_report(incarnation, container_id, role_instance_id, return xml -""" -Convert VMStatus object to status blob format -""" - - def ga_status_to_guest_info(ga_status): + """ + Convert VMStatus object to status blob format + """ v1_ga_guest_info = { "computerName" : ga_status.hostname, "osName" : ga_status.osname, @@ -278,6 +276,7 @@ def ga_status_to_v1(ga_status): } return v1_ga_status + def ext_substatus_to_v1(sub_status_list): status_list = [] for substatus in sub_status_list: @@ -1156,6 +1155,7 @@ def get_artifacts_profile(self): return None + class VersionInfo(object): def __init__(self, xml_text): """ From 88c34ae68782ff5ddee3c1f50bc393380a7868e3 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 13:46:02 -0700 Subject: [PATCH 13/61] add signal for /versions --- .../common/protocol/healthservice.py | 26 +++++++++++++++++-- azurelinuxagent/common/protocol/hostplugin.py | 13 +++++++--- azurelinuxagent/ga/monitor.py | 5 ++-- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 6feed49cc2..b06d07ec0c 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -49,6 +49,9 @@ class HealthService(object): VERSION = "1.0" OBSERVER_NAME = 'WALinuxAgent' HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME = 'GuestAgentPluginHeartbeat' + HOST_PLUGIN_STATUS_OBSERVATION_NAME = 'GuestAgentPluginStatus' + HOST_PLUGIN_VERSIONS_OBSERVATION_NAME = 'GuestAgentPluginVersions' + HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME = 'GuestAgentPluginArtifact' def __init__(self, endpoint): self.endpoint = HealthService.ENDPOINT.format(endpoint) @@ -67,17 +70,36 @@ def as_json(self): } return json.dumps(data) - def observe_host_plugin_heartbeat(self, is_healthy): + def report_host_plugin_heartbeat(self, is_healthy): + """ + Reports a signal for /health + :param is_healthy: whether the call suceeded + """ self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=is_healthy, description='', value='')) + self.report() + + def report_host_plugin_versions(self, is_healthy, response): + """ + Reports a signal for /versions + :param is_healthy: whether the api call succeeded + :param response: debugging information for failures + """ + self.observations.append(Observation(name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, + is_healthy=is_healthy, + description='', + value=response)) + self.report() def report(self): logger.verbose('HealthService: report observations') try: restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) - del self.observations[:] logger.verbose('HealthService: Reported observations to {0}: {1}', self.endpoint, self.as_json) except HttpError as e: logger.warn("HealthService: could not report observations: {0}", ustr(e)) + + # these signals are not timestamped, so there is no value in persisting data + del self.observations[:] diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index d420745484..bbe044f2ed 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -25,6 +25,7 @@ from azurelinuxagent.common.exception import HttpError, ProtocolError, \ ResourceGoneError from azurelinuxagent.common.future import ustr, httpclient +from azurelinuxagent.common.protocol.healthservice import HealthService from azurelinuxagent.common.utils import restutil from azurelinuxagent.common.utils import textutil from azurelinuxagent.common.utils.textutil import remove_bom @@ -59,6 +60,7 @@ def __init__(self, endpoint, container_id, role_config_name): self.deployment_id = None self.role_config_name = role_config_name self.manifest_uri = None + self.health_service = HealthService(endpoint) @staticmethod def is_default_channel(): @@ -100,19 +102,22 @@ def get_api_versions(self): logger.verbose("HostGAPlugin: Getting API versions at [{0}]" .format(url)) return_val = [] + error_response = '' + is_healthy = False try: headers = {HEADER_CONTAINER_ID: self.container_id} response = restutil.http_get(url, headers) if restutil.request_failed(response): - logger.error( - "HostGAPlugin: Failed Get API versions: {0}".format( - restutil.read_response_error(response))) + error_response = restutil.read_response_error(response) + logger.error("HostGAPlugin: Failed Get API versions: {0}".format(error_response)) else: return_val = ustr(remove_bom(response.read()), encoding='utf-8') - + is_healthy = True except HttpError as e: logger.error("HostGAPlugin: Exception Get API versions: {0}".format(e)) + self.health_service.report_host_plugin_versions(is_healthy=is_healthy, response=error_response) + return return_val def get_artifact_request(self, artifact_url, artifact_manifest_url=None): diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index d06c921134..3cb619034e 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -231,7 +231,7 @@ def daemon(self): heartbeat_id = str(uuid.uuid4()).upper() protocol = self.protocol_util.get_protocol() host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) - health_service = HealthService(protocol.endpoint) + health_service = HealthService() while True: last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, @@ -283,8 +283,7 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ is_healthy = host_plugin_errorstate.is_triggered() is False logger.verbose("HostGAPlugin health: {0}", is_healthy) - health_service.observe_host_plugin_heartbeat(is_healthy) - health_service.report() + health_service.report_host_plugin_heartbeat(is_healthy) except Exception as e: msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) From 9a25a694ab84b09ab7c12cbcc91d770bd6ca9cef Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 13:52:10 -0700 Subject: [PATCH 14/61] debugging --- azurelinuxagent/ga/monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 3cb619034e..43c116254c 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -281,7 +281,7 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ host_plugin_errorstate.incr() is_healthy = host_plugin_errorstate.is_triggered() is False - logger.verbose("HostGAPlugin health: {0}", is_healthy) + logger.info("HostGAPlugin health: {0}", is_healthy) health_service.report_host_plugin_heartbeat(is_healthy) @@ -293,7 +293,7 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, - log_event=False) + log_event=True) return datetime.datetime.utcnow() From 165f5c875f8758437535bc012acbf10b7e909a6c Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 13:53:33 -0700 Subject: [PATCH 15/61] fix --- azurelinuxagent/ga/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 43c116254c..a494c2ef0d 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -231,7 +231,7 @@ def daemon(self): heartbeat_id = str(uuid.uuid4()).upper() protocol = self.protocol_util.get_protocol() host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) - health_service = HealthService() + health_service = HealthService(protocol.endpoint) while True: last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, From 90b1727e13b1d83ff20dcd06e0e4611955abb9e3 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 13:54:40 -0700 Subject: [PATCH 16/61] remove debugging --- azurelinuxagent/ga/monitor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index a494c2ef0d..db121c5bb6 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -281,7 +281,7 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ host_plugin_errorstate.incr() is_healthy = host_plugin_errorstate.is_triggered() is False - logger.info("HostGAPlugin health: {0}", is_healthy) + logger.verbose("HostGAPlugin health: {0}", is_healthy) health_service.report_host_plugin_heartbeat(is_healthy) @@ -293,7 +293,7 @@ def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_ op=WALAEventOperation.HostPluginHeartbeat, is_success=False, message=msg, - log_event=True) + log_event=False) return datetime.datetime.utcnow() From 55ac6fe93067e4f955955642d629fc737ba5c622 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 13:55:52 -0700 Subject: [PATCH 17/61] verbose logs --- azurelinuxagent/common/protocol/healthservice.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index b06d07ec0c..6152f84fa5 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -96,6 +96,9 @@ def report_host_plugin_versions(self, is_healthy, response): def report(self): logger.verbose('HealthService: report observations') try: + # TODO: debugging + logger.info("{0}", self.as_json) + restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) logger.verbose('HealthService: Reported observations to {0}: {1}', self.endpoint, self.as_json) except HttpError as e: From 28994c0cb299114d89774295a662781835fc3171 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 14:10:16 -0700 Subject: [PATCH 18/61] cleanup, add report for artifacts --- .../common/protocol/healthservice.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 6152f84fa5..da9eab0bbd 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -26,7 +26,7 @@ class Observation(object): - def __init__(self, name, is_healthy, description, value): + def __init__(self, name, is_healthy, description='', value=''): self.name = name self.is_healthy = is_healthy self.description = description @@ -73,12 +73,10 @@ def as_json(self): def report_host_plugin_heartbeat(self, is_healthy): """ Reports a signal for /health - :param is_healthy: whether the call suceeded + :param is_healthy: whether the call succeeded """ self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, - is_healthy=is_healthy, - description='', - value='')) + is_healthy=is_healthy)) self.report() def report_host_plugin_versions(self, is_healthy, response): @@ -89,7 +87,18 @@ def report_host_plugin_versions(self, is_healthy, response): """ self.observations.append(Observation(name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=is_healthy, - description='', + value=response)) + self.report() + + def report_host_plugin_extension_artifact(self, is_healthy, response): + """ + Reports a signal for /extensionArtifact + :param is_healthy: whether the api call succeeded + :param response: debugging information for failures + :return: + """ + self.observations.append(Observation(name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, + is_healthy=is_healthy, value=response)) self.report() From 21d76b1cba85b380dfba39ddf081a5ae7950eb74 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 15:55:21 -0700 Subject: [PATCH 19/61] minor cleanup --- azurelinuxagent/common/protocol/healthservice.py | 4 +++- azurelinuxagent/common/protocol/wire.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index da9eab0bbd..239ce2a19e 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -90,15 +90,17 @@ def report_host_plugin_versions(self, is_healthy, response): value=response)) self.report() - def report_host_plugin_extension_artifact(self, is_healthy, response): + def report_host_plugin_extension_artifact(self, is_healthy, source, response): """ Reports a signal for /extensionArtifact :param is_healthy: whether the api call succeeded + :param source: specifies the api caller for debugging failures :param response: debugging information for failures :return: """ self.observations.append(Observation(name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, is_healthy=is_healthy, + description=source, value=response)) self.report() diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index f5931f981b..e08503e053 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -614,7 +614,7 @@ def fetch_manifest(self, version_uris): logger.verbose("Using host plugin as default channel") else: logger.verbose("Failed to download manifest, " - "switching to host plugin") + "switching to host plugin") try: host = self.get_host_plugin() @@ -1132,7 +1132,7 @@ def get_artifacts_profile(self): logger.verbose("Using host plugin as default channel") else: logger.verbose("Failed to download artifacts profile, " - "switching to host plugin") + "switching to host plugin") host = self.get_host_plugin() uri, headers = host.get_artifact_request(blob) From 6eaa000c123f8689308b556be0272ace70619995 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 16:32:20 -0700 Subject: [PATCH 20/61] minor refactor and cleanup --- azurelinuxagent/common/protocol/restapi.py | 4 +++- azurelinuxagent/common/protocol/wire.py | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/azurelinuxagent/common/protocol/restapi.py b/azurelinuxagent/common/protocol/restapi.py index fe3b62ffed..7bcdd70cc0 100644 --- a/azurelinuxagent/common/protocol/restapi.py +++ b/azurelinuxagent/common/protocol/restapi.py @@ -333,12 +333,14 @@ def get_artifacts_profile(self): raise NotImplementedError() def download_ext_handler_pkg(self, uri, headers=None, use_proxy=True): + pkg = None try: resp = restutil.http_get(uri, headers=headers, use_proxy=use_proxy) if restutil.request_succeeded(resp): - return resp.read() + pkg = resp.read() except Exception as e: logger.warn("Failed to download from: {0}".format(uri), e) + return pkg def report_provision_status(self, provision_status): raise NotImplementedError() diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index e08503e053..30cb62408a 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -158,16 +158,15 @@ def get_artifacts_profile(self): logger.verbose("Get In-VM Artifacts Profile") return self.client.get_artifacts_profile() - def download_ext_handler_pkg(self, uri, headers=None): - package = super(WireProtocol, self).download_ext_handler_pkg(uri) + def download_ext_handler_pkg(self, uri, headers=None, use_proxy=True): + package = super(WireProtocol, self).download_ext_handler_pkg(uri, use_proxy=use_proxy) - if package is not None: - return package - else: + if package is None: logger.verbose("Download did not succeed, falling back to host plugin") host = self.client.get_host_plugin() uri, headers = host.get_artifact_request(uri, host.manifest_uri) package = super(WireProtocol, self).download_ext_handler_pkg(uri, headers=headers, use_proxy=False) + return package def report_provision_status(self, provision_status): From 1388bb5119e2d4eeb289800a4e43f9008461be8b Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 17:58:52 -0700 Subject: [PATCH 21/61] switch to using fetch over download_ext_handler_pkg directly --- azurelinuxagent/common/protocol/wire.py | 37 +++++++++++++++++------- azurelinuxagent/common/utils/restutil.py | 10 +++++++ tests/protocol/test_wire.py | 30 +++++++------------ 3 files changed, 48 insertions(+), 29 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 30cb62408a..14c8f782cf 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -32,7 +32,8 @@ from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError from azurelinuxagent.common.future import httpclient, bytebuffer -from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol +from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol, URI_FORMAT_GET_EXTENSION_ARTIFACT, \ + HOST_PLUGIN_PORT from azurelinuxagent.common.protocol.restapi import * from azurelinuxagent.common.utils.archive import StateFlusher from azurelinuxagent.common.utils.cryptutil import CryptUtil @@ -159,13 +160,13 @@ def get_artifacts_profile(self): return self.client.get_artifacts_profile() def download_ext_handler_pkg(self, uri, headers=None, use_proxy=True): - package = super(WireProtocol, self).download_ext_handler_pkg(uri, use_proxy=use_proxy) + package = self.client.fetch(uri, headers=headers, use_proxy=use_proxy, decode=False) if package is None: logger.verbose("Download did not succeed, falling back to host plugin") host = self.client.get_host_plugin() uri, headers = host.get_artifact_request(uri, host.manifest_uri) - package = super(WireProtocol, self).download_ext_handler_pkg(uri, headers=headers, use_proxy=False) + package = self.client.fetch(uri, headers=headers, use_proxy=False, decode=False) return package @@ -637,7 +638,8 @@ def fetch_manifest(self, version_uris): raise ProtocolError("Failed to fetch manifest from all sources") - def fetch(self, uri, headers=None, use_proxy=None): + def fetch(self, uri, headers=None, use_proxy=None, decode=True): + content = None logger.verbose("Fetch [{0}] with headers [{1}]", uri, headers) try: resp = self.call_storage_service( @@ -647,21 +649,36 @@ def fetch(self, uri, headers=None, use_proxy=None): use_proxy=use_proxy) if restutil.request_failed(resp): - msg = "[Storage Failed] URI {0} ".format(uri) - if resp is not None: - msg += restutil.read_response_error(resp) + error_response = restutil.read_response_error(resp) + msg = "Fetch failed from [{0}]: {1}".format(uri, error_response) logger.warn(msg) + + self.report_fetch(uri, + is_healthy=restutil.request_failed_at_hostplugin(resp), + response=error_response) + raise ProtocolError(msg) - return self.decode_config(resp.read()) + response_content = resp.read() + content = self.decode_config(response_content) if decode else response_content - except (HttpError, ProtocolError) as e: + self.report_fetch(uri) + + except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) if isinstance(e, ResourceGoneError): raise - return None + return content + + def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): + if uri == URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) \ + and self.host_plugin is not None \ + and self.host_plugin.health_service is not None: + self.host_plugin.health_service.report_host_plugin_extension_artifact(is_healthy=is_healthy, + source=source, + response=response) def update_hosting_env(self, goal_state): if goal_state.hosting_env_uri is None: diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index f15ceb5499..310ece91e6 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -62,6 +62,12 @@ httpclient.ACCEPTED ] +HOSTPLUGIN_FAILURE_CODES = [ + 500, + 502, + 503 +] + THROTTLE_CODES = [ httpclient.FORBIDDEN, httpclient.SERVICE_UNAVAILABLE, @@ -394,6 +400,10 @@ def request_succeeded(resp, ok_codes=OK_CODES): return resp is not None and resp.status in ok_codes +def request_failed_at_hostplugin(resp, failure_codes=HOSTPLUGIN_FAILURE_CODES): + return resp is not None and resp.status in failure_codes + + def read_response_error(resp): result = '' if resp is not None: diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 99dc559658..e51c6a0bfb 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -155,29 +155,21 @@ def test_get_host_ga_plugin(self, *args): self.assertEqual(goal_state.role_config_name, host_plugin.role_config_name) self.assertEqual(1, patch_get_goal_state.call_count) - def test_download_ext_handler_pkg_fallback(self, *args): + @patch("azurelinuxagent.common.utils.restutil.http_request", side_effect=IOError) + @patch("azurelinuxagent.common.protocol.wire.WireClient.get_host_plugin") + @patch("azurelinuxagent.common.protocol.hostplugin.HostPluginProtocol.get_artifact_request") + def test_download_ext_handler_pkg_fallback(self, patch_request, patch_get_host, patch_http, *args): ext_uri = 'extension_uri' host_uri = 'host_uri' - mock_host = HostPluginProtocol(host_uri, 'container_id', 'role_config') - with patch.object(restutil, - "http_request", - side_effect=IOError) as patch_http: - with patch.object(WireClient, - "get_host_plugin", - return_value=mock_host): - with patch.object(HostPluginProtocol, - "get_artifact_request", - return_value=[host_uri, {}]) as patch_request: - - WireProtocol(wireserver_url).download_ext_handler_pkg(ext_uri) + patch_get_host.return_value = HostPluginProtocol(host_uri, 'container_id', 'role_config') + patch_request.return_value = [host_uri, {}] - self.assertEqual(patch_http.call_count, 2) - self.assertEqual(patch_request.call_count, 1) + WireProtocol(wireserver_url).download_ext_handler_pkg(ext_uri) - self.assertEqual(patch_http.call_args_list[0][0][1], - ext_uri) - self.assertEqual(patch_http.call_args_list[1][0][1], - host_uri) + self.assertEqual(patch_http.call_count, 2) + self.assertEqual(patch_request.call_count, 1) + self.assertEqual(patch_http.call_args_list[0][0][1], ext_uri) + self.assertEqual(patch_http.call_args_list[1][0][1], host_uri) def test_upload_status_blob_default(self, *args): vmstatus = VMStatus(message="Ready", status="Ready") From 2f2abfc8a0a477128fdf94f13ccf008995f86d49 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 18:02:33 -0700 Subject: [PATCH 22/61] whitespace cleanup --- azurelinuxagent/common/protocol/wire.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 14c8f782cf..992050550e 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -652,21 +652,17 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): error_response = restutil.read_response_error(resp) msg = "Fetch failed from [{0}]: {1}".format(uri, error_response) logger.warn(msg) - self.report_fetch(uri, is_healthy=restutil.request_failed_at_hostplugin(resp), response=error_response) - raise ProtocolError(msg) - - response_content = resp.read() - content = self.decode_config(response_content) if decode else response_content - - self.report_fetch(uri) - + else: + response_content = resp.read() + content = self.decode_config(response_content) if decode else response_content + self.report_fetch(uri) + except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) - if isinstance(e, ResourceGoneError): raise From d5b78749ab273fd637147d45a180e3c26ebbb436 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Wed, 30 May 2018 18:04:23 -0700 Subject: [PATCH 23/61] switch default --- azurelinuxagent/common/protocol/hostplugin.py | 3 ++- azurelinuxagent/common/protocol/wire.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index bbe044f2ed..1e1394addc 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -47,7 +47,8 @@ class HostPluginProtocol(object): - _is_default_channel = False + # TODO: debugging + _is_default_channel = True def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 992050550e..912c05b0dd 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -660,7 +660,7 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): response_content = resp.read() content = self.decode_config(response_content) if decode else response_content self.report_fetch(uri) - + except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) if isinstance(e, ResourceGoneError): From 1806eb48fd2daba515a11adb2fc0cd0c195040e1 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 00:20:17 -0700 Subject: [PATCH 24/61] exception formatting --- azurelinuxagent/ga/exthandlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 28953dd7e9..3fbe420ac9 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -1113,7 +1113,7 @@ def set_handler_status(self, status="NotReady", message="", code=0): except (IOError, ValueError, ProtocolError) as e: fileutil.clean_ioerror(e, paths=[status_file]) - self.logger.error("Failed to save handler status: {0}", traceback.format_exc()) + self.logger.error("Failed to save handler status: {0}", ustr(e)) def get_handler_status(self): state_dir = self.get_conf_dir() From 21493bc2eeb319e409697433f311474a97d5bf4e Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 00:33:41 -0700 Subject: [PATCH 25/61] debugging --- azurelinuxagent/common/protocol/wire.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 912c05b0dd..616c9785e8 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -659,7 +659,9 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): else: response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - self.report_fetch(uri) + + # TODO: debugging + self.report_fetch(uri, source=headers['x-ms-artifact-location']) except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) @@ -672,6 +674,8 @@ def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): if uri == URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) \ and self.host_plugin is not None \ and self.host_plugin.health_service is not None: + # TODO: debugging + logger.info("report_fetch: {0}", source) self.host_plugin.health_service.report_host_plugin_extension_artifact(is_healthy=is_healthy, source=source, response=response) From 17157be6f858a40affb0e95c48895c6c638a3154 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 00:40:19 -0700 Subject: [PATCH 26/61] remove debugging --- azurelinuxagent/common/protocol/wire.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 616c9785e8..123ee2bf91 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -660,8 +660,7 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - # TODO: debugging - self.report_fetch(uri, source=headers['x-ms-artifact-location']) + self.report_fetch(uri) except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) @@ -674,8 +673,6 @@ def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): if uri == URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) \ and self.host_plugin is not None \ and self.host_plugin.health_service is not None: - # TODO: debugging - logger.info("report_fetch: {0}", source) self.host_plugin.health_service.report_host_plugin_extension_artifact(is_healthy=is_healthy, source=source, response=response) From edb824a1c563f144ce42c6d3ae217eeccb4aa2b0 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 00:43:49 -0700 Subject: [PATCH 27/61] remove 400 from resource gone codes --- azurelinuxagent/common/utils/restutil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index 310ece91e6..b0b92c9082 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -52,7 +52,6 @@ ] RESOURCE_GONE_CODES = [ - httpclient.BAD_REQUEST, httpclient.GONE ] From a0b53ec27d0c212c904c90521f0b3c33c27e7eda Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 01:35:25 -0700 Subject: [PATCH 28/61] rollup hostplugin fetch reporting --- azurelinuxagent/common/errorstate.py | 1 + azurelinuxagent/common/protocol/hostplugin.py | 29 ++++++++++++++++++- azurelinuxagent/common/protocol/wire.py | 17 +++-------- 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/azurelinuxagent/common/errorstate.py b/azurelinuxagent/common/errorstate.py index 52e2ddd5ec..902d831ae6 100644 --- a/azurelinuxagent/common/errorstate.py +++ b/azurelinuxagent/common/errorstate.py @@ -2,6 +2,7 @@ ERROR_STATE_DELTA_DEFAULT = timedelta(minutes=15) ERROR_STATE_DELTA_INSTALL = timedelta(minutes=5) +ERROR_STATE_HOST_PLUGIN_FAILURE = timedelta(minutes=5) class ErrorState(object): def __init__(self, min_timedelta=ERROR_STATE_DELTA_DEFAULT): diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 1e1394addc..33e296ec4d 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -18,10 +18,11 @@ # import base64 +import datetime import json -import traceback from azurelinuxagent.common import logger +from azurelinuxagent.common.errorstate import ErrorState, ERROR_STATE_HOST_PLUGIN_FAILURE from azurelinuxagent.common.exception import HttpError, ProtocolError, \ ResourceGoneError from azurelinuxagent.common.future import ustr, httpclient @@ -50,6 +51,8 @@ class HostPluginProtocol(object): # TODO: debugging _is_default_channel = True + FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1) + def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: raise ProtocolError("HostGAPlugin: Endpoint not provided") @@ -62,6 +65,8 @@ def __init__(self, endpoint, container_id, role_config_name): self.role_config_name = role_config_name self.manifest_uri = None self.health_service = HealthService(endpoint) + self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) + self.fetch_last_timestamp = None @staticmethod def is_default_channel(): @@ -140,6 +145,28 @@ def get_artifact_request(self, artifact_url, artifact_manifest_url=None): return url, headers + def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): + + if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT): + return + + if is_healthy: + self.fetch_error_state.reset() + else: + self.fetch_error_state.incr() + + if self.fetch_last_timestamp is None: + self.fetch_last_timestamp = datetime.datetime.utcnow() - HostPluginProtocol.FETCH_REPORTING_PERIOD + + if datetime.datetime.utcnow() < (self.fetch_last_timestamp + HostPluginProtocol.FETCH_REPORTING_PERIOD): + return + + self.fetch_last_timestamp = datetime.datetime.utcnow() + overall_health = self.fetch_error_state.is_triggered() is False + self.health_service.report_host_plugin_extension_artifact(is_healthy=overall_health, + source=source, + response=response) + def put_vm_log(self, content): raise NotImplementedError("Unimplemented") diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 123ee2bf91..eabfb32d7d 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -652,15 +652,14 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): error_response = restutil.read_response_error(resp) msg = "Fetch failed from [{0}]: {1}".format(uri, error_response) logger.warn(msg) - self.report_fetch(uri, - is_healthy=restutil.request_failed_at_hostplugin(resp), - response=error_response) + self.host_plugin.report_fetch(uri, + is_healthy=restutil.request_failed_at_hostplugin(resp), + response=error_response) raise ProtocolError(msg) else: response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - - self.report_fetch(uri) + self.host_plugin.report_fetch(uri) except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) @@ -669,14 +668,6 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): return content - def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): - if uri == URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT) \ - and self.host_plugin is not None \ - and self.host_plugin.health_service is not None: - self.host_plugin.health_service.report_host_plugin_extension_artifact(is_healthy=is_healthy, - source=source, - response=response) - def update_hosting_env(self, goal_state): if goal_state.hosting_env_uri is None: raise ProtocolError("HostingEnvironmentConfig uri is empty") From f3ee5cbc6f517693e5a8ba98542fb448cc8eb517 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 01:38:03 -0700 Subject: [PATCH 29/61] update exthandler prefix --- azurelinuxagent/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/agent.py b/azurelinuxagent/agent.py index 4229194cd7..e1dfc4ed59 100644 --- a/azurelinuxagent/agent.py +++ b/azurelinuxagent/agent.py @@ -129,7 +129,7 @@ def run_exthandlers(self): """ Run the update and extension handler """ - logger.set_prefix("Upd/Ext-Handler") + logger.set_prefix("ExtHandler") from azurelinuxagent.ga.update import get_update_handler update_handler = get_update_handler() update_handler.run() From 38f53b293340e07e8bc53f528bba27d031fa7cf2 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 12:28:15 -0700 Subject: [PATCH 30/61] report fetch from GuestAgent --- azurelinuxagent/common/protocol/hostplugin.py | 2 +- azurelinuxagent/common/protocol/wire.py | 3 ++- azurelinuxagent/ga/update.py | 12 ++++++++++-- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 33e296ec4d..716fe71280 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -145,7 +145,7 @@ def get_artifact_request(self, artifact_url, artifact_manifest_url=None): return url, headers - def report_fetch(self, uri, is_healthy=True, source='WireClient', response=''): + def report_fetch(self, uri, is_healthy=True, source='', response=''): if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT): return diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index eabfb32d7d..bd03d1646c 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -654,12 +654,13 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): logger.warn(msg) self.host_plugin.report_fetch(uri, is_healthy=restutil.request_failed_at_hostplugin(resp), + source='WireClient', response=error_response) raise ProtocolError(msg) else: response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - self.host_plugin.report_fetch(uri) + self.host_plugin.report_fetch(uri, source='WireClient') except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index ec9329f440..95c6e01418 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -85,6 +85,7 @@ "ovf-env.xml" ] + def get_update_handler(): return UpdateHandler() @@ -874,6 +875,8 @@ def _download(self): def _fetch(self, uri, headers=None, use_proxy=True): package = None try: + is_healthy = True + error_response = '' resp = restutil.http_get(uri, use_proxy=use_proxy, headers=headers) if restutil.request_succeeded(resp): package = resp.read() @@ -882,8 +885,13 @@ def _fetch(self, uri, headers=None, use_proxy=True): asbin=True) logger.verbose(u"Agent {0} downloaded from {1}", self.name, uri) else: - logger.verbose("Fetch was unsuccessful [{0}]", - restutil.read_response_error(resp)) + error_response = restutil.read_response_error(resp) + logger.verbose("Fetch was unsuccessful [{0}]", error_response) + is_healthy = restutil.request_failed_at_hostplugin(resp) + + if self.host is not None: + self.host.report_fetch(uri, is_healthy, source='GuestAgent', response=error_response) + except restutil.HttpError as http_error: if isinstance(http_error, ResourceGoneError): raise From 60d63456595cb09b21a5a3b52f85885785647f45 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 13:05:20 -0700 Subject: [PATCH 31/61] refactor, report status health --- .../common/protocol/healthservice.py | 12 +++ azurelinuxagent/common/protocol/hostplugin.py | 80 ++++++++++++------- azurelinuxagent/common/protocol/wire.py | 10 +-- azurelinuxagent/ga/update.py | 2 +- 4 files changed, 71 insertions(+), 33 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 239ce2a19e..2a8f2cf96f 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -104,6 +104,18 @@ def report_host_plugin_extension_artifact(self, is_healthy, source, response): value=response)) self.report() + def report_host_plugin_status(self, is_healthy, response): + """ + Reports a signal for /status + :param is_healthy: whether the api call succeeded + :param response: debugging information for failures + :return: + """ + self.observations.append(Observation(name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, + is_healthy=is_healthy, + value=response)) + self.report() + def report(self): logger.verbose('HealthService: report observations') try: diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 716fe71280..0a4631a38d 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -52,6 +52,7 @@ class HostPluginProtocol(object): _is_default_channel = True FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1) + STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1) def __init__(self, endpoint, container_id, role_config_name): if endpoint is None: @@ -66,7 +67,9 @@ def __init__(self, endpoint, container_id, role_config_name): self.manifest_uri = None self.health_service = HealthService(endpoint) self.fetch_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) + self.status_error_state = ErrorState(min_timedelta=ERROR_STATE_HOST_PLUGIN_FAILURE) self.fetch_last_timestamp = None + self.status_last_timestamp = None @staticmethod def is_default_channel(): @@ -145,27 +148,43 @@ def get_artifact_request(self, artifact_url, artifact_manifest_url=None): return url, headers - def report_fetch(self, uri, is_healthy=True, source='', response=''): + def report_fetch_health(self, uri, is_healthy=True, source='', response=''): if uri != URI_FORMAT_GET_EXTENSION_ARTIFACT.format(self.endpoint, HOST_PLUGIN_PORT): return + if self.should_report(is_healthy, + self.fetch_error_state, + self.fetch_last_timestamp, + HostPluginProtocol.FETCH_REPORTING_PERIOD): + self.fetch_last_timestamp = datetime.datetime.utcnow() + health_signal = self.fetch_error_state.is_triggered() is False + self.health_service.report_host_plugin_extension_artifact(is_healthy=health_signal, + source=source, + response=response) + + def report_status_health(self, is_healthy, response=''): + if self.should_report(is_healthy, + self.status_error_state, + self.status_last_timestamp, + HostPluginProtocol.STATUS_REPORTING_PERIOD): + self.status_last_timestamp = datetime.datetime.utcnow() + health_signal = self.status_error_state.is_triggered() is False + self.health_service.report_host_plugin_status(is_healthy=health_signal, + response=response) + + @staticmethod + def should_report(is_healthy, error_state, last_timestamp, period): + if is_healthy: - self.fetch_error_state.reset() + error_state.reset() else: - self.fetch_error_state.incr() + error_state.incr() - if self.fetch_last_timestamp is None: - self.fetch_last_timestamp = datetime.datetime.utcnow() - HostPluginProtocol.FETCH_REPORTING_PERIOD - - if datetime.datetime.utcnow() < (self.fetch_last_timestamp + HostPluginProtocol.FETCH_REPORTING_PERIOD): - return + if last_timestamp is None: + last_timestamp = datetime.datetime.utcnow() - period - self.fetch_last_timestamp = datetime.datetime.utcnow() - overall_health = self.fetch_error_state.is_triggered() is False - self.health_service.report_host_plugin_extension_artifact(is_healthy=overall_health, - source=source, - response=response) + return datetime.datetime.utcnow() >= (last_timestamp + period) def put_vm_log(self, content): raise NotImplementedError("Unimplemented") @@ -206,16 +225,20 @@ def _put_block_blob_status(self, sas_url, status_blob): url = URI_FORMAT_PUT_VM_STATUS.format(self.endpoint, HOST_PLUGIN_PORT) response = restutil.http_put(url, - data=self._build_status_data( - sas_url, - status_blob.get_block_blob_headers(len(status_blob.data)), - bytearray(status_blob.data, encoding='utf-8')), - headers=self._build_status_headers()) + data=self._build_status_data( + sas_url, + status_blob.get_block_blob_headers(len(status_blob.data)), + bytearray(status_blob.data, encoding='utf-8')), + headers=self._build_status_headers()) if restutil.request_failed(response): - raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}".format( - restutil.read_response_error(response))) + error_response = restutil.read_response_error(response) + is_healthy = restutil.request_failed_at_hostplugin(response) + self.report_status_health(is_healthy=is_healthy, response=error_response) + raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}" + .format(error_response)) else: + self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: Put BlockBlob status succeeded") def _put_page_blob_status(self, sas_url, status_blob): @@ -228,16 +251,19 @@ def _put_page_blob_status(self, sas_url, status_blob): # First, initialize an empty blob response = restutil.http_put(url, - data=self._build_status_data( - sas_url, - status_blob.get_page_blob_create_headers(status_size)), - headers=self._build_status_headers()) + data=self._build_status_data( + sas_url, + status_blob.get_page_blob_create_headers(status_size)), + headers=self._build_status_headers()) if restutil.request_failed(response): - raise HttpError( - "HostGAPlugin: Failed PageBlob clean-up: {0}".format( - restutil.read_response_error(response))) + error_response = restutil.read_response_error(response) + is_healthy = restutil.request_failed_at_hostplugin(response) + self.report_status_health(is_healthy=is_healthy, response=error_response) + raise HttpError("HostGAPlugin: Failed PageBlob clean-up: {0}" + .format(error_response)) else: + self.report_status_health(is_healthy=True) logger.verbose("HostGAPlugin: PageBlob clean-up succeeded") # Then, upload the blob in pages diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index bd03d1646c..d75b2f1e3c 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -652,15 +652,15 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): error_response = restutil.read_response_error(resp) msg = "Fetch failed from [{0}]: {1}".format(uri, error_response) logger.warn(msg) - self.host_plugin.report_fetch(uri, - is_healthy=restutil.request_failed_at_hostplugin(resp), - source='WireClient', - response=error_response) + self.host_plugin.report_fetch_health(uri, + is_healthy=restutil.request_failed_at_hostplugin(resp), + source='WireClient', + response=error_response) raise ProtocolError(msg) else: response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - self.host_plugin.report_fetch(uri, source='WireClient') + self.host_plugin.report_fetch_health(uri, source='WireClient') except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 95c6e01418..e4db815818 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -890,7 +890,7 @@ def _fetch(self, uri, headers=None, use_proxy=True): is_healthy = restutil.request_failed_at_hostplugin(resp) if self.host is not None: - self.host.report_fetch(uri, is_healthy, source='GuestAgent', response=error_response) + self.host.report_fetch_health(uri, is_healthy, source='GuestAgent', response=error_response) except restutil.HttpError as http_error: if isinstance(http_error, ResourceGoneError): From 86e2eadf1ab3d8410083e1636fd583a38bc661ff Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 13:14:43 -0700 Subject: [PATCH 32/61] remove debugging --- azurelinuxagent/common/protocol/healthservice.py | 3 --- azurelinuxagent/common/protocol/hostplugin.py | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 2a8f2cf96f..ef1267d2cc 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -119,9 +119,6 @@ def report_host_plugin_status(self, is_healthy, response): def report(self): logger.verbose('HealthService: report observations') try: - # TODO: debugging - logger.info("{0}", self.as_json) - restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) logger.verbose('HealthService: Reported observations to {0}: {1}', self.endpoint, self.as_json) except HttpError as e: diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 0a4631a38d..bc39eb09ae 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -48,8 +48,7 @@ class HostPluginProtocol(object): - # TODO: debugging - _is_default_channel = True + _is_default_channel = False FETCH_REPORTING_PERIOD = datetime.timedelta(minutes=1) STATUS_REPORTING_PERIOD = datetime.timedelta(minutes=1) From 29aaaa6a6f5142ae049f62f560bedfe27028b848 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Thu, 31 May 2018 14:03:13 -0700 Subject: [PATCH 33/61] update for tests --- azurelinuxagent/common/protocol/hostplugin.py | 18 ++++++++++-------- azurelinuxagent/common/protocol/wire.py | 12 +++++++----- tests/ga/test_update.py | 3 ++- tests/protocol/test_hostplugin.py | 8 ++++---- tests/protocol/test_wire.py | 11 ++++------- tests/utils/test_rest_util.py | 10 ---------- 6 files changed, 27 insertions(+), 35 deletions(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index bc39eb09ae..cfbf350dc3 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -282,17 +282,19 @@ def _put_page_blob_status(self, sas_url, status_blob): # Send the page response = restutil.http_put(url, - data=self._build_status_data( - sas_url, - status_blob.get_page_blob_page_headers(start, end), - buf), - headers=self._build_status_headers()) + data=self._build_status_data( + sas_url, + status_blob.get_page_blob_page_headers(start, end), + buf), + headers=self._build_status_headers()) if restutil.request_failed(response): + error_response = restutil.read_response_error(response) + is_healthy = restutil.request_failed_at_hostplugin(response) + self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError( - "HostGAPlugin Error: Put PageBlob bytes [{0},{1}]: " \ - "{2}".format( - start, end, restutil.read_response_error(response))) + "HostGAPlugin Error: Put PageBlob bytes " + "[{0},{1}]: {2}".format(start, end, error_response)) # Advance to the next page (if any) start = end diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index d75b2f1e3c..33d24318c7 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -652,15 +652,17 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): error_response = restutil.read_response_error(resp) msg = "Fetch failed from [{0}]: {1}".format(uri, error_response) logger.warn(msg) - self.host_plugin.report_fetch_health(uri, - is_healthy=restutil.request_failed_at_hostplugin(resp), - source='WireClient', - response=error_response) + if self.host_plugin is not None: + self.host_plugin.report_fetch_health(uri, + is_healthy=restutil.request_failed_at_hostplugin(resp), + source='WireClient', + response=error_response) raise ProtocolError(msg) else: response_content = resp.read() content = self.decode_config(response_content) if decode else response_content - self.host_plugin.report_fetch_health(uri, source='WireClient') + if self.host_plugin is not None: + self.host_plugin.report_fetch_health(uri, source='WireClient') except (HttpError, ProtocolError, IOError) as e: logger.verbose("Fetch failed from [{0}]: {1}", uri, e) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 0a267965f4..d53bd88060 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -514,7 +514,8 @@ def test_download_fail(self, mock_http_get, mock_loaded, mock_downloaded): @patch("azurelinuxagent.ga.update.GuestAgent._ensure_downloaded") @patch("azurelinuxagent.ga.update.GuestAgent._ensure_loaded") @patch("azurelinuxagent.ga.update.restutil.http_get") - def test_download_fallback(self, mock_http_get, mock_loaded, mock_downloaded): + @patch("azurelinuxagent.ga.update.restutil.http_post") + def test_download_fallback(self, mock_http_post, mock_http_get, mock_loaded, mock_downloaded): self.remove_agents() self.assertFalse(os.path.isdir(self.agent_path)) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 5eed8522a2..fb9022b049 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -222,7 +222,7 @@ def test_validate_http_request(self): patch_api.return_value = API_VERSION plugin.put_vm_status(status_blob, sas_url, block_blob_type) - self.assertTrue(patch_http.call_count == 1) + self.assertTrue(patch_http.call_count == 2) self._validate_hostplugin_args( patch_http.call_args_list[0], test_goal_state, @@ -274,7 +274,7 @@ def test_validate_block_blob(self): patch_get.return_value = api_versions host_client.put_vm_status(status_blob, sas_url) - self.assertTrue(patch_http.call_count == 1) + self.assertTrue(patch_http.call_count == 2) self._validate_hostplugin_args( patch_http.call_args_list[0], test_goal_state, @@ -314,7 +314,7 @@ def test_validate_page_blobs(self): patch_get.return_value = api_versions host_client.put_vm_status(status_blob, sas_url) - self.assertTrue(patch_http.call_count == 2) + self.assertTrue(patch_http.call_count == 3) exp_data = self._hostplugin_data( status_blob.get_page_blob_create_headers( @@ -330,7 +330,7 @@ def test_validate_page_blobs(self): page) exp_data['requestUri'] += "?comp=page" self._validate_hostplugin_args( - patch_http.call_args_list[1], + patch_http.call_args_list[2], test_goal_state, exp_method, exp_url, exp_data) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index e51c6a0bfb..a8a8ba26e1 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -28,13 +28,14 @@ @patch("time.sleep") @patch("azurelinuxagent.common.protocol.wire.CryptUtil") +@patch("azurelinuxagent.common.protocol.healthservice.HealthService.report") class TestWireProtocol(AgentTestCase): def setUp(self): super(TestWireProtocol, self).setUp() HostPluginProtocol.set_default_channel(False) - def _test_getters(self, test_data, MockCryptUtil, _): + def _test_getters(self, test_data, __, MockCryptUtil, _): MockCryptUtil.side_effect = test_data.mock_crypt_util with patch.object(restutil, 'http_get', test_data.mock_http_get): @@ -93,9 +94,7 @@ def test_getters_with_stale_goal_state(self, *args): # HostingEnvironmentConfig, will be retrieved the expected number self.assertEqual(2, test_data.call_counts["hostingenvuri"]) - def test_call_storage_kwargs(self, - mock_cryptutil, - mock_sleep): + def test_call_storage_kwargs(self, *args): from azurelinuxagent.common.utils import restutil with patch.object(restutil, 'http_get') as http_patch: http_req = restutil.http_get @@ -289,7 +288,6 @@ def test_get_in_vm_artifacts_profile_response_body_not_valid(self, *args): host_plugin_get_artifact_url_and_headers.assert_called_with(testurl) - def test_get_in_vm_artifacts_profile_default(self, *args): wire_protocol_client = WireProtocol(wireserver_url).client wire_protocol_client.ext_conf = ExtensionsConfig(None) @@ -302,8 +300,7 @@ def test_get_in_vm_artifacts_profile_default(self, *args): self.assertEqual(dict(onHold='true'), in_vm_artifacts_profile.__dict__) self.assertTrue(in_vm_artifacts_profile.is_on_hold()) - @patch("time.sleep") - def test_fetch_manifest_fallback(self, patch_sleep, *args): + def test_fetch_manifest_fallback(self, *args): uri1 = ExtHandlerVersionUri() uri1.uri = 'ext_uri' uris = DataContractList(ExtHandlerVersionUri) diff --git a/tests/utils/test_rest_util.py b/tests/utils/test_rest_util.py index adeb814185..05911f8229 100644 --- a/tests/utils/test_rest_util.py +++ b/tests/utils/test_rest_util.py @@ -387,16 +387,6 @@ def test_http_request_retries_for_safe_minimum_number_when_throttled(self, _http [call(1) for i in range(restutil.THROTTLE_RETRIES-1)], _sleep.call_args_list) - @patch("time.sleep") - @patch("azurelinuxagent.common.utils.restutil._http_request") - def test_http_request_raises_for_bad_request(self, _http_request, _sleep): - _http_request.side_effect = [ - Mock(status=httpclient.BAD_REQUEST) - ] - - self.assertRaises(ResourceGoneError, restutil.http_get, "https://foo.bar") - self.assertEqual(1, _http_request.call_count) - @patch("time.sleep") @patch("azurelinuxagent.common.utils.restutil._http_request") def test_http_request_raises_for_resource_gone(self, _http_request, _sleep): From 532c6b738c037e6fdb8c733aca053937fbae3af2 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 12:13:29 -0700 Subject: [PATCH 34/61] limit the observation field sizes --- .../common/protocol/healthservice.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index ef1267d2cc..7976ff169a 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -27,6 +27,18 @@ class Observation(object): def __init__(self, name, is_healthy, description='', value=''): + if name is None: + raise ValueError("Observation name must be provided") + + if is_healthy is None: + raise ValueError("Observation health must be provided") + + if value is None: + value = '' + + if description is None: + description = '' + self.name = name self.is_healthy = is_healthy self.description = description @@ -35,10 +47,10 @@ def __init__(self, name, is_healthy, description='', value=''): @property def as_obj(self): return { - "ObservationName": self.name, + "ObservationName": self.name[:64], "IsHealthy": self.is_healthy, - "Description": self.description, - "Value": self.value + "Description": self.description[:128], + "Value": self.value[:128] } From a1c5bedb396a10c08893cd56cfe6d7df946ab404 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 12:23:07 -0700 Subject: [PATCH 35/61] change manifest message to verbose --- azurelinuxagent/ga/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index e4db815818..d33eb712b1 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -541,7 +541,7 @@ def _purge_agents(self): known_versions = [agent.version for agent in self.agents] if CURRENT_VERSION not in known_versions: - logger.info( + logger.verbose( u"Running Agent {0} was not found in the agent manifest - adding to list", CURRENT_VERSION) known_versions.append(CURRENT_VERSION) From 653f2b4bf61586e3693d0f1fbfcf48a019cb0139 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 12:34:28 -0700 Subject: [PATCH 36/61] remove enable events from the log --- azurelinuxagent/ga/exthandlers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 3fbe420ac9..32a04cdc4d 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -717,10 +717,10 @@ def copy_status_files(self, old_ext_handler_i): def set_operation(self, op): self.operation = op - def report_event(self, message="", is_success=True, duration=0): + def report_event(self, message="", is_success=True, duration=0, log_event=True): ext_handler_version = self.ext_handler.properties.version add_event(name=self.ext_handler.name, version=ext_handler_version, message=message, - op=self.operation, is_success=is_success, duration=duration) + op=self.operation, is_success=is_success, duration=duration, log_event=log_event) def download(self): begin_utc = datetime.datetime.utcnow() @@ -1004,7 +1004,7 @@ def launch_command(self, cmd, timeout=300): raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg)) duration = elapsed_milliseconds(begin_utc) - self.report_event(message="{0}\n{1}".format(cmd, msg), duration=duration) + self.report_event(message="{0}\n{1}".format(cmd, msg), duration=duration, log_event=False) def load_manifest(self): man_file = self.get_manifest_file() From b32b9d0c90b1b810218b02bee1d0cee90a9e715f Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 12:44:17 -0700 Subject: [PATCH 37/61] test layout cleanup --- tests/protocol/test_hostplugin.py | 62 +++++++++++++++---------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index fb9022b049..9341f0b7d0 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -19,27 +19,23 @@ import json import sys -from azurelinuxagent.common.future import ustr - -if sys.version_info[0] == 3: - import http.client as httpclient - bytebuffer = memoryview -elif sys.version_info[0] == 2: - import httplib as httpclient - bytebuffer = buffer - import azurelinuxagent.common.protocol.restapi as restapi import azurelinuxagent.common.protocol.wire as wire import azurelinuxagent.common.protocol.hostplugin as hostplugin -from azurelinuxagent.common import event -from azurelinuxagent.common.exception import ProtocolError, HttpError from azurelinuxagent.common.protocol.hostplugin import API_VERSION from azurelinuxagent.common.utils import restutil - from tests.protocol.mockwiredata import WireProtocolData, DATA_FILE from tests.tools import * +if sys.version_info[0] == 3: + import http.client as httpclient + bytebuffer = memoryview +elif sys.version_info[0] == 2: + import httplib as httpclient + bytebuffer = buffer + + hostplugin_status_url = "http://168.63.129.16:32526/status" sas_url = "http://sas_url" wireserver_url = "168.63.129.16" @@ -55,15 +51,15 @@ if PY_VERSION_MAJOR > 2: faux_status_b64 = faux_status_b64.decode('utf-8') + class TestHostPlugin(AgentTestCase): def _compare_data(self, actual, expected): for k in iter(expected.keys()): if k == 'content' or k == 'requestUri': if actual[k] != expected[k]: - print("Mismatch: Actual '{0}'='{1}', " \ - "Expected '{0}'='{3}'".format( - k, actual[k], expected[k])) + print("Mismatch: Actual '{0}'='{1}', " + "Expected '{0}'='{2}'".format(k, actual[k], expected[k])) return False elif k == 'headers': for h in expected['headers']: @@ -93,7 +89,7 @@ def _hostplugin_data(self, blob_headers, content=None): s = s.decode('utf-8') data['content'] = s return data - + def _hostplugin_headers(self, goal_state): return { 'x-ms-version': '2015-09-01', @@ -101,7 +97,7 @@ def _hostplugin_headers(self, goal_state): 'x-ms-containerid': goal_state.container_id, 'x-ms-host-config-name': goal_state.role_config_name } - + def _validate_hostplugin_args(self, args, goal_state, exp_method, exp_url, exp_data): args, kwargs = args self.assertEqual(exp_method, args[0]) @@ -195,7 +191,6 @@ def test_put_status_error_reporting(self): self.assertFalse(wire.HostPluginProtocol.is_default_channel()) self.assertTrue(patch_add_event.call_count == 1) - def test_validate_http_request(self): """Validate correct set of data is sent to HostGAPlugin when reporting VM status""" @@ -209,8 +204,8 @@ def test_validate_http_request(self): exp_method = 'PUT' exp_url = hostplugin_status_url exp_data = self._hostplugin_data( - status_blob.get_block_blob_headers(len(faux_status)), - bytearray(faux_status, encoding='utf-8')) + status_blob.get_block_blob_headers(len(faux_status)), + bytearray(faux_status, encoding='utf-8')) with patch.object(restutil, "http_request") as patch_http: patch_http.return_value = Mock(status=httpclient.OK) @@ -263,14 +258,14 @@ def test_validate_block_blob(self): exp_method = 'PUT' exp_url = hostplugin_status_url exp_data = self._hostplugin_data( - status_blob.get_block_blob_headers(len(faux_status)), - bytearray(faux_status, encoding='utf-8')) + status_blob.get_block_blob_headers(len(faux_status)), + bytearray(faux_status, encoding='utf-8')) with patch.object(restutil, "http_request") as patch_http: patch_http.return_value = Mock(status=httpclient.OK) with patch.object(wire.HostPluginProtocol, - "get_api_versions") as patch_get: + "get_api_versions") as patch_get: patch_get.return_value = api_versions host_client.put_vm_status(status_blob, sas_url) @@ -279,7 +274,7 @@ def test_validate_block_blob(self): patch_http.call_args_list[0], test_goal_state, exp_method, exp_url, exp_data) - + def test_validate_page_blobs(self): """Validate correct set of data is sent for page blobs""" wire_protocol_client = wire.WireProtocol(wireserver_url).client @@ -308,27 +303,27 @@ def test_validate_page_blobs(self): mock_response = MockResponse('', httpclient.OK) with patch.object(restutil, "http_request", - return_value=mock_response) as patch_http: + return_value=mock_response) as patch_http: with patch.object(wire.HostPluginProtocol, - "get_api_versions") as patch_get: + "get_api_versions") as patch_get: patch_get.return_value = api_versions host_client.put_vm_status(status_blob, sas_url) self.assertTrue(patch_http.call_count == 3) exp_data = self._hostplugin_data( - status_blob.get_page_blob_create_headers( - page_size)) + status_blob.get_page_blob_create_headers( + page_size)) self._validate_hostplugin_args( patch_http.call_args_list[0], test_goal_state, exp_method, exp_url, exp_data) exp_data = self._hostplugin_data( - status_blob.get_page_blob_page_headers( - 0, page_size), - page) - exp_data['requestUri'] += "?comp=page" + status_blob.get_page_blob_page_headers( + 0, page_size), + page) + exp_data['requestUri'] += "?comp=page" self._validate_hostplugin_args( patch_http.call_args_list[2], test_goal_state, @@ -356,7 +351,7 @@ def test_validate_get_extension_artifacts(self): for k in expected_headers: self.assertTrue(k in actual_headers) self.assertEqual(expected_headers[k], actual_headers[k]) - + class MockResponse: def __init__(self, body, status_code): @@ -366,5 +361,6 @@ def __init__(self, body, status_code): def read(self): return self.body + if __name__ == '__main__': unittest.main() From f5f26ae3f19f5d87ecb1c057cc1202db0515d6dd Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 13:17:13 -0700 Subject: [PATCH 38/61] update existing host plugin tests to cover health service calls --- tests/protocol/test_hostplugin.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 9341f0b7d0..9496895fcc 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -218,11 +218,17 @@ def test_validate_http_request(self): plugin.put_vm_status(status_blob, sas_url, block_blob_type) self.assertTrue(patch_http.call_count == 2) + + # first call is to host plugin self._validate_hostplugin_args( patch_http.call_args_list[0], test_goal_state, exp_method, exp_url, exp_data) + # second call is to health service + self.assertEqual('POST', patch_http.call_args_list[1][0][0]) + self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + def test_no_fallback(self): """ Validate fallback to upload status using HostGAPlugin is not happening @@ -270,11 +276,17 @@ def test_validate_block_blob(self): host_client.put_vm_status(status_blob, sas_url) self.assertTrue(patch_http.call_count == 2) + + # first call is to host plugin self._validate_hostplugin_args( patch_http.call_args_list[0], test_goal_state, exp_method, exp_url, exp_data) + # second call is to health service + self.assertEqual('POST', patch_http.call_args_list[1][0][0]) + self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + def test_validate_page_blobs(self): """Validate correct set of data is sent for page blobs""" wire_protocol_client = wire.WireProtocol(wireserver_url).client @@ -311,6 +323,7 @@ def test_validate_page_blobs(self): self.assertTrue(patch_http.call_count == 3) + # first call is to host plugin exp_data = self._hostplugin_data( status_blob.get_page_blob_create_headers( page_size)) @@ -319,6 +332,11 @@ def test_validate_page_blobs(self): test_goal_state, exp_method, exp_url, exp_data) + # second call is to health service + self.assertEqual('POST', patch_http.call_args_list[1][0][0]) + self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + + # last call is to host plugin exp_data = self._hostplugin_data( status_blob.get_page_blob_page_headers( 0, page_size), From 0821103d19572fd5df59fda0f98ed11d735c4bc9 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 15:14:14 -0700 Subject: [PATCH 39/61] do not log agent enable event --- azurelinuxagent/ga/update.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index d33eb712b1..4fc32d975f 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -191,7 +191,8 @@ def run_latest(self, child_args=None): version=agent_version, op=WALAEventOperation.Enable, is_success=True, - message=msg) + message=msg, + log_event=False) if ret is None: ret = self.child_process.wait() From 0370af7935e714ea55d2f8234946a06f6e26f193 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 16:25:46 -0700 Subject: [PATCH 40/61] fix failure conditions, status codes, add tests --- azurelinuxagent/common/protocol/hostplugin.py | 6 +- azurelinuxagent/common/protocol/wire.py | 2 +- azurelinuxagent/common/utils/restutil.py | 13 +- azurelinuxagent/ga/update.py | 2 +- tests/protocol/test_hostplugin.py | 164 +++++++++++++++++- 5 files changed, 173 insertions(+), 14 deletions(-) diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index cfbf350dc3..c4416a3549 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -232,7 +232,7 @@ def _put_block_blob_status(self, sas_url, status_blob): if restutil.request_failed(response): error_response = restutil.read_response_error(response) - is_healthy = restutil.request_failed_at_hostplugin(response) + is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin: Put BlockBlob failed: {0}" .format(error_response)) @@ -257,7 +257,7 @@ def _put_page_blob_status(self, sas_url, status_blob): if restutil.request_failed(response): error_response = restutil.read_response_error(response) - is_healthy = restutil.request_failed_at_hostplugin(response) + is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError("HostGAPlugin: Failed PageBlob clean-up: {0}" .format(error_response)) @@ -290,7 +290,7 @@ def _put_page_blob_status(self, sas_url, status_blob): if restutil.request_failed(response): error_response = restutil.read_response_error(response) - is_healthy = restutil.request_failed_at_hostplugin(response) + is_healthy = not restutil.request_failed_at_hostplugin(response) self.report_status_health(is_healthy=is_healthy, response=error_response) raise HttpError( "HostGAPlugin Error: Put PageBlob bytes " diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 33d24318c7..843bd46039 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -654,7 +654,7 @@ def fetch(self, uri, headers=None, use_proxy=None, decode=True): logger.warn(msg) if self.host_plugin is not None: self.host_plugin.report_fetch_health(uri, - is_healthy=restutil.request_failed_at_hostplugin(resp), + is_healthy=not restutil.request_failed_at_hostplugin(resp), source='WireClient', response=error_response) raise ProtocolError(msg) diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index b0b92c9082..54f1195c28 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -61,10 +61,8 @@ httpclient.ACCEPTED ] -HOSTPLUGIN_FAILURE_CODES = [ - 500, - 502, - 503 +UPSTREAM_FAILURE_CODES = [ + 502 ] THROTTLE_CODES = [ @@ -399,8 +397,11 @@ def request_succeeded(resp, ok_codes=OK_CODES): return resp is not None and resp.status in ok_codes -def request_failed_at_hostplugin(resp, failure_codes=HOSTPLUGIN_FAILURE_CODES): - return resp is not None and resp.status in failure_codes +def request_failed_at_hostplugin(resp, upstream_failure_codes=UPSTREAM_FAILURE_CODES): + """ + Host plugin will return 502 for any upstream issue, so a failure is any 5xx except 502 + """ + return resp is not None and resp.status >= 500 and resp.status not in upstream_failure_codes def read_response_error(resp): diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 4fc32d975f..7d0cee06ee 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -888,7 +888,7 @@ def _fetch(self, uri, headers=None, use_proxy=True): else: error_response = restutil.read_response_error(resp) logger.verbose("Fetch was unsuccessful [{0}]", error_response) - is_healthy = restutil.request_failed_at_hostplugin(resp) + is_healthy = not restutil.request_failed_at_hostplugin(resp) if self.host is not None: self.host.report_fetch_health(uri, is_healthy, source='GuestAgent', response=error_response) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 9496895fcc..7f2530b864 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -22,10 +22,12 @@ import azurelinuxagent.common.protocol.restapi as restapi import azurelinuxagent.common.protocol.wire as wire import azurelinuxagent.common.protocol.hostplugin as hostplugin +from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.hostplugin import API_VERSION from azurelinuxagent.common.utils import restutil from tests.protocol.mockwiredata import WireProtocolData, DATA_FILE +from tests.protocol.test_wire import MockResponse from tests.tools import * if sys.version_info[0] == 3: @@ -37,6 +39,8 @@ hostplugin_status_url = "http://168.63.129.16:32526/status" +hostplugin_versions_url = "http://168.63.129.16:32526/versions" +health_service_url = 'http://168.63.129.16:80/HealthService' sas_url = "http://sas_url" wireserver_url = "168.63.129.16" @@ -54,6 +58,21 @@ class TestHostPlugin(AgentTestCase): + def _init_host(self): + test_goal_state = wire.GoalState(WireProtocolData(DATA_FILE).goal_state) + host_plugin = wire.HostPluginProtocol(wireserver_url, + test_goal_state.container_id, + test_goal_state.role_config_name) + self.assertTrue(host_plugin.health_service is not None) + return host_plugin + + def _init_status_blob(self): + wire_protocol_client = wire.WireProtocol(wireserver_url).client + status_blob = wire_protocol_client.status_blob + status_blob.data = faux_status + status_blob.vm_status = restapi.VMStatus(message="Ready", status="Ready") + return status_blob + def _compare_data(self, actual, expected): for k in iter(expected.keys()): if k == 'content' or k == 'requestUri': @@ -227,7 +246,7 @@ def test_validate_http_request(self): # second call is to health service self.assertEqual('POST', patch_http.call_args_list[1][0][0]) - self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + self.assertEqual(health_service_url, patch_http.call_args_list[1][0][1]) def test_no_fallback(self): """ @@ -255,6 +274,7 @@ def test_validate_block_blob(self): test_goal_state.role_config_name) self.assertFalse(host_client.is_initialized) self.assertTrue(host_client.api_versions is None) + self.assertTrue(host_client.health_service is not None) status_blob = wire_protocol_client.status_blob status_blob.data = faux_status @@ -285,7 +305,7 @@ def test_validate_block_blob(self): # second call is to health service self.assertEqual('POST', patch_http.call_args_list[1][0][0]) - self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + self.assertEqual(health_service_url, patch_http.call_args_list[1][0][1]) def test_validate_page_blobs(self): """Validate correct set of data is sent for page blobs""" @@ -334,7 +354,7 @@ def test_validate_page_blobs(self): # second call is to health service self.assertEqual('POST', patch_http.call_args_list[1][0][0]) - self.assertEqual('http://168.63.129.16:80/HealthService', patch_http.call_args_list[1][0][1]) + self.assertEqual(health_service_url, patch_http.call_args_list[1][0][1]) # last call is to host plugin exp_data = self._hostplugin_data( @@ -360,6 +380,7 @@ def test_validate_get_extension_artifacts(self): test_goal_state.role_config_name) self.assertFalse(host_client.is_initialized) self.assertTrue(host_client.api_versions is None) + self.assertTrue(host_client.health_service is not None) with patch.object(wire.HostPluginProtocol, "get_api_versions", return_value=api_versions) as patch_get: actual_url, actual_headers = host_client.get_artifact_request(sas_url) @@ -370,6 +391,143 @@ def test_validate_get_extension_artifacts(self): self.assertTrue(k in actual_headers) self.assertEqual(expected_headers[k], actual_headers[k]) + @patch("azurelinuxagent.common.utils.restutil.http_get") + def test_health(self, patch_http_get): + host_plugin = self._init_host() + + patch_http_get.return_value = MockResponse('', 200) + result = host_plugin.get_health() + self.assertEqual(1, patch_http_get.call_count) + self.assertTrue(result) + + patch_http_get.return_value = MockResponse('', 500) + result = host_plugin.get_health() + self.assertFalse(result) + + @patch("azurelinuxagent.common.utils.restutil.http_get") + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_versions") + def test_ensure_health_service_called(self, patch_http_get, patch_report_versions): + host_plugin = self._init_host() + + host_plugin.get_api_versions() + self.assertEqual(1, patch_http_get.call_count) + self.assertEqual(1, patch_report_versions.call_count) + + @patch("azurelinuxagent.common.utils.restutil.http_get") + @patch("azurelinuxagent.common.utils.restutil.http_post") + @patch("azurelinuxagent.common.utils.restutil.http_put") + def test_put_status_healthy_signal(self, patch_http_put, patch_http_post, patch_http_get): + host_plugin = self._init_host() + status_blob = self._init_status_blob() + # get_api_versions + patch_http_get.return_value = MockResponse(api_versions, 200) + # put status blob + patch_http_put.return_value = MockResponse('', 201) + + host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) + self.assertEqual(1, patch_http_get.call_count) + self.assertEqual(hostplugin_versions_url, patch_http_get.call_args[0][0]) + + self.assertEqual(2, patch_http_put.call_count) + self.assertEqual(hostplugin_status_url, patch_http_put.call_args_list[0][0][0]) + self.assertEqual(hostplugin_status_url, patch_http_put.call_args_list[1][0][0]) + + self.assertEqual(2, patch_http_post.call_count) + + # signal for /versions + self.assertEqual(health_service_url, patch_http_post.call_args_list[0][0][0]) + jstr = patch_http_post.call_args_list[0][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertTrue(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginVersions', obj['Observations'][0]['ObservationName']) + + # signal for /status + self.assertEqual(health_service_url, patch_http_post.call_args_list[1][0][0]) + jstr = patch_http_post.call_args_list[1][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertTrue(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginStatus', obj['Observations'][0]['ObservationName']) + + @patch("azurelinuxagent.common.utils.restutil.http_get") + @patch("azurelinuxagent.common.utils.restutil.http_post") + @patch("azurelinuxagent.common.utils.restutil.http_put") + def test_put_status_unhealthy_signal_transient(self, patch_http_put, patch_http_post, patch_http_get): + host_plugin = self._init_host() + status_blob = self._init_status_blob() + # get_api_versions + patch_http_get.return_value = MockResponse(api_versions, 200) + # put status blob + patch_http_put.return_value = MockResponse('', 500) + + with self.assertRaises(HttpError): + host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) + + self.assertEqual(1, patch_http_get.call_count) + self.assertEqual(hostplugin_versions_url, patch_http_get.call_args[0][0]) + + self.assertEqual(1, patch_http_put.call_count) + self.assertEqual(hostplugin_status_url, patch_http_put.call_args[0][0]) + + self.assertEqual(2, patch_http_post.call_count) + + # signal for /versions + self.assertEqual(health_service_url, patch_http_post.call_args_list[0][0][0]) + jstr = patch_http_post.call_args_list[0][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertTrue(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginVersions', obj['Observations'][0]['ObservationName']) + + # signal for /status + self.assertEqual(health_service_url, patch_http_post.call_args_list[1][0][0]) + jstr = patch_http_post.call_args_list[1][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertTrue(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginStatus', obj['Observations'][0]['ObservationName']) + + @patch("azurelinuxagent.common.utils.restutil.http_get") + @patch("azurelinuxagent.common.utils.restutil.http_post") + @patch("azurelinuxagent.common.utils.restutil.http_put") + def test_put_status_unhealthy_signal_permanent(self, patch_http_put, patch_http_post, patch_http_get): + host_plugin = self._init_host() + status_blob = self._init_status_blob() + # get_api_versions + patch_http_get.return_value = MockResponse(api_versions, 200) + # put status blob + patch_http_put.return_value = MockResponse('', 500) + + host_plugin.status_error_state.is_triggered = Mock(return_value=True) + + with self.assertRaises(HttpError): + host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) + + self.assertEqual(1, patch_http_get.call_count) + self.assertEqual(hostplugin_versions_url, patch_http_get.call_args[0][0]) + + self.assertEqual(1, patch_http_put.call_count) + self.assertEqual(hostplugin_status_url, patch_http_put.call_args[0][0]) + + self.assertEqual(2, patch_http_post.call_count) + + # signal for /versions + self.assertEqual(health_service_url, patch_http_post.call_args_list[0][0][0]) + jstr = patch_http_post.call_args_list[0][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertTrue(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginVersions', obj['Observations'][0]['ObservationName']) + + # signal for /status + self.assertEqual(health_service_url, patch_http_post.call_args_list[1][0][0]) + jstr = patch_http_post.call_args_list[1][0][1] + obj = json.loads(jstr) + self.assertEqual(1, len(obj['Observations'])) + self.assertFalse(obj['Observations'][0]['IsHealthy']) + self.assertEqual('GuestAgentPluginStatus', obj['Observations'][0]['ObservationName']) + class MockResponse: def __init__(self, body, status_code): From 0cee3f12d67c60afe9c96b08ac2033c7ce4bd203 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 16:35:37 -0700 Subject: [PATCH 41/61] merge fix --- tests/protocol/test_hostplugin.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index fb63ff4848..7cfd62baab 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -22,8 +22,9 @@ import azurelinuxagent.common.protocol.restapi as restapi import azurelinuxagent.common.protocol.wire as wire import azurelinuxagent.common.protocol.hostplugin as hostplugin -from azurelinuxagent.common.exception import HttpError +from azurelinuxagent.common import event +from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.hostplugin import API_VERSION from azurelinuxagent.common.utils import restutil from tests.protocol.mockwiredata import WireProtocolData, DATA_FILE From aa19959f04043c3efe640f2c6a8c8c21c56e4a83 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 16:48:28 -0700 Subject: [PATCH 42/61] fix event import --- tests/protocol/test_hostplugin.py | 38 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 7cfd62baab..be21f1cf34 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -23,7 +23,6 @@ import azurelinuxagent.common.protocol.wire as wire import azurelinuxagent.common.protocol.hostplugin as hostplugin -from azurelinuxagent.common import event from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.hostplugin import API_VERSION from azurelinuxagent.common.utils import restutil @@ -185,7 +184,8 @@ def test_fallback_failure(self): self.assertEqual(1, patch_upload.call_count) self.assertFalse(wire.HostPluginProtocol.is_default_channel()) - def test_put_status_error_reporting(self): + @patch("azurelinuxagent.common.event.add_event") + def test_put_status_error_reporting(self, patch_add_event): """ Validate the telemetry when uploading status fails """ @@ -202,24 +202,22 @@ def test_put_status_error_reporting(self): wire_protocol_client.ext_conf.status_upload_blob = sas_url wire_protocol_client.status_blob.set_vm_status(status) put_error = wire.HttpError("put status http error") - with patch.object(event, - "add_event") as patch_add_event: - with patch.object(restutil, - "http_put", - side_effect=put_error) as patch_http_put: - with patch.object(wire.HostPluginProtocol, - "ensure_initialized", return_value=True): - wire_protocol_client.upload_status_blob() - - # The agent tries to upload via HostPlugin and that fails due to - # http_put having a side effect of "put_error" - # - # The agent tries to upload using a direct connection, and that succeeds. - self.assertEqual(1, wire_protocol_client.status_blob.upload.call_count) - # The agent never touches the default protocol is this code path, so no change. - self.assertFalse(wire.HostPluginProtocol.is_default_channel()) - # The agent never logs a telemetry event for a bad HTTP call - self.assertEqual(patch_add_event.call_count, 0) + with patch.object(restutil, + "http_put", + side_effect=put_error) as patch_http_put: + with patch.object(wire.HostPluginProtocol, + "ensure_initialized", return_value=True): + wire_protocol_client.upload_status_blob() + + # The agent tries to upload via HostPlugin and that fails due to + # http_put having a side effect of "put_error" + # + # The agent tries to upload using a direct connection, and that succeeds. + self.assertEqual(1, wire_protocol_client.status_blob.upload.call_count) + # The agent never touches the default protocol is this code path, so no change. + self.assertFalse(wire.HostPluginProtocol.is_default_channel()) + # The agent never logs a telemetry event for a bad HTTP call + self.assertEqual(patch_add_event.call_count, 0) def test_validate_http_request(self): """Validate correct set of data is sent to HostGAPlugin when reporting VM status""" From a401c003ac0d0d4b009b03981090fb9022dd6ada Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 17:07:19 -0700 Subject: [PATCH 43/61] unit test updates --- tests/protocol/test_hostplugin.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index be21f1cf34..e454a88c72 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -431,7 +431,7 @@ def test_put_status_healthy_signal(self, patch_http_put, patch_http_post, patch_ # get_api_versions patch_http_get.return_value = MockResponse(api_versions, 200) # put status blob - patch_http_put.return_value = MockResponse('', 201) + patch_http_put.return_value = MockResponse(None, 201) host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) self.assertEqual(1, patch_http_get.call_count) @@ -468,8 +468,10 @@ def test_put_status_unhealthy_signal_transient(self, patch_http_put, patch_http_ # get_api_versions patch_http_get.return_value = MockResponse(api_versions, 200) # put status blob - patch_http_put.return_value = MockResponse('', 500) + patch_http_put.return_value = MockResponse(None, 500) + if sys.version_info < (2, 7): + self.assertRaises(HttpError, host_plugin.put_vm_status, status_blob, sas_url) with self.assertRaises(HttpError): host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) @@ -506,12 +508,15 @@ def test_put_status_unhealthy_signal_permanent(self, patch_http_put, patch_http_ # get_api_versions patch_http_get.return_value = MockResponse(api_versions, 200) # put status blob - patch_http_put.return_value = MockResponse('', 500) + patch_http_put.return_value = MockResponse(None, 500) host_plugin.status_error_state.is_triggered = Mock(return_value=True) - with self.assertRaises(HttpError): - host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) + if sys.version_info < (2, 7): + self.assertRaises(HttpError, host_plugin.put_vm_status, status_blob, sas_url) + else: + with self.assertRaises(HttpError): + host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) self.assertEqual(1, patch_http_get.call_count) self.assertEqual(hostplugin_versions_url, patch_http_get.call_args[0][0]) @@ -544,7 +549,7 @@ def __init__(self, body, status_code): self.status = status_code def read(self): - return self.body + return self.body if sys.version_info[0] == 2 else bytes(self.body, encoding='utf-8') if __name__ == '__main__': From 8e1e30a3b10d4206a6bead63ca1cb47eb5d8ded0 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Fri, 1 Jun 2018 17:13:35 -0700 Subject: [PATCH 44/61] unit test fix --- tests/protocol/test_hostplugin.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index e454a88c72..2f56559dab 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -472,8 +472,9 @@ def test_put_status_unhealthy_signal_transient(self, patch_http_put, patch_http_ if sys.version_info < (2, 7): self.assertRaises(HttpError, host_plugin.put_vm_status, status_blob, sas_url) - with self.assertRaises(HttpError): - host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) + else: + with self.assertRaises(HttpError): + host_plugin.put_vm_status(status_blob=status_blob, sas_url=sas_url) self.assertEqual(1, patch_http_get.call_count) self.assertEqual(hostplugin_versions_url, patch_http_get.call_args[0][0]) From 3d952b47474b98f1034aef76f99db7abcdd2e603 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 14:40:08 -0700 Subject: [PATCH 45/61] cleanup the monitor thread; heartbeat tests --- azurelinuxagent/common/errorstate.py | 1 + azurelinuxagent/common/protocol/util.py | 1 + azurelinuxagent/ga/monitor.py | 255 +++++++++++------------- tests/ga/test_monitor.py | 93 ++++++++- tests/protocol/test_wire.py | 7 +- 5 files changed, 217 insertions(+), 140 deletions(-) diff --git a/azurelinuxagent/common/errorstate.py b/azurelinuxagent/common/errorstate.py index 902d831ae6..38aaa1f916 100644 --- a/azurelinuxagent/common/errorstate.py +++ b/azurelinuxagent/common/errorstate.py @@ -4,6 +4,7 @@ ERROR_STATE_DELTA_INSTALL = timedelta(minutes=5) ERROR_STATE_HOST_PLUGIN_FAILURE = timedelta(minutes=5) + class ErrorState(object): def __init__(self, min_timedelta=ERROR_STATE_DELTA_DEFAULT): self.min_timedelta = min_timedelta diff --git a/azurelinuxagent/common/protocol/util.py b/azurelinuxagent/common/protocol/util.py index f6383cc6f6..a3e3176fad 100644 --- a/azurelinuxagent/common/protocol/util.py +++ b/azurelinuxagent/common/protocol/util.py @@ -54,6 +54,7 @@ def get_protocol_util(): class ProtocolUtil(object): + """ ProtocolUtil handles initialization for protocol instance. 2 protocol types are invoked, wire protocol and metadata protocols. diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index db121c5bb6..599bc9bb40 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -101,10 +101,19 @@ class MonitorHandler(object): def __init__(self): self.osutil = get_osutil() - self.protocol_util = get_protocol_util() + self.protocol = get_protocol_util().get_protocol() self.imds_client = get_imds_client() self.sysinfo = [] self.event_thread = None + self.last_event_collection = None + self.last_telemetry_heartbeat = None + self.last_host_plugin_heartbeat = None + self.counter = 0 + self.heartbeat_id = str(uuid.uuid4()).upper() + self.host_plugin_errorstate = None + self.health_service = None + self.health_service = HealthService(self.protocol.endpoint) + self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) def run(self): self.init_sysinfo() @@ -137,8 +146,7 @@ def init_sysinfo(self): logger.warn("Failed to get system info: {0}", e) try: - protocol = self.protocol_util.get_protocol() - vminfo = protocol.get_vminfo() + vminfo = self.protocol.get_vminfo() self.sysinfo.append(TelemetryEventParam("VMName", vminfo.vmName)) self.sysinfo.append(TelemetryEventParam("TenantName", @@ -180,72 +188,54 @@ def collect_event(self, evt_file_name): msg = "Failed to process {0}, {1}".format(evt_file_name, e) raise EventError(msg) - def collect_and_send_events(self, protocol, last_event_collection): - if last_event_collection is None: - last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD + def collect_and_send_events(self): + if self.last_event_collection is None: + self.last_event_collection = datetime.datetime.utcnow() - MonitorHandler.EVENT_COLLECTION_PERIOD - if datetime.datetime.utcnow() < (last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): - return last_event_collection + if datetime.datetime.utcnow() >= (self.last_event_collection + MonitorHandler.EVENT_COLLECTION_PERIOD): + try: + event_list = TelemetryEventList() + event_dir = os.path.join(conf.get_lib_dir(), "events") + event_files = os.listdir(event_dir) + for event_file in event_files: + if not event_file.endswith(".tld"): + continue + event_file_path = os.path.join(event_dir, event_file) + try: + data_str = self.collect_event(event_file_path) + except EventError as e: + logger.error("{0}", e) + continue + + try: + event = parse_event(data_str) + self.add_sysinfo(event) + event_list.events.append(event) + except (ValueError, ProtocolError) as e: + logger.warn("Failed to decode event file: {0}", e) + continue + + if len(event_list.events) == 0: + return - try: - event_list = TelemetryEventList() - event_dir = os.path.join(conf.get_lib_dir(), "events") - event_files = os.listdir(event_dir) - for event_file in event_files: - if not event_file.endswith(".tld"): - continue - event_file_path = os.path.join(event_dir, event_file) try: - data_str = self.collect_event(event_file_path) - except EventError as e: + self.protocol.report_event(event_list) + except ProtocolError as e: logger.error("{0}", e) - continue - - try: - event = parse_event(data_str) - self.add_sysinfo(event) - event_list.events.append(event) - except (ValueError, ProtocolError) as e: - logger.warn("Failed to decode event file: {0}", e) - continue - - if len(event_list.events) == 0: - return + except Exception as e: + logger.warn("Failed to send events: {0}", e) - try: - protocol.report_event(event_list) - except ProtocolError as e: - logger.error("{0}", e) - except Exception as e: - logger.warn("Failed to send events: {0}", e) - - return datetime.datetime.utcnow() + self.last_event_collection = datetime.datetime.utcnow() def daemon(self): - - # Create a new identifier on each restart, reset the counter and all events - counter = 0 - last_event_collection = None - last_telemetry_heartbeat = None - last_host_plugin_heartbeat = None - heartbeat_id = str(uuid.uuid4()).upper() - protocol = self.protocol_util.get_protocol() - host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) - health_service = HealthService(protocol.endpoint) - + min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, + MonitorHandler.EVENT_COLLECTION_PERIOD, + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds while True: - last_telemetry_heartbeat = self.send_telemetry_heartbeat(protocol, - counter, - heartbeat_id, - last_telemetry_heartbeat) - last_event_collection = self.collect_and_send_events(protocol, - last_event_collection) - last_host_plugin_heartbeat = self.send_host_plugin_heartbeat(protocol, - last_host_plugin_heartbeat, - host_plugin_errorstate, - health_service) - # currently the smallest delta is 1 minute - time.sleep(60) + self.send_telemetry_heartbeat() + self.collect_and_send_events() + self.send_host_plugin_heartbeat() + time.sleep(min_delta) def add_sysinfo(self, event): sysinfo_names = [v.name for v in self.sysinfo] @@ -257,83 +247,80 @@ def add_sysinfo(self, event): event.parameters.remove(param) event.parameters.extend(self.sysinfo) - def send_host_plugin_heartbeat(self, protocol, last_host_plugin_heartbeat, host_plugin_errorstate, health_service): - + def send_host_plugin_heartbeat(self): """ Send a health signal every HOST_PLUGIN_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have been able to communicate with HostGAPlugin at least once in the last HOST_PLUGIN_HEALTH_PERIOD. """ + if self.last_host_plugin_heartbeat is None: + self.last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD - if last_host_plugin_heartbeat is None: - last_host_plugin_heartbeat = datetime.datetime.utcnow() - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD + if datetime.datetime.utcnow() >= (self.last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): + try: + host_plugin = self.protocol.client.get_host_plugin() + host_plugin.ensure_initialized() + is_currently_healthy = host_plugin.get_health() - if datetime.datetime.utcnow() < (last_host_plugin_heartbeat + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD): - return last_host_plugin_heartbeat + if is_currently_healthy: + self.host_plugin_errorstate.reset() + else: + self.host_plugin_errorstate.incr() - try: - host_plugin = protocol.client.get_host_plugin() - host_plugin.ensure_initialized() - is_currently_healthy = host_plugin.get_health() - - if is_currently_healthy: - host_plugin_errorstate.reset() - else: - host_plugin_errorstate.incr() - - is_healthy = host_plugin_errorstate.is_triggered() is False - logger.verbose("HostGAPlugin health: {0}", is_healthy) - - health_service.report_host_plugin_heartbeat(is_healthy) - - except Exception as e: - msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HostPluginHeartbeat, - is_success=False, - message=msg, - log_event=False) - - return datetime.datetime.utcnow() - - def send_telemetry_heartbeat(self, protocol, counter, heartbeat_id, last_telemetry_heartbeat): - - if last_telemetry_heartbeat is None: - last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD - - if datetime.datetime.utcnow() < (last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): - return last_telemetry_heartbeat - - incarnation = protocol.get_incarnation() - dropped_packets = self.osutil.get_firewall_dropped_packets(protocol.endpoint) - msg = "{0};{1};{2};{3}".format(incarnation, counter, heartbeat_id, dropped_packets) - - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HeartBeat, - is_success=True, - message=msg, - log_event=False) - - counter += 1 - - io_errors = IOErrorCounter.get_and_reset() - hostplugin_errors = io_errors.get("hostplugin") - protocol_errors = io_errors.get("protocol") - other_errors = io_errors.get("other") - - if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: - msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, - protocol_errors, - other_errors) - add_event( - name=AGENT_NAME, - version=CURRENT_VERSION, - op=WALAEventOperation.HttpErrors, - is_success=True, - message=msg, - log_event=False) - - return datetime.datetime.utcnow() + is_healthy = self.host_plugin_errorstate.is_triggered() is False + logger.verbose("HostGAPlugin health: {0}", is_healthy) + + self.health_service.report_host_plugin_heartbeat(is_healthy) + + except Exception as e: + msg = "Exception sending host plugin heartbeat: {0}".format(ustr(e)) + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HostPluginHeartbeat, + is_success=False, + message=msg, + log_event=False) + + self.last_host_plugin_heartbeat = datetime.datetime.utcnow() + + def send_telemetry_heartbeat(self): + + if self.last_telemetry_heartbeat is None: + self.last_telemetry_heartbeat = datetime.datetime.utcnow() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD + + if datetime.datetime.utcnow() >= (self.last_telemetry_heartbeat + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD): + try: + incarnation = self.protocol.get_incarnation() + dropped_packets = self.osutil.get_firewall_dropped_packets(self.protocol.endpoint) + msg = "{0};{1};{2};{3}".format(incarnation, self.counter, self.heartbeat_id, dropped_packets) + + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HeartBeat, + is_success=True, + message=msg, + log_event=False) + + self.counter += 1 + + io_errors = IOErrorCounter.get_and_reset() + hostplugin_errors = io_errors.get("hostplugin") + protocol_errors = io_errors.get("protocol") + other_errors = io_errors.get("other") + + if hostplugin_errors > 0 or protocol_errors > 0 or other_errors > 0: + msg = "hostplugin:{0};protocol:{1};other:{2}".format(hostplugin_errors, + protocol_errors, + other_errors) + add_event( + name=AGENT_NAME, + version=CURRENT_VERSION, + op=WALAEventOperation.HttpErrors, + is_success=True, + message=msg, + log_event=False) + except Exception as e: + logger.warn("Failed to send heartbeat: {0}", e) + + self.last_telemetry_heartbeat = datetime.datetime.utcnow() diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 59d066db03..74623c0c2a 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -14,22 +14,26 @@ # # Requires Python 2.6+ and Openssl 1.0+ # +from datetime import timedelta +from azurelinuxagent.common.protocol.wire import WireProtocol from tests.tools import * from azurelinuxagent.ga.monitor import * +@patch('azurelinuxagent.common.osutil.get_osutil') +@patch('azurelinuxagent.common.protocol.get_protocol_util') +@patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol') +@patch("azurelinuxagent.common.protocol.healthservice.HealthService.report") class TestMonitor(AgentTestCase): - def test_parse_xml_event(self): + def test_parse_xml_event(self, *args): data_str = load_data('ext/event.xml') event = parse_xml_event(data_str) self.assertNotEquals(None, event) self.assertNotEquals(0, event.parameters) self.assertNotEquals(None, event.parameters[0]) - @patch('azurelinuxagent.common.osutil.get_osutil') - @patch('azurelinuxagent.common.protocol.get_protocol_util') - def test_add_sysinfo(self, _, __): + def test_add_sysinfo(self, *args): data_str = load_data('ext/event.xml') event = parse_xml_event(data_str) monitor_handler = get_monitor_handler() @@ -76,3 +80,84 @@ def test_add_sysinfo(self, _, __): counter += 1 self.assertEquals(5, counter) + + @patch("azurelinuxagent.ga.monitor.MonitorHandler.send_telemetry_heartbeat") + @patch("azurelinuxagent.ga.monitor.MonitorHandler.collect_and_send_events") + @patch("azurelinuxagent.ga.monitor.MonitorHandler.send_host_plugin_heartbeat") + def test_heartbeats(self, patch_hostplugin_heartbeat, patch_send_events, patch_telemetry_heartbeat, *args): + monitor_handler = get_monitor_handler() + + self.assertEqual(0, patch_hostplugin_heartbeat.call_count) + self.assertEqual(0, patch_send_events.call_count) + self.assertEqual(0, patch_telemetry_heartbeat.call_count) + monitor_handler.start() + time.sleep(1) + self.assertTrue(monitor_handler.is_alive()) + self.assertNotEqual(0, patch_hostplugin_heartbeat.call_count) + self.assertNotEqual(0, patch_send_events.call_count) + self.assertNotEqual(0, patch_telemetry_heartbeat.call_count) + + def test_heartbeat_timings_updates_after_window(self, *args): + monitor_handler = get_monitor_handler() + + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(seconds=1) + MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(seconds=1) + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(seconds=1) + + self.assertEqual(None, monitor_handler.last_host_plugin_heartbeat) + self.assertEqual(None, monitor_handler.last_event_collection) + self.assertEqual(None, monitor_handler.last_telemetry_heartbeat) + + monitor_handler.start() + time.sleep(1) + self.assertTrue(monitor_handler.is_alive()) + + self.assertNotEqual(None, monitor_handler.last_host_plugin_heartbeat) + self.assertNotEqual(None, monitor_handler.last_event_collection) + self.assertNotEqual(None, monitor_handler.last_telemetry_heartbeat) + + heartbeat_hostplugin = monitor_handler.last_host_plugin_heartbeat + heartbeat_telemetry = monitor_handler.last_telemetry_heartbeat + events_collection = monitor_handler.last_event_collection + + time.sleep(2) + + self.assertNotEqual(heartbeat_hostplugin, monitor_handler.last_host_plugin_heartbeat) + self.assertNotEqual(events_collection, monitor_handler.last_event_collection) + self.assertNotEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) + + def test_heartbeat_timings_no_updates_within_window(self, *args): + monitor_handler = get_monitor_handler() + + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(seconds=100) + MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(seconds=100) + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(seconds=100) + + self.assertEqual(None, monitor_handler.last_host_plugin_heartbeat) + self.assertEqual(None, monitor_handler.last_event_collection) + self.assertEqual(None, monitor_handler.last_telemetry_heartbeat) + + monitor_handler.start() + time.sleep(1) + self.assertTrue(monitor_handler.is_alive()) + + self.assertNotEqual(None, monitor_handler.last_host_plugin_heartbeat) + self.assertNotEqual(None, monitor_handler.last_event_collection) + self.assertNotEqual(None, monitor_handler.last_telemetry_heartbeat) + + heartbeat_hostplugin = monitor_handler.last_host_plugin_heartbeat + heartbeat_telemetry = monitor_handler.last_telemetry_heartbeat + events_collection = monitor_handler.last_event_collection + + time.sleep(2) + + self.assertEqual(heartbeat_hostplugin, monitor_handler.last_host_plugin_heartbeat) + self.assertEqual(events_collection, monitor_handler.last_event_collection) + self.assertEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) + + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") + def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): + monitor_handler = get_monitor_handler() + monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - timedelta(hours=1) + monitor_handler.send_host_plugin_heartbeat() + self.assertEqual(1, patch_report_heartbeat.call_count) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 1c7f73fb8d..412a850918 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -80,7 +80,8 @@ def test_getters_ext_no_public(self, *args): test_data = WireProtocolData(DATA_FILE_EXT_NO_PUBLIC) self._test_getters(test_data, *args) - def test_getters_with_stale_goal_state(self, *args): + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_extension_artifact") + def test_getters_with_stale_goal_state(self, patch_report, *args): test_data = WireProtocolData(DATA_FILE) test_data.emulate_stale_goal_state = True @@ -93,6 +94,7 @@ def test_getters_with_stale_goal_state(self, *args): # fetched often; however, the dependent documents, such as the # HostingEnvironmentConfig, will be retrieved the expected number self.assertEqual(2, test_data.call_counts["hostingenvuri"]) + self.assertEqual(1, patch_report.call_count) def test_call_storage_kwargs(self, *args): from azurelinuxagent.common.utils import restutil @@ -219,7 +221,8 @@ def test_upload_status_blob_host_ga_plugin(self, *args): patch_http.assert_called_once_with(testurl, wire_protocol_client.status_blob) self.assertFalse(HostPluginProtocol.is_default_channel()) - def test_upload_status_blob_unknown_type_assumes_block(self, *args): + @patch("azurelinuxagent.common.protocol.hostplugin.HostPluginProtocol.ensure_initialized") + def test_upload_status_blob_unknown_type_assumes_block(self, _, *args): vmstatus = VMStatus(message="Ready", status="Ready") wire_protocol_client = WireProtocol(wireserver_url).client wire_protocol_client.ext_conf = ExtensionsConfig(None) From f9e16d01a462d2e075d0337112b0d23d23eb7acd Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 14:47:47 -0700 Subject: [PATCH 46/61] update naming for PR feedback --- azurelinuxagent/common/utils/restutil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/common/utils/restutil.py b/azurelinuxagent/common/utils/restutil.py index 54f1195c28..b64c546f61 100644 --- a/azurelinuxagent/common/utils/restutil.py +++ b/azurelinuxagent/common/utils/restutil.py @@ -61,7 +61,7 @@ httpclient.ACCEPTED ] -UPSTREAM_FAILURE_CODES = [ +HOSTPLUGIN_UPSTREAM_FAILURE_CODES = [ 502 ] @@ -397,7 +397,7 @@ def request_succeeded(resp, ok_codes=OK_CODES): return resp is not None and resp.status in ok_codes -def request_failed_at_hostplugin(resp, upstream_failure_codes=UPSTREAM_FAILURE_CODES): +def request_failed_at_hostplugin(resp, upstream_failure_codes=HOSTPLUGIN_UPSTREAM_FAILURE_CODES): """ Host plugin will return 502 for any upstream issue, so a failure is any 5xx except 502 """ From 493f347358808eeae3e18dc5971db8cdeb713e68 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 15:03:00 -0700 Subject: [PATCH 47/61] fix import test --- tests/test_import.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_import.py b/tests/test_import.py index 39a48abd75..05a4f4cf0f 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -11,8 +11,12 @@ import azurelinuxagent.ga.monitor as monitor import azurelinuxagent.ga.update as update + +@patch('azurelinuxagent.common.osutil.get_osutil') +@patch('azurelinuxagent.common.protocol.get_protocol_util') +@patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol') class TestImportHandler(AgentTestCase): - def test_get_handler(self): + def test_get_handler(self, *args): osutil.get_osutil() protocol.get_protocol_util() dhcp.get_dhcp_handler() From e76f5789e6a6f6f5e2a41f13b26d7e02d295a831 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 15:44:27 -0700 Subject: [PATCH 48/61] mock events for monitor tests --- tests/ga/test_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 74623c0c2a..56ec9eda9a 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -16,11 +16,11 @@ # from datetime import timedelta -from azurelinuxagent.common.protocol.wire import WireProtocol from tests.tools import * from azurelinuxagent.ga.monitor import * +@patch('azurelinuxagent.common.event.EventLogger.add_event') @patch('azurelinuxagent.common.osutil.get_osutil') @patch('azurelinuxagent.common.protocol.get_protocol_util') @patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol') From 98dcf0ee54bce6420377403fae0aed85b3632ff3 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 16:06:35 -0700 Subject: [PATCH 49/61] adjust handler model --- azurelinuxagent/ga/monitor.py | 16 +++++++++++----- tests/ga/test_monitor.py | 1 + tests/test_import.py | 5 +---- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 599bc9bb40..95c47a5d03 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -101,24 +101,30 @@ class MonitorHandler(object): def __init__(self): self.osutil = get_osutil() - self.protocol = get_protocol_util().get_protocol() + self.protocol_util = get_protocol_util() self.imds_client = get_imds_client() - self.sysinfo = [] + self.event_thread = None self.last_event_collection = None self.last_telemetry_heartbeat = None self.last_host_plugin_heartbeat = None + self.protocol = None + self.health_service = None + self.counter = 0 + self.sysinfo = [] self.heartbeat_id = str(uuid.uuid4()).upper() - self.host_plugin_errorstate = None - self.health_service = None - self.health_service = HealthService(self.protocol.endpoint) self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) def run(self): self.init_sysinfo() + self.init_protocols() self.start() + def init_protocols(self): + self.protocol = self.protocol_util.get_protocol() + self.health_service = HealthService(self.protocol.endpoint) + def is_alive(self): return self.event_thread.is_alive() diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 56ec9eda9a..18c2035458 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -158,6 +158,7 @@ def test_heartbeat_timings_no_updates_within_window(self, *args): @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): monitor_handler = get_monitor_handler() + monitor_handler.init_protocols() monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - timedelta(hours=1) monitor_handler.send_host_plugin_heartbeat() self.assertEqual(1, patch_report_heartbeat.call_count) diff --git a/tests/test_import.py b/tests/test_import.py index 05a4f4cf0f..c5fd31c062 100644 --- a/tests/test_import.py +++ b/tests/test_import.py @@ -12,11 +12,8 @@ import azurelinuxagent.ga.update as update -@patch('azurelinuxagent.common.osutil.get_osutil') -@patch('azurelinuxagent.common.protocol.get_protocol_util') -@patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol') class TestImportHandler(AgentTestCase): - def test_get_handler(self, *args): + def test_get_handler(self): osutil.get_osutil() protocol.get_protocol_util() dhcp.get_dhcp_handler() From a6087726c6916a1ca5100ec9e0404fba494db5b2 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 16:19:29 -0700 Subject: [PATCH 50/61] monitor thread clean shutdown for unit tests --- azurelinuxagent/ga/monitor.py | 3 ++- tests/ga/test_monitor.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 95c47a5d03..412beb8a8f 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -113,6 +113,7 @@ def __init__(self): self.counter = 0 self.sysinfo = [] + self.should_run = True self.heartbeat_id = str(uuid.uuid4()).upper() self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) @@ -237,7 +238,7 @@ def daemon(self): min_delta = min(MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD, MonitorHandler.EVENT_COLLECTION_PERIOD, MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD).seconds - while True: + while self.should_run: self.send_telemetry_heartbeat() self.collect_and_send_events() self.send_host_plugin_heartbeat() diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 18c2035458..5d6ebd1fd8 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -97,6 +97,8 @@ def test_heartbeats(self, patch_hostplugin_heartbeat, patch_send_events, patch_t self.assertNotEqual(0, patch_send_events.call_count) self.assertNotEqual(0, patch_telemetry_heartbeat.call_count) + monitor_handler.should_run = False + def test_heartbeat_timings_updates_after_window(self, *args): monitor_handler = get_monitor_handler() @@ -126,6 +128,8 @@ def test_heartbeat_timings_updates_after_window(self, *args): self.assertNotEqual(events_collection, monitor_handler.last_event_collection) self.assertNotEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) + monitor_handler.should_run = False + def test_heartbeat_timings_no_updates_within_window(self, *args): monitor_handler = get_monitor_handler() @@ -155,6 +159,8 @@ def test_heartbeat_timings_no_updates_within_window(self, *args): self.assertEqual(events_collection, monitor_handler.last_event_collection) self.assertEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) + monitor_handler.should_run = False + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): monitor_handler = get_monitor_handler() @@ -162,3 +168,5 @@ def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - timedelta(hours=1) monitor_handler.send_host_plugin_heartbeat() self.assertEqual(1, patch_report_heartbeat.call_count) + monitor_handler.should_run = False + From 3dbaff7d28f7efbe9008e819fb2b8bf7b299e71c Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 16:22:58 -0700 Subject: [PATCH 51/61] switch to thread join --- azurelinuxagent/ga/monitor.py | 5 +++++ tests/ga/test_monitor.py | 11 ++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 412beb8a8f..48bad71c31 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -122,6 +122,11 @@ def run(self): self.init_protocols() self.start() + def stop(self): + self.should_run = False + if self.is_alive(): + self.event_thread.join() + def init_protocols(self): self.protocol = self.protocol_util.get_protocol() self.health_service = HealthService(self.protocol.endpoint) diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 5d6ebd1fd8..d1c8a3e2de 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -90,14 +90,16 @@ def test_heartbeats(self, patch_hostplugin_heartbeat, patch_send_events, patch_t self.assertEqual(0, patch_hostplugin_heartbeat.call_count) self.assertEqual(0, patch_send_events.call_count) self.assertEqual(0, patch_telemetry_heartbeat.call_count) + monitor_handler.start() time.sleep(1) self.assertTrue(monitor_handler.is_alive()) + self.assertNotEqual(0, patch_hostplugin_heartbeat.call_count) self.assertNotEqual(0, patch_send_events.call_count) self.assertNotEqual(0, patch_telemetry_heartbeat.call_count) - monitor_handler.should_run = False + monitor_handler.stop() def test_heartbeat_timings_updates_after_window(self, *args): monitor_handler = get_monitor_handler() @@ -128,7 +130,7 @@ def test_heartbeat_timings_updates_after_window(self, *args): self.assertNotEqual(events_collection, monitor_handler.last_event_collection) self.assertNotEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) - monitor_handler.should_run = False + monitor_handler.stop() def test_heartbeat_timings_no_updates_within_window(self, *args): monitor_handler = get_monitor_handler() @@ -159,7 +161,7 @@ def test_heartbeat_timings_no_updates_within_window(self, *args): self.assertEqual(events_collection, monitor_handler.last_event_collection) self.assertEqual(heartbeat_telemetry, monitor_handler.last_telemetry_heartbeat) - monitor_handler.should_run = False + monitor_handler.stop() @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): @@ -168,5 +170,4 @@ def test_heartbeat_creates_signal(self, patch_report_heartbeat, *args): monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - timedelta(hours=1) monitor_handler.send_host_plugin_heartbeat() self.assertEqual(1, patch_report_heartbeat.call_count) - monitor_handler.should_run = False - + monitor_handler.stop() From 9d8640e2a8fdfcb89991a2667ccd805b82c3e49c Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 16:25:16 -0700 Subject: [PATCH 52/61] correct init sequence --- azurelinuxagent/ga/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 48bad71c31..d2e94c77ee 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -118,8 +118,8 @@ def __init__(self): self.host_plugin_errorstate = ErrorState(min_timedelta=MonitorHandler.HOST_PLUGIN_HEALTH_PERIOD) def run(self): - self.init_sysinfo() self.init_protocols() + self.init_sysinfo() self.start() def stop(self): From 58586a0a0f82a576efdb271393721fa6d183a923 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 16:27:50 -0700 Subject: [PATCH 53/61] event thread None check --- azurelinuxagent/ga/monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index d2e94c77ee..2e435e0d32 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -132,7 +132,7 @@ def init_protocols(self): self.health_service = HealthService(self.protocol.endpoint) def is_alive(self): - return self.event_thread.is_alive() + return self.event_thread is not None and self.event_thread.is_alive() def start(self): self.event_thread = threading.Thread(target=self.daemon) From 8a16a866bdfbdf03653ea6c632f121fd92419d7e Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Mon, 4 Jun 2018 23:11:55 -0700 Subject: [PATCH 54/61] add healthservice tests --- tests/protocol/test_healthservice.py | 159 +++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 tests/protocol/test_healthservice.py diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py new file mode 100644 index 0000000000..67eaa0edd5 --- /dev/null +++ b/tests/protocol/test_healthservice.py @@ -0,0 +1,159 @@ +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +import json + +from azurelinuxagent.common.protocol.healthservice import Observation, HealthService +from tests.tools import * + + +class TestHealthService(AgentTestCase): + + def assert_observation(self, call_args, name, is_healthy, value, description): + endpoint = call_args[0][0] + content = call_args[0][1] + + jo = json.loads(content) + api = jo['Api'] + source = jo['Source'] + version = jo['Version'] + obs = jo['Observations'] + fo = obs[0] + obs_name = fo['ObservationName'] + obs_healthy = fo['IsHealthy'] + obs_value = fo['Value'] + obs_description = fo['Description'] + + self.assertEqual('application/json', call_args[1]['headers']['Content-Type']) + self.assertEqual('http://endpoint:80/HealthService', endpoint) + self.assertEqual('reporttargethealth', api) + self.assertEqual('WALinuxAgent', source) + self.assertEqual('1.0', version) + + self.assertEqual(name, obs_name) + self.assertEqual(value, obs_value) + self.assertEqual(is_healthy, obs_healthy) + self.assertEqual(description, obs_description) + + def test_observation_validity(self): + try: + Observation(name=None, is_healthy=True) + self.fail('Empty observation name should raise ValueError') + except ValueError: + pass + + try: + Observation(name='Name', is_healthy=None) + self.fail('Empty measurement should raise ValueError') + except ValueError: + pass + + o = Observation(name='Name', is_healthy=True, value=None, description=None) + self.assertEqual('', o.value) + self.assertEqual('', o.description) + + long_str = 's' * 200 + o = Observation(name=long_str, is_healthy=True, value=long_str, description=long_str) + self.assertEqual(200, len(o.name)) + self.assertEqual(200, len(o.value)) + self.assertEqual(200, len(o.description)) + + self.assertEqual(64, len(o.as_obj['ObservationName'])) + self.assertEqual(128, len(o.as_obj['Value'])) + self.assertEqual(128, len(o.as_obj['Description'])) + + def test_observation_json(self): + health_service = HealthService('endpoint') + health_service.observations.append(Observation(name='name', + is_healthy=True, + value='value', + description='description')) + expected_json = '{"Source": "WALinuxAgent", ' \ + '"Api": "reporttargethealth", ' \ + '"Version": "1.0", ' \ + '"Observations": [{' \ + '"Value": "value", ' \ + '"ObservationName": "name", ' \ + '"Description": "description", ' \ + '"IsHealthy": true' \ + '}]}' + self.assertEqual(expected_json, health_service.as_json) + + @patch("azurelinuxagent.common.utils.restutil.http_post") + def test_reporting(self, patch_post): + health_service = HealthService('endpoint') + health_service.report_host_plugin_status(is_healthy=True, response='response') + self.assertEqual(1, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, + is_healthy=True, + value='response', + description='') + + health_service.report_host_plugin_status(is_healthy=False, response='error') + self.assertEqual(2, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, + is_healthy=False, + value='error', + description='') + + health_service.report_host_plugin_extension_artifact(is_healthy=True, source='source', response='response') + self.assertEqual(3, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, + is_healthy=True, + value='response', + description='source') + + health_service.report_host_plugin_extension_artifact(is_healthy=False, source='source', response='response') + self.assertEqual(4, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, + is_healthy=False, + value='response', + description='source') + + health_service.report_host_plugin_heartbeat(is_healthy=True) + self.assertEqual(5, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, + is_healthy=True, + value='', + description='') + + health_service.report_host_plugin_heartbeat(is_healthy=False) + self.assertEqual(6, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, + is_healthy=False, + value='', + description='') + + health_service.report_host_plugin_versions(is_healthy=True, response='response') + self.assertEqual(7, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, + is_healthy=True, + value='response', + description='') + + health_service.report_host_plugin_versions(is_healthy=False, response='response') + self.assertEqual(8, patch_post.call_count) + self.assert_observation(call_args=patch_post.call_args, + name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, + is_healthy=False, + value='response', + description='') From 91a6c37b3a0b9759640b9b4c96076387273135b5 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 00:53:47 -0700 Subject: [PATCH 55/61] update health service tests --- tests/protocol/test_healthservice.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py index 67eaa0edd5..9c1eed641c 100644 --- a/tests/protocol/test_healthservice.py +++ b/tests/protocol/test_healthservice.py @@ -101,6 +101,7 @@ def test_reporting(self, patch_post): is_healthy=True, value='response', description='') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_status(is_healthy=False, response='error') self.assertEqual(2, patch_post.call_count) @@ -109,6 +110,7 @@ def test_reporting(self, patch_post): is_healthy=False, value='error', description='') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=True, source='source', response='response') self.assertEqual(3, patch_post.call_count) @@ -117,6 +119,7 @@ def test_reporting(self, patch_post): is_healthy=True, value='response', description='source') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_extension_artifact(is_healthy=False, source='source', response='response') self.assertEqual(4, patch_post.call_count) @@ -125,6 +128,7 @@ def test_reporting(self, patch_post): is_healthy=False, value='response', description='source') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=True) self.assertEqual(5, patch_post.call_count) @@ -133,6 +137,7 @@ def test_reporting(self, patch_post): is_healthy=True, value='', description='') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_heartbeat(is_healthy=False) self.assertEqual(6, patch_post.call_count) @@ -141,6 +146,7 @@ def test_reporting(self, patch_post): is_healthy=False, value='', description='') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=True, response='response') self.assertEqual(7, patch_post.call_count) @@ -149,6 +155,7 @@ def test_reporting(self, patch_post): is_healthy=True, value='response', description='') + self.assertEqual(0, len(health_service.observations)) health_service.report_host_plugin_versions(is_healthy=False, response='response') self.assertEqual(8, patch_post.call_count) @@ -157,3 +164,4 @@ def test_reporting(self, patch_post): is_healthy=False, value='response', description='') + self.assertEqual(0, len(health_service.observations)) From ccccd19cc54ae8909001d013b05743e7ca35eefd Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 00:59:44 -0700 Subject: [PATCH 56/61] test observationsare cleared after failures --- tests/protocol/test_healthservice.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py index 9c1eed641c..040d91b082 100644 --- a/tests/protocol/test_healthservice.py +++ b/tests/protocol/test_healthservice.py @@ -15,6 +15,7 @@ # Requires Python 2.6+ and Openssl 1.0+ import json +from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.healthservice import Observation, HealthService from tests.tools import * @@ -165,3 +166,9 @@ def test_reporting(self, patch_post): value='response', description='') self.assertEqual(0, len(health_service.observations)) + + patch_post.side_effect = HttpError() + health_service.report_host_plugin_versions(is_healthy=True, response='') + + self.assertEqual(9, patch_post.call_count) + self.assertEqual(0, len(health_service.observations)) From a5673be46388c5cb7739ef534aa52cd328dd2d6d Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 01:09:03 -0700 Subject: [PATCH 57/61] test for status codes --- tests/protocol/test_healthservice.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py index 040d91b082..381af0acaa 100644 --- a/tests/protocol/test_healthservice.py +++ b/tests/protocol/test_healthservice.py @@ -17,11 +17,18 @@ from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.healthservice import Observation, HealthService +from azurelinuxagent.common.utils import restutil +from tests.protocol.test_hostplugin import MockResponse from tests.tools import * class TestHealthService(AgentTestCase): + def assert_status_code(self, status_code, expected_healthy): + response = MockResponse('response', status_code) + is_healthy = not restutil.request_failed_at_hostplugin(response) + self.assertEqual(expected_healthy, is_healthy) + def assert_observation(self, call_args, name, is_healthy, value, description): endpoint = call_args[0][0] content = call_args[0][1] @@ -172,3 +179,19 @@ def test_reporting(self, patch_post): self.assertEqual(9, patch_post.call_count) self.assertEqual(0, len(health_service.observations)) + + def test_status_codes(self): + # healthy + self.assert_status_code(status_code=200, expected_healthy=True) + self.assert_status_code(status_code=201, expected_healthy=True) + self.assert_status_code(status_code=302, expected_healthy=True) + self.assert_status_code(status_code=416, expected_healthy=True) + self.assert_status_code(status_code=419, expected_healthy=True) + self.assert_status_code(status_code=429, expected_healthy=True) + self.assert_status_code(status_code=502, expected_healthy=True) + + # unhealthy + self.assert_status_code(status_code=500, expected_healthy=False) + self.assert_status_code(status_code=501, expected_healthy=False) + self.assert_status_code(status_code=503, expected_healthy=False) + self.assert_status_code(status_code=504, expected_healthy=False) From 3ee3f271b158472937355dc8935c60a7370a98c1 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 02:43:57 -0700 Subject: [PATCH 58/61] additional tests for reporting --- tests/protocol/test_hostplugin.py | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 2f56559dab..ad4a308b4d 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -18,10 +18,12 @@ import base64 import json import sys +import datetime import azurelinuxagent.common.protocol.restapi as restapi import azurelinuxagent.common.protocol.wire as wire import azurelinuxagent.common.protocol.hostplugin as hostplugin +from azurelinuxagent.common.errorstate import ErrorState from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.protocol.hostplugin import API_VERSION @@ -543,6 +545,86 @@ def test_put_status_unhealthy_signal_permanent(self, patch_http_put, patch_http_ self.assertFalse(obj['Observations'][0]['IsHealthy']) self.assertEqual('GuestAgentPluginStatus', obj['Observations'][0]['ObservationName']) + @patch("azurelinuxagent.common.protocol.hostplugin.HostPluginProtocol.should_report", return_value=True) + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_extension_artifact") + def test_report_fetch_health(self, patch_report_artifact, patch_should_report): + host_plugin = self._init_host() + host_plugin.report_fetch_health(uri='', is_healthy=True) + self.assertEqual(0, patch_should_report.call_count) + + host_plugin.report_fetch_health(uri='http://169.254.169.254/extensionArtifact', is_healthy=True) + self.assertEqual(0, patch_should_report.call_count) + + host_plugin.report_fetch_health(uri='http://168.63.129.16:32526/status', is_healthy=True) + self.assertEqual(0, patch_should_report.call_count) + + self.assertEqual(None, host_plugin.fetch_last_timestamp) + host_plugin.report_fetch_health(uri='http://168.63.129.16:32526/extensionArtifact', is_healthy=True) + self.assertNotEqual(None, host_plugin.fetch_last_timestamp) + self.assertEqual(1, patch_should_report.call_count) + self.assertEqual(1, patch_report_artifact.call_count) + + @patch("azurelinuxagent.common.protocol.hostplugin.HostPluginProtocol.should_report", return_value=True) + @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_status") + def test_report_status_health(self, patch_report_status, patch_should_report): + host_plugin = self._init_host() + self.assertEqual(None, host_plugin.status_last_timestamp) + host_plugin.report_status_health(is_healthy=True) + self.assertNotEqual(None, host_plugin.status_last_timestamp) + self.assertEqual(1, patch_should_report.call_count) + self.assertEqual(1, patch_report_status.call_count) + + def test_should_report(self): + host_plugin = self._init_host() + error_state = ErrorState(min_timedelta=datetime.timedelta(minutes=5)) + period = datetime.timedelta(minutes=1) + last_timestamp = None + + # first measurement at 0s, should report + is_healthy = True + actual = host_plugin.should_report(is_healthy, + error_state, + last_timestamp, + period) + self.assertEqual(True, actual) + + # second measurement at 30s, should not report + last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=30) + actual = host_plugin.should_report(is_healthy, + error_state, + last_timestamp, + period) + self.assertEqual(False, actual) + + # third measurement at 60s, should report + last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=60) + actual = host_plugin.should_report(is_healthy, + error_state, + last_timestamp, + period) + self.assertEqual(True, actual) + + # fourth measurement unhealthy, should report and increment counter + is_healthy = False + self.assertEqual(0, error_state.count) + actual = host_plugin.should_report(is_healthy, + error_state, + last_timestamp, + period) + self.assertEqual(1, error_state.count) + self.assertEqual(True, actual) + + # fifth measurement, should not report and reset counter + is_healthy = True + last_timestamp = datetime.datetime.utcnow() - datetime.timedelta(seconds=30) + self.assertEqual(1, error_state.count) + actual = host_plugin.should_report(is_healthy, + error_state, + last_timestamp, + period) + self.assertEqual(0, error_state.count) + self.assertEqual(False, actual) + class MockResponse: def __init__(self, body, status_code): From 14f022f789cbf5f0aa0bdc900cf52f157b3ff0cd Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 02:56:45 -0700 Subject: [PATCH 59/61] better json compare --- tests/protocol/test_healthservice.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py index 381af0acaa..bae195f7a4 100644 --- a/tests/protocol/test_healthservice.py +++ b/tests/protocol/test_healthservice.py @@ -97,7 +97,9 @@ def test_observation_json(self): '"Description": "description", ' \ '"IsHealthy": true' \ '}]}' - self.assertEqual(expected_json, health_service.as_json) + expected = sorted(json.loads(expected_json).items()) + actual = sorted(json.loads(health_service.as_json).items()) + self.assertEqual(expected, actual) @patch("azurelinuxagent.common.utils.restutil.http_post") def test_reporting(self, patch_post): From 1a465faac6c24d9c80575525d37d0395eb94f128 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 12:08:48 -0700 Subject: [PATCH 60/61] PR feedback --- .../common/protocol/healthservice.py | 17 +++++++------- azurelinuxagent/common/protocol/hostplugin.py | 11 ++++++++++ azurelinuxagent/ga/exthandlers.py | 5 ++--- tests/ga/test_monitor.py | 22 +++++++++---------- tests/protocol/test_wire.py | 2 +- 5 files changed, 34 insertions(+), 23 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 7976ff169a..99f0e9fe80 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -89,7 +89,7 @@ def report_host_plugin_heartbeat(self, is_healthy): """ self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, is_healthy=is_healthy)) - self.report() + self._report() def report_host_plugin_versions(self, is_healthy, response): """ @@ -100,7 +100,7 @@ def report_host_plugin_versions(self, is_healthy, response): self.observations.append(Observation(name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, is_healthy=is_healthy, value=response)) - self.report() + self._report() def report_host_plugin_extension_artifact(self, is_healthy, source, response): """ @@ -114,7 +114,7 @@ def report_host_plugin_extension_artifact(self, is_healthy, source, response): is_healthy=is_healthy, description=source, value=response)) - self.report() + self._report() def report_host_plugin_status(self, is_healthy, response): """ @@ -126,15 +126,16 @@ def report_host_plugin_status(self, is_healthy, response): self.observations.append(Observation(name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, is_healthy=is_healthy, value=response)) - self.report() + self._report() - def report(self): + def _report(self): logger.verbose('HealthService: report observations') try: restutil.http_post(self.endpoint, self.as_json, headers={'Content-Type': 'application/json'}) logger.verbose('HealthService: Reported observations to {0}: {1}', self.endpoint, self.as_json) except HttpError as e: logger.warn("HealthService: could not report observations: {0}", ustr(e)) - - # these signals are not timestamped, so there is no value in persisting data - del self.observations[:] + finally: + # TODO: add safety boundaries + # these signals are not timestamped, so there is no value in persisting data + del self.observations[:] diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index c4416a3549..c958c02013 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -174,8 +174,19 @@ def report_status_health(self, is_healthy, response=''): @staticmethod def should_report(is_healthy, error_state, last_timestamp, period): + """ + Determine whether a health signal should be reported + :param is_healthy: whether the current measurement is healthy + :param error_state: the error state which is tracking time since failure + :param last_timestamp: the last measurement time stamp + :param period: the reporting period + :return: True if the signal should be reported, False otherwise + """ if is_healthy: + # we only reset the error state upon success, since we want to keep + # reporting the failure; this is different to other uses of error states + # which do not have a separate periodicity error_state.reset() else: error_state.incr() diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 32a04cdc4d..1d49187d44 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -1111,9 +1111,8 @@ def set_handler_status(self, status="NotReady", message="", code=0): self.ext_handler.name, self.ext_handler.properties.version)) except (IOError, ValueError, ProtocolError) as e: - fileutil.clean_ioerror(e, - paths=[status_file]) - self.logger.error("Failed to save handler status: {0}", ustr(e)) + fileutil.clean_ioerror(e, paths=[status_file]) + self.logger.error("Failed to save handler status: {0}, {1}", ustr(e), traceback.format_exc()) def get_handler_status(self): state_dir = self.get_conf_dir() diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index d1c8a3e2de..5608396211 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -24,7 +24,7 @@ @patch('azurelinuxagent.common.osutil.get_osutil') @patch('azurelinuxagent.common.protocol.get_protocol_util') @patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol') -@patch("azurelinuxagent.common.protocol.healthservice.HealthService.report") +@patch("azurelinuxagent.common.protocol.healthservice.HealthService._report") class TestMonitor(AgentTestCase): def test_parse_xml_event(self, *args): data_str = load_data('ext/event.xml') @@ -104,16 +104,16 @@ def test_heartbeats(self, patch_hostplugin_heartbeat, patch_send_events, patch_t def test_heartbeat_timings_updates_after_window(self, *args): monitor_handler = get_monitor_handler() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(seconds=1) - MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(seconds=1) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(seconds=1) + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(milliseconds=100) + MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(milliseconds=100) + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(milliseconds=100) self.assertEqual(None, monitor_handler.last_host_plugin_heartbeat) self.assertEqual(None, monitor_handler.last_event_collection) self.assertEqual(None, monitor_handler.last_telemetry_heartbeat) monitor_handler.start() - time.sleep(1) + time.sleep(0.2) self.assertTrue(monitor_handler.is_alive()) self.assertNotEqual(None, monitor_handler.last_host_plugin_heartbeat) @@ -124,7 +124,7 @@ def test_heartbeat_timings_updates_after_window(self, *args): heartbeat_telemetry = monitor_handler.last_telemetry_heartbeat events_collection = monitor_handler.last_event_collection - time.sleep(2) + time.sleep(0.5) self.assertNotEqual(heartbeat_hostplugin, monitor_handler.last_host_plugin_heartbeat) self.assertNotEqual(events_collection, monitor_handler.last_event_collection) @@ -135,16 +135,16 @@ def test_heartbeat_timings_updates_after_window(self, *args): def test_heartbeat_timings_no_updates_within_window(self, *args): monitor_handler = get_monitor_handler() - MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(seconds=100) - MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(seconds=100) - MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(seconds=100) + MonitorHandler.TELEMETRY_HEARTBEAT_PERIOD = timedelta(seconds=1) + MonitorHandler.EVENT_COLLECTION_PERIOD = timedelta(seconds=1) + MonitorHandler.HOST_PLUGIN_HEARTBEAT_PERIOD = timedelta(seconds=1) self.assertEqual(None, monitor_handler.last_host_plugin_heartbeat) self.assertEqual(None, monitor_handler.last_event_collection) self.assertEqual(None, monitor_handler.last_telemetry_heartbeat) monitor_handler.start() - time.sleep(1) + time.sleep(0.2) self.assertTrue(monitor_handler.is_alive()) self.assertNotEqual(None, monitor_handler.last_host_plugin_heartbeat) @@ -155,7 +155,7 @@ def test_heartbeat_timings_no_updates_within_window(self, *args): heartbeat_telemetry = monitor_handler.last_telemetry_heartbeat events_collection = monitor_handler.last_event_collection - time.sleep(2) + time.sleep(0.5) self.assertEqual(heartbeat_hostplugin, monitor_handler.last_host_plugin_heartbeat) self.assertEqual(events_collection, monitor_handler.last_event_collection) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 412a850918..91de36206d 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -28,7 +28,7 @@ @patch("time.sleep") @patch("azurelinuxagent.common.protocol.wire.CryptUtil") -@patch("azurelinuxagent.common.protocol.healthservice.HealthService.report") +@patch("azurelinuxagent.common.protocol.healthservice.HealthService._report") class TestWireProtocol(AgentTestCase): def setUp(self): From 47f18c530982b5d9e1235bfc745180d2bf134053 Mon Sep 17 00:00:00 2001 From: Hans Krijger Date: Tue, 5 Jun 2018 12:37:57 -0700 Subject: [PATCH 61/61] additional bounds safety; new test --- .../common/protocol/healthservice.py | 35 ++++++++++++------- tests/protocol/test_healthservice.py | 14 ++++++++ 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/azurelinuxagent/common/protocol/healthservice.py b/azurelinuxagent/common/protocol/healthservice.py index 99f0e9fe80..29a44f2a2a 100644 --- a/azurelinuxagent/common/protocol/healthservice.py +++ b/azurelinuxagent/common/protocol/healthservice.py @@ -64,6 +64,7 @@ class HealthService(object): HOST_PLUGIN_STATUS_OBSERVATION_NAME = 'GuestAgentPluginStatus' HOST_PLUGIN_VERSIONS_OBSERVATION_NAME = 'GuestAgentPluginVersions' HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME = 'GuestAgentPluginArtifact' + MAX_OBSERVATIONS = 10 def __init__(self, endpoint): self.endpoint = HealthService.ENDPOINT.format(endpoint) @@ -87,8 +88,8 @@ def report_host_plugin_heartbeat(self, is_healthy): Reports a signal for /health :param is_healthy: whether the call succeeded """ - self.observations.append(Observation(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, - is_healthy=is_healthy)) + self._observe(name=HealthService.HOST_PLUGIN_HEARTBEAT_OBSERVATION_NAME, + is_healthy=is_healthy) self._report() def report_host_plugin_versions(self, is_healthy, response): @@ -97,9 +98,9 @@ def report_host_plugin_versions(self, is_healthy, response): :param is_healthy: whether the api call succeeded :param response: debugging information for failures """ - self.observations.append(Observation(name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, - is_healthy=is_healthy, - value=response)) + self._observe(name=HealthService.HOST_PLUGIN_VERSIONS_OBSERVATION_NAME, + is_healthy=is_healthy, + value=response) self._report() def report_host_plugin_extension_artifact(self, is_healthy, source, response): @@ -110,10 +111,10 @@ def report_host_plugin_extension_artifact(self, is_healthy, source, response): :param response: debugging information for failures :return: """ - self.observations.append(Observation(name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, - is_healthy=is_healthy, - description=source, - value=response)) + self._observe(name=HealthService.HOST_PLUGIN_ARTIFACT_OBSERVATION_NAME, + is_healthy=is_healthy, + description=source, + value=response) self._report() def report_host_plugin_status(self, is_healthy, response): @@ -123,11 +124,20 @@ def report_host_plugin_status(self, is_healthy, response): :param response: debugging information for failures :return: """ - self.observations.append(Observation(name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, - is_healthy=is_healthy, - value=response)) + self._observe(name=HealthService.HOST_PLUGIN_STATUS_OBSERVATION_NAME, + is_healthy=is_healthy, + value=response) self._report() + def _observe(self, name, is_healthy, value='', description=''): + # ensure we keep the list size within bounds + if len(self.observations) >= HealthService.MAX_OBSERVATIONS: + del self.observations[:HealthService.MAX_OBSERVATIONS-1] + self.observations.append(Observation(name=name, + is_healthy=is_healthy, + value=value, + description=description)) + def _report(self): logger.verbose('HealthService: report observations') try: @@ -136,6 +146,5 @@ def _report(self): except HttpError as e: logger.warn("HealthService: could not report observations: {0}", ustr(e)) finally: - # TODO: add safety boundaries # these signals are not timestamped, so there is no value in persisting data del self.observations[:] diff --git a/tests/protocol/test_healthservice.py b/tests/protocol/test_healthservice.py index bae195f7a4..e9646c01a0 100644 --- a/tests/protocol/test_healthservice.py +++ b/tests/protocol/test_healthservice.py @@ -182,6 +182,20 @@ def test_reporting(self, patch_post): self.assertEqual(9, patch_post.call_count) self.assertEqual(0, len(health_service.observations)) + def test_observation_length(self): + health_service = HealthService('endpoint') + + # make 100 observations + for i in range(0, 100): + health_service._observe(is_healthy=True, name='{0}'.format(i)) + + # ensure we keep only 10 + self.assertEqual(10, len(health_service.observations)) + + # ensure we keep the most recent 10 + self.assertEqual('90', health_service.observations[0].name) + self.assertEqual('99', health_service.observations[9].name) + def test_status_codes(self): # healthy self.assert_status_code(status_code=200, expected_healthy=True)