From 04b4bb4e002ffcffbc0f19a6e0f339d39414d9d1 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 11 Sep 2020 11:45:23 -0700 Subject: [PATCH 01/63] Added a new thread for sending telemetry data --- azurelinuxagent/common/future.py | 2 + azurelinuxagent/common/protocol/wire.py | 4 +- azurelinuxagent/ga/telemetry_service.py | 104 ++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 azurelinuxagent/ga/telemetry_service.py diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 79e6e3ba12..bf65cf6b0c 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -26,10 +26,12 @@ bytebuffer = memoryview # pylint: disable=C0103 from collections import OrderedDict # pylint: disable=W0611 + from queue import PriorityQueue elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 + from Queue import PriorityQueue """Rename Python2 unicode to ustr""" # pylint: disable=W0105 ustr = unicode # pylint: disable=E0602,invalid-name diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index b92d8b0ae8..aeb21a9bed 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -190,7 +190,7 @@ def report_ext_status(self, ext_handler_name, ext_name, ext_status): # pylint: d self.client.status_blob.set_ext_status(ext_handler_name, ext_status) def report_event(self, events): - validate_param(EVENTS_DIRECTORY, events, TelemetryEventList) + # validate_param(EVENTS_DIRECTORY, events, TelemetryEventList) self.client.report_event(events) def upload_logs(self, logs): @@ -1106,7 +1106,7 @@ def report_event(self, event_list): event_report_error_count, event_report_errors = 0, [] # Group events by providerId - for event in event_list.events: + for event in event_list: try: if event.providerId not in buf: buf[event.providerId] = b'' diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py new file mode 100644 index 0000000000..1d1a39b3e3 --- /dev/null +++ b/azurelinuxagent/ga/telemetry_service.py @@ -0,0 +1,104 @@ +# Microsoft Azure Linux Agent +# +# Copyright 2020 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# +import threading + +from azurelinuxagent.common import logger +from azurelinuxagent.common.future import ustr, PriorityQueue + + +def get_telemetry_service_handler(protocol_util): + return TelemetryServiceHandler(protocol_util) + +class TelemetryServiceHandler(object): + """ + This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as + there's any data available in the queue to send. + """ + + _THREAD_NAME = "TelemetryServiceHandler" + + def __init__(self, protocol_util): + self._protocol = protocol_util.get_protocol() + self.should_run = True + self.thread = None + self._should_process_event = threading.Event() + self._queue = PriorityQueue() + + @staticmethod + def get_thread_name(): + return TelemetryServiceHandler._THREAD_NAME + + def run(self): + logger.info("Start Extension Telemetry service.") + self.start() + + def is_alive(self): + return self.thread is not None and self.thread.is_alive() + + def start(self): + self.thread = threading.Thread(target=self.daemon) + self.thread.setDaemon(True) + self.thread.setName(self.get_thread_name()) + self.thread.start() + + def stop(self): + """ + Stop server communication and join the thread to main thread. + """ + self.should_run = False + if self.is_alive(): + self.thread.join() + + def stopped(self): + return not self.should_run + + def enqueue_event(self, event, priority): + # Add event to queue and set event + self._queue.put((priority, event)) + if not self._should_process_event.is_set(): + self._should_process_event.set() + + def daemon(self): + logger.info("Successfully started the {0} thread".format(self.get_thread_name())) + try: + # On demand wait, start processing as soon as there is any data available in the queue + while self._should_process_event.wait(): + self.send_events_in_queue() + + except Exception as error: + logger.warn("An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}", + self.get_thread_name(), ustr(error)) + + def get_events(self): + while not self._queue.empty(): + try: + yield self._queue.get() + finally: + # Mark the event as processed once done + self._queue.task_done() + + def send_events_in_queue(self): + # Process everything in Queue + if not self._queue.empty(): + self._protocol.report_event(self.get_events) + + # Clear event when done + if self._should_process_event.is_set(): + self._should_process_event.clear() + raise NotImplementedError() \ No newline at end of file From 97bc0b6ce87c2f7932bdef2855e9464d8c4920f0 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 14 Sep 2020 16:25:23 -0700 Subject: [PATCH 02/63] Added a new priority queue for sending events --- azurelinuxagent/common/event.py | 23 +++++++++-- azurelinuxagent/common/protocol/wire.py | 2 +- azurelinuxagent/ga/monitor.py | 52 ++++++++++++++++--------- azurelinuxagent/ga/telemetry_service.py | 21 ++++++---- azurelinuxagent/ga/update.py | 7 +++- 5 files changed, 71 insertions(+), 34 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 43305f0153..b2c5118c36 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -281,6 +281,18 @@ def _log_event(name, op, message, duration, is_success=True): # pylint: disable= logger.info(_EVENT_MSG, name, op, message, duration) +class TelemetryEventPriorities(object): + """ + Class defining the priorities for telemetry events. Lower the number, higher the priority + + Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry + instantly rather than waiting for a minute for the monitor thread to pick up the events) + """ + AGENT_EVENT = 1 # Agent events always get the highest priority + EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline + EXTENSION_EVENT_OLD_PIPELINE = 3 + + class EventLogger(object): def __init__(self): self.event_dir = None @@ -587,7 +599,7 @@ def report_dropped_events_error(count, errors, op, max_errors_to_report): # pyli message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) - def collect_events(self): # pylint: disable=R0914 + def collect_events(self, enqueue_event): # pylint: disable=R0914 """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. @@ -614,6 +626,7 @@ def collect_events(self): # pylint: disable=R0914 event_data = fd.read().decode("utf-8") event = parse_event(event_data) + priority = TelemetryEventPriorities.AGENT_EVENT # "legacy" events are events produced by previous versions of the agent (<= 2.2.46) and extensions; # they do not include all the telemetry fields, so we add them here @@ -627,10 +640,12 @@ def collect_events(self): # pylint: disable=R0914 if event.is_extension_event(): EventLogger._trim_extension_event_parameters(event) self.add_common_event_parameters(event, event_file_creation_time) + priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE else: self._update_legacy_agent_event(event, event_file_creation_time) - event_list.events.append(event) + # event_list.events.append(event) + enqueue_event(event, priority) finally: os.remove(event_file_path) except UnicodeError as e: # pylint: disable=C0103 @@ -774,8 +789,8 @@ def add_periodic(delta, name, op=WALAEventOperation.Unknown, is_success=True, du message=message, log_event=log_event, force=force) -def collect_events(reporter=__event_logger__): - return reporter.collect_events() +def collect_events(enqueue_event, reporter=__event_logger__): + return reporter.collect_events(enqueue_event) def mark_event_status(name, version, op, status): # pylint: disable=C0103 diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index aeb21a9bed..4bc23d2661 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1106,7 +1106,7 @@ def report_event(self, event_list): event_report_error_count, event_report_errors = 0, [] # Group events by providerId - for event in event_list: + for event in event_list(): try: if event.providerId not in buf: buf[event.providerId] = b'' diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 6392086d16..0c12bc6ab8 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -38,8 +38,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_monitor_handler(): - return MonitorHandler() +def get_monitor_handler(telemetry_handler): + return MonitorHandler(telemetry_handler) class PollResourceUsageOperation(PeriodicOperation): @@ -109,6 +109,35 @@ def _operation_impl(): logger.reset_periodic() +class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): + """ + Periodic operation to collect and send telemetry events located in the events folder + """ + + _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) + + def __init__(self, telemetry_handler): + super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( + name="collect_and_enqueue_events", + operation=self.collect_and_enqueue_events, + period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) + self.enqueue_events = telemetry_handler.enqueue_event + + def collect_and_enqueue_events(self): + """ + Periodically send any events located in the events folder + """ + try: + # event_list = collect_events() + collect_events(self.enqueue_events) + + # if len(event_list.events) > 0: # pylint: disable=len-as-condition + # self.protocol.report_event(event_list) + except Exception as e: # pylint: disable=C0103 + err_msg = "Failure in collecting Agent events: {0}".format(ustr(e)) + add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) + + class ReportNetworkErrorsOperation(PeriodicOperation): def __init__(self): super(ReportNetworkErrorsOperation, self).__init__( @@ -158,8 +187,6 @@ def _operation_impl(self): class MonitorHandler(object): # pylint: disable=R0902 - # telemetry - EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) # host plugin HOST_PLUGIN_HEARTBEAT_PERIOD = datetime.timedelta(minutes=1) HOST_PLUGIN_HEALTH_PERIOD = datetime.timedelta(minutes=5) @@ -173,14 +200,14 @@ class MonitorHandler(object): # pylint: disable=R0902 def get_thread_name(): return MonitorHandler._THREAD_NAME - def __init__(self): + def __init__(self, telemetry_handler): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), - PeriodicOperation("collect_and_send_events", self.collect_and_send_events, self.EVENT_COLLECTION_PERIOD), + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler), ReportNetworkErrorsOperation(), PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), @@ -251,19 +278,6 @@ def daemon(self, init_data=False): except Exception as e: # pylint: disable=C0103 logger.error("An error occurred in the monitor thread; will exit the thread.\n{0}", ustr(e)) - def collect_and_send_events(self): - """ - Periodically send any events located in the events folder - """ - try: - event_list = collect_events() - - if len(event_list.events) > 0: # pylint: disable=len-as-condition - self.protocol.report_event(event_list) - except Exception as e: # pylint: disable=C0103 - err_msg = "Failure in collecting/sending Agent events: {0}".format(ustr(e)) - add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) - def send_imds_heartbeat(self): """ Send a health signal every IMDS_HEARTBEAT_PERIOD. The signal is 'Healthy' when we have diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 1d1a39b3e3..d8dc0c43f7 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -16,6 +16,7 @@ # # Requires Python 2.6+ and Openssl 1.0+ # +import datetime import threading from azurelinuxagent.common import logger @@ -32,12 +33,13 @@ class TelemetryServiceHandler(object): """ _THREAD_NAME = "TelemetryServiceHandler" + _MAX_TIMEOUT = datetime.timedelta(minutes=5).seconds def __init__(self, protocol_util): self._protocol = protocol_util.get_protocol() self.should_run = True self.thread = None - self._should_process_event = threading.Event() + self._should_process_events = threading.Event() self._queue = PriorityQueue() @staticmethod @@ -71,14 +73,16 @@ def stopped(self): def enqueue_event(self, event, priority): # Add event to queue and set event self._queue.put((priority, event)) - if not self._should_process_event.is_set(): - self._should_process_event.set() + + # Always set the event if any enqueue happens (even if already set) + self._should_process_events.set() def daemon(self): logger.info("Successfully started the {0} thread".format(self.get_thread_name())) try: # On demand wait, start processing as soon as there is any data available in the queue - while self._should_process_event.wait(): + # In worst case, also keep checking every 5 mins to ensure that no data is being missed + while self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT): self.send_events_in_queue() except Exception as error: @@ -88,7 +92,8 @@ def daemon(self): def get_events(self): while not self._queue.empty(): try: - yield self._queue.get() + _, event = self._queue.get() + yield event finally: # Mark the event as processed once done self._queue.task_done() @@ -99,6 +104,6 @@ def send_events_in_queue(self): self._protocol.report_event(self.get_events) # Clear event when done - if self._should_process_event.is_set(): - self._should_process_event.clear() - raise NotImplementedError() \ No newline at end of file + # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. + if self._should_process_events.is_set() and not self._queue.empty(): + self._should_process_events.clear() \ No newline at end of file diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 98b5926945..10dd3a22ab 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -60,6 +60,7 @@ from azurelinuxagent.ga.monitor import get_monitor_handler # pylint: disable=C0302 +from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler AGENT_ERROR_FILE = "error.json" # File name for agent error record AGENT_MANIFEST_FILE = "HandlerManifest.json" @@ -274,9 +275,11 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # Get all thread handlers + telemetry_handler = get_telemetry_service_handler(self.protocol_util) all_thread_handlers = [ - get_monitor_handler(), - get_env_handler() + get_monitor_handler(telemetry_handler), + get_env_handler(), + telemetry_handler ] if is_extension_telemetry_pipeline_enabled(): From 10added37cdec765b2309b31f0c9e299d680c1ee Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 15 Sep 2020 12:40:46 -0700 Subject: [PATCH 03/63] Added stacktrace for failure --- azurelinuxagent/ga/telemetry_service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index d8dc0c43f7..4baf6c6c20 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -19,6 +19,8 @@ import datetime import threading +import traceback + from azurelinuxagent.common import logger from azurelinuxagent.common.future import ustr, PriorityQueue @@ -86,8 +88,8 @@ def daemon(self): self.send_events_in_queue() except Exception as error: - logger.warn("An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}", - self.get_thread_name(), ustr(error)) + logger.warn("An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}", + self.get_thread_name(), ustr(error), traceback.format_exc()) def get_events(self): while not self._queue.empty(): From 647bbaf5c48d7c837b016928ca465060622af3d2 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 15 Sep 2020 13:09:14 -0700 Subject: [PATCH 04/63] Added stacktrace for failures and made it a set --- azurelinuxagent/common/event.py | 4 ++-- azurelinuxagent/common/protocol/wire.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index b2c5118c36..55b0992d90 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -609,7 +609,7 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) event_files = os.listdir(event_directory_full_path) unicode_error_count, unicode_errors = 0, [] - collect_event_error_count, collect_event_errors = 0, [] + collect_event_error_count, collect_event_errors = 0, set() for event_file in event_files: try: @@ -655,7 +655,7 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 except Exception as e: # pylint: disable=C0103 collect_event_error_count += 1 if len(collect_event_errors) < max_collect_errors_to_report: - collect_event_errors.append(ustr(e)) + collect_event_errors.add(traceback.format_exc()) EventLogger.report_dropped_events_error(collect_event_error_count, collect_event_errors, WALAEventOperation.CollectEventErrors, max_collect_errors_to_report) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 4bc23d2661..a0f29236b1 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1137,9 +1137,9 @@ def report_event(self, event_list): event_report_errors.append(ustr(e)) EventLogger.report_dropped_events_error(event_report_error_count, event_report_errors, - WALAEventOperation.CollectEventErrors, max_send_errors_to_report) + WALAEventOperation.ReportEventErrors, max_send_errors_to_report) EventLogger.report_dropped_events_error(unicode_error_count, unicode_errors, - WALAEventOperation.CollectEventUnicodeErrors, + WALAEventOperation.ReportEventUnicodeErrors, max_send_errors_to_report) # Send out all events left in buffer. From 2896b23ebc62cdf28b880a1539b410a130c154db Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 15 Sep 2020 14:55:19 -0700 Subject: [PATCH 05/63] Added a counter to pq to avoid collisions and more logging --- azurelinuxagent/common/protocol/wire.py | 1 + azurelinuxagent/ga/telemetry_service.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index a0f29236b1..0753882e50 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1135,6 +1135,7 @@ def report_event(self, event_list): event_report_error_count += 1 if len(event_report_errors) < max_send_errors_to_report: event_report_errors.append(ustr(e)) + logger.info("Event {0} done processing!".format(event)) EventLogger.report_dropped_events_error(event_report_error_count, event_report_errors, WALAEventOperation.ReportEventErrors, max_send_errors_to_report) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 4baf6c6c20..e6c7763c40 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -28,6 +28,7 @@ def get_telemetry_service_handler(protocol_util): return TelemetryServiceHandler(protocol_util) + class TelemetryServiceHandler(object): """ This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as @@ -43,6 +44,10 @@ def __init__(self, protocol_util): self.thread = None self._should_process_events = threading.Event() self._queue = PriorityQueue() + # The basic PriorityQueue sorts based on the 2nd item if the priority is same, but since TelemetryEvent is not orderable, it throws. + # This property takes care of that collision by maintaining a counter to ensure that collision can never occur. + # It is reset every time the threading event self._should_process_events is unset + self._queue_counter = 0 @staticmethod def get_thread_name(): @@ -74,7 +79,8 @@ def stopped(self): def enqueue_event(self, event, priority): # Add event to queue and set event - self._queue.put((priority, event)) + self._queue.put((priority, self._queue_counter, event)) + self._queue_counter += 1 # Always set the event if any enqueue happens (even if already set) self._should_process_events.set() @@ -94,10 +100,11 @@ def daemon(self): def get_events(self): while not self._queue.empty(): try: - _, event = self._queue.get() + _, __, event = self._queue.get() yield event finally: # Mark the event as processed once done + logger.info("Marking event {0} as done!".format(event)) self._queue.task_done() def send_events_in_queue(self): @@ -108,4 +115,5 @@ def send_events_in_queue(self): # Clear event when done # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. if self._should_process_events.is_set() and not self._queue.empty(): - self._should_process_events.clear() \ No newline at end of file + self._should_process_events.clear() + self._queue_counter = 0 \ No newline at end of file From a9e7bebfe01de4d5c239ddab82583345c1955a9f Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 15 Sep 2020 15:22:55 -0700 Subject: [PATCH 06/63] Changed logging to verbose --- azurelinuxagent/common/protocol/wire.py | 2 +- azurelinuxagent/ga/telemetry_service.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 0753882e50..cbceeeb200 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1135,7 +1135,7 @@ def report_event(self, event_list): event_report_error_count += 1 if len(event_report_errors) < max_send_errors_to_report: event_report_errors.append(ustr(e)) - logger.info("Event {0} done processing!".format(event)) + logger.verbose("done reporting for Event {0}".format(event)) EventLogger.report_dropped_events_error(event_report_error_count, event_report_errors, WALAEventOperation.ReportEventErrors, max_send_errors_to_report) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index e6c7763c40..dd77ef4bc5 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -104,7 +104,7 @@ def get_events(self): yield event finally: # Mark the event as processed once done - logger.info("Marking event {0} as done!".format(event)) + logger.verbose("Finished working with event {0}".format(event)) self._queue.task_done() def send_events_in_queue(self): From 540f5f1ee8302405bb81e5f2205d6772af78b2d9 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 16 Sep 2020 11:33:28 -0700 Subject: [PATCH 07/63] Added a global counter for PQ --- azurelinuxagent/ga/telemetry_service.py | 26 +++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index dd77ef4bc5..f643132cc6 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -29,6 +29,24 @@ def get_telemetry_service_handler(protocol_util): return TelemetryServiceHandler(protocol_util) +class QueueCounter(object): + def __init__(self): + self._value = 0 + self._lock = threading.RLock() + + def increment(self): + with self._lock: + self._value += 1 + + @property + def value(self): + return self._value + + def reset(self): + with self._lock: + self._value = 0 + + class TelemetryServiceHandler(object): """ This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as @@ -47,7 +65,7 @@ def __init__(self, protocol_util): # The basic PriorityQueue sorts based on the 2nd item if the priority is same, but since TelemetryEvent is not orderable, it throws. # This property takes care of that collision by maintaining a counter to ensure that collision can never occur. # It is reset every time the threading event self._should_process_events is unset - self._queue_counter = 0 + self._queue_counter = QueueCounter() @staticmethod def get_thread_name(): @@ -79,8 +97,8 @@ def stopped(self): def enqueue_event(self, event, priority): # Add event to queue and set event - self._queue.put((priority, self._queue_counter, event)) - self._queue_counter += 1 + self._queue.put((priority, self._queue_counter.value, event)) + self._queue_counter.increment() # Always set the event if any enqueue happens (even if already set) self._should_process_events.set() @@ -116,4 +134,4 @@ def send_events_in_queue(self): # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. if self._should_process_events.is_set() and not self._queue.empty(): self._should_process_events.clear() - self._queue_counter = 0 \ No newline at end of file + self._queue_counter.reset() \ No newline at end of file From b21f97ced49e4151be3001fa9ac6dbc00984f648 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 16 Sep 2020 15:59:02 -0700 Subject: [PATCH 08/63] Added more logging --- azurelinuxagent/common/protocol/wire.py | 4 ++-- azurelinuxagent/ga/telemetry_service.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index cbceeeb200..54a1205deb 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1098,7 +1098,7 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): raise ProtocolError( "Failed to send events:{0}".format(resp.status)) - def report_event(self, event_list): + def report_event(self, get_events): max_send_errors_to_report = 5 buf = {} events_per_request = 0 @@ -1106,7 +1106,7 @@ def report_event(self, event_list): event_report_error_count, event_report_errors = 0, [] # Group events by providerId - for event in event_list(): + for event in get_events(): try: if event.providerId not in buf: buf[event.providerId] = b'' diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index f643132cc6..5d240eac0c 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -32,7 +32,7 @@ def get_telemetry_service_handler(protocol_util): class QueueCounter(object): def __init__(self): self._value = 0 - self._lock = threading.RLock() + self._lock = threading.Lock() def increment(self): with self._lock: @@ -98,6 +98,8 @@ def stopped(self): def enqueue_event(self, event, priority): # Add event to queue and set event self._queue.put((priority, self._queue_counter.value, event)) + logger.verbose( + "Added event Priority: {0}, Counter: {1}, Event: {2}".format(priority, self._queue_counter.value, event)) self._queue_counter.increment() # Always set the event if any enqueue happens (even if already set) @@ -117,16 +119,20 @@ def daemon(self): def get_events(self): while not self._queue.empty(): + event = None try: _, __, event = self._queue.get() + logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) yield event finally: # Mark the event as processed once done logger.verbose("Finished working with event {0}".format(event)) - self._queue.task_done() + # self._queue.task_done() def send_events_in_queue(self): # Process everything in Queue + logger.verbose("Processing data in the telemetry service queue, approx qsize: {0}; Counter val: {1}", + self._queue.qsize(), self._queue_counter.value) if not self._queue.empty(): self._protocol.report_event(self.get_events) From c00b28df00f226909a8d31c54627ff3f9038d754 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 16 Sep 2020 17:11:10 -0700 Subject: [PATCH 09/63] Fixed faulty condition --- azurelinuxagent/ga/telemetry_service.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 5d240eac0c..365c87796b 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -138,6 +138,7 @@ def send_events_in_queue(self): # Clear event when done # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. - if self._should_process_events.is_set() and not self._queue.empty(): + if self._should_process_events.is_set() and self._queue.empty(): + logger.verbose("Resetting the event and counter with val: {0}", self._queue_counter.value) self._should_process_events.clear() self._queue_counter.reset() \ No newline at end of file From 3b7b3937d92c91fc3e3d84c44de7b219ff868f5d Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 10:26:50 -0700 Subject: [PATCH 10/63] Implemented comparator for TelemetryEvent class --- azurelinuxagent/common/telemetryevent.py | 30 ++++++++++++++++++++++-- azurelinuxagent/ga/telemetry_service.py | 9 ++++--- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 78117b5d46..a424d69ee9 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -18,6 +18,7 @@ # from azurelinuxagent.common.datacontract import DataContract, DataContractList +from azurelinuxagent.common.event import TelemetryEventPriorities from azurelinuxagent.common.version import AGENT_NAME class CommonTelemetryEventSchema(object): # pylint: disable=R0903 @@ -83,23 +84,48 @@ def __eq__(self, other): class TelemetryEvent(DataContract): - def __init__(self, eventId=None, providerId=None): + def __init__(self, eventId=None, providerId=None, priority=TelemetryEventPriorities.AGENT_EVENT): self.eventId = eventId # pylint: disable=C0103 self.providerId = providerId # pylint: disable=C0103 self.parameters = DataContractList(TelemetryEventParam) self.file_type = "" + self._priority = priority # Checking if the particular param name is in the TelemetryEvent. def __contains__(self, param_name): return param_name in [param.name for param in self.parameters] + def __le__(self, other): + raise self.priority <= other.priority + + def __ge__(self, other): + raise self.priority >= other.priority + + def __eq__(self, other): + raise self.priority == other.priority + + def __lt__(self, other): + raise self.priority < other.priority + + def __gt__(self, other): + raise self.priority > other.priority + + def __ne__(self, other): + raise self.priority != other.priority + + @property + def priority(self): + return self._priority + def is_extension_event(self): # Events originating from the agent have "WALinuxAgent" as the Name parameter, or they don't have a Name # parameter, in the case of log and metric events. So, in case the Name parameter exists and it is not # "WALinuxAgent", it is an extension event. for param in self.parameters: if param.name == GuestAgentExtensionEventsSchema.Name: - return param.value != AGENT_NAME + if param.value != AGENT_NAME: + self._priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE + return True return False def get_version(self): diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 365c87796b..f85a023d19 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -97,7 +97,8 @@ def stopped(self): def enqueue_event(self, event, priority): # Add event to queue and set event - self._queue.put((priority, self._queue_counter.value, event)) + self._queue.put(event) + # self._queue.put((priority, self._queue_counter.value, event)) logger.verbose( "Added event Priority: {0}, Counter: {1}, Event: {2}".format(priority, self._queue_counter.value, event)) self._queue_counter.increment() @@ -121,8 +122,10 @@ def get_events(self): while not self._queue.empty(): event = None try: - _, __, event = self._queue.get() - logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) + # _, __, event = self._queue.get() + event = self._queue.get() + # logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) + logger.verbose("Fetched event Priority: {0}, Event: {1}".format(event.priority if event is not None else 100, event)) yield event finally: # Mark the event as processed once done From 859843ce19a313f12b29972085c725536b24287c Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 10:43:33 -0700 Subject: [PATCH 11/63] Moved class --- azurelinuxagent/common/telemetryevent.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index a424d69ee9..8aeb2919c5 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -18,7 +18,6 @@ # from azurelinuxagent.common.datacontract import DataContract, DataContractList -from azurelinuxagent.common.event import TelemetryEventPriorities from azurelinuxagent.common.version import AGENT_NAME class CommonTelemetryEventSchema(object): # pylint: disable=R0903 @@ -83,6 +82,17 @@ def __eq__(self, other): return isinstance(other, TelemetryEventParam) and other.name == self.name and other.value == self.value +class TelemetryEventPriorities(object): + """ + Class defining the priorities for telemetry events. Lower the number, higher the priority + + Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry + instantly rather than waiting for a minute for the monitor thread to pick up the events) + """ + AGENT_EVENT = 1 # Agent events always get the highest priority + EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline + EXTENSION_EVENT_OLD_PIPELINE = 3 + class TelemetryEvent(DataContract): def __init__(self, eventId=None, providerId=None, priority=TelemetryEventPriorities.AGENT_EVENT): self.eventId = eventId # pylint: disable=C0103 From 76f1290428a49f0f0f38509a0e98c7f8d9cb6520 Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 13:02:23 -0700 Subject: [PATCH 12/63] Code fix and more logging --- azurelinuxagent/common/telemetryevent.py | 12 ++++++------ azurelinuxagent/ga/telemetry_service.py | 2 ++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 8aeb2919c5..a0797f0c23 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -106,22 +106,22 @@ def __contains__(self, param_name): return param_name in [param.name for param in self.parameters] def __le__(self, other): - raise self.priority <= other.priority + return self.priority <= other.priority def __ge__(self, other): - raise self.priority >= other.priority + return self.priority >= other.priority def __eq__(self, other): - raise self.priority == other.priority + return self.priority == other.priority def __lt__(self, other): - raise self.priority < other.priority + return self.priority < other.priority def __gt__(self, other): - raise self.priority > other.priority + return self.priority > other.priority def __ne__(self, other): - raise self.priority != other.priority + return self.priority != other.priority @property def priority(self): diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index f85a023d19..8b24497f11 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -127,6 +127,8 @@ def get_events(self): # logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) logger.verbose("Fetched event Priority: {0}, Event: {1}".format(event.priority if event is not None else 100, event)) yield event + except Exception as e: + logger.error("Some exception: {0}, now the event will be None".format(ustr(e))) finally: # Mark the event as processed once done logger.verbose("Finished working with event {0}".format(event)) From e0eb752ef871868909a9d88e910fa6a98cf13511 Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 13:27:44 -0700 Subject: [PATCH 13/63] Removed dead code --- azurelinuxagent/common/event.py | 16 +---------- azurelinuxagent/common/telemetryevent.py | 1 + azurelinuxagent/ga/telemetry_service.py | 36 ++++-------------------- 3 files changed, 7 insertions(+), 46 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 55b0992d90..7446017088 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -281,18 +281,6 @@ def _log_event(name, op, message, duration, is_success=True): # pylint: disable= logger.info(_EVENT_MSG, name, op, message, duration) -class TelemetryEventPriorities(object): - """ - Class defining the priorities for telemetry events. Lower the number, higher the priority - - Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry - instantly rather than waiting for a minute for the monitor thread to pick up the events) - """ - AGENT_EVENT = 1 # Agent events always get the highest priority - EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline - EXTENSION_EVENT_OLD_PIPELINE = 3 - - class EventLogger(object): def __init__(self): self.event_dir = None @@ -626,7 +614,6 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 event_data = fd.read().decode("utf-8") event = parse_event(event_data) - priority = TelemetryEventPriorities.AGENT_EVENT # "legacy" events are events produced by previous versions of the agent (<= 2.2.46) and extensions; # they do not include all the telemetry fields, so we add them here @@ -640,12 +627,11 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 if event.is_extension_event(): EventLogger._trim_extension_event_parameters(event) self.add_common_event_parameters(event, event_file_creation_time) - priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE else: self._update_legacy_agent_event(event, event_file_creation_time) # event_list.events.append(event) - enqueue_event(event, priority) + enqueue_event(event) finally: os.remove(event_file_path) except UnicodeError as e: # pylint: disable=C0103 diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index a0797f0c23..ea6250d733 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -93,6 +93,7 @@ class TelemetryEventPriorities(object): EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline EXTENSION_EVENT_OLD_PIPELINE = 3 + class TelemetryEvent(DataContract): def __init__(self, eventId=None, providerId=None, priority=TelemetryEventPriorities.AGENT_EVENT): self.eventId = eventId # pylint: disable=C0103 diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 8b24497f11..1c8a983f67 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -29,24 +29,6 @@ def get_telemetry_service_handler(protocol_util): return TelemetryServiceHandler(protocol_util) -class QueueCounter(object): - def __init__(self): - self._value = 0 - self._lock = threading.Lock() - - def increment(self): - with self._lock: - self._value += 1 - - @property - def value(self): - return self._value - - def reset(self): - with self._lock: - self._value = 0 - - class TelemetryServiceHandler(object): """ This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as @@ -62,10 +44,6 @@ def __init__(self, protocol_util): self.thread = None self._should_process_events = threading.Event() self._queue = PriorityQueue() - # The basic PriorityQueue sorts based on the 2nd item if the priority is same, but since TelemetryEvent is not orderable, it throws. - # This property takes care of that collision by maintaining a counter to ensure that collision can never occur. - # It is reset every time the threading event self._should_process_events is unset - self._queue_counter = QueueCounter() @staticmethod def get_thread_name(): @@ -95,13 +73,11 @@ def stop(self): def stopped(self): return not self.should_run - def enqueue_event(self, event, priority): + def enqueue_event(self, event): # Add event to queue and set event self._queue.put(event) # self._queue.put((priority, self._queue_counter.value, event)) - logger.verbose( - "Added event Priority: {0}, Counter: {1}, Event: {2}".format(priority, self._queue_counter.value, event)) - self._queue_counter.increment() + logger.verbose("Added event Priority: {0}, Event: {1}", event.priority, event) # Always set the event if any enqueue happens (even if already set) self._should_process_events.set() @@ -136,14 +112,12 @@ def get_events(self): def send_events_in_queue(self): # Process everything in Queue - logger.verbose("Processing data in the telemetry service queue, approx qsize: {0}; Counter val: {1}", - self._queue.qsize(), self._queue_counter.value) + logger.verbose("Processing data in the telemetry service queue, approx qsize: {0}", self._queue.qsize()) if not self._queue.empty(): self._protocol.report_event(self.get_events) # Clear event when done # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. if self._should_process_events.is_set() and self._queue.empty(): - logger.verbose("Resetting the event and counter with val: {0}", self._queue_counter.value) - self._should_process_events.clear() - self._queue_counter.reset() \ No newline at end of file + logger.verbose("Resetting the event") + self._should_process_events.clear() \ No newline at end of file From d45d73e5b30a319377c02286a668dd3e1324623c Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 17:03:24 -0700 Subject: [PATCH 14/63] Made ETP code compatible with new telemetry service --- azurelinuxagent/ga/extension_telemetry.py | 113 +++++++++++----------- azurelinuxagent/ga/exthandlers.py | 2 +- azurelinuxagent/ga/update.py | 2 +- tests/ga/test_extension_telemetry.py | 3 +- 4 files changed, 58 insertions(+), 62 deletions(-) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 82d871d12e..8cdd4ebc78 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -29,14 +29,14 @@ TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger from azurelinuxagent.common.exception import InvalidExtensionEventError from azurelinuxagent.common.future import ustr -from azurelinuxagent.common.telemetryevent import TelemetryEventList, TelemetryEvent, TelemetryEventParam, \ - GuestAgentGenericLogsSchema +from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ + GuestAgentGenericLogsSchema, TelemetryEventPriorities from azurelinuxagent.ga.exthandlers import HANDLER_NAME_PATTERN from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_extension_telemetry_handler(protocol_util): - return ExtensionTelemetryHandler(protocol_util) +def get_extension_telemetry_handler(protocol_util, telemetry_handler): + return ExtensionTelemetryHandler(protocol_util, telemetry_handler) class ExtensionEventSchema(object): # pylint: disable=R0903 """ @@ -68,45 +68,37 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, protocol_util): + def __init__(self, protocol_util, enqueue_event): super(ProcessExtensionTelemetry, self).__init__( - name="collect and send extension events", - operation=self._collect_and_send_events, + name="collect_and_enqueue_extension_events", + operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) self._protocol = protocol_util.get_protocol() + self._enqueue_event = enqueue_event - def _collect_and_send_events(self): - event_list = self._collect_extension_events() - - if len(event_list.events) > 0: # pylint: disable=C1801 - self._protocol.report_event(event_list) - - def _collect_extension_events(self): - events_list = TelemetryEventList() + def _collect_and_enqueue_extension_events(self): extension_handler_with_event_dirs = [] try: extension_handler_with_event_dirs = self._get_extension_events_dir_with_handler_name(conf.get_ext_log_dir()) - if len(extension_handler_with_event_dirs) == 0: # pylint: disable=C1801 + if not extension_handler_with_event_dirs: logger.verbose("No Extension events directory exist") - return events_list + return for extension_handler_with_event_dir in extension_handler_with_event_dirs: handler_name = extension_handler_with_event_dir[0] handler_event_dir_path = extension_handler_with_event_dir[1] - self._capture_extension_events(handler_name, handler_event_dir_path, events_list) - except Exception as e: # pylint: disable=C0103 - msg = "Unknown error occurred when trying to collect extension events. Error: {0}".format(ustr(e)) + self._capture_extension_events(handler_name, handler_event_dir_path) + except Exception as error: + msg = "Unknown error occurred when trying to collect extension events. Error: {0}".format(ustr(error)) add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing, message=msg, is_success=False) finally: # Always ensure that the events directory are being deleted each run, # even if we run into an error and dont process them this run. self._ensure_all_events_directories_empty(extension_handler_with_event_dirs) - return events_list - @staticmethod def _get_extension_events_dir_with_handler_name(extension_log_dir): """ @@ -129,16 +121,26 @@ def _get_extension_events_dir_with_handler_name(extension_log_dir): return extension_handler_with_event_dirs - def _capture_extension_events(self, handler_name, handler_event_dir_path, events_list): # pylint: disable=R0914 + def _event_file_size_allowed(self, event_file_path): + + event_file_size = os.stat(event_file_path).st_size + if event_file_size > self._EXTENSION_EVENT_FILE_MAX_SIZE: + convert_to_mb = lambda x: (1.0 * x) / (1000 * 1000) + msg = "Skipping file: {0} as its size is {1:.2f} Mb > Max size allowed {2:.1f} Mb".format( + event_file_path, convert_to_mb(event_file_size), + convert_to_mb(self._EXTENSION_EVENT_FILE_MAX_SIZE)) + logger.warn(msg) + add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) + return False + return True + + def _capture_extension_events(self, handler_name, handler_event_dir_path): """ Capture Extension events and add them to the events_list :param handler_name: Complete Handler Name. Eg: Microsoft.CPlat.Core.RunCommandLinux :param handler_event_dir_path: Full path. Eg: '/var/log/azure/Microsoft.CPlat.Core.RunCommandLinux/events' - :param events_list: List of captured extension events """ - convert_to_mb = lambda x: (1.0 * x)/(1000 * 1000) - # Filter out the files that do not follow the pre-defined EXTENSION_EVENT_FILE_NAME_REGEX event_files = [event_file for event_file in os.listdir(handler_event_dir_path) if re.match(self._EXTENSION_EVENT_FILE_NAME_REGEX, event_file) is not None] @@ -154,22 +156,13 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path, events try: logger.verbose("Processing event file: {0}", event_file_path) - # We only support _EXTENSION_EVENT_FILE_MAX_SIZE=4Mb max file size - event_file_size = os.stat(event_file_path).st_size - if event_file_size > self._EXTENSION_EVENT_FILE_MAX_SIZE: - msg = "Skipping file: {0} as its size is {1:.2f} Mb > Max size allowed {2:.1f} Mb".format( - event_file_path, convert_to_mb(event_file_size), - convert_to_mb(self._EXTENSION_EVENT_FILE_MAX_SIZE)) - logger.warn(msg) - add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) + if not self._event_file_size_allowed(event_file_path): continue # We support multiple events in a file, read the file and parse events. - parsed_events = self._parse_event_file_and_capture_events(handler_name, event_file_path, - captured_extension_events_count, - dropped_events_with_error_count) - events_list.events.extend(parsed_events) - captured_extension_events_count += len(parsed_events) + captured_extension_events_count = self._get_captured_events_count(handler_name, event_file_path, + captured_extension_events_count, + dropped_events_with_error_count) # We only allow MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD=300 maximum events per period per handler if captured_extension_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD: @@ -179,14 +172,14 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path, events add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) break - except Exception as e: # pylint: disable=C0103 - msg = "Failed to process event file {0}: {1}", event_file, ustr(e) + except Exception as error: + msg = "Failed to process event file {0}: {1}", event_file, ustr(error) logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) finally: os.remove(event_file_path) - if dropped_events_with_error_count is not None and len(dropped_events_with_error_count) > 0: # pylint: disable=C1801 + if dropped_events_with_error_count: msg = "Dropped events for Extension: {0}; Details:\n\t{1}".format(handler_name, '\n\t'.join( ["Reason: {0}; Dropped Count: {1}".format(k, v) for k, v in dropped_events_with_error_count.items()])) logger.warn(msg) @@ -197,7 +190,7 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path, events @staticmethod def _ensure_all_events_directories_empty(extension_events_directories): - if len(extension_events_directories) == 0: # pylint: disable=C1801 + if not extension_events_directories: return for extension_handler_with_event_dir in extension_events_directories: @@ -210,16 +203,16 @@ def _ensure_all_events_directories_empty(extension_events_directories): for residue_file in os.listdir(event_dir_path): try: os.remove(os.path.join(event_dir_path, residue_file)) - except Exception as e: # pylint: disable=C0103 + except Exception as error: # Only log the first error once per handler per run if unable to clean off residue files - err = ustr(e) if err is None else err + err = ustr(error) if err is None else err if err is not None: logger.error("Failed to completely clear the {0} directory. Exception: {1}", event_dir_path, err) - def _parse_event_file_and_capture_events(self, handler_name, event_file_path, captured_events_count, - dropped_events_with_error_count): - events_list = [] + def _get_captured_events_count(self, handler_name, event_file_path, captured_events_count, + dropped_events_with_error_count): + event_file_time = datetime.datetime.fromtimestamp(os.path.getmtime(event_file_path)) # Read event file and decode it properly @@ -236,21 +229,21 @@ def _parse_event_file_and_capture_events(self, handler_name, event_file_path, ca for event in events: try: - events_list.append(self._parse_telemetry_event(handler_name, event, event_file_time)) + self._enqueue_event(self._parse_telemetry_event(handler_name, event, event_file_time)) captured_events_count += 1 - except InvalidExtensionEventError as e: # pylint: disable=C0103 + except InvalidExtensionEventError as invalid_error: # These are the errors thrown if there's an error parsing the event. We want to report these back to the # extension publishers so that they are aware of the issues. # The error messages are all static messages, we will use this to create a dict and emit an event at the # end of each run to notify if there were any errors parsing events for the extension - dropped_events_with_error_count[ustr(e)] += 1 - except Exception as e: # pylint: disable=C0103 - logger.warn("Unable to parse and transmit event, error: {0}".format(e)) + dropped_events_with_error_count[ustr(invalid_error)] += 1 + except Exception as error: + logger.warn("Unable to parse and transmit event, error: {0}".format(error)) if captured_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD: break - return events_list + return captured_events_count def _parse_telemetry_event(self, handler_name, extension_unparsed_event, event_file_time): """ @@ -263,7 +256,8 @@ def _parse_telemetry_event(self, handler_name, extension_unparsed_event, event_f # Create a telemetry event, add all common parameters to the event # and then overwrite all the common params with extension events params if same - event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID) + event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID, + priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE) event.file_type = "json" self.add_common_params_to_extension_event(event, event_file_time) @@ -306,14 +300,14 @@ def _parse_event_and_ensure_it_is_valid(self, extension_event): raise InvalidExtensionEventError( key_err_msg.format(InvalidExtensionEventError.MissingKeyError, ExtensionEventSchema.Message)) - if event[message_key] is None or len(event[message_key]) == 0: # pylint: disable=C1801 + if not event[message_key]: raise InvalidExtensionEventError( "{0}: {1} should not be empty".format(InvalidExtensionEventError.EmptyMessageError, ExtensionEventSchema.Message)) for required_key in self._EXTENSION_EVENT_REQUIRED_FIELDS: # If all required keys not in event then raise - if not required_key in event: + if required_key not in event: raise InvalidExtensionEventError( key_err_msg.format(InvalidExtensionEventError.MissingKeyError, required_key)) @@ -355,10 +349,11 @@ class ExtensionTelemetryHandler(object): _THREAD_NAME = "ExtensionTelemetryHandler" - def __init__(self, protocol_util): + def __init__(self, protocol_util, telemetry_handler): self.protocol_util = protocol_util self.should_run = True self.thread = None + self._enqueue_event = telemetry_handler.enqueue_event @staticmethod def get_thread_name(): @@ -389,7 +384,7 @@ def stopped(self): return not self.should_run def daemon(self): - op = ProcessExtensionTelemetry(self.protocol_util) # pylint: disable=C0103 + op = ProcessExtensionTelemetry(self.protocol_util, self._enqueue_event) # pylint: disable=C0103 logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 5ab1a6d7f6..4afe0c2346 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -81,7 +81,7 @@ _NUM_OF_STATUS_FILE_RETRIES = 5 _STATUS_FILE_RETRY_DELAY = 2 # seconds -_ENABLE_EXTENSION_TELEMETRY_PIPELINE = False +_ENABLE_EXTENSION_TELEMETRY_PIPELINE = True def is_extension_telemetry_pipeline_enabled(): return _ENABLE_EXTENSION_TELEMETRY_PIPELINE diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 10dd3a22ab..039ed09832 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -284,7 +284,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 if is_extension_telemetry_pipeline_enabled(): # Reuse the same protocol_util as the UpdateHandler class to avoid new initializations - all_thread_handlers.append(get_extension_telemetry_handler(self.protocol_util)) + all_thread_handlers.append(get_extension_telemetry_handler(self.protocol_util, telemetry_handler)) # Launch all monitoring threads for thread_handler in all_thread_handlers: diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_extension_telemetry.py index 172059f746..54d9b0da78 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_extension_telemetry.py @@ -170,7 +170,8 @@ def http_post_handler(url, body, **__): with mock_wire_protocol(DATA_FILE, http_post_handler=http_post_handler) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) - extension_telemetry_processor = ProcessExtensionTelemetry(protocol_util) + enqueue_event = MagicMock() + extension_telemetry_processor = ProcessExtensionTelemetry(protocol_util, enqueue_event) extension_telemetry_processor.event_body = [] yield extension_telemetry_processor From b55c8e54f100f5f8acc1bcd7c2e4a478e5b88878 Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 17:22:38 -0700 Subject: [PATCH 15/63] Added more debug data --- azurelinuxagent/ga/extension_telemetry.py | 5 ++++- azurelinuxagent/ga/telemetry_service.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 8cdd4ebc78..50210b5da6 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -23,6 +23,8 @@ import threading from collections import defaultdict +import traceback + import azurelinuxagent.common.logger as logger from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, TELEMETRY_LOG_EVENT_ID, \ @@ -92,7 +94,8 @@ def _collect_and_enqueue_extension_events(self): handler_event_dir_path = extension_handler_with_event_dir[1] self._capture_extension_events(handler_name, handler_event_dir_path) except Exception as error: - msg = "Unknown error occurred when trying to collect extension events. Error: {0}".format(ustr(error)) + msg = "Unknown error occurred when trying to collect extension events. Error: {0}, Stack: {1}".format( + ustr(error), traceback.format_exc()) add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing, message=msg, is_success=False) finally: # Always ensure that the events directory are being deleted each run, diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 1c8a983f67..bd732e7a3d 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -77,7 +77,7 @@ def enqueue_event(self, event): # Add event to queue and set event self._queue.put(event) # self._queue.put((priority, self._queue_counter.value, event)) - logger.verbose("Added event Priority: {0}, Event: {1}", event.priority, event) + logger.verbose("Added event for {0}, Priority: {1}, Event: {2}", self.get_thread_name(), event.priority, event) # Always set the event if any enqueue happens (even if already set) self._should_process_events.set() From 1b4a2405ec29f16593640ceba95da6e8659022fb Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 17:33:07 -0700 Subject: [PATCH 16/63] More logging --- azurelinuxagent/ga/extension_telemetry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 50210b5da6..2182e2a720 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -96,6 +96,7 @@ def _collect_and_enqueue_extension_events(self): except Exception as error: msg = "Unknown error occurred when trying to collect extension events. Error: {0}, Stack: {1}".format( ustr(error), traceback.format_exc()) + logger.warn(msg) add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing, message=msg, is_success=False) finally: # Always ensure that the events directory are being deleted each run, From 27d5ca8ece27b7844571a40d77c040abee729ead Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 17 Sep 2020 17:53:02 -0700 Subject: [PATCH 17/63] bug fix --- azurelinuxagent/ga/extension_telemetry.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 2182e2a720..7eee5fa14f 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -96,7 +96,6 @@ def _collect_and_enqueue_extension_events(self): except Exception as error: msg = "Unknown error occurred when trying to collect extension events. Error: {0}, Stack: {1}".format( ustr(error), traceback.format_exc()) - logger.warn(msg) add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing, message=msg, is_success=False) finally: # Always ensure that the events directory are being deleted each run, @@ -177,7 +176,7 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): break except Exception as error: - msg = "Failed to process event file {0}: {1}", event_file, ustr(error) + msg = "Failed to process event file {0}: {1}".format(event_file, ustr(error)) logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) finally: From b0198400666aa864af6b9f4ebb1324aaefb525e2 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 18 Sep 2020 10:52:53 -0700 Subject: [PATCH 18/63] Improved logging --- azurelinuxagent/common/protocol/wire.py | 11 ++++++----- azurelinuxagent/ga/extension_telemetry.py | 3 ++- tests/ga/test_extension_telemetry.py | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 54a1205deb..f46bbd91d7 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -23,6 +23,7 @@ import time import traceback import xml.sax.saxutils as saxutils +from collections import defaultdict from datetime import datetime # pylint: disable=ungrouped-imports import azurelinuxagent.common.conf as conf @@ -1101,7 +1102,7 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): def report_event(self, get_events): max_send_errors_to_report = 5 buf = {} - events_per_request = 0 + events_per_request = defaultdict(int) unicode_error_count, unicode_errors = 0, [] event_report_error_count, event_report_errors = 0, [] @@ -1121,12 +1122,12 @@ def report_event(self, get_events): .format(str(details_of_event), len(event_str), MAX_EVENT_BUFFER_SIZE)) continue if len(buf[event.providerId] + event_str) >= MAX_EVENT_BUFFER_SIZE: + logger.verbose("No of events this request = {0}".format(events_per_request[event.providerId])) self.send_encoded_event(event.providerId, buf[event.providerId]) buf[event.providerId] = b'' - logger.verbose("No of events this request = {0}".format(events_per_request)) - events_per_request = 0 + events_per_request[event.providerId] = 0 buf[event.providerId] = buf[event.providerId] + event_str - events_per_request += 1 + events_per_request[event.providerId] += 1 except UnicodeError as e: # pylint: disable=C0103 unicode_error_count += 1 if len(unicode_errors) < max_send_errors_to_report: @@ -1146,7 +1147,7 @@ def report_event(self, get_events): # Send out all events left in buffer. for provider_id in list(buf.keys()): if len(buf[provider_id]) > 0: # pylint: disable=len-as-condition - logger.verbose("No of events this request = {0}".format(events_per_request)) + logger.verbose("No of events this request = {0}".format(events_per_request[provider_id])) self.send_encoded_event(provider_id, buf[provider_id]) def report_status_event(self, message, is_success): diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 7eee5fa14f..c8be13297f 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -176,7 +176,8 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): break except Exception as error: - msg = "Failed to process event file {0}: {1}".format(event_file, ustr(error)) + msg = "Failed to process event file {0}: {1}, {2}".format(event_file, ustr(error), + traceback.format_exc()) logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) finally: diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_extension_telemetry.py index 54d9b0da78..4373af13ca 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_extension_telemetry.py @@ -23,7 +23,7 @@ from azurelinuxagent.ga.extension_telemetry import ExtensionEventSchema, ProcessExtensionTelemetry from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse from tests.protocol.mockwiredata import DATA_FILE -from tests.tools import AgentTestCase, clear_singleton_instances, data_dir, skip_if_predicate_true # pylint: disable=unused-import +from tests.tools import AgentTestCase, clear_singleton_instances, data_dir class TestExtensionTelemetryHandler(AgentTestCase, HttpRequestPredicates): From b8a724836c5945571b5355e55e82be89a0ca7b7d Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 18 Sep 2020 17:36:27 -0700 Subject: [PATCH 19/63] Fixed some failing tests --- azurelinuxagent/common/event.py | 4 +- azurelinuxagent/ga/exthandlers.py | 2 +- azurelinuxagent/ga/monitor.py | 16 +++---- azurelinuxagent/ga/update.py | 2 +- tests/common/test_event.py | 79 ++++++++++++++++++------------- tests/ga/test_monitor.py | 21 ++++---- tests/protocol/test_wire.py | 34 ++++++++----- 7 files changed, 93 insertions(+), 65 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 7446017088..40b7cc7a88 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -593,7 +593,7 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 from the events directory. """ max_collect_errors_to_report = 5 - event_list = TelemetryEventList() + # event_list = TelemetryEventList() event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) event_files = os.listdir(event_directory_full_path) unicode_error_count, unicode_errors = 0, [] @@ -649,7 +649,7 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 WALAEventOperation.CollectEventUnicodeErrors, max_collect_errors_to_report) - return event_list + # return event_list def _update_legacy_agent_event(self, event, event_creation_time): # Ensure that if an agent event is missing a field from the schema defined since 2.2.47, the missing fields diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 4afe0c2346..5ab1a6d7f6 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -81,7 +81,7 @@ _NUM_OF_STATUS_FILE_RETRIES = 5 _STATUS_FILE_RETRY_DELAY = 2 # seconds -_ENABLE_EXTENSION_TELEMETRY_PIPELINE = True +_ENABLE_EXTENSION_TELEMETRY_PIPELINE = False def is_extension_telemetry_pipeline_enabled(): return _ENABLE_EXTENSION_TELEMETRY_PIPELINE diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 0c12bc6ab8..841f3684e9 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -38,8 +38,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_monitor_handler(telemetry_handler): - return MonitorHandler(telemetry_handler) +def get_monitor_handler(enqueue_event): + return MonitorHandler(enqueue_event) class PollResourceUsageOperation(PeriodicOperation): @@ -116,12 +116,12 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - def __init__(self, telemetry_handler): + def __init__(self, enqueue_event): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self.collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self.enqueue_events = telemetry_handler.enqueue_event + self.enqueue_events = enqueue_event def collect_and_enqueue_events(self): """ @@ -133,8 +133,8 @@ def collect_and_enqueue_events(self): # if len(event_list.events) > 0: # pylint: disable=len-as-condition # self.protocol.report_event(event_list) - except Exception as e: # pylint: disable=C0103 - err_msg = "Failure in collecting Agent events: {0}".format(ustr(e)) + except Exception as error: + err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) @@ -200,14 +200,14 @@ class MonitorHandler(object): # pylint: disable=R0902 def get_thread_name(): return MonitorHandler._THREAD_NAME - def __init__(self, telemetry_handler): + def __init__(self, enqueue_event): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler), + CollectAndEnqueueEventsPeriodicOperation(enqueue_event), ReportNetworkErrorsOperation(), PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 039ed09832..661dadfe34 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -277,7 +277,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 # Get all thread handlers telemetry_handler = get_telemetry_service_handler(self.protocol_util) all_thread_handlers = [ - get_monitor_handler(telemetry_handler), + get_monitor_handler(telemetry_handler.enqueue_event), get_env_handler(), telemetry_handler ] diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 19b49a3c5a..8d6c9e66e9 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -83,6 +83,20 @@ def setUp(self): GuestAgentExtensionEventsSchema.ExtensionType: "" } + @staticmethod + def _report_events(protocol, event_list): + def _yield_events(): + for telemetry_event in event_list: + yield telemetry_event + + protocol.client.report_event(_yield_events) + + @staticmethod + def _collect_events(): + event_list = [] + event.collect_events(lambda telemetry_event: event_list.append(telemetry_event)) + return event_list + @staticmethod def _is_guest_extension_event(event): # pylint: disable=redefined-outer-name return event.eventId == TELEMETRY_EVENT_EVENT_ID and event.providerId == TELEMETRY_EVENT_PROVIDER_ID @@ -94,24 +108,24 @@ def _is_telemetry_log_event(event): # pylint: disable=redefined-outer-name def test_parse_xml_event(self, *args): # pylint: disable=unused-argument data_str = load_data('ext/event_from_extension.xml') event = parse_xml_event(data_str) # pylint: disable=redefined-outer-name - self.assertNotEqual(None, event) + self.assertIsNotNone(event) self.assertNotEqual(0, event.parameters) self.assertTrue(all(param is not None for param in event.parameters)) def test_parse_json_event(self, *args): # pylint: disable=unused-argument data_str = load_data('ext/event.json') event = parse_json_event(data_str) # pylint: disable=redefined-outer-name - self.assertNotEqual(None, event) + self.assertIsNotNone(event) self.assertNotEqual(0, event.parameters) self.assertTrue(all(param is not None for param in event.parameters)) def test_add_event_should_use_the_container_id_from_the_most_recent_goal_state(self): def create_event_and_return_container_id(): # pylint: disable=inconsistent-return-statements event.add_event(name='Event') - event_list = event.collect_events() - self.assertEquals(len(event_list.events), 1, "Could not find the event created by add_event") # pylint: disable=deprecated-method + event_list = self._collect_events() + self.assertEquals(len(event_list), 1, "Could not find the event created by add_event") # pylint: disable=deprecated-method - for p in event_list.events[0].parameters: # pylint: disable=invalid-name + for p in event_list[0].parameters: # pylint: disable=invalid-name if p.name == CommonTelemetryEventSchema.ContainerId: return p.value @@ -346,10 +360,10 @@ def test_collect_events_should_delete_event_files(self): event_files = os.listdir(self.event_dir) self.assertEquals(len(event_files), 3, "Did not find all the event files that were created") # pylint: disable=deprecated-method - event_list = event.collect_events() + event_list = self._collect_events() event_files = os.listdir(self.event_dir) - self.assertEquals(len(event_list.events), 3, "Did not collect all the events that were created") # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 3, "Did not collect all the events that were created") # pylint: disable=deprecated-method self.assertEquals(len(event_files), 0, "The event files were not deleted") # pylint: disable=deprecated-method def test_save_event(self): @@ -371,10 +385,11 @@ def _get_event_message(evt): def test_collect_events_should_be_able_to_process_events_with_non_ascii_characters(self): self._create_test_event_file("custom_script_nonascii_characters.tld") - event_list = event.collect_events() + event_list = self._collect_events() - self.assertEquals(len(event_list.events), 1) # pylint: disable=deprecated-method - self.assertEquals(TestEvent._get_event_message(event_list.events[0]), u'World\u05e2\u05d9\u05d5\u05ea \u05d0\u05d7\u05e8\u05d5\u05ea\u0906\u091c') # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method + self.assertEquals(TestEvent._get_event_message(event_list[0]), + u'World\u05e2\u05d9\u05d5\u05ea \u05d0\u05d7\u05e8\u05d5\u05ea\u0906\u091c') # pylint: disable=deprecated-method def test_collect_events_should_ignore_invalid_event_files(self): self._create_test_event_file("custom_script_1.tld") # a valid event @@ -384,12 +399,12 @@ def test_collect_events_should_ignore_invalid_event_files(self): self._create_test_event_file("custom_script_2.tld") # another valid event with patch("azurelinuxagent.common.event.add_event") as mock_add_event: - event_list = event.collect_events() + event_list = self._collect_events() self.assertEquals( # pylint: disable=deprecated-method - len(event_list.events), 2) + len(event_list), 2) self.assertTrue( - all(TestEvent._get_event_message(evt) == "A test telemetry message." for evt in event_list.events), + all(TestEvent._get_event_message(evt) == "A test telemetry message." for evt in event_list), "The valid events were not found") invalid_events = [] @@ -513,14 +528,13 @@ def _test_create_event_function_should_create_events_that_have_all_the_parameter create_event_function() timestamp_upper = TestEvent._datetime_to_event_timestamp(datetime.utcnow()) - # retrieve the event that was created - event_list = event.collect_events() + event_list = self._collect_events() - self.assertEquals(len(event_list.events), 1) # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method # verify the event parameters self._assert_event_includes_all_parameters_in_the_telemetry_schema( - event_list.events[0], + event_list[0], expected_parameters, assert_timestamp=lambda timestamp: self.assertTrue(timestamp_lower <= timestamp <= timestamp_upper, "The event timestamp (opcode) is incorrect") @@ -589,8 +603,8 @@ def test_add_log_event_should_always_create_events_when_forced(self): def test_add_log_event_should_not_create_event_if_not_allowed_and_not_forced(self): add_log_event(logger.LogLevel.WARNING, 'A test WARNING log event') - event_list = event.collect_events() - self.assertEquals(len(event_list.events), 0, "No events should be created if not forced and not allowed") # pylint: disable=deprecated-method + event_list = self._collect_events() + self.assertEquals(len(event_list), 0, "No events should be created if not forced and not allowed") # pylint: disable=deprecated-method def test_report_metric_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema(self): self._test_create_event_function_should_create_events_that_have_all_the_parameters_in_the_telemetry_schema( @@ -617,12 +631,12 @@ def test_collect_events_should_add_all_the_parameters_in_the_telemetry_schema_to # only a subset of fields; the rest are added by the current agent when events are collected. self._create_test_event_file("legacy_agent.tld") - event_list = event.collect_events() + event_list = self._collect_events() - self.assertEquals(len(event_list.events), 1) # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method self._assert_event_includes_all_parameters_in_the_telemetry_schema( - event_list.events[0], + event_list[0], expected_parameters={ GuestAgentExtensionEventsSchema.Name: "WALinuxAgent", GuestAgentExtensionEventsSchema.Version: "9.9.9", @@ -647,12 +661,12 @@ def test_collect_events_should_use_the_file_creation_time_for_legacy_agent_event event_creation_time = TestEvent._get_file_creation_timestamp(test_file) - event_list = event.collect_events() + event_list = self._collect_events() - self.assertEquals(len(event_list.events), 1) # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method self._assert_event_includes_all_parameters_in_the_telemetry_schema( - event_list.events[0], + event_list[0], expected_parameters={ GuestAgentExtensionEventsSchema.Name: "WALinuxAgent", GuestAgentExtensionEventsSchema.Version: "9.9.9", @@ -679,12 +693,12 @@ def _assert_extension_event_includes_all_parameters_in_the_telemetry_schema(self event_creation_time = TestEvent._get_file_creation_timestamp(test_file) - event_list = event.collect_events() + event_list = self._collect_events() - self.assertEquals(len(event_list.events), 1) # pylint: disable=deprecated-method + self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method self._assert_event_includes_all_parameters_in_the_telemetry_schema( - event_list.events[0], + event_list[0], expected_parameters={ GuestAgentExtensionEventsSchema.Name: 'Microsoft.Azure.Extensions.CustomScript', GuestAgentExtensionEventsSchema.Version: '2.0.4', @@ -762,9 +776,10 @@ def http_post_handler(url, body, **__): event_file_path = self._create_test_event_file("event_with_callstack.waagent.tld") expected_message = get_event_message_from_event_file(event_file_path) - event_list = event.collect_events() + event_list = self._collect_events() + - protocol.client.report_event(event_list) + self._report_events(protocol, event_list) event_message = get_event_message_from_http_request_body(http_post_handler.request_body) @@ -799,8 +814,8 @@ def http_post_handler(url, body, **__): ] for msg in test_messages: add_event('TestEventEncoding', message=msg) - event_list = event.collect_events() - protocol.client.report_event(event_list) + event_list = self._collect_events() + self._report_events(protocol, event_list) # In Py2, encode() produces a str and in py3 it produces a bytes string. # type(bytes) == type(str) for Py2 so this check is mainly for Py3 to ensure that the event is encoded properly. self.assertIsInstance(http_post_handler.request_body, bytes, "The Event request body should be encoded") diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 7735e4d9b9..c07fe957db 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -68,6 +68,10 @@ def _create_monitor_handler(enabled_operations=[], iterations=1): # pylint: disa * run_and_wait() - invokes run() and wait() on the MonitorHandler """ + event_list = [] + def _enqueue_events(telemetry_event): + event_list.append(telemetry_event) + def run(self): if len(enabled_operations) == 0 or self._name in enabled_operations: # pylint: disable=protected-access,len-as-condition run.original_definition(self) @@ -84,9 +88,10 @@ def run_and_wait(): monitor_handler.run() monitor_handler.join() - monitor_handler = get_monitor_handler() + monitor_handler = get_monitor_handler(_enqueue_events) monitor_handler.get_mock_wire_protocol = lambda: protocol monitor_handler.run_and_wait = run_and_wait + monitor_handler.event_list = event_list yield monitor_handler @@ -221,12 +226,12 @@ def _assert_error_event_reported(self, mock_add_event, expected_msg): self.assertTrue(found_msg, "Error event not reported") @patch("azurelinuxagent.common.event.TELEMETRY_EVENT_PROVIDER_ID", _TEST_EVENT_PROVIDER_ID) - @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") + # @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_events(self, mock_lib_dir, patch_send_event, *_): + def test_collect_and_send_events(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: + with _create_monitor_handler(enabled_operations=["collect_and_enqueue_events"]) as monitor_handler: self._create_extension_event(message="Message-Test") test_mtime = 1000 # epoch time, in ms @@ -242,8 +247,8 @@ def test_collect_and_send_events(self, mock_lib_dir, patch_send_event, *_): monitor_handler.run_and_wait() # Validating the crafted message by the collect_and_send_events call. - self.assertEqual(1, patch_send_event.call_count) - send_event_call_args = monitor_handler.get_mock_wire_protocol().client.send_encoded_event.call_args[0] # pylint: disable=no-member + self.assertEqual(1, len(monitor_handler.event_list)) + collected_event = monitor_handler.get_mock_wire_protocol().client.send_encoded_event.call_args[0] # pylint: disable=no-member # Some of those expected values come from the mock protocol and imds client set up during test initialization osutil = get_osutil() @@ -282,7 +287,7 @@ def test_collect_and_send_events(self, mock_lib_dir, patch_send_event, *_): osutil.get_processor_cores()) self.maxDiff = None # pylint: disable=invalid-name - self.assertEqual(sample_message.encode('utf-8'), send_event_call_args[1]) + self.assertEqual(sample_message.encode('utf-8'), collected_event[1]) @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") @patch("azurelinuxagent.common.conf.get_lib_dir") @@ -588,7 +593,7 @@ class TestMonitorFailure(AgentTestCase): @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") def test_error_heartbeat_creates_no_signal(self, patch_report_heartbeat, patch_http_get, patch_add_event, *args): # pylint: disable=unused-argument - monitor_handler = get_monitor_handler() + monitor_handler = get_monitor_handler(MagicMock()) protocol = WireProtocol('endpoint') protocol.update_goal_state = MagicMock() with patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol): diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index ddd5a31f07..1accae43ac 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -115,6 +115,14 @@ def _test_getters(self, test_data, certsMustBePresent, __, MockCryptUtil, _): # self.assertFalse(os.path.isfile(prv2)) self.assertEqual("1", protocol.get_incarnation()) + @staticmethod + def _get_telemetry_events_generator(event_list): + def _yield_events(): + for telemetry_event in event_list: + yield telemetry_event + + return _yield_events + def test_getters(self, *args): """Normal case""" test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE) @@ -376,47 +384,47 @@ def test_send_encoded_event(self, mock_http_request, *args): @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") def test_report_event_small_event(self, patch_send_event, *args): # pylint: disable=unused-argument - event_list = TelemetryEventList() + event_list = [] client = WireProtocol(WIRESERVER_URL).client event_str = random_generator(10) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) event_str = random_generator(100) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) event_str = random_generator(1000) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) event_str = random_generator(10000) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) - client.report_event(event_list) + client.report_event(self._get_telemetry_events_generator(event_list)) # It merges the messages into one message self.assertEqual(patch_send_event.call_count, 1) @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") def test_report_event_multiple_events_to_fill_buffer(self, patch_send_event, *args): # pylint: disable=unused-argument - event_list = TelemetryEventList() + event_list = [] client = WireProtocol(WIRESERVER_URL).client event_str = random_generator(2 ** 15) - event_list.events.append(get_event(message=event_str)) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) - client.report_event(event_list) + client.report_event(self._get_telemetry_events_generator(event_list)) # It merges the messages into one message self.assertEqual(patch_send_event.call_count, 2) @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") def test_report_event_large_event(self, patch_send_event, *args): # pylint: disable=unused-argument - event_list = TelemetryEventList() + event_list = [] event_str = random_generator(2 ** 18) - event_list.events.append(get_event(message=event_str)) + event_list.append(get_event(message=event_str)) client = WireProtocol(WIRESERVER_URL).client - client.report_event(event_list) + client.report_event(self._get_telemetry_events_generator(event_list)) self.assertEqual(patch_send_event.call_count, 0) From 38c4e86e71659ac4b4e5a7dec5f739f82110d68d Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 21 Sep 2020 11:41:20 -0700 Subject: [PATCH 20/63] Hijacking verbose flag for testing --- azurelinuxagent/ga/exthandlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 5ab1a6d7f6..40d9aeeebf 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -84,7 +84,7 @@ _ENABLE_EXTENSION_TELEMETRY_PIPELINE = False def is_extension_telemetry_pipeline_enabled(): - return _ENABLE_EXTENSION_TELEMETRY_PIPELINE + return conf.get_logs_verbose() class ValidHandlerStatus(object): # pylint: disable=R0903 From 7f9a7239ab0de0153150ec36a0df3552c882d67a Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 21 Sep 2020 13:39:17 -0700 Subject: [PATCH 21/63] Refactored and fixed ETP and ETP tests --- azurelinuxagent/ga/extension_telemetry.py | 24 ++-- azurelinuxagent/ga/update.py | 3 +- tests/ga/test_extension_telemetry.py | 151 ++++++++++------------ 3 files changed, 78 insertions(+), 100 deletions(-) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index c8be13297f..8137aa9113 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -37,9 +37,11 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_extension_telemetry_handler(protocol_util, telemetry_handler): - return ExtensionTelemetryHandler(protocol_util, telemetry_handler) +def get_extension_telemetry_handler(enqueue_events): + return ExtensionTelemetryHandler(enqueue_events) + +# Pylint R0903 (too-few-public-methods) : Disabling here because this class is an Enum, no public methods needed. class ExtensionEventSchema(object): # pylint: disable=R0903 """ Class for defining the schema for Extension Events. @@ -70,13 +72,12 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, protocol_util, enqueue_event): + def __init__(self, enqueue_event): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) - self._protocol = protocol_util.get_protocol() self._enqueue_event = enqueue_event def _collect_and_enqueue_extension_events(self): @@ -220,8 +221,8 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve event_file_time = datetime.datetime.fromtimestamp(os.path.getmtime(event_file_path)) # Read event file and decode it properly - with open(event_file_path, "rb") as fd: # pylint: disable=C0103 - event_data = fd.read().decode("utf-8") + with open(event_file_path, "rb") as event_file_descriptor: + event_data = event_file_descriptor.read().decode("utf-8") # Parse the string and get the list of events events = json.loads(event_data) @@ -353,11 +354,10 @@ class ExtensionTelemetryHandler(object): _THREAD_NAME = "ExtensionTelemetryHandler" - def __init__(self, protocol_util, telemetry_handler): - self.protocol_util = protocol_util + def __init__(self, enqueue_events): self.should_run = True self.thread = None - self._enqueue_event = telemetry_handler.enqueue_event + self._enqueue_event = enqueue_events @staticmethod def get_thread_name(): @@ -388,15 +388,15 @@ def stopped(self): return not self.should_run def daemon(self): - op = ProcessExtensionTelemetry(self.protocol_util, self._enqueue_event) # pylint: disable=C0103 + op = ProcessExtensionTelemetry(self._enqueue_event) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: op.run() - except Exception as e: # pylint: disable=C0103 + except Exception as error: logger.warn( "An error occurred in the Telemetry Extension thread main loop; will skip the current iteration.\n{0}", - ustr(e)) + ustr(error)) finally: PeriodicOperation.sleep_until_next_operation([op]) \ No newline at end of file diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 661dadfe34..9b6872a017 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -283,8 +283,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 ] if is_extension_telemetry_pipeline_enabled(): - # Reuse the same protocol_util as the UpdateHandler class to avoid new initializations - all_thread_handlers.append(get_extension_telemetry_handler(self.protocol_util, telemetry_handler)) + all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event)) # Launch all monitoring threads for thread_handler in all_thread_handlers: diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_extension_telemetry.py index 4373af13ca..7be87ff419 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_extension_telemetry.py @@ -123,37 +123,29 @@ def _replace_in_file(file_path, replace_from, replace_to): f.write(content) @staticmethod - def _get_handlers_with_version_from_event_body(event_body): + def _get_param_from_events(event_list): + for event in event_list: + for param in event.parameters: + yield param + + @staticmethod + def _get_handlers_with_version(event_list): event_with_name_and_versions = defaultdict(list) - regex_pattern = r'' - for body in event_body: - body = TestExtensionTelemetryHandler._get_ustr_from_event_body(body) - xml_doc = textutil.parse_doc(body) - events = textutil.findall(xml_doc, "Event") - for event in events: - # Since this is a telemetry event, we should always have an EventName param in the event - handler_name, version = re.search(regex_pattern, event.childNodes[0].nodeValue).groups() + for param in TestExtensionTelemetryHandler._get_param_from_events(event_list): + if param.name == GuestAgentGenericLogsSchema.EventName: + handler_name, version = param.value.split("-") event_with_name_and_versions[handler_name].append(version) return event_with_name_and_versions @staticmethod - def _get_param_value_from_event_body_if_exists(event_body, param_name): - regex_pattern = r'.+?)(\"|\') .* \/>'.format(param_name) - param_values = [] - - for body in event_body: - body = TestExtensionTelemetryHandler._get_ustr_from_event_body(body) - xml_doc = textutil.parse_doc(body) - events = textutil.findall(xml_doc, "Event") - - for event in events: + def _get_param_value_from_event_body_if_exists(event_list, param_name): - regex = re.search(regex_pattern, event.childNodes[0].nodeValue) - if regex is None: - continue - param_values.append(regex.group('value')) + param_values = [] + for param in TestExtensionTelemetryHandler._get_param_from_events(event_list): + if param.name == param_name: + param_values.append(param.value) return param_values @@ -161,21 +153,15 @@ def _get_param_value_from_event_body_if_exists(event_body, param_name): @contextlib.contextmanager def _create_extension_telemetry_processor(self): - def http_post_handler(url, body, **__): - if self.is_telemetry_request(url): - extension_telemetry_processor.event_body.append(body) - return MockHttpResponse(status=200) - return None + event_list = [] + def _enqueue_events(telemetry_event): + event_list.append(telemetry_event) - with mock_wire_protocol(DATA_FILE, http_post_handler=http_post_handler) as protocol: - protocol_util = MagicMock() - protocol_util.get_protocol = Mock(return_value=protocol) - enqueue_event = MagicMock() - extension_telemetry_processor = ProcessExtensionTelemetry(protocol_util, enqueue_event) - extension_telemetry_processor.event_body = [] - yield extension_telemetry_processor + extension_telemetry_processor = ProcessExtensionTelemetry(_enqueue_events) + extension_telemetry_processor.event_list = event_list + yield extension_telemetry_processor - def _assert_handler_data_in_event_body(self, telemetry_events, ext_names_with_count, expected_count=None): + def _assert_handler_data_in_event_list(self, telemetry_events, ext_names_with_count, expected_count=None): for ext_name, test_file_event_count in ext_names_with_count.items(): # If expected_count is not given, then the take the no of good events in the test file as the source of truth @@ -190,21 +176,16 @@ def _assert_handler_data_in_event_body(self, telemetry_events, ext_names_with_co self.assertEqual(len(telemetry_events[ext_name]), count, "No of good events for ext {0} do not match".format(ext_name)) - def _assert_param_in_events(self, event_body, param_key, param_value, min_count=1): + def _assert_param_in_events(self, event_list, param_key, param_value, min_count=1): - param = TelemetryEventParam(param_key, param_value) - param_str = event_param_to_v1(param) count = 0 - for body in event_body: - body = TestExtensionTelemetryHandler._get_ustr_from_event_body(body) - if count >= min_count: - break - count += body.count(param_str) + for param in TestExtensionTelemetryHandler._get_param_from_events(event_list): + if param.name == param_key and param.value == param_value: + count += 1 self.assertGreaterEqual(count, min_count, - "'{0}' param only found {1} times in events. Min_count required: {2}".format(param_str, - count, - min_count)) + "'{0}: {1}' param only found {2} times in events. Min_count required: {3}".format( + param_key, param_value, count, min_count)) @staticmethod def _get_ustr_from_event_body(body): return body if (body is None or type(body) is ustr) else textutil.str_to_encoded_ustr(body) # pylint: disable=unidiomatic-typecheck @@ -227,10 +208,10 @@ def test_it_should_not_capture_malformed_events(self): self._MALFORMED_FILES, "bad_json_files", "1591816395.json")) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, bad_name_ext_with_count, expected_count=0) - self._assert_handler_data_in_event_body(telemetry_events, bad_json_ext_with_count, expected_count=0) + self._assert_handler_data_in_event_list(telemetry_events, bad_name_ext_with_count, expected_count=0) + self._assert_handler_data_in_event_list(telemetry_events, bad_json_ext_with_count, expected_count=0) def test_it_should_capture_and_send_correct_events(self): @@ -241,9 +222,9 @@ def test_it_should_capture_and_send_correct_events(self): self._MIX_FILES, "1591835859.json"))) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, ext_names_with_count) + self._assert_handler_data_in_event_list(telemetry_events, ext_names_with_count) def test_it_should_disregard_bad_events_and_keep_good_ones_in_a_mixed_file(self): with self._create_extension_telemetry_processor() as extension_telemetry_processor: @@ -252,9 +233,9 @@ def test_it_should_disregard_bad_events_and_keep_good_ones_in_a_mixed_file(self) self._MALFORMED_FILES, "bad_name_file.json"))) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, extensions_with_count) + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) def test_it_should_limit_max_no_of_events_to_send_per_run_per_extension_and_report_event(self): max_events = 5 @@ -264,13 +245,13 @@ def test_it_should_limit_max_no_of_events_to_send_per_run_per_extension_and_repo ext_names_with_count = self._create_random_extension_events_dir_with_events(5, self._WELL_FORMED_FILES) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) - self._assert_handler_data_in_event_body(telemetry_events, ext_names_with_count, expected_count=max_events) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) + self._assert_handler_data_in_event_list(telemetry_events, ext_names_with_count, expected_count=max_events) pattern = r'Reached max count for the extension:\s*(?P.+?);\s*.+' self._assert_event_reported(mock_event, ext_names_with_count, pattern) - def test_it_should_send_the_latest_events(self): + def test_it_should_only_process_the_newer_events(self): max_events = 5 no_of_extension = 2 test_guid = str(uuid.uuid4()) @@ -286,13 +267,13 @@ def test_it_should_send_the_latest_events(self): replace_to='"{0}": "{1}"'.format(ExtensionEventSchema.OperationId, test_guid)) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) - self._assert_handler_data_in_event_body(telemetry_events, ext_names_with_count, + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) + self._assert_handler_data_in_event_list(telemetry_events, ext_names_with_count, expected_count=max_events) - self._assert_param_in_events(extension_telemetry_processor.event_body, + self._assert_param_in_events(extension_telemetry_processor.event_list, param_key=GuestAgentGenericLogsSchema.Context1, param_value="This is the latest event", min_count=no_of_extension*max_events) - self._assert_param_in_events(extension_telemetry_processor.event_body, + self._assert_param_in_events(extension_telemetry_processor.event_list, param_key=GuestAgentGenericLogsSchema.Context3, param_value=test_guid, min_count=no_of_extension*max_events) @@ -303,9 +284,9 @@ def test_it_should_parse_extension_event_irrespective_of_case(self): self._TEST_DATA_DIR, "different_cases")) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, extensions_with_count) + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) def test_it_should_parse_special_chars_properly(self): with self._create_extension_telemetry_processor() as extension_telemetry_processor: @@ -313,9 +294,9 @@ def test_it_should_parse_special_chars_properly(self): self._TEST_DATA_DIR, "special_chars")) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, extensions_with_count) + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) def _setup_and_assert_tests_for_max_sizes(self, no_of_extensions=2, expected_count=None): with self._create_extension_telemetry_processor() as extension_telemetry_processor: @@ -325,10 +306,10 @@ def _setup_and_assert_tests_for_max_sizes(self, no_of_extensions=2, expected_cou "large_messages")) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) - self._assert_handler_data_in_event_body(telemetry_events, extensions_with_count, expected_count) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count, expected_count) - return extensions_with_count, extension_telemetry_processor.event_body + return extensions_with_count, extension_telemetry_processor.event_list def _assert_invalid_extension_error_event_reported(self, mock_event, handler_name_with_count, error, expected_drop_count=None): @@ -363,8 +344,8 @@ def test_it_should_trim_message_if_more_than_limit(self): max_len = 100 no_of_extensions = 2 with patch("azurelinuxagent.ga.extension_telemetry.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_MSG_LEN", max_len): - handler_name_with_count, event_body = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable - context1_vals = self._get_param_value_from_event_body_if_exists(event_body, + handler_name_with_count, event_list = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable + context1_vals = self._get_param_value_from_event_body_if_exists(event_list, GuestAgentGenericLogsSchema.Context1) self.assertEqual(no_of_extensions, len(context1_vals), "There should be {0} Context1 values".format(no_of_extensions)) @@ -415,7 +396,7 @@ def test_it_should_map_extension_event_json_correctly_to_telemetry_event(self): telemetry_event_map = defaultdict(list) for telemetry_event_key in expected_mapping: telemetry_event_map[telemetry_event_key] = self._get_param_value_from_event_body_if_exists( - extension_telemetry_processor.event_body, telemetry_event_key) + extension_telemetry_processor.event_list, telemetry_event_key) with open(test_file, 'r') as event_file: data = json.load(event_file) @@ -454,9 +435,9 @@ def test_it_should_always_cleanup_files_on_good_and_bad_cases(self): random_file.write('1*2*3' * 100) extension_telemetry_processor.run() - telemetry_events = self._get_handlers_with_version_from_event_body(extension_telemetry_processor.event_body) + telemetry_events = self._get_handlers_with_version(extension_telemetry_processor.event_list) - self._assert_handler_data_in_event_body(telemetry_events, extensions_with_count) + self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count) for handler_name in extensions_with_count.keys(): events_path = os.path.join(conf.get_ext_log_dir(), handler_name, EVENTS_DIRECTORY) @@ -465,21 +446,19 @@ def test_it_should_always_cleanup_files_on_good_and_bad_cases(self): def test_it_should_skip_unwanted_parameters_in_event_file(self): extra_params = ["SomethingNewButNotCool", "SomethingVeryWeird"] - param_format = ' Date: Mon, 21 Sep 2020 15:52:50 -0700 Subject: [PATCH 22/63] Added queue.join for telemetry_service --- azurelinuxagent/ga/telemetry_service.py | 38 ++++++------ tests/ga/test_extension_telemetry.py | 25 ++++++-- tests/ga/test_telemetry_service.py | 77 +++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 23 deletions(-) create mode 100644 tests/ga/test_telemetry_service.py diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index bd732e7a3d..21d7fbbb99 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -41,7 +41,7 @@ class TelemetryServiceHandler(object): def __init__(self, protocol_util): self._protocol = protocol_util.get_protocol() self.should_run = True - self.thread = None + self._thread = None self._should_process_events = threading.Event() self._queue = PriorityQueue() @@ -54,13 +54,13 @@ def run(self): self.start() def is_alive(self): - return self.thread is not None and self.thread.is_alive() + return self._thread is not None and self._thread.is_alive() def start(self): - self.thread = threading.Thread(target=self.daemon) - self.thread.setDaemon(True) - self.thread.setName(self.get_thread_name()) - self.thread.start() + self._thread = threading.Thread(target=self._process_telemetry_thread) + self._thread.setDaemon(True) + self._thread.setName(self.get_thread_name()) + self._thread.start() def stop(self): """ @@ -68,7 +68,11 @@ def stop(self): """ self.should_run = False if self.is_alive(): - self.thread.join() + self.join() + + def join(self): + self._queue.join() + self._thread.join() def stopped(self): return not self.should_run @@ -82,39 +86,37 @@ def enqueue_event(self, event): # Always set the event if any enqueue happens (even if already set) self._should_process_events.set() - def daemon(self): + def _process_telemetry_thread(self): logger.info("Successfully started the {0} thread".format(self.get_thread_name())) try: # On demand wait, start processing as soon as there is any data available in the queue # In worst case, also keep checking every 5 mins to ensure that no data is being missed while self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT): - self.send_events_in_queue() + self._send_events_in_queue() except Exception as error: logger.warn("An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}", self.get_thread_name(), ustr(error), traceback.format_exc()) - def get_events(self): + def _get_events_in_queue(self): while not self._queue.empty(): - event = None try: # _, __, event = self._queue.get() event = self._queue.get() # logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) logger.verbose("Fetched event Priority: {0}, Event: {1}".format(event.priority if event is not None else 100, event)) yield event + # Mark task_done once data processed. Do not mark task_done if error fetching from queue, else that will raise errors + logger.verbose("Marking event as done now: {0}".format(event)) + self._queue.task_done() except Exception as e: - logger.error("Some exception: {0}, now the event will be None".format(ustr(e))) - finally: - # Mark the event as processed once done - logger.verbose("Finished working with event {0}".format(event)) - # self._queue.task_done() + logger.error("Some exception when fetching event from queue: {0}".format(ustr(e))) - def send_events_in_queue(self): + def _send_events_in_queue(self): # Process everything in Queue logger.verbose("Processing data in the telemetry service queue, approx qsize: {0}", self._queue.qsize()) if not self._queue.empty(): - self._protocol.report_event(self.get_events) + self._protocol.report_event(self._get_events_in_queue) # Clear event when done # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_extension_telemetry.py index 7be87ff419..6fcd3512ea 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_extension_telemetry.py @@ -1,3 +1,20 @@ +# Copyright 2020 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# + import contextlib import glob import json @@ -9,20 +26,18 @@ import uuid from collections import defaultdict -from mock import MagicMock, Mock, patch +from mock import patch from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY from azurelinuxagent.common.exception import InvalidExtensionEventError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil -from azurelinuxagent.common.protocol.wire import event_param_to_v1 -from azurelinuxagent.common.telemetryevent import TelemetryEventParam, GuestAgentGenericLogsSchema, \ +from azurelinuxagent.common.telemetryevent import GuestAgentGenericLogsSchema, \ CommonTelemetryEventSchema from azurelinuxagent.common.utils import fileutil, textutil from azurelinuxagent.ga.extension_telemetry import ExtensionEventSchema, ProcessExtensionTelemetry -from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse -from tests.protocol.mockwiredata import DATA_FILE +from tests.protocol.mocks import HttpRequestPredicates from tests.tools import AgentTestCase, clear_singleton_instances, data_dir diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py new file mode 100644 index 0000000000..18085c2c93 --- /dev/null +++ b/tests/ga/test_telemetry_service.py @@ -0,0 +1,77 @@ +# Copyright 2020 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Requires Python 2.6+ and Openssl 1.0+ +# +import contextlib + +import uuid +from mock import MagicMock, Mock + +from azurelinuxagent.common.future import ustr +from azurelinuxagent.common.protocol.util import ProtocolUtil +from azurelinuxagent.common.telemetryevent import TelemetryEvent +from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler +from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol +from tests.protocol.mockwiredata import DATA_FILE +from tests.tools import AgentTestCase, clear_singleton_instances + + +class TestExtensionTelemetryHandler(AgentTestCase): + def setUp(self): + AgentTestCase.setUp(self) + clear_singleton_instances(ProtocolUtil) + + @contextlib.contextmanager + def _create_telemetry_service_handler(self): + def http_post_handler(url, body, **__): + if self.is_telemetry_request(url): + telemetry_service_handler.event_body.append(body) + return MockHttpResponse(status=200) + return None + + with mock_wire_protocol(DATA_FILE, http_post_handler=http_post_handler) as protocol: + protocol_util = MagicMock() + protocol_util.get_protocol = Mock(return_value=protocol) + telemetry_service_handler = get_telemetry_service_handler(protocol_util) + telemetry_service_handler.event_body = [] + telemetry_service_handler.start() + yield telemetry_service_handler + + def test_it_should_send_events_properly(self): + raise NotImplementedError + test_guid = ustr(uuid.uuid4()) + events = [TelemetryEvent(eventId=test_guid), TelemetryEvent(providerId=test_guid)] + + with self._create_telemetry_service_handler() as telemetry_handler: + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + + + + def test_it_should_send_as_soon_as_events_available_in_queue(self): + raise NotImplementedError + + def test_thread_should_wait_for_events_in_queue(self): + raise NotImplementedError + + def test_it_should_honour_the_priority_order_of_events(self): + raise NotImplementedError + + def test_it_should_try_sending_events_periodically(self): + raise NotImplementedError + + def test_it_should_send_events(self): + raise NotImplementedError \ No newline at end of file From a072f3d86ee9d378b67986cb4a545605ad5135a2 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 22 Sep 2020 14:15:18 -0700 Subject: [PATCH 23/63] Added tests for the telemetry_service --- azurelinuxagent/ga/telemetry_service.py | 9 +- tests/ga/test_telemetry_service.py | 112 +++++++++++++++++++----- 2 files changed, 94 insertions(+), 27 deletions(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 21d7fbbb99..e6ff82eccb 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -91,7 +91,8 @@ def _process_telemetry_thread(self): try: # On demand wait, start processing as soon as there is any data available in the queue # In worst case, also keep checking every 5 mins to ensure that no data is being missed - while self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT): + while not self.stopped(): + self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT) self._send_events_in_queue() except Exception as error: @@ -101,16 +102,14 @@ def _process_telemetry_thread(self): def _get_events_in_queue(self): while not self._queue.empty(): try: - # _, __, event = self._queue.get() event = self._queue.get() - # logger.verbose("Fetched event Priority: {0}, Counter: {1}, Event: {2}".format(_, __, event)) logger.verbose("Fetched event Priority: {0}, Event: {1}".format(event.priority if event is not None else 100, event)) yield event # Mark task_done once data processed. Do not mark task_done if error fetching from queue, else that will raise errors logger.verbose("Marking event as done now: {0}".format(event)) self._queue.task_done() - except Exception as e: - logger.error("Some exception when fetching event from queue: {0}".format(ustr(e))) + except Exception as error: + logger.error("Some exception when fetching event from queue: {0}".format(ustr(error))) def _send_events_in_queue(self): # Process everything in Queue diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 18085c2c93..5da606cec0 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -15,29 +15,33 @@ # Requires Python 2.6+ and Openssl 1.0+ # import contextlib - +import re +import time import uuid -from mock import MagicMock, Mock +from datetime import datetime, timedelta + +from mock import MagicMock, Mock, patch from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil -from azurelinuxagent.common.telemetryevent import TelemetryEvent +from azurelinuxagent.common.protocol.wire import event_to_v1_encoded +from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventPriorities, TelemetryEventParam from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler -from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol +from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE from tests.tools import AgentTestCase, clear_singleton_instances -class TestExtensionTelemetryHandler(AgentTestCase): +class TestExtensionTelemetryHandler(AgentTestCase, HttpRequestPredicates): def setUp(self): AgentTestCase.setUp(self) clear_singleton_instances(ProtocolUtil) @contextlib.contextmanager - def _create_telemetry_service_handler(self): + def _create_telemetry_service_handler(self, timeout=0.5, start_thread=True): def http_post_handler(url, body, **__): if self.is_telemetry_request(url): - telemetry_service_handler.event_body.append(body) + telemetry_service_handler.event_calls.append((datetime.now(), body)) return MockHttpResponse(status=200) return None @@ -45,33 +49,97 @@ def http_post_handler(url, body, **__): protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) telemetry_service_handler = get_telemetry_service_handler(protocol_util) - telemetry_service_handler.event_body = [] - telemetry_service_handler.start() - yield telemetry_service_handler + telemetry_service_handler.event_calls = [] + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): + if start_thread: + telemetry_service_handler.start() + self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") + yield telemetry_service_handler + + def _assert_test_data_in_event_body(self, telemetry_handler, test_events): + # Stop the thread and Wait for the queue and thread to join + telemetry_handler.stop() + + for event in test_events: + event_str = event_to_v1_encoded(event) + found = False + for _, event_body in telemetry_handler.event_calls: + if event_str in event_body: + found = True + break + + self.assertTrue(found, "Event {0} not found in any telemetry calls".format(event_str)) def test_it_should_send_events_properly(self): - raise NotImplementedError - test_guid = ustr(uuid.uuid4()) - events = [TelemetryEvent(eventId=test_guid), TelemetryEvent(providerId=test_guid)] + events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] + + with self._create_telemetry_service_handler() as telemetry_handler: + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + self._assert_test_data_in_event_body(telemetry_handler, events) + + def test_it_should_send_as_soon_as_events_available_in_queue(self): + events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] with self._create_telemetry_service_handler() as telemetry_handler: + test_start_time = datetime.now() for test_event in events: telemetry_handler.enqueue_event(test_event) + self._assert_test_data_in_event_body(telemetry_handler, events) + # Ensure that we send out the data as soon as we enqueue the events + for event_time, _ in telemetry_handler.event_calls: + elapsed = event_time - test_start_time + self.assertLessEqual(elapsed, timedelta(seconds=2), "Request was not sent as soon as possible") + def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): + events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] - def test_it_should_send_as_soon_as_events_available_in_queue(self): - raise NotImplementedError + with self._create_telemetry_service_handler(timeout=0.1) as telemetry_handler: - def test_thread_should_wait_for_events_in_queue(self): - raise NotImplementedError + # Do nothing for some time + time.sleep(0.3) + + # Ensure that no events were transmitted by the telemetry handler during this time, i.e. telemetry thread was idle + self.assertEqual(0, len(telemetry_handler.event_calls), "Unwanted calls to telemetry") + + # Now enqueue data and verify telemetry_service sends them asap + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + self._assert_test_data_in_event_body(telemetry_handler, events) def test_it_should_honour_the_priority_order_of_events(self): - raise NotImplementedError - def test_it_should_try_sending_events_periodically(self): - raise NotImplementedError + # In general, lower the number, higher the priority + # Priority Order: AGENT_EVENT > EXTENSION_EVENT_NEW_PIPELINE > EXTENSION_EVENT_OLD_PIPELINE + events = [ + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT), + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), + TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT) + ] + expected_priority_order = [] + + with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: + for test_event in events: + test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) + expected_priority_order.append(str(test_event.priority)) + telemetry_handler.enqueue_event(test_event) + + telemetry_handler.start() + # Give the thread some time to start up, this was causing concurrency issues in UTs + time.sleep(0.005) + self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") + self._assert_test_data_in_event_body(telemetry_handler, events) + + priorities = [] + regex_pattern = r'' + for _, event_body in telemetry_handler.event_calls: + priorities.extend(re.findall(regex_pattern, event_body)) - def test_it_should_send_events(self): - raise NotImplementedError \ No newline at end of file + self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") \ No newline at end of file From de7d744c58edbf376809c1763dad44ee8951adf9 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 22 Sep 2020 17:20:58 -0700 Subject: [PATCH 24/63] Fixed some tests and code --- azurelinuxagent/common/event.py | 17 ++--- azurelinuxagent/common/protocol/wire.py | 51 +++++++++------ azurelinuxagent/common/telemetryevent.py | 5 -- azurelinuxagent/ga/monitor.py | 8 +-- azurelinuxagent/ga/telemetry_service.py | 9 ++- tests/ga/test_monitor.py | 83 ------------------------ tests/ga/test_telemetry_service.py | 65 ++++++++++++++++++- tests/protocol/test_wire.py | 2 +- 8 files changed, 111 insertions(+), 129 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 40b7cc7a88..e6a016a4e0 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -38,7 +38,6 @@ from azurelinuxagent.common.utils import fileutil, textutil from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, getattrib from azurelinuxagent.common.version import CURRENT_VERSION, CURRENT_AGENT, AGENT_NAME, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, AGENT_EXECUTION_MODE -from azurelinuxagent.common.telemetryevent import TelemetryEventList from azurelinuxagent.common.protocol.imds import get_imds_client EVENTS_DIRECTORY = "events" @@ -580,7 +579,7 @@ def _trim_extension_event_parameters(event): event.parameters = trimmed_params @staticmethod - def report_dropped_events_error(count, errors, op, max_errors_to_report): # pylint: disable=C0103 + def report_dropped_events_error(count, errors, op, max_errors_to_report): err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" if count > 0: add_event(op=op, @@ -593,10 +592,9 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 from the events directory. """ max_collect_errors_to_report = 5 - # event_list = TelemetryEventList() event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) event_files = os.listdir(event_directory_full_path) - unicode_error_count, unicode_errors = 0, [] + unicode_error_count, unicode_errors = 0, set() collect_event_error_count, collect_event_errors = 0, set() for event_file in event_files: @@ -630,18 +628,17 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 else: self._update_legacy_agent_event(event, event_file_creation_time) - # event_list.events.append(event) enqueue_event(event) finally: os.remove(event_file_path) - except UnicodeError as e: # pylint: disable=C0103 + except UnicodeError as uni_err: unicode_error_count += 1 if len(unicode_errors) < max_collect_errors_to_report: - unicode_errors.append(ustr(e)) - except Exception as e: # pylint: disable=C0103 + unicode_errors.add("{0}: {1}".format(ustr(uni_err), traceback.format_exc())) + except Exception as error: collect_event_error_count += 1 if len(collect_event_errors) < max_collect_errors_to_report: - collect_event_errors.add(traceback.format_exc()) + collect_event_errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) EventLogger.report_dropped_events_error(collect_event_error_count, collect_event_errors, WALAEventOperation.CollectEventErrors, max_collect_errors_to_report) @@ -649,8 +646,6 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 WALAEventOperation.CollectEventUnicodeErrors, max_collect_errors_to_report) - # return event_list - def _update_legacy_agent_event(self, event, event_creation_time): # Ensure that if an agent event is missing a field from the schema defined since 2.2.47, the missing fields # will be appended, ensuring the event schema is complete before the event is reported. diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index f46bbd91d7..6cd81f8c2a 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -39,7 +39,7 @@ from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.protocol.restapi import DataContract, ExtensionStatus, ExtHandlerPackage, \ ExtHandlerPackageList, ExtHandlerVersionUri, ProvisionStatus, VMInfo, VMStatus -from azurelinuxagent.common.telemetryevent import TelemetryEventList, GuestAgentExtensionEventsSchema +from azurelinuxagent.common.telemetryevent import GuestAgentExtensionEventsSchema from azurelinuxagent.common.utils import fileutil, restutil from azurelinuxagent.common.utils.archive import StateFlusher from azurelinuxagent.common.utils.cryptutil import CryptUtil @@ -191,7 +191,6 @@ def report_ext_status(self, ext_handler_name, ext_name, ext_status): # pylint: d self.client.status_blob.set_ext_status(ext_handler_name, ext_status) def report_event(self, events): - # validate_param(EVENTS_DIRECTORY, events, TelemetryEventList) self.client.report_event(events) def upload_logs(self, logs): @@ -1103,8 +1102,24 @@ def report_event(self, get_events): max_send_errors_to_report = 5 buf = {} events_per_request = defaultdict(int) - unicode_error_count, unicode_errors = 0, [] - event_report_error_count, event_report_errors = 0, [] + unicode_error_count, unicode_errors = 0, set() + event_report_error_count, event_report_errors = 0, set() + + def _send_event(provider_id): + uni_err_count, err_count = 0, 0 + try: + self.send_encoded_event(provider_id, buf[provider_id]) + except UnicodeError as uni_error: + uni_err_count += 1 + if len(unicode_errors) < max_send_errors_to_report: + unicode_errors.add("{0}: {1}".format(ustr(uni_error), traceback.format_exc())) + except Exception as error: + err_count += 1 + if len(event_report_errors) < max_send_errors_to_report: + event_report_errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) + + return uni_err_count, err_count + # Group events by providerId for event in get_events(): @@ -1123,33 +1138,31 @@ def report_event(self, get_events): continue if len(buf[event.providerId] + event_str) >= MAX_EVENT_BUFFER_SIZE: logger.verbose("No of events this request = {0}".format(events_per_request[event.providerId])) - self.send_encoded_event(event.providerId, buf[event.providerId]) + uni_err_count, err_count = _send_event(event.providerId) + unicode_error_count += uni_err_count + event_report_error_count += err_count buf[event.providerId] = b'' events_per_request[event.providerId] = 0 buf[event.providerId] = buf[event.providerId] + event_str events_per_request[event.providerId] += 1 - except UnicodeError as e: # pylint: disable=C0103 - unicode_error_count += 1 - if len(unicode_errors) < max_send_errors_to_report: - unicode_errors.append(ustr(e)) - except Exception as e: # pylint: disable=C0103 - event_report_error_count += 1 - if len(event_report_errors) < max_send_errors_to_report: - event_report_errors.append(ustr(e)) + except Exception as error: + logger.warn("Unexpected error when generating Events: {0}, {1}", ustr(error), traceback.format_exc()) logger.verbose("done reporting for Event {0}".format(event)) + # Send out all events left in buffer. + for provider_id in list(buf.keys()): + if buf[provider_id]: + logger.verbose("No of events this request = {0}".format(events_per_request[provider_id])) + uni_err_count, err_count = _send_event(provider_id) + unicode_error_count += uni_err_count + event_report_error_count += err_count + EventLogger.report_dropped_events_error(event_report_error_count, event_report_errors, WALAEventOperation.ReportEventErrors, max_send_errors_to_report) EventLogger.report_dropped_events_error(unicode_error_count, unicode_errors, WALAEventOperation.ReportEventUnicodeErrors, max_send_errors_to_report) - # Send out all events left in buffer. - for provider_id in list(buf.keys()): - if len(buf[provider_id]) > 0: # pylint: disable=len-as-condition - logger.verbose("No of events this request = {0}".format(events_per_request[provider_id])) - self.send_encoded_event(provider_id, buf[provider_id]) - def report_status_event(self, message, is_success): report_event(op=WALAEventOperation.ReportStatus, is_success=is_success, diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index ea6250d733..055cff1d91 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -144,8 +144,3 @@ def get_version(self): if param.name == GuestAgentExtensionEventsSchema.Version: return param.value return None - - -class TelemetryEventList(DataContract): # pylint: disable=R0903 - def __init__(self): - self.events = DataContractList(TelemetryEvent) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 841f3684e9..1d1032ae1f 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -119,20 +119,16 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): def __init__(self, enqueue_event): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", - operation=self.collect_and_enqueue_events, + operation=self._collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) self.enqueue_events = enqueue_event - def collect_and_enqueue_events(self): + def _collect_and_enqueue_events(self): """ Periodically send any events located in the events folder """ try: - # event_list = collect_events() collect_events(self.enqueue_events) - - # if len(event_list.events) > 0: # pylint: disable=len-as-condition - # self.protocol.report_event(event_list) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index e6ff82eccb..0632be9ad1 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -22,6 +22,7 @@ import traceback from azurelinuxagent.common import logger +from azurelinuxagent.common.event import add_event, WALAEventOperation from azurelinuxagent.common.future import ustr, PriorityQueue @@ -96,8 +97,9 @@ def _process_telemetry_thread(self): self._send_events_in_queue() except Exception as error: - logger.warn("An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}", - self.get_thread_name(), ustr(error), traceback.format_exc()) + err_msg = "An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}".format( + self.get_thread_name(), ustr(error), traceback.format_exc()) + add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) def _get_events_in_queue(self): while not self._queue.empty(): @@ -107,9 +109,10 @@ def _get_events_in_queue(self): yield event # Mark task_done once data processed. Do not mark task_done if error fetching from queue, else that will raise errors logger.verbose("Marking event as done now: {0}".format(event)) - self._queue.task_done() except Exception as error: logger.error("Some exception when fetching event from queue: {0}".format(ustr(error))) + finally: + self._queue.task_done() def _send_events_in_queue(self): # Process everything in Queue diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index c07fe957db..f399d7b445 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -216,15 +216,6 @@ def _get_event_data(duration, is_success, message, name, op, version, eventId=1) data = get_properties(event) return json.dumps(data) - def _assert_error_event_reported(self, mock_add_event, expected_msg): - found_msg = False - for call_args in mock_add_event.call_args_list: - _, kwargs = call_args - if expected_msg in kwargs['message']: - found_msg = True - break - self.assertTrue(found_msg, "Error event not reported") - @patch("azurelinuxagent.common.event.TELEMETRY_EVENT_PROVIDER_ID", _TEST_EVENT_PROVIDER_ID) # @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") @patch("azurelinuxagent.common.conf.get_lib_dir") @@ -329,80 +320,6 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir, patch_sen # The send_event call should never be called as the events are larger than 2**16. self.assertEqual(0, patch_send_event.call_count) - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_with_http_post_returning_503(self, mock_lib_dir, *_): - mock_lib_dir.return_value = self.lib_dir - fileutil.mkdir(self.event_dir) - - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: - def http_post_handler(url, _, **__): - if self.is_telemetry_request(url): - return MockHttpResponse(restutil.httpclient.SERVICE_UNAVAILABLE) - return None - - protocol = monitor_handler.get_mock_wire_protocol() - protocol.set_http_handlers(http_post_handler=http_post_handler) - - sizes = [1, 2, 3] # get the powers of 2, and multiple by 1024. - - for power in sizes: - size = 2 ** power * 1024 - self._create_extension_event(size) - - with patch("azurelinuxagent.ga.monitor.add_event") as mock_add_event: - monitor_handler.run_and_wait() - self.assertEqual(1, mock_add_event.call_count) - self.assertEqual(0, len(os.listdir(self.event_dir))) - expected_msg = "[ProtocolError] [Wireserver Exception] [ProtocolError] [Wireserver Failed] URI http://{0}/machine?comp=telemetrydata [HTTP Failed] Status Code 503".format( - protocol.get_endpoint()) - self._assert_error_event_reported(mock_add_event, expected_msg) - - - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_with_send_event_generating_exception(self, mock_lib_dir, *args): # pylint: disable=unused-argument - mock_lib_dir.return_value = self.lib_dir - fileutil.mkdir(self.event_dir) - - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: - sizes = [1, 2, 3] # get the powers of 2, and multiple by 1024. - - for power in sizes: - size = 2 ** power * 1024 - self._create_extension_event(size) - - # This test validates that if we hit an issue while sending an event, we never send it again. - with patch("azurelinuxagent.ga.monitor.add_event") as mock_add_event: - with patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") as patch_send_event: - test_str = "Test exception, Guid: {0}".format(str(uuid.uuid4())) - patch_send_event.side_effect = Exception(test_str) - - monitor_handler.run_and_wait() - - self.assertEqual(0, len(os.listdir(self.event_dir))) - self._assert_error_event_reported(mock_add_event, test_str) - - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_with_call_wireserver_returns_http_error_and_reports_event(self, mock_lib_dir, *args): # pylint: disable=unused-argument - mock_lib_dir.return_value = self.lib_dir - fileutil.mkdir(self.event_dir) - add_event(name="MonitorTests", op=WALAEventOperation.HeartBeat, is_success=True, message="Test heartbeat") - - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: - test_str = "A test exception, Guid: {0}".format(str(uuid.uuid4())) - - def http_post_handler(url, _, **__): - if self.is_telemetry_request(url): - return HttpError(test_str) - return None - - monitor_handler.get_mock_wire_protocol().set_http_handlers(http_post_handler=http_post_handler) - - with patch("azurelinuxagent.ga.monitor.add_event") as mock_add_event: - monitor_handler.run_and_wait() - - self.assertEqual(0, len(os.listdir(self.event_dir))) - self._assert_error_event_reported(mock_add_event, test_str) - @patch('azurelinuxagent.common.osutil.get_osutil') @patch('azurelinuxagent.common.protocol.util.get_protocol_util') diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 5da606cec0..e3af22d6f0 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -20,8 +20,11 @@ import uuid from datetime import datetime, timedelta +from azurelinuxagent.common.utils import restutil from mock import MagicMock, Mock, patch +from azurelinuxagent.common.event import WALAEventOperation +from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import event_to_v1_encoded @@ -51,6 +54,7 @@ def http_post_handler(url, body, **__): telemetry_service_handler = get_telemetry_service_handler(protocol_util) telemetry_service_handler.event_calls = [] with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): + telemetry_service_handler.get_mock_wire_protocol = lambda: protocol if start_thread: telemetry_service_handler.start() self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") @@ -70,6 +74,26 @@ def _assert_test_data_in_event_body(self, telemetry_handler, test_events): self.assertTrue(found, "Event {0} not found in any telemetry calls".format(event_str)) + def _assert_error_event_reported(self, mock_add_event, expected_msg, op=WALAEventOperation.ReportEventErrors): + found_msg = False + for call_args in mock_add_event.call_args_list: + _, kwargs = call_args + if expected_msg in kwargs['message'] and kwargs['op'] == op: + found_msg = True + break + self.assertTrue(found_msg, "Error msg: {0} not reported".format(expected_msg)) + + def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_msgs): + with self._create_telemetry_service_handler() as telemetry_handler: + + telemetry_handler.get_mock_wire_protocol().set_http_handlers(http_post_handler=http_post_handler) + + with patch("azurelinuxagent.common.event.add_event") as mock_add_event: + telemetry_handler.enqueue_event(TelemetryEvent()) + telemetry_handler.stop() + for msg in expected_msgs: + self._assert_error_event_reported(mock_add_event, msg) + def test_it_should_send_events_properly(self): events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] @@ -142,4 +166,43 @@ def test_it_should_honour_the_priority_order_of_events(self): for _, event_body in telemetry_handler.event_calls: priorities.extend(re.findall(regex_pattern, event_body)) - self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") \ No newline at end of file + self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") + + def test_telemetry_service_with_call_wireserver_returns_http_error_and_reports_event(self): + + test_str = "A test exception, Guid: {0}".format(str(uuid.uuid4())) + + def http_post_handler(url, _, **__): + if self.is_telemetry_request(url): + return HttpError(test_str) + return None + + self._setup_and_assert_bad_request_scenarios(http_post_handler, [test_str]) + + def test_telemetry_service_should_report_event_when_http_post_returning_503(self): + + def http_post_handler(url, _, **__): + if self.is_telemetry_request(url): + return MockHttpResponse(restutil.httpclient.SERVICE_UNAVAILABLE) + return None + + expected_msgs = ["[ProtocolError] [Wireserver Exception] [ProtocolError] [Wireserver Failed]", + "[HTTP Failed] Status Code 503"] + + self._setup_and_assert_bad_request_scenarios(http_post_handler, expected_msgs) + + def test_telemetry_service_should_add_event_on_unexpected_errors(self): + + with self._create_telemetry_service_handler() as telemetry_handler: + + # This test validates that if we hit an issue while sending an event, we never send it again. + with patch("azurelinuxagent.ga.telemetry_service.add_event") as mock_add_event: + with patch("azurelinuxagent.common.protocol.wire.WireClient.report_event") as patch_report_event: + test_str = "Test exception, Guid: {0}".format(str(uuid.uuid4())) + patch_report_event.side_effect = Exception(test_str) + + telemetry_handler.enqueue_event(TelemetryEvent()) + time.sleep(0.05) + telemetry_handler.stop() + + self._assert_error_event_reported(mock_add_event, test_str, op=WALAEventOperation.UnhandledError) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 1accae43ac..aca7bf567f 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -32,7 +32,7 @@ from azurelinuxagent.common.protocol.restapi import VMAgentManifestUri from azurelinuxagent.common.protocol.wire import WireProtocol, WireClient, \ InVMArtifactsProfile, StatusBlob, VMStatus -from azurelinuxagent.common.telemetryevent import TelemetryEventList, GuestAgentExtensionEventsSchema, \ +from azurelinuxagent.common.telemetryevent import GuestAgentExtensionEventsSchema, \ TelemetryEventParam, TelemetryEvent from azurelinuxagent.common.utils import restutil from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION From 318f7a1e232619e1ef9ebafd96aca8273d27d7fe Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 10:10:40 -0700 Subject: [PATCH 25/63] Fixed final failing tests --- azurelinuxagent/ga/exthandlers.py | 2 +- tests/ga/test_monitor.py | 149 ----------------------- tests/ga/test_telemetry_service.py | 185 +++++++++++++++++++++++++++-- 3 files changed, 177 insertions(+), 159 deletions(-) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 40d9aeeebf..5ab1a6d7f6 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -84,7 +84,7 @@ _ENABLE_EXTENSION_TELEMETRY_PIPELINE = False def is_extension_telemetry_pipeline_enabled(): - return conf.get_logs_verbose() + return _ENABLE_EXTENSION_TELEMETRY_PIPELINE class ValidHandlerStatus(object): # pylint: disable=R0903 diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index f399d7b445..62478056b1 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -172,155 +172,6 @@ def test_it_should_clear_periodic_log_messages(self): self.assertEqual(0, len(logger.DEFAULT_LOGGER.periodic_messages), "The monitor thread did not reset the periodic log messages") -class TestEventMonitoring(AgentTestCase, HttpRequestPredicates): - def setUp(self): - AgentTestCase.setUp(self) - self.lib_dir = tempfile.mkdtemp() - self.event_dir = os.path.join(self.lib_dir, event.EVENTS_DIRECTORY) - - EventLoggerTools.initialize_event_logger(self.event_dir) - - def tearDown(self): - fileutil.rm_dirs(self.lib_dir) - - _TEST_EVENT_PROVIDER_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" - - def _create_extension_event(self, # pylint: disable=invalid-name,too-many-arguments - size=0, - name="DummyExtension", - op=WALAEventOperation.Unknown, - is_success=True, - duration=0, - version=CURRENT_VERSION, - message="DummyMessage"): - event_data = TestEventMonitoring._get_event_data(name=size if size != 0 else name, - op=op, - is_success=is_success, - duration=duration, - version=version, - message=random_generator(size) if size != 0 else message) - event_file = os.path.join(self.event_dir, "{0}.tld".format(int(time.time() * 1000000))) - with open(event_file, 'wb+') as fd: # pylint: disable=invalid-name - fd.write(event_data.encode('utf-8')) - - @staticmethod - def _get_event_data(duration, is_success, message, name, op, version, eventId=1): # pylint: disable=invalid-name,too-many-arguments - event = TelemetryEvent(eventId, TestEventMonitoring._TEST_EVENT_PROVIDER_ID) # pylint: disable=redefined-outer-name - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, name)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str(version))) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, op)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, is_success)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, message)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, duration)) - - data = get_properties(event) - return json.dumps(data) - - @patch("azurelinuxagent.common.event.TELEMETRY_EVENT_PROVIDER_ID", _TEST_EVENT_PROVIDER_ID) - # @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_events(self, mock_lib_dir, *_): - mock_lib_dir.return_value = self.lib_dir - - with _create_monitor_handler(enabled_operations=["collect_and_enqueue_events"]) as monitor_handler: - self._create_extension_event(message="Message-Test") - - test_mtime = 1000 # epoch time, in ms - test_opcodename = datetime.datetime.fromtimestamp(test_mtime).strftime(logger.Logger.LogTimeFormatInUTC) - test_eventtid = 42 - test_eventpid = 24 - test_taskname = "TEST_TaskName" - - with patch("os.path.getmtime", return_value=test_mtime): - with patch('os.getpid', return_value=test_eventpid): - with patch("threading.Thread.ident", new_callable=PropertyMock(return_value=test_eventtid)): - with patch("threading.Thread.getName", return_value=test_taskname): - monitor_handler.run_and_wait() - - # Validating the crafted message by the collect_and_send_events call. - self.assertEqual(1, len(monitor_handler.event_list)) - collected_event = monitor_handler.get_mock_wire_protocol().client.send_encoded_event.call_args[0] # pylint: disable=no-member - - # Some of those expected values come from the mock protocol and imds client set up during test initialization - osutil = get_osutil() - osversion = u"{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME,platform.release()) - - sample_message = '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - '' \ - ']]>'.format(AGENT_VERSION, CURRENT_AGENT, test_opcodename, test_eventtid, - test_eventpid, test_taskname, osversion, int(osutil.get_total_mem()), - osutil.get_processor_cores()) - - self.maxDiff = None # pylint: disable=invalid-name - self.assertEqual(sample_message.encode('utf-8'), collected_event[1]) - - @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_events_with_small_events(self, mock_lib_dir, patch_send_event, *_): - mock_lib_dir.return_value = self.lib_dir - - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: - - sizes = [15, 15, 15, 15] # get the powers of 2 - 2**16 is the limit - - for power in sizes: - size = 2 ** power - self._create_extension_event(size) - - monitor_handler.run_and_wait() - - # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. - - self.assertEqual(4, patch_send_event.call_count) - - @patch("azurelinuxagent.common.protocol.wire.WireClient.send_encoded_event") - @patch("azurelinuxagent.common.conf.get_lib_dir") - def test_collect_and_send_events_with_large_events(self, mock_lib_dir, patch_send_event, *_): - mock_lib_dir.return_value = self.lib_dir - - with _create_monitor_handler(enabled_operations=["collect_and_send_events"]) as monitor_handler: - - sizes = [17, 17, 17] # get the powers of 2 - - for power in sizes: - size = 2 ** power - self._create_extension_event(size) - - with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - monitor_handler.run_and_wait() - - self.assertEqual(3, patch_periodic_warn.call_count) - - # The send_event call should never be called as the events are larger than 2**16. - self.assertEqual(0, patch_send_event.call_count) - - @patch('azurelinuxagent.common.osutil.get_osutil') @patch('azurelinuxagent.common.protocol.util.get_protocol_util') @patch("azurelinuxagent.common.protocol.healthservice.HealthService._report") diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index e3af22d6f0..84ae003297 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -20,25 +20,54 @@ import uuid from datetime import datetime, timedelta -from azurelinuxagent.common.utils import restutil -from mock import MagicMock, Mock, patch +import tempfile + +import os + +import json + +import platform + +from azurelinuxagent.common.osutil.factory import get_osutil + +from azurelinuxagent.common import event, logger +from azurelinuxagent.common.datacontract import get_properties + +from azurelinuxagent.common.utils import restutil, fileutil +from mock import MagicMock, Mock, patch, PropertyMock from azurelinuxagent.common.event import WALAEventOperation from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import event_to_v1_encoded -from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventPriorities, TelemetryEventParam +from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventPriorities, TelemetryEventParam, \ + GuestAgentExtensionEventsSchema +from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, AGENT_VERSION, CURRENT_AGENT, \ + DISTRO_CODE_NAME +from azurelinuxagent.ga.monitor import CollectAndEnqueueEventsPeriodicOperation from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler +from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE from tests.tools import AgentTestCase, clear_singleton_instances +from tests.utils.event_logger_tools import EventLoggerTools -class TestExtensionTelemetryHandler(AgentTestCase, HttpRequestPredicates): +class TestTelemetryServiceHandler(AgentTestCase, HttpRequestPredicates): def setUp(self): AgentTestCase.setUp(self) clear_singleton_instances(ProtocolUtil) + self.lib_dir = tempfile.mkdtemp() + self.event_dir = os.path.join(self.lib_dir, event.EVENTS_DIRECTORY) + + EventLoggerTools.initialize_event_logger(self.event_dir) + + def tearDown(self): + AgentTestCase.tearDown(self) + fileutil.rm_dirs(self.lib_dir) + + _TEST_EVENT_PROVIDER_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" @contextlib.contextmanager def _create_telemetry_service_handler(self, timeout=0.5, start_thread=True): @@ -60,9 +89,15 @@ def http_post_handler(url, body, **__): self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") yield telemetry_service_handler + @staticmethod + def _stop_handler(telemetry_handler, timeout=0.001): + # Giving it some grace time to finish execution and then stopping thread + time.sleep(timeout) + telemetry_handler.stop() + def _assert_test_data_in_event_body(self, telemetry_handler, test_events): # Stop the thread and Wait for the queue and thread to join - telemetry_handler.stop() + TestTelemetryServiceHandler._stop_handler(telemetry_handler) for event in test_events: event_str = event_to_v1_encoded(event) @@ -90,7 +125,7 @@ def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_ms with patch("azurelinuxagent.common.event.add_event") as mock_add_event: telemetry_handler.enqueue_event(TelemetryEvent()) - telemetry_handler.stop() + TestTelemetryServiceHandler._stop_handler(telemetry_handler) for msg in expected_msgs: self._assert_error_event_reported(mock_add_event, msg) @@ -193,7 +228,7 @@ def http_post_handler(url, _, **__): def test_telemetry_service_should_add_event_on_unexpected_errors(self): - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_telemetry_service_handler(timeout=0.1) as telemetry_handler: # This test validates that if we hit an issue while sending an event, we never send it again. with patch("azurelinuxagent.ga.telemetry_service.add_event") as mock_add_event: @@ -202,7 +237,139 @@ def test_telemetry_service_should_add_event_on_unexpected_errors(self): patch_report_event.side_effect = Exception(test_str) telemetry_handler.enqueue_event(TelemetryEvent()) - time.sleep(0.05) - telemetry_handler.stop() + TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) self._assert_error_event_reported(mock_add_event, test_str, op=WALAEventOperation.UnhandledError) + + def _create_extension_event(self, # pylint: disable=invalid-name,too-many-arguments + size=0, + name="DummyExtension", + op=WALAEventOperation.Unknown, + is_success=True, + duration=0, + version=CURRENT_VERSION, + message="DummyMessage"): + event_data = self._get_event_data(name=size if size != 0 else name, + op=op, + is_success=is_success, + duration=duration, + version=version, + message=random_generator(size) if size != 0 else message) + event_file = os.path.join(self.event_dir, "{0}.tld".format(int(time.time() * 1000000))) + with open(event_file, 'wb+') as fd: # pylint: disable=invalid-name + fd.write(event_data.encode('utf-8')) + + @staticmethod + def _get_event_data(duration, is_success, message, name, op, version, eventId=1): # pylint: disable=invalid-name,too-many-arguments + event = TelemetryEvent(eventId, TestTelemetryServiceHandler._TEST_EVENT_PROVIDER_ID) # pylint: disable=redefined-outer-name + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, name)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str(version))) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, op)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, is_success)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, message)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, duration)) + + data = get_properties(event) + return json.dumps(data) + + @patch("azurelinuxagent.common.event.TELEMETRY_EVENT_PROVIDER_ID", _TEST_EVENT_PROVIDER_ID) + @patch("azurelinuxagent.common.conf.get_lib_dir") + def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): + mock_lib_dir.return_value = self.lib_dir + + with self._create_telemetry_service_handler() as telemetry_handler: + monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event) + self._create_extension_event(message="Message-Test") + + test_mtime = 1000 # epoch time, in ms + test_opcodename = datetime.fromtimestamp(test_mtime).strftime(logger.Logger.LogTimeFormatInUTC) + test_eventtid = 42 + test_eventpid = 24 + test_taskname = "TEST_TaskName" + + with patch("os.path.getmtime", return_value=test_mtime): + with patch('os.getpid', return_value=test_eventpid): + with patch("threading.Thread.ident", new_callable=PropertyMock(return_value=test_eventtid)): + with patch("threading.Thread.getName", return_value=test_taskname): + monitor_handler.run() + + TestTelemetryServiceHandler._stop_handler(telemetry_handler) + # Validating the crafted message by the collect_and_send_events call. + self.assertEqual(1, len(telemetry_handler.event_calls), "Only 1 event should be sent") + + _, collected_event = telemetry_handler.event_calls[0] + + # Some of those expected values come from the mock protocol and imds client set up during test initialization + osutil = get_osutil() + osversion = u"{0}:{1}-{2}-{3}:{4}".format(platform.system(), DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME, + platform.release()) + + sample_message = '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + '' \ + ']]>'.format(AGENT_VERSION, CURRENT_AGENT, test_opcodename, test_eventtid, + test_eventpid, test_taskname, osversion, int(osutil.get_total_mem()), + osutil.get_processor_cores()) + + self.assertIn(sample_message.encode('utf-8'), collected_event) + + @patch("azurelinuxagent.common.conf.get_lib_dir") + def test_collect_and_send_events_with_small_events(self, mock_lib_dir): + mock_lib_dir.return_value = self.lib_dir + + with self._create_telemetry_service_handler() as telemetry_handler: + sizes = [15, 15, 15, 15] # get the powers of 2 - 2**16 is the limit + + for power in sizes: + size = 2 ** power + self._create_extension_event(size) + + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + + # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. + TestTelemetryServiceHandler._stop_handler(telemetry_handler) + self.assertEqual(4, len(telemetry_handler.event_calls)) + + @patch("azurelinuxagent.common.conf.get_lib_dir") + def test_collect_and_send_events_with_large_events(self, mock_lib_dir): + mock_lib_dir.return_value = self.lib_dir + + with self._create_telemetry_service_handler() as telemetry_handler: + sizes = [17, 17, 17] # get the powers of 2 + + for power in sizes: + size = 2 ** power + self._create_extension_event(size) + + with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + TestTelemetryServiceHandler._stop_handler(telemetry_handler) + self.assertEqual(3, patch_periodic_warn.call_count) + + # The send_event call should never be called as the events are larger than 2**16. + self.assertEqual(0, len(telemetry_handler.event_calls)) \ No newline at end of file From 16561feaa3974d57ef66edd254f6550ab6ca23a1 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 10:41:50 -0700 Subject: [PATCH 26/63] Fixed linter errors --- azurelinuxagent/common/event.py | 4 ++-- azurelinuxagent/common/future.py | 4 ++-- azurelinuxagent/common/protocol/wire.py | 4 ++-- azurelinuxagent/ga/extension_telemetry.py | 6 ++--- tests/common/test_event.py | 6 ++--- tests/ga/test_monitor.py | 28 +++++++---------------- tests/ga/test_telemetry_service.py | 18 +++++++-------- 7 files changed, 29 insertions(+), 41 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index e6a016a4e0..3e2030ce48 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -579,10 +579,10 @@ def _trim_extension_event_parameters(event): event.parameters = trimmed_params @staticmethod - def report_dropped_events_error(count, errors, op, max_errors_to_report): + def report_dropped_events_error(count, errors, operation_name, max_errors_to_report): err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" if count > 0: - add_event(op=op, + add_event(op=operation_name, message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index bf65cf6b0c..01ab8a073d 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -26,12 +26,12 @@ bytebuffer = memoryview # pylint: disable=C0103 from collections import OrderedDict # pylint: disable=W0611 - from queue import PriorityQueue + from queue import PriorityQueue # pylint: disable=W0611 elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 - from Queue import PriorityQueue + from Queue import PriorityQueue # pylint: disable=W0611 """Rename Python2 unicode to ustr""" # pylint: disable=W0105 ustr = unicode # pylint: disable=E0602,invalid-name diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 6cd81f8c2a..d9f9bb3937 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -24,13 +24,13 @@ import traceback import xml.sax.saxutils as saxutils from collections import defaultdict -from datetime import datetime # pylint: disable=ungrouped-imports +from datetime import datetime # pylint: disable=ungrouped-imports import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.datacontract import validate_param -from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, EVENTS_DIRECTORY, EventLogger, \ +from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, EventLogger, \ report_event from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index 8137aa9113..4bba368e8e 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -388,15 +388,15 @@ def stopped(self): return not self.should_run def daemon(self): - op = ProcessExtensionTelemetry(self._enqueue_event) + periodic_operation = ProcessExtensionTelemetry(self._enqueue_event) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: - op.run() + periodic_operation.run() except Exception as error: logger.warn( "An error occurred in the Telemetry Extension thread main loop; will skip the current iteration.\n{0}", ustr(error)) finally: - PeriodicOperation.sleep_until_next_operation([op]) \ No newline at end of file + PeriodicOperation.sleep_until_next_operation([periodic_operation]) \ No newline at end of file diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 8d6c9e66e9..966c606805 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -94,7 +94,7 @@ def _yield_events(): @staticmethod def _collect_events(): event_list = [] - event.collect_events(lambda telemetry_event: event_list.append(telemetry_event)) + event.collect_events(event_list.append) return event_list @staticmethod @@ -388,8 +388,8 @@ def test_collect_events_should_be_able_to_process_events_with_non_ascii_characte event_list = self._collect_events() self.assertEquals(len(event_list), 1) # pylint: disable=deprecated-method - self.assertEquals(TestEvent._get_event_message(event_list[0]), - u'World\u05e2\u05d9\u05d5\u05ea \u05d0\u05d7\u05e8\u05d5\u05ea\u0906\u091c') # pylint: disable=deprecated-method + self.assertEquals(TestEvent._get_event_message(event_list[0]), # pylint: disable=deprecated-method + u'World\u05e2\u05d9\u05d5\u05ea \u05d0\u05d7\u05e8\u05d5\u05ea\u0906\u091c') def test_collect_events_should_ignore_invalid_event_files(self): self._create_test_event_file("custom_script_1.tld") # a valid event diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 62478056b1..4fce933158 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -15,40 +15,28 @@ # # Requires Python 2.6+ and Openssl 1.0+ # -import datetime import contextlib -import json +import datetime import os -import platform import random import re import string -import tempfile -import time -import uuid -from datetime import timedelta # pylint: disable=ungrouped-imports - -from azurelinuxagent.common.protocol.util import ProtocolUtil +from datetime import timedelta # pylint: disable=ungrouped-imports from azurelinuxagent.common import event, logger from azurelinuxagent.common.cgroup import CGroup, CpuCgroup, MemoryCgroup, MetricValue from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry -from azurelinuxagent.common.datacontract import get_properties -from azurelinuxagent.common.event import add_event, WALAEventOperation, EVENTS_DIRECTORY -from azurelinuxagent.common.exception import HttpError +from azurelinuxagent.common.event import EVENTS_DIRECTORY from azurelinuxagent.common.logger import Logger -from azurelinuxagent.common.osutil import get_osutil +from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import WireProtocol -from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, GuestAgentExtensionEventsSchema -from azurelinuxagent.common.utils import fileutil, restutil -from azurelinuxagent.common.version import AGENT_VERSION, CURRENT_VERSION, CURRENT_AGENT, DISTRO_NAME, DISTRO_VERSION, DISTRO_CODE_NAME -from azurelinuxagent.ga.monitor import get_monitor_handler, MonitorHandler, PeriodicOperation, ResetPeriodicLogMessagesOperation, PollResourceUsageOperation +from azurelinuxagent.ga.monitor import get_monitor_handler, MonitorHandler, PeriodicOperation, \ + ResetPeriodicLogMessagesOperation, PollResourceUsageOperation from tests.common.mock_cgroup_commands import mock_cgroup_commands -from tests.protocol.mockwiredata import DATA_FILE from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse -from tests.tools import Mock, MagicMock, patch, AgentTestCase, clear_singleton_instances, PropertyMock -from tests.utils.event_logger_tools import EventLoggerTools +from tests.protocol.mockwiredata import DATA_FILE +from tests.tools import Mock, MagicMock, patch, AgentTestCase, clear_singleton_instances def random_generator(size=6, chars=string.ascii_uppercase + string.digits + string.ascii_lowercase): diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 84ae003297..9a750ee2c0 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -28,15 +28,15 @@ import platform +from mock import MagicMock, Mock, patch, PropertyMock from azurelinuxagent.common.osutil.factory import get_osutil -from azurelinuxagent.common import event, logger +from azurelinuxagent.common import logger from azurelinuxagent.common.datacontract import get_properties from azurelinuxagent.common.utils import restutil, fileutil -from mock import MagicMock, Mock, patch, PropertyMock -from azurelinuxagent.common.event import WALAEventOperation +from azurelinuxagent.common.event import WALAEventOperation, EVENTS_DIRECTORY from azurelinuxagent.common.exception import HttpError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil @@ -59,7 +59,7 @@ def setUp(self): AgentTestCase.setUp(self) clear_singleton_instances(ProtocolUtil) self.lib_dir = tempfile.mkdtemp() - self.event_dir = os.path.join(self.lib_dir, event.EVENTS_DIRECTORY) + self.event_dir = os.path.join(self.lib_dir, EVENTS_DIRECTORY) EventLoggerTools.initialize_event_logger(self.event_dir) @@ -99,8 +99,8 @@ def _assert_test_data_in_event_body(self, telemetry_handler, test_events): # Stop the thread and Wait for the queue and thread to join TestTelemetryServiceHandler._stop_handler(telemetry_handler) - for event in test_events: - event_str = event_to_v1_encoded(event) + for telemetry_event in test_events: + event_str = event_to_v1_encoded(telemetry_event) found = False for _, event_body in telemetry_handler.event_calls: if event_str in event_body: @@ -109,11 +109,11 @@ def _assert_test_data_in_event_body(self, telemetry_handler, test_events): self.assertTrue(found, "Event {0} not found in any telemetry calls".format(event_str)) - def _assert_error_event_reported(self, mock_add_event, expected_msg, op=WALAEventOperation.ReportEventErrors): + def _assert_error_event_reported(self, mock_add_event, expected_msg, operation=WALAEventOperation.ReportEventErrors): found_msg = False for call_args in mock_add_event.call_args_list: _, kwargs = call_args - if expected_msg in kwargs['message'] and kwargs['op'] == op: + if expected_msg in kwargs['message'] and kwargs['op'] == operation: found_msg = True break self.assertTrue(found_msg, "Error msg: {0} not reported".format(expected_msg)) @@ -239,7 +239,7 @@ def test_telemetry_service_should_add_event_on_unexpected_errors(self): telemetry_handler.enqueue_event(TelemetryEvent()) TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) - self._assert_error_event_reported(mock_add_event, test_str, op=WALAEventOperation.UnhandledError) + self._assert_error_event_reported(mock_add_event, test_str, operation=WALAEventOperation.UnhandledError) def _create_extension_event(self, # pylint: disable=invalid-name,too-many-arguments size=0, From 8c7207b2537afbc2123afa12949de5d732b6d759 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 11:03:49 -0700 Subject: [PATCH 27/63] Fixed linter errors - pt2 --- azurelinuxagent/common/future.py | 4 ++-- azurelinuxagent/common/protocol/wire.py | 3 ++- azurelinuxagent/common/telemetryevent.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 01ab8a073d..844c196211 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -26,12 +26,12 @@ bytebuffer = memoryview # pylint: disable=C0103 from collections import OrderedDict # pylint: disable=W0611 - from queue import PriorityQueue # pylint: disable=W0611 + from queue import PriorityQueue # pylint: disable=W0611,import-error elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 - from Queue import PriorityQueue # pylint: disable=W0611 + from Queue import PriorityQueue # pylint: disable=W0611,import-error """Rename Python2 unicode to ustr""" # pylint: disable=W0105 ustr = unicode # pylint: disable=E0602,invalid-name diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index d9f9bb3937..20d9455543 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1098,7 +1098,8 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): raise ProtocolError( "Failed to send events:{0}".format(resp.status)) - def report_event(self, get_events): + # Pylint too-many-locals: Disabling this here as a lot of the locals are used for error debugging + def report_event(self, get_events): # pylint: disable=too-many-locals max_send_errors_to_report = 5 buf = {} events_per_request = defaultdict(int) diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 055cff1d91..0950084def 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -82,7 +82,8 @@ def __eq__(self, other): return isinstance(other, TelemetryEventParam) and other.name == self.name and other.value == self.value -class TelemetryEventPriorities(object): +# Pylint R0903 (too-few-public-methods) : Disabling here as this class is used as an Enum +class TelemetryEventPriorities(object): # pylint: disable=R0903 """ Class defining the priorities for telemetry events. Lower the number, higher the priority From 98e8c3a44726a6c9330b471caa058d612cde2a0b Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 11:23:13 -0700 Subject: [PATCH 28/63] Fixed failing py3 test --- tests/ga/test_telemetry_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 9a750ee2c0..9b89fb063b 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -34,7 +34,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.datacontract import get_properties -from azurelinuxagent.common.utils import restutil, fileutil +from azurelinuxagent.common.utils import restutil, fileutil, textutil from azurelinuxagent.common.event import WALAEventOperation, EVENTS_DIRECTORY from azurelinuxagent.common.exception import HttpError @@ -199,7 +199,7 @@ def test_it_should_honour_the_priority_order_of_events(self): priorities = [] regex_pattern = r'' for _, event_body in telemetry_handler.event_calls: - priorities.extend(re.findall(regex_pattern, event_body)) + priorities.extend(re.findall(regex_pattern, textutil.str_to_encoded_ustr(event_body))) self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") From ea94628f9708ea9d89555bada5f9b19cae4f12b7 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 12:06:33 -0700 Subject: [PATCH 29/63] Added thread interface to Telemetry service and minor bug fixes --- azurelinuxagent/ga/monitor.py | 1 - azurelinuxagent/ga/telemetry_service.py | 3 ++- tests/ga/test_telemetry_service.py | 2 -- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 8724dd8f55..6096c80003 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -28,7 +28,6 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.interfaces import ThreadHandlerInterface from azurelinuxagent.common.osutil import get_osutil -from azurelinuxagent.common.protocol.util import get_protocol_util from azurelinuxagent.common.protocol.healthservice import HealthService from azurelinuxagent.common.protocol.imds import get_imds_client from azurelinuxagent.common.protocol.util import get_protocol_util diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 0632be9ad1..2f476ada9b 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -24,13 +24,14 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation from azurelinuxagent.common.future import ustr, PriorityQueue +from azurelinuxagent.common.interfaces import ThreadHandlerInterface def get_telemetry_service_handler(protocol_util): return TelemetryServiceHandler(protocol_util) -class TelemetryServiceHandler(object): +class TelemetryServiceHandler(ThreadHandlerInterface): """ This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as there's any data available in the queue to send. diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 9b89fb063b..c0559dee68 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -191,8 +191,6 @@ def test_it_should_honour_the_priority_order_of_events(self): telemetry_handler.enqueue_event(test_event) telemetry_handler.start() - # Give the thread some time to start up, this was causing concurrency issues in UTs - time.sleep(0.005) self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") self._assert_test_data_in_event_body(telemetry_handler, events) From 340c6f4213a9285177931f78e547b73485730769 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 23 Sep 2020 13:38:37 -0700 Subject: [PATCH 30/63] Final checks, code cleanup --- azurelinuxagent/common/event.py | 8 ++-- azurelinuxagent/common/future.py | 3 +- azurelinuxagent/common/protocol/wire.py | 10 ++--- azurelinuxagent/common/telemetryevent.py | 2 +- azurelinuxagent/ga/extension_telemetry.py | 14 +++---- azurelinuxagent/ga/monitor.py | 14 +++---- azurelinuxagent/ga/telemetry_service.py | 16 +++----- azurelinuxagent/ga/update.py | 4 +- tests/common/test_event.py | 2 +- tests/ga/test_extension_telemetry.py | 5 +-- tests/ga/test_monitor.py | 10 ++--- tests/ga/test_telemetry_service.py | 47 +++++++++-------------- 12 files changed, 57 insertions(+), 78 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 8520f28ee3..3781ba6833 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -587,7 +587,7 @@ def report_dropped_events_error(count, errors, operation_name, max_errors_to_rep message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) - def collect_events(self, enqueue_event): # pylint: disable=R0914 + def collect_events(self, enqueue_event_func): # pylint: disable=R0914 """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. @@ -629,7 +629,7 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 else: self._update_legacy_agent_event(event, event_file_creation_time) - enqueue_event(event) + enqueue_event_func(event) finally: os.remove(event_file_path) except UnicodeError as uni_err: @@ -771,8 +771,8 @@ def add_periodic(delta, name, op=WALAEventOperation.Unknown, is_success=True, du message=message, log_event=log_event, force=force) -def collect_events(enqueue_event, reporter=__event_logger__): - return reporter.collect_events(enqueue_event) +def collect_events(enqueue_event_func, reporter=__event_logger__): + return reporter.collect_events(enqueue_event_func) def mark_event_status(name, version, op, status): # pylint: disable=C0103 diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index dd3d94b9e6..c019b695c7 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -29,8 +29,9 @@ # to import from this module in others. # Additionally, python2 doesn't have this, so we need to disable import-error # as well. - from builtins import int, range # pylint: disable=unused-import,import-error + # unused-import, import-error Disabled: Due to backward compatibility between py2 and py3 + from builtins import int, range # pylint: disable=unused-import,import-error from collections import OrderedDict # pylint: disable=W0611 from queue import PriorityQueue # pylint: disable=W0611,import-error diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 20d9455543..3b17540fbc 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -190,8 +190,8 @@ def report_ext_status(self, ext_handler_name, ext_name, ext_status): # pylint: d validate_param("ext_status", ext_status, ExtensionStatus) self.client.status_blob.set_ext_status(ext_handler_name, ext_status) - def report_event(self, events): - self.client.report_event(events) + def report_event(self, get_events_in_queue): + self.client.report_event(get_events_in_queue) def upload_logs(self, logs): self.client.upload_logs(logs) @@ -1098,8 +1098,8 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): raise ProtocolError( "Failed to send events:{0}".format(resp.status)) - # Pylint too-many-locals: Disabling this here as a lot of the locals are used for error debugging - def report_event(self, get_events): # pylint: disable=too-many-locals + # too-many-locals Disabled: Most of the locals are used for error debugging + def report_event(self, get_events_in_queue): # pylint: disable=too-many-locals max_send_errors_to_report = 5 buf = {} events_per_request = defaultdict(int) @@ -1123,7 +1123,7 @@ def _send_event(provider_id): # Group events by providerId - for event in get_events(): + for event in get_events_in_queue(): try: if event.providerId not in buf: buf[event.providerId] = b'' diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 0950084def..4af0aa2dd7 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -82,7 +82,7 @@ def __eq__(self, other): return isinstance(other, TelemetryEventParam) and other.name == self.name and other.value == self.value -# Pylint R0903 (too-few-public-methods) : Disabling here as this class is used as an Enum +# too-few-public-methods Disabled: This class is used as an Enum class TelemetryEventPriorities(object): # pylint: disable=R0903 """ Class defining the priorities for telemetry events. Lower the number, higher the priority diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/extension_telemetry.py index dac2513bb9..6802d90ce0 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/extension_telemetry.py @@ -42,7 +42,7 @@ def get_extension_telemetry_handler(enqueue_events): return ExtensionTelemetryHandler(enqueue_events) -# Pylint R0903 (too-few-public-methods) : Disabling here because this class is an Enum, no public methods needed. +# too-few-public-methods Disabled: This class is used as an Enum class ExtensionEventSchema(object): # pylint: disable=R0903 """ Class for defining the schema for Extension Events. @@ -73,13 +73,13 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, enqueue_event): + def __init__(self, enqueue_event_func): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) - self._enqueue_event = enqueue_event + self._enqueue_event_func = enqueue_event_func def _collect_and_enqueue_extension_events(self): extension_handler_with_event_dirs = [] @@ -235,7 +235,7 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve for event in events: try: - self._enqueue_event(self._parse_telemetry_event(handler_name, event, event_file_time)) + self._enqueue_event_func(self._parse_telemetry_event(handler_name, event, event_file_time)) captured_events_count += 1 except InvalidExtensionEventError as invalid_error: # These are the errors thrown if there's an error parsing the event. We want to report these back to the @@ -355,10 +355,10 @@ class ExtensionTelemetryHandler(ThreadHandlerInterface): _THREAD_NAME = "ExtensionTelemetryHandler" - def __init__(self, enqueue_events): + def __init__(self, enqueue_event_func): self.should_run = True self.thread = None - self._enqueue_event = enqueue_events + self._enqueue_event_func = enqueue_event_func @staticmethod def get_thread_name(): @@ -389,7 +389,7 @@ def stopped(self): return not self.should_run def daemon(self): - periodic_operation = ProcessExtensionTelemetry(self._enqueue_event) + periodic_operation = ProcessExtensionTelemetry(self._enqueue_event_func) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index 6096c80003..4e0f80c492 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -37,8 +37,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_monitor_handler(enqueue_event): - return MonitorHandler(enqueue_event) +def get_monitor_handler(enqueue_event_func): + return MonitorHandler(enqueue_event_func) class PollResourceUsageOperation(PeriodicOperation): @@ -115,19 +115,19 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - def __init__(self, enqueue_event): + def __init__(self, enqueue_event_func): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self._collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self.enqueue_events = enqueue_event + self._enqueue_event_func = enqueue_event_func def _collect_and_enqueue_events(self): """ Periodically send any events located in the events folder """ try: - collect_events(self.enqueue_events) + collect_events(self._enqueue_event_func) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) @@ -197,14 +197,14 @@ class MonitorHandler(ThreadHandlerInterface): # pylint: disable=R0902 def get_thread_name(): return MonitorHandler._THREAD_NAME - def __init__(self, enqueue_event): + def __init__(self, enqueue_event_func): self.osutil = get_osutil() self.imds_client = None self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), - CollectAndEnqueueEventsPeriodicOperation(enqueue_event), + CollectAndEnqueueEventsPeriodicOperation(enqueue_event_func), ReportNetworkErrorsOperation(), PollResourceUsageOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 2f476ada9b..7ddc481591 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -79,13 +79,11 @@ def join(self): def stopped(self): return not self.should_run - def enqueue_event(self, event): + def enqueue_event_func(self, event): # Add event to queue and set event self._queue.put(event) - # self._queue.put((priority, self._queue_counter.value, event)) - logger.verbose("Added event for {0}, Priority: {1}, Event: {2}", self.get_thread_name(), event.priority, event) - # Always set the event if any enqueue happens (even if already set) + # Set the event if any enqueue happens (even if already set) to trigger sending those events self._should_process_events.set() def _process_telemetry_thread(self): @@ -106,23 +104,19 @@ def _get_events_in_queue(self): while not self._queue.empty(): try: event = self._queue.get() - logger.verbose("Fetched event Priority: {0}, Event: {1}".format(event.priority if event is not None else 100, event)) yield event - # Mark task_done once data processed. Do not mark task_done if error fetching from queue, else that will raise errors - logger.verbose("Marking event as done now: {0}".format(event)) except Exception as error: - logger.error("Some exception when fetching event from queue: {0}".format(ustr(error))) + logger.error("Some exception when fetching event from queue: {0}, {1}".format(ustr(error), + traceback.format_exc())) finally: self._queue.task_done() def _send_events_in_queue(self): # Process everything in Queue - logger.verbose("Processing data in the telemetry service queue, approx qsize: {0}", self._queue.qsize()) if not self._queue.empty(): self._protocol.report_event(self._get_events_in_queue) - # Clear event when done - # There might be a rare race condition where the loop exits and we get a new event, in that case not unsetting the event. + # Reset the event when done processing all events in queue if self._should_process_events.is_set() and self._queue.empty(): logger.verbose("Resetting the event") self._should_process_events.clear() \ No newline at end of file diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 138bbb14e0..d45230543e 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -271,7 +271,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 # Get all thread handlers telemetry_handler = get_telemetry_service_handler(self.protocol_util) all_thread_handlers = [ - get_monitor_handler(telemetry_handler.enqueue_event), + get_monitor_handler(telemetry_handler.enqueue_event_func), get_env_handler(), telemetry_handler ] @@ -280,7 +280,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 all_thread_handlers.append(get_collect_logs_handler()) if is_extension_telemetry_pipeline_enabled(): - all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event)) + all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event_func)) # Launch all monitoring threads for thread_handler in all_thread_handlers: diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 0dde04c020..76646d1a89 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -123,7 +123,7 @@ def test_add_event_should_use_the_container_id_from_the_most_recent_goal_state(s def create_event_and_return_container_id(): # pylint: disable=inconsistent-return-statements event.add_event(name='Event') event_list = self._collect_events() - self.assertEquals(len(event_list), 1, "Could not find the event created by add_event") # pylint: disable=deprecated-method + self.assertEqual(len(event_list), 1, "Could not find the event created by add_event") for p in event_list[0].parameters: # pylint: disable=invalid-name if p.name == CommonTelemetryEventSchema.ContainerId: diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_extension_telemetry.py index 6fcd3512ea..e5062a07ce 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_extension_telemetry.py @@ -169,10 +169,7 @@ def _get_param_value_from_event_body_if_exists(event_list, param_name): def _create_extension_telemetry_processor(self): event_list = [] - def _enqueue_events(telemetry_event): - event_list.append(telemetry_event) - - extension_telemetry_processor = ProcessExtensionTelemetry(_enqueue_events) + extension_telemetry_processor = ProcessExtensionTelemetry(event_list.append) extension_telemetry_processor.event_list = event_list yield extension_telemetry_processor diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 2ca574bdb3..99e80c95b8 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -21,7 +21,6 @@ import random import re import string -from datetime import timedelta # pylint: disable=ungrouped-imports from azurelinuxagent.common import event, logger from azurelinuxagent.common.cgroup import CGroup, CpuCgroup, MemoryCgroup, MetricValue @@ -56,15 +55,12 @@ def _create_monitor_handler(enabled_operations=[], iterations=1): # pylint: disa * run_and_wait() - invokes run() and wait() on the MonitorHandler """ - event_list = [] - def _enqueue_events(telemetry_event): - event_list.append(telemetry_event) - def run(self): if len(enabled_operations) == 0 or self._name in enabled_operations: # pylint: disable=protected-access,len-as-condition run.original_definition(self) run.original_definition = PeriodicOperation.run + event_list = [] with mock_wire_protocol(DATA_FILE) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) @@ -76,7 +72,7 @@ def run_and_wait(): monitor_handler.run() monitor_handler.join() - monitor_handler = get_monitor_handler(_enqueue_events) + monitor_handler = get_monitor_handler(event_list.append) monitor_handler.get_mock_wire_protocol = lambda: protocol monitor_handler.run_and_wait = run_and_wait monitor_handler.event_list = event_list @@ -354,7 +350,7 @@ def test_error_heartbeat_creates_no_signal(self, patch_report_heartbeat, patch_h protocol.update_goal_state = MagicMock() with patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol): monitor_handler.init_protocols() - monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - timedelta(hours=1) + monitor_handler.last_host_plugin_heartbeat = datetime.datetime.utcnow() - datetime.timedelta(hours=1) patch_http_get.side_effect = IOError('client error') monitor_handler.send_host_plugin_heartbeat() diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index c0559dee68..3438654d7d 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -124,7 +124,7 @@ def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_ms telemetry_handler.get_mock_wire_protocol().set_http_handlers(http_post_handler=http_post_handler) with patch("azurelinuxagent.common.event.add_event") as mock_add_event: - telemetry_handler.enqueue_event(TelemetryEvent()) + telemetry_handler.enqueue_event_func(TelemetryEvent()) TestTelemetryServiceHandler._stop_handler(telemetry_handler) for msg in expected_msgs: self._assert_error_event_reported(mock_add_event, msg) @@ -134,7 +134,7 @@ def test_it_should_send_events_properly(self): with self._create_telemetry_service_handler() as telemetry_handler: for test_event in events: - telemetry_handler.enqueue_event(test_event) + telemetry_handler.enqueue_event_func(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -144,7 +144,7 @@ def test_it_should_send_as_soon_as_events_available_in_queue(self): with self._create_telemetry_service_handler() as telemetry_handler: test_start_time = datetime.now() for test_event in events: - telemetry_handler.enqueue_event(test_event) + telemetry_handler.enqueue_event_func(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -166,7 +166,7 @@ def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): # Now enqueue data and verify telemetry_service sends them asap for test_event in events: - telemetry_handler.enqueue_event(test_event) + telemetry_handler.enqueue_event_func(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -188,7 +188,7 @@ def test_it_should_honour_the_priority_order_of_events(self): for test_event in events: test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) expected_priority_order.append(str(test_event.priority)) - telemetry_handler.enqueue_event(test_event) + telemetry_handler.enqueue_event_func(test_event) telemetry_handler.start() self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") @@ -201,7 +201,7 @@ def test_it_should_honour_the_priority_order_of_events(self): self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") - def test_telemetry_service_with_call_wireserver_returns_http_error_and_reports_event(self): + def test_telemetry_service_should_report_event_if_wireserver_returns_http_error(self): test_str = "A test exception, Guid: {0}".format(str(uuid.uuid4())) @@ -228,44 +228,35 @@ def test_telemetry_service_should_add_event_on_unexpected_errors(self): with self._create_telemetry_service_handler(timeout=0.1) as telemetry_handler: - # This test validates that if we hit an issue while sending an event, we never send it again. with patch("azurelinuxagent.ga.telemetry_service.add_event") as mock_add_event: with patch("azurelinuxagent.common.protocol.wire.WireClient.report_event") as patch_report_event: test_str = "Test exception, Guid: {0}".format(str(uuid.uuid4())) patch_report_event.side_effect = Exception(test_str) - telemetry_handler.enqueue_event(TelemetryEvent()) + telemetry_handler.enqueue_event_func(TelemetryEvent()) TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) self._assert_error_event_reported(mock_add_event, test_str, operation=WALAEventOperation.UnhandledError) - def _create_extension_event(self, # pylint: disable=invalid-name,too-many-arguments + def _create_extension_event(self, size=0, name="DummyExtension", - op=WALAEventOperation.Unknown, - is_success=True, - duration=0, - version=CURRENT_VERSION, message="DummyMessage"): event_data = self._get_event_data(name=size if size != 0 else name, - op=op, - is_success=is_success, - duration=duration, - version=version, message=random_generator(size) if size != 0 else message) event_file = os.path.join(self.event_dir, "{0}.tld".format(int(time.time() * 1000000))) - with open(event_file, 'wb+') as fd: # pylint: disable=invalid-name - fd.write(event_data.encode('utf-8')) + with open(event_file, 'wb+') as file_descriptor: + file_descriptor.write(event_data.encode('utf-8')) @staticmethod - def _get_event_data(duration, is_success, message, name, op, version, eventId=1): # pylint: disable=invalid-name,too-many-arguments - event = TelemetryEvent(eventId, TestTelemetryServiceHandler._TEST_EVENT_PROVIDER_ID) # pylint: disable=redefined-outer-name + def _get_event_data(message, name): + event = TelemetryEvent(1, TestTelemetryServiceHandler._TEST_EVENT_PROVIDER_ID) event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, name)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str(version))) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, op)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, is_success)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str(CURRENT_VERSION))) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, WALAEventOperation.Unknown)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.OperationSuccess, True)) event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Message, message)) - event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, duration)) + event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Duration, 0)) data = get_properties(event) return json.dumps(data) @@ -276,7 +267,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir with self._create_telemetry_service_handler() as telemetry_handler: - monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event) + monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func) self._create_extension_event(message="Message-Test") test_mtime = 1000 # epoch time, in ms @@ -347,7 +338,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): size = 2 ** power self._create_extension_event(size) - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func).run() # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. TestTelemetryServiceHandler._stop_handler(telemetry_handler) @@ -365,7 +356,7 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir): self._create_extension_event(size) with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func).run() TestTelemetryServiceHandler._stop_handler(telemetry_handler) self.assertEqual(3, patch_periodic_warn.call_count) From a686f257a8234cca10e81c9bf619290cc2036082 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 30 Sep 2020 17:57:28 -0700 Subject: [PATCH 31/63] Minor PR comments --- azurelinuxagent/ga/telemetry_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 7ddc481591..42f00d908c 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -69,6 +69,8 @@ def stop(self): Stop server communication and join the thread to main thread. """ self.should_run = False + # Set the event to unblock the thread to ensure that the thread is not blocking shutdown. + self._should_process_events.set() if self.is_alive(): self.join() From f48fab5d6a8f5020e0d7a15917c536177a4ed548 Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 1 Oct 2020 16:35:28 -0700 Subject: [PATCH 32/63] Added a linter comment --- azurelinuxagent/common/event.py | 1 + 1 file changed, 1 insertion(+) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 3781ba6833..89f1eceb84 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -587,6 +587,7 @@ def report_dropped_events_error(count, errors, operation_name, max_errors_to_rep message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) + # too-many-locals Disabled: Most local variables are being used for debugging which is acceptable. def collect_events(self, enqueue_event_func): # pylint: disable=R0914 """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files From 357403a4b0ea15fd9debfbda987e3badbca78cde Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 6 Oct 2020 17:07:45 -0700 Subject: [PATCH 33/63] Combined both telemetry event collections to a single thread --- ...lemetry.py => collect_telemetry_events.py} | 37 +++++++++++++++++-- azurelinuxagent/ga/monitor.py | 28 +------------- azurelinuxagent/ga/update.py | 2 +- ...ry.py => test_collect_telemetry_events.py} | 18 ++++----- 4 files changed, 44 insertions(+), 41 deletions(-) rename azurelinuxagent/ga/{extension_telemetry.py => collect_telemetry_events.py} (92%) rename tests/ga/{test_extension_telemetry.py => test_collect_telemetry_events.py} (96%) diff --git a/azurelinuxagent/ga/extension_telemetry.py b/azurelinuxagent/ga/collect_telemetry_events.py similarity index 92% rename from azurelinuxagent/ga/extension_telemetry.py rename to azurelinuxagent/ga/collect_telemetry_events.py index 6802d90ce0..9c22f13467 100644 --- a/azurelinuxagent/ga/extension_telemetry.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -28,7 +28,7 @@ import azurelinuxagent.common.logger as logger from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, TELEMETRY_LOG_EVENT_ID, \ - TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger + TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, collect_events from azurelinuxagent.common.exception import InvalidExtensionEventError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ @@ -347,6 +347,31 @@ def add_common_params_to_extension_event(event, event_time): reporter.add_common_event_parameters(event, event_time) +class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): + """ + Periodic operation to collect and send telemetry events located in the events folder + """ + + _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) + + def __init__(self, enqueue_event_func): + super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( + name="collect_and_enqueue_events", + operation=self._collect_and_enqueue_events, + period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) + self._enqueue_event_func = enqueue_event_func + + def _collect_and_enqueue_events(self): + """ + Periodically send any events located in the events folder + """ + try: + collect_events(self._enqueue_event_func) + except Exception as error: + err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) + add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) + + class ExtensionTelemetryHandler(ThreadHandlerInterface): """ This Handler takes care of fetching the Extension Telemetry events from the {extension_events_dir} and sends it to @@ -389,15 +414,19 @@ def stopped(self): return not self.should_run def daemon(self): - periodic_operation = ProcessExtensionTelemetry(self._enqueue_event_func) + periodic_operations = [ + CollectAndEnqueueEventsPeriodicOperation(self._enqueue_event_func), + ProcessExtensionTelemetry(self._enqueue_event_func) + ] logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: - periodic_operation.run() + for periodic_op in periodic_operations: + periodic_op.run() except Exception as error: logger.warn( "An error occurred in the Telemetry Extension thread main loop; will skip the current iteration.\n{0}", ustr(error)) finally: - PeriodicOperation.sleep_until_next_operation([periodic_operation]) \ No newline at end of file + PeriodicOperation.sleep_until_next_operation(periodic_operations) \ No newline at end of file diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index d588255030..ddabcbcd7d 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -24,7 +24,7 @@ from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry from azurelinuxagent.common.errorstate import ErrorState -from azurelinuxagent.common.event import add_event, WALAEventOperation, report_metric, collect_events +from azurelinuxagent.common.event import add_event, WALAEventOperation, report_metric from azurelinuxagent.common.future import ustr from azurelinuxagent.common.interfaces import ThreadHandlerInterface from azurelinuxagent.common.osutil import get_osutil @@ -108,31 +108,6 @@ def _operation_impl(): logger.reset_periodic() -class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): - """ - Periodic operation to collect and send telemetry events located in the events folder - """ - - _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - - def __init__(self, enqueue_event_func): - super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( - name="collect_and_enqueue_events", - operation=self._collect_and_enqueue_events, - period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self._enqueue_event_func = enqueue_event_func - - def _collect_and_enqueue_events(self): - """ - Periodically send any events located in the events folder - """ - try: - collect_events(self._enqueue_event_func) - except Exception as error: - err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) - add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) - - class ReportNetworkErrorsOperation(PeriodicOperation): def __init__(self): super(ReportNetworkErrorsOperation, self).__init__( @@ -204,7 +179,6 @@ def __init__(self, enqueue_event_func): self.event_thread = None self._periodic_operations = [ ResetPeriodicLogMessagesOperation(), - CollectAndEnqueueEventsPeriodicOperation(enqueue_event_func), ReportNetworkErrorsOperation(), PeriodicOperation("send_host_plugin_heartbeat", self.send_host_plugin_heartbeat, self.HOST_PLUGIN_HEARTBEAT_PERIOD), PeriodicOperation("send_imds_heartbeat", self.send_imds_heartbeat, self.IMDS_HEARTBEAT_PERIOD), diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 16ea7643eb..a864852745 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -53,7 +53,7 @@ PY_VERSION_MINOR, PY_VERSION_MICRO from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed from azurelinuxagent.ga.env import get_env_handler -from azurelinuxagent.ga.extension_telemetry import get_extension_telemetry_handler +from azurelinuxagent.ga.collect_telemetry_events import get_extension_telemetry_handler from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, \ is_extension_telemetry_pipeline_enabled, list_agent_lib_directory diff --git a/tests/ga/test_extension_telemetry.py b/tests/ga/test_collect_telemetry_events.py similarity index 96% rename from tests/ga/test_extension_telemetry.py rename to tests/ga/test_collect_telemetry_events.py index e5062a07ce..73d8282f14 100644 --- a/tests/ga/test_extension_telemetry.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -36,7 +36,7 @@ from azurelinuxagent.common.telemetryevent import GuestAgentGenericLogsSchema, \ CommonTelemetryEventSchema from azurelinuxagent.common.utils import fileutil, textutil -from azurelinuxagent.ga.extension_telemetry import ExtensionEventSchema, ProcessExtensionTelemetry +from azurelinuxagent.ga.collect_telemetry_events import ExtensionEventSchema, ProcessExtensionTelemetry from tests.protocol.mocks import HttpRequestPredicates from tests.tools import AgentTestCase, clear_singleton_instances, data_dir @@ -251,7 +251,7 @@ def test_it_should_disregard_bad_events_and_keep_good_ones_in_a_mixed_file(self) def test_it_should_limit_max_no_of_events_to_send_per_run_per_extension_and_report_event(self): max_events = 5 - with patch("azurelinuxagent.ga.extension_telemetry.add_log_event") as mock_event: + with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: with self._create_extension_telemetry_processor() as extension_telemetry_processor: with patch.object(extension_telemetry_processor, "_MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD", max_events): ext_names_with_count = self._create_random_extension_events_dir_with_events(5, self._WELL_FORMED_FILES) @@ -355,7 +355,7 @@ def _assert_event_reported(self, mock_event, handler_name_with_count, pattern): def test_it_should_trim_message_if_more_than_limit(self): max_len = 100 no_of_extensions = 2 - with patch("azurelinuxagent.ga.extension_telemetry.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_MSG_LEN", max_len): + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_MSG_LEN", max_len): handler_name_with_count, event_list = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable context1_vals = self._get_param_value_from_event_body_if_exists(event_list, GuestAgentGenericLogsSchema.Context1) @@ -368,8 +368,8 @@ def test_it_should_trim_message_if_more_than_limit(self): def test_it_should_skip_events_larger_than_max_size_and_report_event(self): max_size = 1000 no_of_extensions = 3 - with patch("azurelinuxagent.ga.extension_telemetry.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.extension_telemetry.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_SIZE", max_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) self._assert_invalid_extension_error_event_reported(mock_event, handler_name_with_count, @@ -378,8 +378,8 @@ def test_it_should_skip_events_larger_than_max_size_and_report_event(self): def test_it_should_skip_large_files_greater_than_max_file_size_and_report_event(self): max_file_size = 10000 no_of_extensions = 5 - with patch("azurelinuxagent.ga.extension_telemetry.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.extension_telemetry.ProcessExtensionTelemetry._EXTENSION_EVENT_FILE_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_FILE_MAX_SIZE", max_file_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) @@ -473,7 +473,7 @@ def test_it_should_skip_unwanted_parameters_in_event_file(self): "Unwanted param {0} found".format(param)) def test_it_should_not_send_events_which_dont_have_all_required_keys_and_report_event(self): - with patch("azurelinuxagent.ga.extension_telemetry.add_log_event") as mock_event: + with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: with self._create_extension_telemetry_processor() as extension_telemetry_processor: extensions_with_count = self._create_random_extension_events_dir_with_events(3, os.path.join( self._TEST_DATA_DIR, "missing_parameters")) @@ -516,7 +516,7 @@ def test_it_should_not_send_events_which_dont_have_all_required_keys_and_report_ self.assertEqual(len(extensions_with_count), 0, "All extension events not matched") def test_it_should_not_send_event_where_message_is_empty_and_report_event(self): - with patch("azurelinuxagent.ga.extension_telemetry.add_log_event") as mock_event: + with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: with self._create_extension_telemetry_processor() as extension_telemetry_processor: extensions_with_count = self._create_random_extension_events_dir_with_events(3, os.path.join( self._TEST_DATA_DIR, "empty_message")) From ea760abe041dbe17429ec581c072716cbf4a8b04 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 7 Oct 2020 12:11:39 -0700 Subject: [PATCH 34/63] Renamed enqueue_event_func to enqueue_event --- azurelinuxagent/common/event.py | 8 ++++---- azurelinuxagent/common/protocol/wire.py | 1 - .../ga/collect_telemetry_events.py | 20 +++++++++---------- azurelinuxagent/ga/monitor.py | 6 +++--- azurelinuxagent/ga/telemetry_service.py | 2 +- azurelinuxagent/ga/update.py | 4 ++-- tests/ga/test_telemetry_service.py | 18 ++++++++--------- 7 files changed, 29 insertions(+), 30 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 89f1eceb84..9099dd7e8a 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -588,7 +588,7 @@ def report_dropped_events_error(count, errors, operation_name, max_errors_to_rep is_success=False) # too-many-locals Disabled: Most local variables are being used for debugging which is acceptable. - def collect_events(self, enqueue_event_func): # pylint: disable=R0914 + def collect_events(self, enqueue_event): # pylint: disable=R0914 """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. @@ -630,7 +630,7 @@ def collect_events(self, enqueue_event_func): # pylint: disable=R0914 else: self._update_legacy_agent_event(event, event_file_creation_time) - enqueue_event_func(event) + enqueue_event(event) finally: os.remove(event_file_path) except UnicodeError as uni_err: @@ -772,8 +772,8 @@ def add_periodic(delta, name, op=WALAEventOperation.Unknown, is_success=True, du message=message, log_event=log_event, force=force) -def collect_events(enqueue_event_func, reporter=__event_logger__): - return reporter.collect_events(enqueue_event_func) +def collect_events(enqueue_event, reporter=__event_logger__): + return reporter.collect_events(enqueue_event) def mark_event_status(name, version, op, status): # pylint: disable=C0103 diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 6412e05e83..384b25732e 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1102,7 +1102,6 @@ def _send_event(provider_id): return uni_err_count, err_count - # Group events by providerId for event in get_events_in_queue(): try: diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 9c22f13467..41a7c01f4b 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -73,13 +73,13 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, enqueue_event_func): + def __init__(self, enqueue_event): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) - self._enqueue_event_func = enqueue_event_func + self._enqueue_event = enqueue_event def _collect_and_enqueue_extension_events(self): extension_handler_with_event_dirs = [] @@ -235,7 +235,7 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve for event in events: try: - self._enqueue_event_func(self._parse_telemetry_event(handler_name, event, event_file_time)) + self._enqueue_event(self._parse_telemetry_event(handler_name, event, event_file_time)) captured_events_count += 1 except InvalidExtensionEventError as invalid_error: # These are the errors thrown if there's an error parsing the event. We want to report these back to the @@ -354,19 +354,19 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - def __init__(self, enqueue_event_func): + def __init__(self, enqueue_event): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self._collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self._enqueue_event_func = enqueue_event_func + self._enqueue_event = enqueue_event def _collect_and_enqueue_events(self): """ Periodically send any events located in the events folder """ try: - collect_events(self._enqueue_event_func) + collect_events(self._enqueue_event) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) @@ -380,10 +380,10 @@ class ExtensionTelemetryHandler(ThreadHandlerInterface): _THREAD_NAME = "ExtensionTelemetryHandler" - def __init__(self, enqueue_event_func): + def __init__(self, enqueue_event): self.should_run = True self.thread = None - self._enqueue_event_func = enqueue_event_func + self._enqueue_event = enqueue_event @staticmethod def get_thread_name(): @@ -415,8 +415,8 @@ def stopped(self): def daemon(self): periodic_operations = [ - CollectAndEnqueueEventsPeriodicOperation(self._enqueue_event_func), - ProcessExtensionTelemetry(self._enqueue_event_func) + CollectAndEnqueueEventsPeriodicOperation(self._enqueue_event), + ProcessExtensionTelemetry(self._enqueue_event) ] logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): diff --git a/azurelinuxagent/ga/monitor.py b/azurelinuxagent/ga/monitor.py index ddabcbcd7d..87b17c8241 100644 --- a/azurelinuxagent/ga/monitor.py +++ b/azurelinuxagent/ga/monitor.py @@ -37,8 +37,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_monitor_handler(enqueue_event_func): - return MonitorHandler(enqueue_event_func) +def get_monitor_handler(): + return MonitorHandler() class PollResourceUsageOperation(PeriodicOperation): @@ -172,7 +172,7 @@ class MonitorHandler(ThreadHandlerInterface): # pylint: disable=R0902 def get_thread_name(): return MonitorHandler._THREAD_NAME - def __init__(self, enqueue_event_func): + def __init__(self): self.osutil = get_osutil() self.imds_client = None diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 42f00d908c..9e24cabfa7 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -81,7 +81,7 @@ def join(self): def stopped(self): return not self.should_run - def enqueue_event_func(self, event): + def enqueue_event(self, event): # Add event to queue and set event self._queue.put(event) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index a864852745..4c3d4bed28 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -290,7 +290,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 # Get all thread handlers telemetry_handler = get_telemetry_service_handler(self.protocol_util) all_thread_handlers = [ - get_monitor_handler(telemetry_handler.enqueue_event_func), + get_monitor_handler(), get_env_handler(), telemetry_handler ] @@ -299,7 +299,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 all_thread_handlers.append(get_collect_logs_handler()) if is_extension_telemetry_pipeline_enabled(): - all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event_func)) + all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event)) # Launch all monitoring threads for thread_handler in all_thread_handlers: diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 3438654d7d..29cc32490f 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -124,7 +124,7 @@ def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_ms telemetry_handler.get_mock_wire_protocol().set_http_handlers(http_post_handler=http_post_handler) with patch("azurelinuxagent.common.event.add_event") as mock_add_event: - telemetry_handler.enqueue_event_func(TelemetryEvent()) + telemetry_handler.enqueue_event(TelemetryEvent()) TestTelemetryServiceHandler._stop_handler(telemetry_handler) for msg in expected_msgs: self._assert_error_event_reported(mock_add_event, msg) @@ -134,7 +134,7 @@ def test_it_should_send_events_properly(self): with self._create_telemetry_service_handler() as telemetry_handler: for test_event in events: - telemetry_handler.enqueue_event_func(test_event) + telemetry_handler.enqueue_event(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -144,7 +144,7 @@ def test_it_should_send_as_soon_as_events_available_in_queue(self): with self._create_telemetry_service_handler() as telemetry_handler: test_start_time = datetime.now() for test_event in events: - telemetry_handler.enqueue_event_func(test_event) + telemetry_handler.enqueue_event(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -166,7 +166,7 @@ def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): # Now enqueue data and verify telemetry_service sends them asap for test_event in events: - telemetry_handler.enqueue_event_func(test_event) + telemetry_handler.enqueue_event(test_event) self._assert_test_data_in_event_body(telemetry_handler, events) @@ -188,7 +188,7 @@ def test_it_should_honour_the_priority_order_of_events(self): for test_event in events: test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) expected_priority_order.append(str(test_event.priority)) - telemetry_handler.enqueue_event_func(test_event) + telemetry_handler.enqueue_event(test_event) telemetry_handler.start() self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") @@ -233,7 +233,7 @@ def test_telemetry_service_should_add_event_on_unexpected_errors(self): test_str = "Test exception, Guid: {0}".format(str(uuid.uuid4())) patch_report_event.side_effect = Exception(test_str) - telemetry_handler.enqueue_event_func(TelemetryEvent()) + telemetry_handler.enqueue_event(TelemetryEvent()) TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) self._assert_error_event_reported(mock_add_event, test_str, operation=WALAEventOperation.UnhandledError) @@ -267,7 +267,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir with self._create_telemetry_service_handler() as telemetry_handler: - monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func) + monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event) self._create_extension_event(message="Message-Test") test_mtime = 1000 # epoch time, in ms @@ -338,7 +338,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): size = 2 ** power self._create_extension_event(size) - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. TestTelemetryServiceHandler._stop_handler(telemetry_handler) @@ -356,7 +356,7 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir): self._create_extension_event(size) with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event_func).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() TestTelemetryServiceHandler._stop_handler(telemetry_handler) self.assertEqual(3, patch_periodic_warn.call_count) From 59174c1d3b28d2d307ef3b0f1012fac093b87067 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 7 Oct 2020 14:42:39 -0700 Subject: [PATCH 35/63] Added batching support --- azurelinuxagent/ga/telemetry_service.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 9e24cabfa7..2a35eaa1b1 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -20,6 +20,7 @@ import threading import traceback +import time from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation @@ -39,6 +40,9 @@ class TelemetryServiceHandler(ThreadHandlerInterface): _THREAD_NAME = "TelemetryServiceHandler" _MAX_TIMEOUT = datetime.timedelta(minutes=5).seconds + _MIN_QUEUE_LIMIT = 30 + _MIN_WAIT_TIME = datetime.timedelta(seconds=5) + def __init__(self, protocol_util): self._protocol = protocol_util.get_protocol() @@ -105,7 +109,7 @@ def _process_telemetry_thread(self): def _get_events_in_queue(self): while not self._queue.empty(): try: - event = self._queue.get() + event = self._queue.get_nowait() yield event except Exception as error: logger.error("Some exception when fetching event from queue: {0}, {1}".format(ustr(error), @@ -116,6 +120,12 @@ def _get_events_in_queue(self): def _send_events_in_queue(self): # Process everything in Queue if not self._queue.empty(): + start_time = datetime.datetime.utcnow() + while self._queue.qsize() < self._MIN_QUEUE_LIMIT or \ + (start_time + self._MIN_WAIT_TIME) <= datetime.datetime.utcnow(): + # To promote batching, we either wait for atleast 30 events or 5 secs before sending out the first + # request to wireserver + time.sleep(secs=1) self._protocol.report_event(self._get_events_in_queue) # Reset the event when done processing all events in queue From 484031588ede75827a479198bf0b7ac5587e48dc Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 7 Oct 2020 14:59:39 -0700 Subject: [PATCH 36/63] Removed priority queue --- azurelinuxagent/common/future.py | 4 +- azurelinuxagent/common/telemetryevent.py | 68 +++++++++---------- .../ga/collect_telemetry_events.py | 5 +- azurelinuxagent/ga/telemetry_service.py | 7 +- tests/ga/test_telemetry_service.py | 64 ++++++++--------- 5 files changed, 73 insertions(+), 75 deletions(-) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index c019b695c7..5458d1179e 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -33,12 +33,12 @@ # unused-import, import-error Disabled: Due to backward compatibility between py2 and py3 from builtins import int, range # pylint: disable=unused-import,import-error from collections import OrderedDict # pylint: disable=W0611 - from queue import PriorityQueue # pylint: disable=W0611,import-error + from queue import Queue # pylint: disable=W0611,import-error elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 - from Queue import PriorityQueue # pylint: disable=W0611,import-error + from Queue import Queue # pylint: disable=W0611,import-error # We want to suppress the following: diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 4af0aa2dd7..13155d2f31 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -83,51 +83,51 @@ def __eq__(self, other): # too-few-public-methods Disabled: This class is used as an Enum -class TelemetryEventPriorities(object): # pylint: disable=R0903 - """ - Class defining the priorities for telemetry events. Lower the number, higher the priority - - Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry - instantly rather than waiting for a minute for the monitor thread to pick up the events) - """ - AGENT_EVENT = 1 # Agent events always get the highest priority - EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline - EXTENSION_EVENT_OLD_PIPELINE = 3 +# class TelemetryEventPriorities(object): # pylint: disable=R0903 +# """ +# Class defining the priorities for telemetry events. Lower the number, higher the priority +# +# Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry +# instantly rather than waiting for a minute for the monitor thread to pick up the events) +# """ +# AGENT_EVENT = 1 # Agent events always get the highest priority +# EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline +# EXTENSION_EVENT_OLD_PIPELINE = 3 class TelemetryEvent(DataContract): - def __init__(self, eventId=None, providerId=None, priority=TelemetryEventPriorities.AGENT_EVENT): + def __init__(self, eventId=None, providerId=None): self.eventId = eventId # pylint: disable=C0103 self.providerId = providerId # pylint: disable=C0103 self.parameters = DataContractList(TelemetryEventParam) self.file_type = "" - self._priority = priority + # self._priority = priority # Checking if the particular param name is in the TelemetryEvent. def __contains__(self, param_name): return param_name in [param.name for param in self.parameters] - def __le__(self, other): - return self.priority <= other.priority - - def __ge__(self, other): - return self.priority >= other.priority - - def __eq__(self, other): - return self.priority == other.priority - - def __lt__(self, other): - return self.priority < other.priority - - def __gt__(self, other): - return self.priority > other.priority - - def __ne__(self, other): - return self.priority != other.priority - - @property - def priority(self): - return self._priority + # def __le__(self, other): + # return self.priority <= other.priority + # + # def __ge__(self, other): + # return self.priority >= other.priority + # + # def __eq__(self, other): + # return self.priority == other.priority + # + # def __lt__(self, other): + # return self.priority < other.priority + # + # def __gt__(self, other): + # return self.priority > other.priority + # + # def __ne__(self, other): + # return self.priority != other.priority + + # @property + # def priority(self): + # return self._priority def is_extension_event(self): # Events originating from the agent have "WALinuxAgent" as the Name parameter, or they don't have a Name @@ -136,7 +136,7 @@ def is_extension_event(self): for param in self.parameters: if param.name == GuestAgentExtensionEventsSchema.Name: if param.value != AGENT_NAME: - self._priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE + # self._priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE return True return False diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 41a7c01f4b..662958fce8 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -32,7 +32,7 @@ from azurelinuxagent.common.exception import InvalidExtensionEventError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ - GuestAgentGenericLogsSchema, TelemetryEventPriorities + GuestAgentGenericLogsSchema from azurelinuxagent.common.interfaces import ThreadHandlerInterface from azurelinuxagent.ga.exthandlers import HANDLER_NAME_PATTERN from azurelinuxagent.ga.periodic_operation import PeriodicOperation @@ -262,8 +262,7 @@ def _parse_telemetry_event(self, handler_name, extension_unparsed_event, event_f # Create a telemetry event, add all common parameters to the event # and then overwrite all the common params with extension events params if same - event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID, - priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE) + event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID) event.file_type = "json" self.add_common_params_to_extension_event(event, event_file_time) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 2a35eaa1b1..22ad7b8938 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -18,13 +18,12 @@ # import datetime import threading - -import traceback import time +import traceback from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation -from azurelinuxagent.common.future import ustr, PriorityQueue +from azurelinuxagent.common.future import ustr, Queue from azurelinuxagent.common.interfaces import ThreadHandlerInterface @@ -49,7 +48,7 @@ def __init__(self, protocol_util): self.should_run = True self._thread = None self._should_process_events = threading.Event() - self._queue = PriorityQueue() + self._queue = Queue() #PriorityQueue() @staticmethod def get_thread_name(): diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 29cc32490f..9279940279 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -41,11 +41,11 @@ from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import event_to_v1_encoded -from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventPriorities, TelemetryEventParam, \ +from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ GuestAgentExtensionEventsSchema from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, AGENT_VERSION, CURRENT_AGENT, \ DISTRO_CODE_NAME -from azurelinuxagent.ga.monitor import CollectAndEnqueueEventsPeriodicOperation +from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates @@ -170,36 +170,36 @@ def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): self._assert_test_data_in_event_body(telemetry_handler, events) - def test_it_should_honour_the_priority_order_of_events(self): - - # In general, lower the number, higher the priority - # Priority Order: AGENT_EVENT > EXTENSION_EVENT_NEW_PIPELINE > EXTENSION_EVENT_OLD_PIPELINE - events = [ - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT), - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), - TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT) - ] - expected_priority_order = [] - - with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: - for test_event in events: - test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) - expected_priority_order.append(str(test_event.priority)) - telemetry_handler.enqueue_event(test_event) - - telemetry_handler.start() - self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") - self._assert_test_data_in_event_body(telemetry_handler, events) - - priorities = [] - regex_pattern = r'' - for _, event_body in telemetry_handler.event_calls: - priorities.extend(re.findall(regex_pattern, textutil.str_to_encoded_ustr(event_body))) - - self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") + # def test_it_should_honour_the_priority_order_of_events(self): + # + # # In general, lower the number, higher the priority + # # Priority Order: AGENT_EVENT > EXTENSION_EVENT_NEW_PIPELINE > EXTENSION_EVENT_OLD_PIPELINE + # events = [ + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT), + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), + # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT) + # ] + # expected_priority_order = [] + # + # with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: + # for test_event in events: + # test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) + # expected_priority_order.append(str(test_event.priority)) + # telemetry_handler.enqueue_event(test_event) + # + # telemetry_handler.start() + # self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") + # self._assert_test_data_in_event_body(telemetry_handler, events) + # + # priorities = [] + # regex_pattern = r'' + # for _, event_body in telemetry_handler.event_calls: + # priorities.extend(re.findall(regex_pattern, textutil.str_to_encoded_ustr(event_body))) + # + # self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") def test_telemetry_service_should_report_event_if_wireserver_returns_http_error(self): From 8255dea4345cac2a2c064613807ffcb6b6791d59 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 7 Oct 2020 16:28:11 -0700 Subject: [PATCH 37/63] Introduced ServiceStoppedError and changes around it --- azurelinuxagent/common/event.py | 6 +- azurelinuxagent/common/exception.py | 8 ++ .../ga/collect_telemetry_events.py | 121 +++++++++++------- azurelinuxagent/ga/telemetry_service.py | 4 + azurelinuxagent/ga/update.py | 2 +- 5 files changed, 93 insertions(+), 48 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 9099dd7e8a..91daede8e4 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -29,7 +29,7 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger from azurelinuxagent.common.AgentGlobals import AgentGlobals -from azurelinuxagent.common.exception import EventError, OSUtilError +from azurelinuxagent.common.exception import EventError, OSUtilError, ServiceStoppedError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.datacontract import get_properties, set_properties from azurelinuxagent.common.osutil import get_osutil @@ -633,6 +633,10 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 enqueue_event(event) finally: os.remove(event_file_path) + except ServiceStoppedError as stopped_error: + logger.error( + "Unable to enqueue events as service stopped: {0}, skipping events collection".format( + ustr(stopped_error))) except UnicodeError as uni_err: unicode_error_count += 1 if len(unicode_errors) < max_collect_errors_to_report: diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index 0f8709f67a..e8ab977c1d 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -227,6 +227,14 @@ def __init__(self, msg=None, inner=None): super(InvalidExtensionEventError, self).__init__(msg, inner) +class ServiceStoppedError(AgentError): + """ + Error thrown when trying to access a Servive which is stopped + """ + def __init__(self, msg=None, inner=None): + super(ServiceStoppedError, self).__init__(msg, inner) + + class ExtensionErrorCodes(object): # pylint: disable=R0903 """ Common Error codes used across by Compute RP for better understanding diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 662958fce8..a4450cd4cf 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -29,7 +29,7 @@ from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, TELEMETRY_LOG_EVENT_ID, \ TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, collect_events -from azurelinuxagent.common.exception import InvalidExtensionEventError +from azurelinuxagent.common.exception import InvalidExtensionEventError, ServiceStoppedError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ GuestAgentGenericLogsSchema @@ -38,8 +38,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_extension_telemetry_handler(enqueue_events): - return ExtensionTelemetryHandler(enqueue_events) +def get_extension_telemetry_handler(telemetry_service_handler): + return ExtensionTelemetryHandler(telemetry_service_handler) # too-few-public-methods Disabled: This class is used as an Enum @@ -73,15 +73,22 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, enqueue_event): + def __init__(self, telemetry_service_handler): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) - self._enqueue_event = enqueue_event + self._telemetry_service_handler = telemetry_service_handler def _collect_and_enqueue_extension_events(self): + + if self._telemetry_service_handler.stopped(): + logger.warn("{0} service is not running, skipping current iteration".format( + self._telemetry_service_handler.get_thread_name())) + return + + delete_all_event_files = True extension_handler_with_event_dirs = [] try: @@ -95,14 +102,19 @@ def _collect_and_enqueue_extension_events(self): handler_name = extension_handler_with_event_dir[0] handler_event_dir_path = extension_handler_with_event_dir[1] self._capture_extension_events(handler_name, handler_event_dir_path) + except ServiceStoppedError: + # Since the service stopped, we should not delete the extension files and retry sending them whenever + # the telemetry service comes back up + delete_all_event_files = False except Exception as error: msg = "Unknown error occurred when trying to collect extension events. Error: {0}, Stack: {1}".format( ustr(error), traceback.format_exc()) add_event(op=WALAEventOperation.ExtensionTelemetryEventProcessing, message=msg, is_success=False) finally: - # Always ensure that the events directory are being deleted each run, + # Always ensure that the events directory are being deleted each run except when Telemetry Service is stopped, # even if we run into an error and dont process them this run. - self._ensure_all_events_directories_empty(extension_handler_with_event_dirs) + if delete_all_event_files: + self._ensure_all_events_directories_empty(extension_handler_with_event_dirs) @staticmethod def _get_extension_events_dir_with_handler_name(extension_log_dir): @@ -155,44 +167,49 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): captured_extension_events_count = 0 dropped_events_with_error_count = defaultdict(int) - for event_file in event_files: - - event_file_path = os.path.join(handler_event_dir_path, event_file) - try: - logger.verbose("Processing event file: {0}", event_file_path) - - if not self._event_file_size_allowed(event_file_path): - continue - - # We support multiple events in a file, read the file and parse events. - captured_extension_events_count = self._get_captured_events_count(handler_name, event_file_path, - captured_extension_events_count, - dropped_events_with_error_count) + try: + for event_file in event_files: - # We only allow MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD=300 maximum events per period per handler - if captured_extension_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD: - msg = "Reached max count for the extension: {0}; Max Limit: {1}. Skipping the rest.".format( - handler_name, self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD) + event_file_path = os.path.join(handler_event_dir_path, event_file) + try: + logger.verbose("Processing event file: {0}", event_file_path) + + if not self._event_file_size_allowed(event_file_path): + continue + + # We support multiple events in a file, read the file and parse events. + captured_extension_events_count = self._get_captured_events_count(handler_name, event_file_path, + captured_extension_events_count, + dropped_events_with_error_count) + + # We only allow MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD=300 maximum events per period per handler + if captured_extension_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD: + msg = "Reached max count for the extension: {0}; Max Limit: {1}. Skipping the rest.".format( + handler_name, self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD) + logger.warn(msg) + add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) + break + except ServiceStoppedError: + # Not logging here as already logged once, re-raising + # Since we already started processing this file, deleting it as we could've already sent some events out + raise + except Exception as error: + msg = "Failed to process event file {0}: {1}, {2}".format(event_file, ustr(error), + traceback.format_exc()) logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) - break + finally: + os.remove(event_file_path) - except Exception as error: - msg = "Failed to process event file {0}: {1}, {2}".format(event_file, ustr(error), - traceback.format_exc()) + finally: + if dropped_events_with_error_count: + msg = "Dropped events for Extension: {0}; Details:\n\t{1}".format(handler_name, '\n\t'.join( + ["Reason: {0}; Dropped Count: {1}".format(k, v) for k, v in dropped_events_with_error_count.items()])) logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) - finally: - os.remove(event_file_path) - if dropped_events_with_error_count: - msg = "Dropped events for Extension: {0}; Details:\n\t{1}".format(handler_name, '\n\t'.join( - ["Reason: {0}; Dropped Count: {1}".format(k, v) for k, v in dropped_events_with_error_count.items()])) - logger.warn(msg) - add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) - - if captured_extension_events_count > 0: - logger.info("Collected {0} events for extension: {1}".format(captured_extension_events_count, handler_name)) + if captured_extension_events_count > 0: + logger.info("Collected {0} events for extension: {1}".format(captured_extension_events_count, handler_name)) @staticmethod def _ensure_all_events_directories_empty(extension_events_directories): @@ -235,7 +252,9 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve for event in events: try: - self._enqueue_event(self._parse_telemetry_event(handler_name, event, event_file_time)) + self._telemetry_service_handler.enqueue_event( + self._parse_telemetry_event(handler_name, event, event_file_time) + ) captured_events_count += 1 except InvalidExtensionEventError as invalid_error: # These are the errors thrown if there's an error parsing the event. We want to report these back to the @@ -243,6 +262,11 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve # The error messages are all static messages, we will use this to create a dict and emit an event at the # end of each run to notify if there were any errors parsing events for the extension dropped_events_with_error_count[ustr(invalid_error)] += 1 + except ServiceStoppedError as stopped_error: + logger.error( + "Unable to enqueue events as service stopped: {0}. Stopping collecting extension events".format( + ustr(stopped_error))) + raise except Exception as error: logger.warn("Unable to parse and transmit event, error: {0}".format(error)) @@ -353,19 +377,24 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - def __init__(self, enqueue_event): + def __init__(self, telemetry_service_handler): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self._collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self._enqueue_event = enqueue_event + self._telemetry_service_handler = telemetry_service_handler def _collect_and_enqueue_events(self): """ Periodically send any events located in the events folder """ try: - collect_events(self._enqueue_event) + if self._telemetry_service_handler.stopped(): + logger.warn("{0} service is not running, skipping iteration.".format( + self._telemetry_service_handler.get_thread_name())) + return + + collect_events(self._telemetry_service_handler.enqueue_event) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) @@ -379,10 +408,10 @@ class ExtensionTelemetryHandler(ThreadHandlerInterface): _THREAD_NAME = "ExtensionTelemetryHandler" - def __init__(self, enqueue_event): + def __init__(self, telemetry_service_handler): self.should_run = True self.thread = None - self._enqueue_event = enqueue_event + self._telemetry_service_handler = telemetry_service_handler @staticmethod def get_thread_name(): @@ -414,8 +443,8 @@ def stopped(self): def daemon(self): periodic_operations = [ - CollectAndEnqueueEventsPeriodicOperation(self._enqueue_event), - ProcessExtensionTelemetry(self._enqueue_event) + CollectAndEnqueueEventsPeriodicOperation(self._telemetry_service_handler), + ProcessExtensionTelemetry(self._telemetry_service_handler) ] logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 22ad7b8938..a981632af6 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -23,6 +23,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation +from azurelinuxagent.common.exception import ServiceStoppedError from azurelinuxagent.common.future import ustr, Queue from azurelinuxagent.common.interfaces import ThreadHandlerInterface @@ -86,6 +87,9 @@ def stopped(self): def enqueue_event(self, event): # Add event to queue and set event + if self.stopped(): + raise ServiceStoppedError("{0} is stopped, not accepting anymore events".format(self.get_thread_name())) + self._queue.put(event) # Set the event if any enqueue happens (even if already set) to trigger sending those events diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 4c3d4bed28..8b19b7329c 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -299,7 +299,7 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 all_thread_handlers.append(get_collect_logs_handler()) if is_extension_telemetry_pipeline_enabled(): - all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler.enqueue_event)) + all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler)) # Launch all monitoring threads for thread_handler in all_thread_handlers: From 99b7954da428a5f6e27137d31afad6ba38b562fa Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 7 Oct 2020 18:04:20 -0700 Subject: [PATCH 38/63] Code cleanup, introduced a small class only for debug purposes --- azurelinuxagent/common/event.py | 68 ++++++++++++++++++------- azurelinuxagent/common/protocol/wire.py | 45 +++++++--------- 2 files changed, 68 insertions(+), 45 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 91daede8e4..b91cd2b2b4 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -281,6 +281,51 @@ def _log_event(name, op, message, duration, is_success=True): # pylint: disable= logger.info(_EVENT_MSG, name, op, message, duration) +class EventDebugInfo(object): + MAX_ERRORS = 5 + OP_REPORT = "Report" + OP_COLLECT = "Collect" + + def __init__(self, operation=OP_REPORT): + self.unicode_error_count = 0 + self.unicode_errors = set() + self.op_error_count = 0 + self.op_errors = set() + + if operation == self.OP_REPORT: + self.unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors + self.op_errors_event = WALAEventOperation.ReportEventErrors + elif operation == self.OP_COLLECT: + self.unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors + self.op_errors_event = WALAEventOperation.CollectEventErrors + + def report_debug_info(self): + + def report_dropped_events_error(count, errors, operation_name): + err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" + if count > 0: + add_event(op=operation_name, + message=err_msg_format.format(count, EventDebugInfo.MAX_ERRORS, ', '.join(errors)), + is_success=False) + + report_dropped_events_error(self.op_error_count, self.op_errors, self.op_errors_event) + report_dropped_events_error(self.unicode_error_count, self.unicode_errors, self.unicode_error_event) + + @staticmethod + def _update_errors_and_get_count(error_count, errors, error): + error_count += 1 + if len(errors) < EventDebugInfo.MAX_ERRORS: + errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) + return error_count + + def update_unicode_error(self, unicode_err): + self.unicode_error_count = self._update_errors_and_get_count(self.unicode_error_count, self.unicode_errors, + unicode_err) + + def update_op_error(self, op_err): + self.op_error_count = self._update_errors_and_get_count(self.op_error_count, self.op_errors, op_err) + + class EventLogger(object): def __init__(self): self.event_dir = None @@ -587,17 +632,14 @@ def report_dropped_events_error(count, errors, operation_name, max_errors_to_rep message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) - # too-many-locals Disabled: Most local variables are being used for debugging which is acceptable. - def collect_events(self, enqueue_event): # pylint: disable=R0914 + def collect_events(self, enqueue_event): """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. """ - max_collect_errors_to_report = 5 event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) event_files = os.listdir(event_directory_full_path) - unicode_error_count, unicode_errors = 0, set() - collect_event_error_count, collect_event_errors = 0, set() + debug_info = EventDebugInfo(operation=EventDebugInfo.OP_COLLECT) for event_file in event_files: try: @@ -638,19 +680,11 @@ def collect_events(self, enqueue_event): # pylint: disable=R0914 "Unable to enqueue events as service stopped: {0}, skipping events collection".format( ustr(stopped_error))) except UnicodeError as uni_err: - unicode_error_count += 1 - if len(unicode_errors) < max_collect_errors_to_report: - unicode_errors.add("{0}: {1}".format(ustr(uni_err), traceback.format_exc())) + debug_info.update_unicode_error(uni_err) except Exception as error: - collect_event_error_count += 1 - if len(collect_event_errors) < max_collect_errors_to_report: - collect_event_errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) - - EventLogger.report_dropped_events_error(collect_event_error_count, collect_event_errors, - WALAEventOperation.CollectEventErrors, max_collect_errors_to_report) - EventLogger.report_dropped_events_error(unicode_error_count, unicode_errors, - WALAEventOperation.CollectEventUnicodeErrors, - max_collect_errors_to_report) + debug_info.update_op_error(error) + + debug_info.report_debug_info() def _update_legacy_agent_event(self, event, event_creation_time): # Ensure that if an agent event is missing a field from the schema defined since 2.2.47, the missing fields diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 384b25732e..05c347cf44 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -31,7 +31,7 @@ import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.datacontract import validate_param from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, EventLogger, \ - report_event + report_event, EventDebugInfo from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError from azurelinuxagent.common.future import httpclient, bytebuffer, ustr @@ -1079,28 +1079,18 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): raise ProtocolError( "Failed to send events:{0}".format(resp.status)) - # too-many-locals Disabled: Most of the locals are used for error debugging - def report_event(self, get_events_in_queue): # pylint: disable=too-many-locals - max_send_errors_to_report = 5 + def report_event(self, get_events_in_queue): buf = {} + debug_info = EventDebugInfo(operation=EventDebugInfo.OP_REPORT) events_per_request = defaultdict(int) - unicode_error_count, unicode_errors = 0, set() - event_report_error_count, event_report_errors = 0, set() - def _send_event(provider_id): - uni_err_count, err_count = 0, 0 + def _send_event(provider_id, debug_info): try: self.send_encoded_event(provider_id, buf[provider_id]) except UnicodeError as uni_error: - uni_err_count += 1 - if len(unicode_errors) < max_send_errors_to_report: - unicode_errors.add("{0}: {1}".format(ustr(uni_error), traceback.format_exc())) + debug_info.update_unicode_error(uni_error) except Exception as error: - err_count += 1 - if len(event_report_errors) < max_send_errors_to_report: - event_report_errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) - - return uni_err_count, err_count + debug_info.update_op_error(error) # Group events by providerId for event in get_events_in_queue(): @@ -1108,7 +1098,9 @@ def _send_event(provider_id): if event.providerId not in buf: buf[event.providerId] = b'' event_str = event_to_v1_encoded(event) + if len(event_str) >= MAX_EVENT_BUFFER_SIZE: + # Ignore single events that are too large to send out details_of_event = [ustr(x.name) + ":" + ustr(x.value) for x in event.parameters if x.name in [GuestAgentExtensionEventsSchema.Name, GuestAgentExtensionEventsSchema.Version, GuestAgentExtensionEventsSchema.Operation, @@ -1117,15 +1109,18 @@ def _send_event(provider_id): "Single event too large: {0}, with the length: {1} more than the limit({2})" .format(str(details_of_event), len(event_str), MAX_EVENT_BUFFER_SIZE)) continue + + # If buffer is full, send out the events in buffer and reset buffer if len(buf[event.providerId] + event_str) >= MAX_EVENT_BUFFER_SIZE: logger.verbose("No of events this request = {0}".format(events_per_request[event.providerId])) - uni_err_count, err_count = _send_event(event.providerId) - unicode_error_count += uni_err_count - event_report_error_count += err_count + _send_event(event.providerId, debug_info) buf[event.providerId] = b'' events_per_request[event.providerId] = 0 + + # Add encoded events to the buffer buf[event.providerId] = buf[event.providerId] + event_str events_per_request[event.providerId] += 1 + except Exception as error: logger.warn("Unexpected error when generating Events: {0}, {1}", ustr(error), traceback.format_exc()) logger.verbose("done reporting for Event {0}".format(event)) @@ -1134,15 +1129,9 @@ def _send_event(provider_id): for provider_id in list(buf.keys()): if buf[provider_id]: logger.verbose("No of events this request = {0}".format(events_per_request[provider_id])) - uni_err_count, err_count = _send_event(provider_id) - unicode_error_count += uni_err_count - event_report_error_count += err_count - - EventLogger.report_dropped_events_error(event_report_error_count, event_report_errors, - WALAEventOperation.ReportEventErrors, max_send_errors_to_report) - EventLogger.report_dropped_events_error(unicode_error_count, unicode_errors, - WALAEventOperation.ReportEventUnicodeErrors, - max_send_errors_to_report) + _send_event(provider_id, debug_info) + + debug_info.report_debug_info() def report_status_event(self, message, is_success): report_event(op=WALAEventOperation.ReportStatus, From 905710ac52e7ede9242767f6ab0421a65ad40607 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 9 Oct 2020 14:31:24 -0700 Subject: [PATCH 39/63] Final review changes, code cleanup and added more try-catches --- azurelinuxagent/common/future.py | 4 +-- .../ga/collect_telemetry_events.py | 25 +++++++++++++------ azurelinuxagent/ga/exthandlers.py | 5 ---- azurelinuxagent/ga/telemetry_service.py | 23 ++++++++++------- azurelinuxagent/ga/update.py | 13 ++++------ tests/ga/test_update.py | 2 +- 6 files changed, 40 insertions(+), 32 deletions(-) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 5458d1179e..963bb37a6c 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -33,12 +33,12 @@ # unused-import, import-error Disabled: Due to backward compatibility between py2 and py3 from builtins import int, range # pylint: disable=unused-import,import-error from collections import OrderedDict # pylint: disable=W0611 - from queue import Queue # pylint: disable=W0611,import-error + from queue import Queue, Full # pylint: disable=W0611,import-error elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 - from Queue import Queue # pylint: disable=W0611,import-error + from Queue import Queue, Full # pylint: disable=W0611,import-error # We want to suppress the following: diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index a4450cd4cf..7d2b809a11 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -38,8 +38,8 @@ from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_extension_telemetry_handler(telemetry_service_handler): - return ExtensionTelemetryHandler(telemetry_service_handler) +def get_telemetry_collector_handler(telemetry_service_handler): + return CollectTelemetryEventsHandler(telemetry_service_handler) # too-few-public-methods Disabled: This class is used as an Enum @@ -73,6 +73,12 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] + _ENABLE_EXTENSION_TELEMETRY_PIPELINE = False + + @staticmethod + def is_extension_telemetry_pipeline_enabled(): + return ProcessExtensionTelemetry._ENABLE_EXTENSION_TELEMETRY_PIPELINE + def __init__(self, telemetry_service_handler): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", @@ -400,7 +406,7 @@ def _collect_and_enqueue_events(self): add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) -class ExtensionTelemetryHandler(ThreadHandlerInterface): +class CollectTelemetryEventsHandler(ThreadHandlerInterface): """ This Handler takes care of fetching the Extension Telemetry events from the {extension_events_dir} and sends it to Kusto for advanced debuggability. @@ -415,7 +421,7 @@ def __init__(self, telemetry_service_handler): @staticmethod def get_thread_name(): - return ExtensionTelemetryHandler._THREAD_NAME + return CollectTelemetryEventsHandler._THREAD_NAME def run(self): logger.info("Start Extension Telemetry service.") @@ -427,7 +433,7 @@ def is_alive(self): def start(self): self.thread = threading.Thread(target=self.daemon) self.thread.setDaemon(True) - self.thread.setName(ExtensionTelemetryHandler.get_thread_name()) + self.thread.setName(CollectTelemetryEventsHandler.get_thread_name()) self.thread.start() def stop(self): @@ -443,9 +449,14 @@ def stopped(self): def daemon(self): periodic_operations = [ - CollectAndEnqueueEventsPeriodicOperation(self._telemetry_service_handler), - ProcessExtensionTelemetry(self._telemetry_service_handler) + CollectAndEnqueueEventsPeriodicOperation(self._telemetry_service_handler) ] + + logger.info("Extension Telemetry pipeline enabled: {0}".format( + ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled())) + if ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled(): + periodic_operations.append(ProcessExtensionTelemetry(self._telemetry_service_handler)) + logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): try: diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index f610dfa952..d4c476b8fb 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -81,11 +81,6 @@ _NUM_OF_STATUS_FILE_RETRIES = 5 _STATUS_FILE_RETRY_DELAY = 2 # seconds -_ENABLE_EXTENSION_TELEMETRY_PIPELINE = False - -def is_extension_telemetry_pipeline_enabled(): - return _ENABLE_EXTENSION_TELEMETRY_PIPELINE - class ValidHandlerStatus(object): # pylint: disable=R0903 transitioning = "transitioning" diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index a981632af6..cbe09fd238 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -24,7 +24,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation from azurelinuxagent.common.exception import ServiceStoppedError -from azurelinuxagent.common.future import ustr, Queue +from azurelinuxagent.common.future import ustr, Queue, Full from azurelinuxagent.common.interfaces import ThreadHandlerInterface @@ -39,9 +39,9 @@ class TelemetryServiceHandler(ThreadHandlerInterface): """ _THREAD_NAME = "TelemetryServiceHandler" - _MAX_TIMEOUT = datetime.timedelta(minutes=5).seconds + _MAX_TIMEOUT = datetime.timedelta(seconds=5).seconds _MIN_QUEUE_LIMIT = 30 - _MIN_WAIT_TIME = datetime.timedelta(seconds=5) + _MIN_BATCH_WAIT_TIME = datetime.timedelta(seconds=5) def __init__(self, protocol_util): @@ -90,7 +90,12 @@ def enqueue_event(self, event): if self.stopped(): raise ServiceStoppedError("{0} is stopped, not accepting anymore events".format(self.get_thread_name())) - self._queue.put(event) + # Queue.put() can block if the queue is full which can be an uninterruptible wait. Blocking for a max of + # TelemetryServiceHandler._MAX_TIMEOUT seconds and raising a ServiceStoppedError to retry later. + try: + self._queue.put(event, timeout=TelemetryServiceHandler._MAX_TIMEOUT) + except Full as error: + raise ServiceStoppedError("Queue full, stopping any more enqueuing until the next run. {0}", ustr(error)) # Set the event if any enqueue happens (even if already set) to trigger sending those events self._should_process_events.set() @@ -98,8 +103,8 @@ def enqueue_event(self, event): def _process_telemetry_thread(self): logger.info("Successfully started the {0} thread".format(self.get_thread_name())) try: - # On demand wait, start processing as soon as there is any data available in the queue - # In worst case, also keep checking every 5 mins to ensure that no data is being missed + # On demand wait, start processing as soon as there is any data available in the queue. In worst case, + # also keep checking every TelemetryServiceHandler._MAX_TIMEOUT secs to avoid uninterruptible waits while not self.stopped(): self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT) self._send_events_in_queue() @@ -125,9 +130,9 @@ def _send_events_in_queue(self): if not self._queue.empty(): start_time = datetime.datetime.utcnow() while self._queue.qsize() < self._MIN_QUEUE_LIMIT or \ - (start_time + self._MIN_WAIT_TIME) <= datetime.datetime.utcnow(): - # To promote batching, we either wait for atleast 30 events or 5 secs before sending out the first - # request to wireserver + (start_time + self._MIN_BATCH_WAIT_TIME) <= datetime.datetime.utcnow(): + # To promote batching, we either wait for atleast _MIN_QUEUE_LIMIT events or _MIN_BATCH_WAIT_TIME secs + # before sending out the first request to wireserver time.sleep(secs=1) self._protocol.report_event(self._get_events_in_queue) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 8b19b7329c..192e643c16 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -53,10 +53,9 @@ PY_VERSION_MINOR, PY_VERSION_MICRO from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed from azurelinuxagent.ga.env import get_env_handler -from azurelinuxagent.ga.collect_telemetry_events import get_extension_telemetry_handler +from azurelinuxagent.ga.collect_telemetry_events import get_telemetry_collector_handler, ProcessExtensionTelemetry -from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, \ - is_extension_telemetry_pipeline_enabled, list_agent_lib_directory +from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, list_agent_lib_directory from azurelinuxagent.ga.monitor import get_monitor_handler # pylint: disable=C0302 @@ -292,15 +291,13 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 all_thread_handlers = [ get_monitor_handler(), get_env_handler(), - telemetry_handler + telemetry_handler, + get_telemetry_collector_handler(telemetry_handler) ] if is_log_collection_allowed(): all_thread_handlers.append(get_collect_logs_handler()) - if is_extension_telemetry_pipeline_enabled(): - all_thread_handlers.append(get_extension_telemetry_handler(telemetry_handler)) - # Launch all monitoring threads for thread_handler in all_thread_handlers: thread_handler.run() @@ -803,7 +800,7 @@ def _ensure_extension_telemetry_state_configured_properly(protocol): continue try: - if not is_extension_telemetry_pipeline_enabled(): + if not ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled(): # If extension telemetry pipeline is disabled, ensure we delete all existing extension events directory # because the agent will not be listening on those events. extension_event_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index fa605fdcd0..e0af052d07 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1557,7 +1557,7 @@ def test_it_should_recreate_handler_env_on_service_startup(self): # Rerun the update handler and ensure that the HandlerEnvironment file is recreated with eventsFolder # flag in HandlerEnvironment.json file - with patch('azurelinuxagent.ga.exthandlers._ENABLE_EXTENSION_TELEMETRY_PIPELINE', + with patch('azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled', return_value=True): update_handler.set_iterations(1) update_handler.run(debug=True) From 6e2f6e792d336ab09a25437436783d1bd5be96f6 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 9 Oct 2020 15:19:26 -0700 Subject: [PATCH 40/63] Reverted the is_extension_telemetry_pipeline_enabled flag position --- azurelinuxagent/ga/collect_telemetry_events.py | 12 +++--------- azurelinuxagent/ga/exthandlers.py | 7 ++++++- azurelinuxagent/ga/update.py | 7 ++++--- tests/ga/test_update.py | 3 +-- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 7d2b809a11..c57c65eb7a 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -34,7 +34,7 @@ from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ GuestAgentGenericLogsSchema from azurelinuxagent.common.interfaces import ThreadHandlerInterface -from azurelinuxagent.ga.exthandlers import HANDLER_NAME_PATTERN +from azurelinuxagent.ga.exthandlers import HANDLER_NAME_PATTERN, is_extension_telemetry_pipeline_enabled from azurelinuxagent.ga.periodic_operation import PeriodicOperation @@ -73,12 +73,6 @@ class ProcessExtensionTelemetry(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - _ENABLE_EXTENSION_TELEMETRY_PIPELINE = False - - @staticmethod - def is_extension_telemetry_pipeline_enabled(): - return ProcessExtensionTelemetry._ENABLE_EXTENSION_TELEMETRY_PIPELINE - def __init__(self, telemetry_service_handler): super(ProcessExtensionTelemetry, self).__init__( name="collect_and_enqueue_extension_events", @@ -453,8 +447,8 @@ def daemon(self): ] logger.info("Extension Telemetry pipeline enabled: {0}".format( - ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled())) - if ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled(): + is_extension_telemetry_pipeline_enabled())) + if is_extension_telemetry_pipeline_enabled(): periodic_operations.append(ProcessExtensionTelemetry(self._telemetry_service_handler)) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index d4c476b8fb..074a00ae87 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -52,7 +52,7 @@ _HANDLER_NAME_PATTERN = r'^([^-]+)' _HANDLER_VERSION_PATTERN = r'(\d+(?:\.\d+)*)' -_HANDLER_PATTERN = _HANDLER_NAME_PATTERN + r"-" + _HANDLER_VERSION_PATTERN #r'^([^-]+)-(\d+(?:\.\d+)*)' +_HANDLER_PATTERN = _HANDLER_NAME_PATTERN + r"-" + _HANDLER_VERSION_PATTERN _HANDLER_PKG_PATTERN = re.compile(_HANDLER_PATTERN + r'\.zip$', re.IGNORECASE) _DEFAULT_EXT_TIMEOUT_MINUTES = 90 @@ -81,6 +81,11 @@ _NUM_OF_STATUS_FILE_RETRIES = 5 _STATUS_FILE_RETRY_DELAY = 2 # seconds +_ENABLE_EXTENSION_TELEMETRY_PIPELINE = False + +def is_extension_telemetry_pipeline_enabled(): + return _ENABLE_EXTENSION_TELEMETRY_PIPELINE + class ValidHandlerStatus(object): # pylint: disable=R0903 transitioning = "transitioning" diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 192e643c16..0d75806c47 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -53,9 +53,10 @@ PY_VERSION_MINOR, PY_VERSION_MICRO from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed from azurelinuxagent.ga.env import get_env_handler -from azurelinuxagent.ga.collect_telemetry_events import get_telemetry_collector_handler, ProcessExtensionTelemetry +from azurelinuxagent.ga.collect_telemetry_events import get_telemetry_collector_handler -from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, list_agent_lib_directory +from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, list_agent_lib_directory, \ + is_extension_telemetry_pipeline_enabled from azurelinuxagent.ga.monitor import get_monitor_handler # pylint: disable=C0302 @@ -800,7 +801,7 @@ def _ensure_extension_telemetry_state_configured_properly(protocol): continue try: - if not ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled(): + if not is_extension_telemetry_pipeline_enabled(): # If extension telemetry pipeline is disabled, ensure we delete all existing extension events directory # because the agent will not be listening on those events. extension_event_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index e0af052d07..32936b9c23 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1557,8 +1557,7 @@ def test_it_should_recreate_handler_env_on_service_startup(self): # Rerun the update handler and ensure that the HandlerEnvironment file is recreated with eventsFolder # flag in HandlerEnvironment.json file - with patch('azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry.is_extension_telemetry_pipeline_enabled', - return_value=True): + with patch('azurelinuxagent.ga.exthandlers.is_extension_telemetry_pipeline_enabled', return_value=True): update_handler.set_iterations(1) update_handler.run(debug=True) From 4c43bd0fd9321d2135f2a05442e1b1c7e62bf193 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 9 Oct 2020 15:28:08 -0700 Subject: [PATCH 41/63] Fixed test_monitor.py --- tests/ga/test_monitor.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/ga/test_monitor.py b/tests/ga/test_monitor.py index 99e80c95b8..83b0d8b4f9 100644 --- a/tests/ga/test_monitor.py +++ b/tests/ga/test_monitor.py @@ -60,7 +60,6 @@ def run(self): run.original_definition(self) run.original_definition = PeriodicOperation.run - event_list = [] with mock_wire_protocol(DATA_FILE) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) @@ -72,10 +71,9 @@ def run_and_wait(): monitor_handler.run() monitor_handler.join() - monitor_handler = get_monitor_handler(event_list.append) + monitor_handler = get_monitor_handler() monitor_handler.get_mock_wire_protocol = lambda: protocol monitor_handler.run_and_wait = run_and_wait - monitor_handler.event_list = event_list yield monitor_handler @@ -345,7 +343,7 @@ class TestMonitorFailure(AgentTestCase): @patch("azurelinuxagent.common.protocol.healthservice.HealthService.report_host_plugin_heartbeat") def test_error_heartbeat_creates_no_signal(self, patch_report_heartbeat, patch_http_get, patch_add_event, *args): # pylint: disable=unused-argument - monitor_handler = get_monitor_handler(MagicMock()) + monitor_handler = get_monitor_handler() protocol = WireProtocol('endpoint') protocol.update_goal_state = MagicMock() with patch('azurelinuxagent.common.protocol.util.ProtocolUtil.get_protocol', return_value=protocol): From dfc522b98dbe88d6a397ef9b2609d2417f660725 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 12 Oct 2020 15:09:07 -0700 Subject: [PATCH 42/63] Fixed failing tests --- azurelinuxagent/ga/telemetry_service.py | 9 +- tests/ga/test_collect_telemetry_events.py | 51 ++++++- tests/ga/test_telemetry_service.py | 154 ++++++++++++++++------ 3 files changed, 162 insertions(+), 52 deletions(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index cbe09fd238..29319edf6e 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -129,11 +129,12 @@ def _send_events_in_queue(self): # Process everything in Queue if not self._queue.empty(): start_time = datetime.datetime.utcnow() - while self._queue.qsize() < self._MIN_QUEUE_LIMIT or \ - (start_time + self._MIN_BATCH_WAIT_TIME) <= datetime.datetime.utcnow(): + while not self.stopped() and self._queue.qsize() < self._MIN_QUEUE_LIMIT and ( + start_time + self._MIN_BATCH_WAIT_TIME) > datetime.datetime.utcnow(): # To promote batching, we either wait for atleast _MIN_QUEUE_LIMIT events or _MIN_BATCH_WAIT_TIME secs - # before sending out the first request to wireserver - time.sleep(secs=1) + # before sending out the first request to wireserver. + # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. + time.sleep(1) self._protocol.report_event(self._get_events_in_queue) # Reset the event when done processing all events in queue diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index 73d8282f14..f8918fc746 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -26,11 +26,11 @@ import uuid from collections import defaultdict -from mock import patch +from mock import patch, MagicMock from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY -from azurelinuxagent.common.exception import InvalidExtensionEventError +from azurelinuxagent.common.exception import InvalidExtensionEventError, ServiceStoppedError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.telemetryevent import GuestAgentGenericLogsSchema, \ @@ -166,10 +166,14 @@ def _get_param_value_from_event_body_if_exists(event_list, param_name): @contextlib.contextmanager - def _create_extension_telemetry_processor(self): + def _create_extension_telemetry_processor(self, telemetry_handler=None): event_list = [] - extension_telemetry_processor = ProcessExtensionTelemetry(event_list.append) + if not telemetry_handler: + telemetry_handler = MagicMock(autospec=True) + telemetry_handler.stopped = MagicMock(return_value=False) + telemetry_handler.enqueue_event = MagicMock(wraps=event_list.append) + extension_telemetry_processor = ProcessExtensionTelemetry(telemetry_handler) extension_telemetry_processor.event_list = event_list yield extension_telemetry_processor @@ -527,4 +531,41 @@ def test_it_should_not_send_event_where_message_is_empty_and_report_event(self): self._assert_handler_data_in_event_list(telemetry_events, extensions_with_count, expected_count=0) self._assert_invalid_extension_error_event_reported(mock_event, extensions_with_count, InvalidExtensionEventError.EmptyMessageError, - expected_drop_count=1) \ No newline at end of file + expected_drop_count=1) + + def test_it_should_not_process_events_if_telemetry_service_stopped(self): + event_list = [] + telemetry_handler = MagicMock(autospec=True) + telemetry_handler.stopped = MagicMock(return_value=True) + telemetry_handler.enqueue_event = MagicMock(wraps=event_list.append) + + with self._create_extension_telemetry_processor(telemetry_handler) as extension_telemetry_processor: + self._create_random_extension_events_dir_with_events(3, self._WELL_FORMED_FILES) + extension_telemetry_processor.run() + + self.assertEqual(0, len(event_list), "No events should have been enqueued") + + def test_it_should_not_delete_event_files_except_current_one_if_service_stopped_midway(self): + event_list = [] + telemetry_handler = MagicMock(autospec=True) + telemetry_handler.stopped = MagicMock(return_value=False) + telemetry_handler.enqueue_event = MagicMock(side_effect=ServiceStoppedError("Telemetry service stopped"), + wraps=event_list.append) + no_of_extensions = 3 + # self._WELL_FORMED_FILES has 3 event files, i.e. total files for 3 extensions = 3 * 3 = 9 + # But since we delete the file that we were processing last, expected count = 8 + expected_event_file_count = 8 + + with self._create_extension_telemetry_processor(telemetry_handler) as extension_telemetry_processor: + ext_names = self._create_random_extension_events_dir_with_events(no_of_extensions, self._WELL_FORMED_FILES) + extension_telemetry_processor.run() + + self.assertEqual(0, len(event_list), "No events should have been enqueued") + total_file_count = 0 + for ext_name in ext_names: + event_dir = os.path.join(conf.get_ext_log_dir(), ext_name, EVENTS_DIRECTORY) + file_count = len(os.listdir(event_dir)) + self.assertGreater(file_count, 0, "Some event files should still be there") + total_file_count += file_count + + self.assertEqual(expected_event_file_count, total_file_count, "Expected File count doesn't match") diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 9279940279..2289669925 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -37,7 +37,7 @@ from azurelinuxagent.common.utils import restutil, fileutil, textutil from azurelinuxagent.common.event import WALAEventOperation, EVENTS_DIRECTORY -from azurelinuxagent.common.exception import HttpError +from azurelinuxagent.common.exception import HttpError, ServiceStoppedError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import event_to_v1_encoded @@ -50,7 +50,7 @@ from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE -from tests.tools import AgentTestCase, clear_singleton_instances +from tests.tools import AgentTestCase, clear_singleton_instances, mock_sleep from tests.utils.event_logger_tools import EventLoggerTools @@ -70,7 +70,7 @@ def tearDown(self): _TEST_EVENT_PROVIDER_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" @contextlib.contextmanager - def _create_telemetry_service_handler(self, timeout=0.5, start_thread=True): + def _create_telemetry_service_handler(self, timeout=0.5, start_thread=True, batching_queue_limit=1): def http_post_handler(url, body, **__): if self.is_telemetry_request(url): telemetry_service_handler.event_calls.append((datetime.now(), body)) @@ -82,12 +82,15 @@ def http_post_handler(url, body, **__): protocol_util.get_protocol = Mock(return_value=protocol) telemetry_service_handler = get_telemetry_service_handler(protocol_util) telemetry_service_handler.event_calls = [] - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): - telemetry_service_handler.get_mock_wire_protocol = lambda: protocol - if start_thread: - telemetry_service_handler.start() - self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") - yield telemetry_service_handler + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_QUEUE_LIMIT", + batching_queue_limit): + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): + + telemetry_service_handler.get_mock_wire_protocol = lambda: protocol + if start_thread: + telemetry_service_handler.start() + self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") + yield telemetry_service_handler @staticmethod def _stop_handler(telemetry_handler, timeout=0.001): @@ -138,7 +141,7 @@ def test_it_should_send_events_properly(self): self._assert_test_data_in_event_body(telemetry_handler, events) - def test_it_should_send_as_soon_as_events_available_in_queue(self): + def test_it_should_send_as_soon_as_events_available_in_queue_with_minimal_batching_limits(self): events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] with self._create_telemetry_service_handler() as telemetry_handler: @@ -170,36 +173,101 @@ def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): self._assert_test_data_in_event_body(telemetry_handler, events) - # def test_it_should_honour_the_priority_order_of_events(self): - # - # # In general, lower the number, higher the priority - # # Priority Order: AGENT_EVENT > EXTENSION_EVENT_NEW_PIPELINE > EXTENSION_EVENT_OLD_PIPELINE - # events = [ - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE), - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT), - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.EXTENSION_EVENT_NEW_PIPELINE), - # TelemetryEvent(eventId=ustr(uuid.uuid4()), priority=TelemetryEventPriorities.AGENT_EVENT) - # ] - # expected_priority_order = [] - # - # with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: - # for test_event in events: - # test_event.parameters.append(TelemetryEventParam("Priority", test_event.priority)) - # expected_priority_order.append(str(test_event.priority)) - # telemetry_handler.enqueue_event(test_event) - # - # telemetry_handler.start() - # self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") - # self._assert_test_data_in_event_body(telemetry_handler, events) - # - # priorities = [] - # regex_pattern = r'' - # for _, event_body in telemetry_handler.event_calls: - # priorities.extend(re.findall(regex_pattern, textutil.str_to_encoded_ustr(event_body))) - # - # self.assertEqual(sorted(expected_priority_order), priorities, "Priorities dont match") + def test_it_should_honor_batch_time_limits_before_sending_telemetry(self): + events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] + wait_time = timedelta(seconds=10) + orig_sleep = time.sleep + + with patch("time.sleep", lambda *_: orig_sleep(0.01)): + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + self.assertEqual(0, len(telemetry_handler.event_calls), "No events should have been logged") + TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) + + wait_time = timedelta(seconds=0.2) + with patch("time.sleep", lambda *_: orig_sleep(0.05)): + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + test_start_time = datetime.now() + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + while len(telemetry_handler.event_calls) == 0 and \ + (test_start_time + timedelta(seconds=1)) > datetime.now(): + # Wait for event calls to be made, wait a max of 1 secs + orig_sleep(0.1) + + self.assertGreater(len(telemetry_handler.event_calls), 0, "No event calls made at all!") + self._assert_test_data_in_event_body(telemetry_handler, events) + for event_time, _ in telemetry_handler.event_calls: + elapsed = event_time - test_start_time + # Technically we should send out data after 0.2 secs, but keeping a buffer of 1sec while testing + self.assertLessEqual(elapsed, timedelta(seconds=1), "Request was not sent properly") + + def test_it_should_clear_queue_before_stopping(self): + events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] + wait_time = timedelta(seconds=10) + + with patch("time.sleep", lambda *_: mock_sleep(0.01)): + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + for test_event in events: + telemetry_handler.enqueue_event(test_event) + + self.assertEqual(0, len(telemetry_handler.event_calls), "No events should have been logged") + TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) + # After the service is asked to stop, we should send all data in the queue + self._assert_test_data_in_event_body(telemetry_handler, events) + + def test_it_should_honor_batch_queue_limits_before_sending_telemetry(self): + + batch_limit = 5 + + with self._create_telemetry_service_handler(batching_queue_limit=batch_limit) as telemetry_handler: + events = [] + + for _ in range(batch_limit-1): + test_event = TelemetryEvent(eventId=ustr(uuid.uuid4())) + events.append(test_event) + telemetry_handler.enqueue_event(test_event) + + self.assertEqual(0, len(telemetry_handler.event_calls), "No events should have been logged") + + for _ in range(batch_limit): + test_event = TelemetryEvent(eventId=ustr(uuid.uuid4())) + events.append(test_event) + telemetry_handler.enqueue_event(test_event) + + self._assert_test_data_in_event_body(telemetry_handler, events) + + def test_it_should_raise_on_enqueue_if_service_stopped(self): + with self._create_telemetry_service_handler(start_thread=False) as telemetry_handler: + # Ensure the thread is stopped + telemetry_handler.stop() + with self.assertRaises(ServiceStoppedError) as context_manager: + telemetry_handler.enqueue_event(TelemetryEvent(eventId=ustr(uuid.uuid4()))) + + exception = context_manager.exception + self.assertIn("TelemetryServiceHandler is stopped, not accepting anymore events", str(exception)) + + + def test_it_should_honour_the_incoming_order_of_events(self): + + with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: + for index in range(5): + telemetry_handler.enqueue_event(TelemetryEvent(eventId=index)) + + telemetry_handler.start() + self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") + TestTelemetryServiceHandler._stop_handler(telemetry_handler) + _, event_body = telemetry_handler.event_calls[0] + event_orders = re.findall(r'', + textutil.str_to_encoded_ustr(event_body)) + self.assertEqual(sorted(event_orders), event_orders, "Events not ordered correctly") + def test_telemetry_service_should_report_event_if_wireserver_returns_http_error(self): @@ -267,7 +335,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir with self._create_telemetry_service_handler() as telemetry_handler: - monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event) + monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler) self._create_extension_event(message="Message-Test") test_mtime = 1000 # epoch time, in ms @@ -338,7 +406,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): size = 2 ** power self._create_extension_event(size) - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. TestTelemetryServiceHandler._stop_handler(telemetry_handler) @@ -356,7 +424,7 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir): self._create_extension_event(size) with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler.enqueue_event).run() + CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() TestTelemetryServiceHandler._stop_handler(telemetry_handler) self.assertEqual(3, patch_periodic_warn.call_count) From e33500dd49935f0b38e4fe1e3c3e18fb23b841d0 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 12 Oct 2020 17:35:33 -0700 Subject: [PATCH 43/63] Fixed failing TestUpdate tests --- tests/ga/test_update.py | 122 ++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 55 deletions(-) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 32936b9c23..3302077dac 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1257,27 +1257,31 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument with patch('azurelinuxagent.ga.update.get_monitor_handler') as mock_monitor: with patch('azurelinuxagent.ga.update.get_env_handler') as mock_env: with patch('azurelinuxagent.ga.update.get_collect_logs_handler') as mock_collect_logs: - with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): - with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): - with patch('time.sleep', side_effect=iterator) as mock_sleep: # pylint: disable=redefined-outer-name - with patch('sys.exit') as mock_exit: - if isinstance(os.getppid, MagicMock): - self.update_handler.run() - else: - with patch('os.getppid', return_value=42): - self.update_handler.run() - - self.assertEqual(1, mock_handler.call_count) - self.assertEqual(mock_handler.return_value.method_calls, calls) - self.assertEqual(1, mock_ra_handler.call_count) - self.assertEqual(mock_ra_handler.return_value.method_calls, calls) - self.assertEqual(invocations, mock_sleep.call_count) - if invocations > 0: - self.assertEqual(sleep_interval, mock_sleep.call_args[0]) - self.assertEqual(1, mock_monitor.call_count) - self.assertEqual(1, mock_env.call_count) - self.assertEqual(1, mock_collect_logs.call_count) - self.assertEqual(1, mock_exit.call_count) + with patch('azurelinuxagent.ga.update.get_telemetry_service_handler') as mock_telemetry_service: + with patch('azurelinuxagent.ga.update.get_telemetry_collector_handler') as mock_event_collector: + with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): + with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): + with patch('time.sleep', side_effect=iterator) as mock_sleep: # pylint: disable=redefined-outer-name + with patch('sys.exit') as mock_exit: + if isinstance(os.getppid, MagicMock): + self.update_handler.run() + else: + with patch('os.getppid', return_value=42): + self.update_handler.run() + + self.assertEqual(1, mock_handler.call_count) + self.assertEqual(mock_handler.return_value.method_calls, calls) + self.assertEqual(1, mock_ra_handler.call_count) + self.assertEqual(mock_ra_handler.return_value.method_calls, calls) + self.assertEqual(invocations, mock_sleep.call_count) + if invocations > 0: + self.assertEqual(sleep_interval, mock_sleep.call_args[0]) + self.assertEqual(1, mock_monitor.call_count) + self.assertEqual(1, mock_env.call_count) + self.assertEqual(1, mock_collect_logs.call_count) + self.assertEqual(1, mock_telemetry_service.call_count) + self.assertEqual(1, mock_event_collector.call_count) + self.assertEqual(1, mock_exit.call_count) def test_run(self): self._test_run() @@ -1571,22 +1575,25 @@ def test_it_should_recreate_handler_env_on_service_startup(self): @contextlib.contextmanager def _setup_test_for_ext_event_dirs_retention(self): - tempdir = tempfile.mkdtemp() + temp_ext_log_dir = tempfile.mkdtemp() + temp_lib_dir = tempfile.mkdtemp() try: - with patch.object(conf, "get_ext_log_dir", return_value=tempdir): - with self._get_update_handler(test_data=DATA_FILE_MULTIPLE_EXT) as (update_handler, protocol): - with patch('azurelinuxagent.ga.exthandlers._ENABLE_EXTENSION_TELEMETRY_PIPELINE', True): - update_handler.run(debug=True) - expected_events_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) - no_of_extensions = protocol.mock_wire_data.get_no_of_plugins_in_extension_config() - # Ensure extensions installed and events directory created - self.assertEqual(len(expected_events_dirs), no_of_extensions, "Extension events directories dont match") - for ext_dir in expected_events_dirs: - self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} not created!".format(ext_dir)) - - yield update_handler, expected_events_dirs + with patch.object(conf, "get_lib_dir", return_value=temp_lib_dir): + with patch.object(conf, "get_ext_log_dir", return_value=temp_ext_log_dir): + with self._get_update_handler(test_data=DATA_FILE_MULTIPLE_EXT) as (update_handler, protocol): + with patch('azurelinuxagent.ga.exthandlers._ENABLE_EXTENSION_TELEMETRY_PIPELINE', True): + update_handler.run(debug=True) + expected_events_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) + no_of_extensions = protocol.mock_wire_data.get_no_of_plugins_in_extension_config() + # Ensure extensions installed and events directory created + self.assertEqual(len(expected_events_dirs), no_of_extensions, "Extension events directories dont match") + for ext_dir in expected_events_dirs: + self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} not created!".format(ext_dir)) + + yield update_handler, expected_events_dirs finally: - shutil.rmtree(tempdir, ignore_errors=True) + shutil.rmtree(temp_ext_log_dir, ignore_errors=True) + shutil.rmtree(temp_lib_dir, ignore_errors=True) def test_it_should_delete_extension_events_directory_if_extension_telemetry_pipeline_disabled(self): # Disable extension telemetry pipeline and ensure events directory got deleted @@ -1604,6 +1611,8 @@ def test_it_should_retain_extension_events_directories_if_extension_telemetry_pi self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} should exist!".format(ext_dir)) +@patch('azurelinuxagent.ga.update.get_telemetry_collector_handler') +@patch('azurelinuxagent.ga.update.get_telemetry_service_handler') @patch('azurelinuxagent.ga.update.get_collect_logs_handler') @patch('azurelinuxagent.ga.update.get_monitor_handler') @patch('azurelinuxagent.ga.update.get_env_handler') @@ -1649,65 +1658,68 @@ def _setup_mock_thread_and_start_test_run(self, mock_thread, is_alive=True, invo self._test_run(invocations=invocations) return thread - def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs): + def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs, mock_telemetry_service, mock_telemetry_collector): self.assertTrue(self.update_handler.running) - mock_monitor_thread = MagicMock() - mock_monitor_thread.run = MagicMock() - mock_monitor.return_value = mock_monitor_thread - - mock_env_thread = MagicMock() - mock_env_thread.run = MagicMock() - mock_env.return_value = mock_env_thread + def _get_mock_thread(): + thread = MagicMock() + thread.run = MagicMock() + return thread - mock_collect_logs_thread = MagicMock() - mock_collect_logs_thread.run = MagicMock() - mock_collect_logs.return_value = mock_collect_logs_thread + mock_monitor.return_value = _get_mock_thread() + mock_env.return_value = _get_mock_thread() + mock_collect_logs.return_value = _get_mock_thread() + mock_telemetry_service.return_value = _get_mock_thread() + mock_telemetry_collector.return_value = _get_mock_thread() self._test_run(invocations=0) self.assertEqual(1, mock_monitor.call_count) - self.assertEqual(1, mock_monitor_thread.run.call_count) + self.assertEqual(1, mock_monitor().run.call_count) self.assertEqual(1, mock_env.call_count) - self.assertEqual(1, mock_env_thread.run.call_count) + self.assertEqual(1, mock_env().run.call_count) self.assertEqual(1, mock_collect_logs.call_count) - self.assertEqual(1, mock_collect_logs_thread.run.call_count) + self.assertEqual(1, mock_collect_logs().run.call_count) + self.assertEqual(1, mock_telemetry_collector.call_count) + self.assertEqual(1, mock_telemetry_collector().run.call_count) + self.assertEqual(1, mock_telemetry_service.call_count) + self.assertEqual(1, mock_telemetry_service().run.call_count) - def test_check_if_monitor_thread_is_alive(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_check_if_monitor_thread_is_alive(self, _, mock_monitor, *args): # pylint: disable=unused-argument mock_monitor_thread = self._setup_mock_thread_and_start_test_run(mock_monitor, is_alive=True, invocations=0) self.assertEqual(1, mock_monitor.call_count) self.assertEqual(1, mock_monitor_thread.run.call_count) self.assertEqual(1, mock_monitor_thread.is_alive.call_count) self.assertEqual(0, mock_monitor_thread.start.call_count) - def test_check_if_env_thread_is_alive(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_check_if_env_thread_is_alive(self, mock_env, *args): # pylint: disable=unused-argument mock_env_thread = self._setup_mock_thread_and_start_test_run(mock_env, is_alive=True, invocations=1) self.assertEqual(1, mock_env.call_count) self.assertEqual(1, mock_env_thread.run.call_count) self.assertEqual(1, mock_env_thread.is_alive.call_count) self.assertEqual(0, mock_env_thread.start.call_count) - def test_restart_monitor_thread_if_not_alive(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_restart_monitor_thread_if_not_alive(self, _, mock_monitor, *args): # pylint: disable=unused-argument mock_monitor_thread = self._setup_mock_thread_and_start_test_run(mock_monitor, is_alive=False, invocations=1) self.assertEqual(1, mock_monitor.call_count) self.assertEqual(1, mock_monitor_thread.run.call_count) self.assertEqual(1, mock_monitor_thread.is_alive.call_count) self.assertEqual(1, mock_monitor_thread.start.call_count) - def test_restart_env_thread_if_not_alive(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_restart_env_thread_if_not_alive(self, mock_env, *args): # pylint: disable=unused-argument mock_env_thread = self._setup_mock_thread_and_start_test_run(mock_env, is_alive=False, invocations=1) self.assertEqual(1, mock_env.call_count) self.assertEqual(1, mock_env_thread.run.call_count) self.assertEqual(1, mock_env_thread.is_alive.call_count) self.assertEqual(1, mock_env_thread.start.call_count) - def test_restart_monitor_thread(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_restart_monitor_thread(self, _, mock_monitor, *args): # pylint: disable=unused-argument mock_monitor_thread = self._setup_mock_thread_and_start_test_run(mock_monitor, is_alive=False, invocations=0) self.assertEqual(True, mock_monitor.called) self.assertEqual(True, mock_monitor_thread.run.called) self.assertEqual(True, mock_monitor_thread.is_alive.called) self.assertEqual(True, mock_monitor_thread.start.called) - def test_restart_env_thread(self, mock_env, mock_monitor, mock_collect_logs): # pylint: disable=unused-argument + def test_restart_env_thread(self, mock_env, *args): # pylint: disable=unused-argument mock_env_thread = self._setup_mock_thread_and_start_test_run(mock_env, is_alive=False, invocations=0) self.assertEqual(True, mock_env.called) self.assertEqual(True, mock_env_thread.run.called) From cc3e23acefe28fc82523f4e6aed1fb3491f5e729 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 13 Oct 2020 11:45:33 -0700 Subject: [PATCH 44/63] Fixed linter errors --- azurelinuxagent/common/event.py | 7 ++--- azurelinuxagent/common/protocol/wire.py | 3 +-- azurelinuxagent/ga/telemetry_service.py | 3 ++- tests/ga/test_telemetry_service.py | 3 +-- tests/ga/test_update.py | 35 +++++++++++-------------- 5 files changed, 23 insertions(+), 28 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index b91cd2b2b4..61e857a2e8 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -632,7 +632,8 @@ def report_dropped_events_error(count, errors, operation_name, max_errors_to_rep message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), is_success=False) - def collect_events(self, enqueue_event): + # too-many-locals Disabled: The number of local variables is OK + def collect_events(self, enqueue_event): # pylint: disable=too-many-locals """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. @@ -652,8 +653,8 @@ def collect_events(self, enqueue_event): try: logger.verbose("Processing event file: {0}", event_file_path) - with open(event_file_path, "rb") as fd: # pylint: disable=C0103 - event_data = fd.read().decode("utf-8") + with open(event_file_path, "rb") as event_fd: + event_data = event_fd.read().decode("utf-8") event = parse_event(event_data) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 05c347cf44..61a29f3e32 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -30,8 +30,7 @@ import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.datacontract import validate_param -from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, EventLogger, \ - report_event, EventDebugInfo +from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, report_event, EventDebugInfo from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError from azurelinuxagent.common.future import httpclient, bytebuffer, ustr diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 29319edf6e..b24769c4b0 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -95,7 +95,8 @@ def enqueue_event(self, event): try: self._queue.put(event, timeout=TelemetryServiceHandler._MAX_TIMEOUT) except Full as error: - raise ServiceStoppedError("Queue full, stopping any more enqueuing until the next run. {0}", ustr(error)) + raise ServiceStoppedError( + "Queue full, stopping any more enqueuing until the next run. {0}".format(ustr(error))) # Set the event if any enqueue happens (even if already set) to trigger sending those events self._should_process_events.set() diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 2289669925..547ae469d9 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -195,8 +195,7 @@ def test_it_should_honor_batch_time_limits_before_sending_telemetry(self): for test_event in events: telemetry_handler.enqueue_event(test_event) - while len(telemetry_handler.event_calls) == 0 and \ - (test_start_time + timedelta(seconds=1)) > datetime.now(): + while not telemetry_handler.event_calls and (test_start_time + timedelta(seconds=1)) > datetime.now(): # Wait for event calls to be made, wait a max of 1 secs orig_sleep(0.1) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 3302077dac..c6e4b30a83 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1233,7 +1233,8 @@ def test_run_latest_creates_only_one_signal_handler(self, mock_signal): self._test_run_latest() self.assertEqual(0, mock_signal.call_count) - def _test_run(self, invocations=1, calls=[call.run()], enable_updates=False, sleep_interval=(6,)): # pylint: disable=dangerous-default-value + # too-many-locals Disabled: The number of local variables is OK + def _test_run(self, invocations=1, calls=[call.run()], enable_updates=False, sleep_interval=(6,)): # pylint: disable=dangerous-default-value,too-many-locals conf.get_autoupdate_enabled = Mock(return_value=enable_updates) # Note: @@ -1261,7 +1262,7 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument with patch('azurelinuxagent.ga.update.get_telemetry_collector_handler') as mock_event_collector: with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): - with patch('time.sleep', side_effect=iterator) as mock_sleep: # pylint: disable=redefined-outer-name + with patch('time.sleep', side_effect=iterator) as sleep_mock: with patch('sys.exit') as mock_exit: if isinstance(os.getppid, MagicMock): self.update_handler.run() @@ -1273,9 +1274,9 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument self.assertEqual(mock_handler.return_value.method_calls, calls) self.assertEqual(1, mock_ra_handler.call_count) self.assertEqual(mock_ra_handler.return_value.method_calls, calls) - self.assertEqual(invocations, mock_sleep.call_count) + self.assertEqual(invocations, sleep_mock.call_count) if invocations > 0: - self.assertEqual(sleep_interval, mock_sleep.call_args[0]) + self.assertEqual(sleep_interval, sleep_mock.call_args[0]) self.assertEqual(1, mock_monitor.call_count) self.assertEqual(1, mock_env.call_count) self.assertEqual(1, mock_collect_logs.call_count) @@ -1658,7 +1659,8 @@ def _setup_mock_thread_and_start_test_run(self, mock_thread, is_alive=True, invo self._test_run(invocations=invocations) return thread - def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs, mock_telemetry_service, mock_telemetry_collector): + # too-many-arguments Disabled: The number of arguments maps to the number of threads + def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs, mock_telemetry_service, mock_telemetry_collector): # pylint: disable=too-many-arguments self.assertTrue(self.update_handler.running) def _get_mock_thread(): @@ -1666,23 +1668,16 @@ def _get_mock_thread(): thread.run = MagicMock() return thread - mock_monitor.return_value = _get_mock_thread() - mock_env.return_value = _get_mock_thread() - mock_collect_logs.return_value = _get_mock_thread() - mock_telemetry_service.return_value = _get_mock_thread() - mock_telemetry_collector.return_value = _get_mock_thread() + all_threads = [mock_telemetry_service, mock_telemetry_collector, mock_env, mock_monitor, mock_collect_logs] + + for thread in all_threads: + thread.return_value = _get_mock_thread() self._test_run(invocations=0) - self.assertEqual(1, mock_monitor.call_count) - self.assertEqual(1, mock_monitor().run.call_count) - self.assertEqual(1, mock_env.call_count) - self.assertEqual(1, mock_env().run.call_count) - self.assertEqual(1, mock_collect_logs.call_count) - self.assertEqual(1, mock_collect_logs().run.call_count) - self.assertEqual(1, mock_telemetry_collector.call_count) - self.assertEqual(1, mock_telemetry_collector().run.call_count) - self.assertEqual(1, mock_telemetry_service.call_count) - self.assertEqual(1, mock_telemetry_service().run.call_count) + + for thread in all_threads: + self.assertEqual(1, thread.call_count) + self.assertEqual(1, thread().run.call_count) def test_check_if_monitor_thread_is_alive(self, _, mock_monitor, *args): # pylint: disable=unused-argument mock_monitor_thread = self._setup_mock_thread_and_start_test_run(mock_monitor, is_alive=True, invocations=0) From a6373bd8b7ae611f6f1a5f53648fdcd4e06f11e0 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 13 Oct 2020 12:39:46 -0700 Subject: [PATCH 45/63] Change thread name --- azurelinuxagent/ga/collect_telemetry_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index c57c65eb7a..5ffe52804a 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -406,7 +406,7 @@ class CollectTelemetryEventsHandler(ThreadHandlerInterface): Kusto for advanced debuggability. """ - _THREAD_NAME = "ExtensionTelemetryHandler" + _THREAD_NAME = "TelemetryEventsCollector" def __init__(self, telemetry_service_handler): self.should_run = True From 560d974cb51c247bda50039e05113413a8f9806d Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 13 Oct 2020 12:53:38 -0700 Subject: [PATCH 46/63] Cleaning up before starting the flaky test --- tests/ga/test_update.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index c6e4b30a83..0fe2153b48 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1576,6 +1576,10 @@ def test_it_should_recreate_handler_env_on_service_startup(self): @contextlib.contextmanager def _setup_test_for_ext_event_dirs_retention(self): + # This test class creates some files in the lib dir on setup, since we're using a new temp dir for our test, + # making sure the dir created by TestUpdate.setUp() is always clean for this test. + shutil.rmtree(conf.get_lib_dir(), ignore_errors=True) + temp_ext_log_dir = tempfile.mkdtemp() temp_lib_dir = tempfile.mkdtemp() try: From 457cba42d9bed008b3cb94fa57f8081871cbd71f Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 13 Oct 2020 16:17:36 -0700 Subject: [PATCH 47/63] Removed unnecessary logs and added a verbose log --- azurelinuxagent/common/protocol/wire.py | 1 - azurelinuxagent/ga/telemetry_service.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 61a29f3e32..ef38683623 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1122,7 +1122,6 @@ def _send_event(provider_id, debug_info): except Exception as error: logger.warn("Unexpected error when generating Events: {0}, {1}", ustr(error), traceback.format_exc()) - logger.verbose("done reporting for Event {0}".format(event)) # Send out all events left in buffer. for provider_id in list(buf.keys()): diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index b24769c4b0..02d61ecd63 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -135,6 +135,8 @@ def _send_events_in_queue(self): # To promote batching, we either wait for atleast _MIN_QUEUE_LIMIT events or _MIN_BATCH_WAIT_TIME secs # before sending out the first request to wireserver. # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. + logger.verbose("Waiting for events to batch. Queue size: {0}, First request time: {1}", + self._queue.qsize(), start_time) time.sleep(1) self._protocol.report_event(self._get_events_in_queue) From 7d59ccae24386fb2ede75559b23bcfcbf5adb4d3 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 13 Oct 2020 16:52:20 -0700 Subject: [PATCH 48/63] Improved log message --- azurelinuxagent/ga/telemetry_service.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 02d61ecd63..31b03097d7 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -135,8 +135,8 @@ def _send_events_in_queue(self): # To promote batching, we either wait for atleast _MIN_QUEUE_LIMIT events or _MIN_BATCH_WAIT_TIME secs # before sending out the first request to wireserver. # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. - logger.verbose("Waiting for events to batch. Queue size: {0}, First request time: {1}", - self._queue.qsize(), start_time) + logger.verbose("Waiting for events to batch. Queue size: {0}, Time elapsed: {1} secs", + self._queue.qsize(), (datetime.datetime.utcnow() - start_time).seconds) time.sleep(1) self._protocol.report_event(self._get_events_in_queue) From c70e3d459b41dfb7154837ff4e6c4ab59f76ac04 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 14 Oct 2020 12:14:45 -0700 Subject: [PATCH 49/63] Code cleanup --- azurelinuxagent/common/protocol/wire.py | 3 +- azurelinuxagent/common/telemetryevent.py | 37 ------------------------ 2 files changed, 1 insertion(+), 39 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index ef38683623..02ec897fee 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -16,7 +16,6 @@ # # Requires Python 2.6+ and Openssl 1.0+ -import datetime import json import os import random @@ -24,7 +23,7 @@ import traceback import xml.sax.saxutils as saxutils from collections import defaultdict -from datetime import datetime # pylint: disable=ungrouped-imports +from datetime import datetime import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 13155d2f31..544820f76d 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -82,53 +82,17 @@ def __eq__(self, other): return isinstance(other, TelemetryEventParam) and other.name == self.name and other.value == self.value -# too-few-public-methods Disabled: This class is used as an Enum -# class TelemetryEventPriorities(object): # pylint: disable=R0903 -# """ -# Class defining the priorities for telemetry events. Lower the number, higher the priority -# -# Note: 0 is reserved for a feature like QuickLog in the Windows Agent (i.e. the ability to send out telemetry -# instantly rather than waiting for a minute for the monitor thread to pick up the events) -# """ -# AGENT_EVENT = 1 # Agent events always get the highest priority -# EXTENSION_EVENT_NEW_PIPELINE = 2 # Prioritize extensions using the new dedicated pipeline over extensions hijacking the agent pipeline -# EXTENSION_EVENT_OLD_PIPELINE = 3 - - class TelemetryEvent(DataContract): def __init__(self, eventId=None, providerId=None): self.eventId = eventId # pylint: disable=C0103 self.providerId = providerId # pylint: disable=C0103 self.parameters = DataContractList(TelemetryEventParam) self.file_type = "" - # self._priority = priority # Checking if the particular param name is in the TelemetryEvent. def __contains__(self, param_name): return param_name in [param.name for param in self.parameters] - # def __le__(self, other): - # return self.priority <= other.priority - # - # def __ge__(self, other): - # return self.priority >= other.priority - # - # def __eq__(self, other): - # return self.priority == other.priority - # - # def __lt__(self, other): - # return self.priority < other.priority - # - # def __gt__(self, other): - # return self.priority > other.priority - # - # def __ne__(self, other): - # return self.priority != other.priority - - # @property - # def priority(self): - # return self._priority - def is_extension_event(self): # Events originating from the agent have "WALinuxAgent" as the Name parameter, or they don't have a Name # parameter, in the case of log and metric events. So, in case the Name parameter exists and it is not @@ -136,7 +100,6 @@ def is_extension_event(self): for param in self.parameters: if param.name == GuestAgentExtensionEventsSchema.Name: if param.value != AGENT_NAME: - # self._priority = TelemetryEventPriorities.EXTENSION_EVENT_OLD_PIPELINE return True return False From 589e6695616e81d6328f6e3cf3aa1352ab809d6d Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 15 Oct 2020 15:52:55 -0700 Subject: [PATCH 50/63] addressed PR comments --- azurelinuxagent/common/event.py | 58 +++++++++---------- azurelinuxagent/common/exception.py | 2 +- azurelinuxagent/common/protocol/wire.py | 4 +- .../ga/collect_telemetry_events.py | 4 +- azurelinuxagent/ga/telemetry_service.py | 8 +-- tests/common/test_event.py | 2 +- tests/ga/test_telemetry_service.py | 2 +- 7 files changed, 39 insertions(+), 41 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 61e857a2e8..8fe20b3212 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -281,23 +281,29 @@ def _log_event(name, op, message, duration, is_success=True): # pylint: disable= logger.info(_EVENT_MSG, name, op, message, duration) -class EventDebugInfo(object): - MAX_ERRORS = 5 +class CollectOrReportEventDebugInfo(object): + """ + This class is used for capturing and reporting debug info that is captured during event collection and + reporting to wireserver. + It captures the count of unicode errors and any unexpected errors and also a subset of errors with stacks to help + with debugging any potential issues. + """ + __MAX_ERRORS_TO_REPORT = 5 OP_REPORT = "Report" OP_COLLECT = "Collect" def __init__(self, operation=OP_REPORT): - self.unicode_error_count = 0 - self.unicode_errors = set() - self.op_error_count = 0 - self.op_errors = set() + self.__unicode_error_count = 0 + self.__unicode_errors = set() + self.__op_error_count = 0 + self.__op_errors = set() if operation == self.OP_REPORT: - self.unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors - self.op_errors_event = WALAEventOperation.ReportEventErrors + self.__unicode_error_event = WALAEventOperation.ReportEventUnicodeErrors + self.__op_errors_event = WALAEventOperation.ReportEventErrors elif operation == self.OP_COLLECT: - self.unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors - self.op_errors_event = WALAEventOperation.CollectEventErrors + self.__unicode_error_event = WALAEventOperation.CollectEventUnicodeErrors + self.__op_errors_event = WALAEventOperation.CollectEventErrors def report_debug_info(self): @@ -305,25 +311,25 @@ def report_dropped_events_error(count, errors, operation_name): err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" if count > 0: add_event(op=operation_name, - message=err_msg_format.format(count, EventDebugInfo.MAX_ERRORS, ', '.join(errors)), + message=err_msg_format.format(count, CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT, ', '.join(errors)), is_success=False) - report_dropped_events_error(self.op_error_count, self.op_errors, self.op_errors_event) - report_dropped_events_error(self.unicode_error_count, self.unicode_errors, self.unicode_error_event) + report_dropped_events_error(self.__op_error_count, self.__op_errors, self.__op_errors_event) + report_dropped_events_error(self.__unicode_error_count, self.__unicode_errors, self.__unicode_error_event) @staticmethod def _update_errors_and_get_count(error_count, errors, error): error_count += 1 - if len(errors) < EventDebugInfo.MAX_ERRORS: + if len(errors) < CollectOrReportEventDebugInfo.__MAX_ERRORS_TO_REPORT: errors.add("{0}: {1}".format(ustr(error), traceback.format_exc())) return error_count def update_unicode_error(self, unicode_err): - self.unicode_error_count = self._update_errors_and_get_count(self.unicode_error_count, self.unicode_errors, - unicode_err) + self.__unicode_error_count = self._update_errors_and_get_count(self.__unicode_error_count, self.__unicode_errors, + unicode_err) def update_op_error(self, op_err): - self.op_error_count = self._update_errors_and_get_count(self.op_error_count, self.op_errors, op_err) + self.__op_error_count = self._update_errors_and_get_count(self.__op_error_count, self.__op_errors, op_err) class EventLogger(object): @@ -624,23 +630,15 @@ def _trim_extension_event_parameters(event): event.parameters = trimmed_params - @staticmethod - def report_dropped_events_error(count, errors, operation_name, max_errors_to_report): - err_msg_format = "DroppedEventsCount: {0}\nReasons (first {1} errors): {2}" - if count > 0: - add_event(op=operation_name, - message=err_msg_format.format(count, max_errors_to_report, ', '.join(errors)), - is_success=False) - # too-many-locals Disabled: The number of local variables is OK - def collect_events(self, enqueue_event): # pylint: disable=too-many-locals + def process_events(self, process_event_operation): # pylint: disable=too-many-locals """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. """ event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) event_files = os.listdir(event_directory_full_path) - debug_info = EventDebugInfo(operation=EventDebugInfo.OP_COLLECT) + debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_COLLECT) for event_file in event_files: try: @@ -673,7 +671,7 @@ def collect_events(self, enqueue_event): # pylint: disable=too-many-locals else: self._update_legacy_agent_event(event, event_file_creation_time) - enqueue_event(event) + process_event_operation(event) finally: os.remove(event_file_path) except ServiceStoppedError as stopped_error: @@ -811,8 +809,8 @@ def add_periodic(delta, name, op=WALAEventOperation.Unknown, is_success=True, du message=message, log_event=log_event, force=force) -def collect_events(enqueue_event, reporter=__event_logger__): - return reporter.collect_events(enqueue_event) +def process_events(process_event_operation, reporter=__event_logger__): + return reporter.process_events(process_event_operation) def mark_event_status(name, version, op, status): # pylint: disable=C0103 diff --git a/azurelinuxagent/common/exception.py b/azurelinuxagent/common/exception.py index e8ab977c1d..a40dc3e6d4 100644 --- a/azurelinuxagent/common/exception.py +++ b/azurelinuxagent/common/exception.py @@ -229,7 +229,7 @@ def __init__(self, msg=None, inner=None): class ServiceStoppedError(AgentError): """ - Error thrown when trying to access a Servive which is stopped + Error thrown when trying to access a Service which is stopped """ def __init__(self, msg=None, inner=None): super(ServiceStoppedError, self).__init__(msg, inner) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 02ec897fee..65dea16ef1 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -29,7 +29,7 @@ import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.datacontract import validate_param -from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, report_event, EventDebugInfo +from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, report_event, CollectOrReportEventDebugInfo from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError from azurelinuxagent.common.future import httpclient, bytebuffer, ustr @@ -1079,7 +1079,7 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): def report_event(self, get_events_in_queue): buf = {} - debug_info = EventDebugInfo(operation=EventDebugInfo.OP_REPORT) + debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_REPORT) events_per_request = defaultdict(int) def _send_event(provider_id, debug_info): diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 5ffe52804a..53efe040dd 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -28,7 +28,7 @@ import azurelinuxagent.common.logger as logger from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, TELEMETRY_LOG_EVENT_ID, \ - TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, collect_events + TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, process_events from azurelinuxagent.common.exception import InvalidExtensionEventError, ServiceStoppedError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ @@ -394,7 +394,7 @@ def _collect_and_enqueue_events(self): self._telemetry_service_handler.get_thread_name())) return - collect_events(self._telemetry_service_handler.enqueue_event) + process_events(self._telemetry_service_handler.enqueue_event) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 31b03097d7..5f5d139c46 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -40,7 +40,7 @@ class TelemetryServiceHandler(ThreadHandlerInterface): _THREAD_NAME = "TelemetryServiceHandler" _MAX_TIMEOUT = datetime.timedelta(seconds=5).seconds - _MIN_QUEUE_LIMIT = 30 + _MIN_EVENTS_TO_BATCH = 30 _MIN_BATCH_WAIT_TIME = datetime.timedelta(seconds=5) @@ -49,7 +49,7 @@ def __init__(self, protocol_util): self.should_run = True self._thread = None self._should_process_events = threading.Event() - self._queue = Queue() #PriorityQueue() + self._queue = Queue() @staticmethod def get_thread_name(): @@ -130,9 +130,9 @@ def _send_events_in_queue(self): # Process everything in Queue if not self._queue.empty(): start_time = datetime.datetime.utcnow() - while not self.stopped() and self._queue.qsize() < self._MIN_QUEUE_LIMIT and ( + while not self.stopped() and self._queue.qsize() < self._MIN_EVENTS_TO_BATCH and ( start_time + self._MIN_BATCH_WAIT_TIME) > datetime.datetime.utcnow(): - # To promote batching, we either wait for atleast _MIN_QUEUE_LIMIT events or _MIN_BATCH_WAIT_TIME secs + # To promote batching, we either wait for atleast _MIN_EVENTS_TO_BATCH events or _MIN_BATCH_WAIT_TIME secs # before sending out the first request to wireserver. # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. logger.verbose("Waiting for events to batch. Queue size: {0}, Time elapsed: {1} secs", diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 76646d1a89..aa82e9de56 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -94,7 +94,7 @@ def _yield_events(): @staticmethod def _collect_events(): event_list = [] - event.collect_events(event_list.append) + event.process_events(event_list.append) return event_list @staticmethod diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 547ae469d9..501415f066 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -82,7 +82,7 @@ def http_post_handler(url, body, **__): protocol_util.get_protocol = Mock(return_value=protocol) telemetry_service_handler = get_telemetry_service_handler(protocol_util) telemetry_service_handler.event_calls = [] - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_QUEUE_LIMIT", + with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_EVENTS_TO_BATCH", batching_queue_limit): with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): From af0465c62a6c8572d71fc38b5adda43d8c129829 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 16 Oct 2020 13:24:17 -0700 Subject: [PATCH 51/63] PR comments pt2 --- azurelinuxagent/common/protocol/wire.py | 18 +++++++++--------- azurelinuxagent/common/telemetryevent.py | 3 +-- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 65dea16ef1..fe4ac02e95 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -188,8 +188,8 @@ def report_ext_status(self, ext_handler_name, ext_name, ext_status): # pylint: d validate_param("ext_status", ext_status, ExtensionStatus) self.client.status_blob.set_ext_status(ext_handler_name, ext_status) - def report_event(self, get_events_in_queue): - self.client.report_event(get_events_in_queue) + def report_event(self, events_iterator): + self.client.report_event(events_iterator) def upload_logs(self, logs): self.client.upload_logs(logs) @@ -1077,10 +1077,10 @@ def send_encoded_event(self, provider_id, event_str, encoding='utf-8'): raise ProtocolError( "Failed to send events:{0}".format(resp.status)) - def report_event(self, get_events_in_queue): + def report_event(self, events_iterator): buf = {} debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_REPORT) - events_per_request = defaultdict(int) + events_per_provider = defaultdict(int) def _send_event(provider_id, debug_info): try: @@ -1091,7 +1091,7 @@ def _send_event(provider_id, debug_info): debug_info.update_op_error(error) # Group events by providerId - for event in get_events_in_queue(): + for event in events_iterator(): try: if event.providerId not in buf: buf[event.providerId] = b'' @@ -1110,14 +1110,14 @@ def _send_event(provider_id, debug_info): # If buffer is full, send out the events in buffer and reset buffer if len(buf[event.providerId] + event_str) >= MAX_EVENT_BUFFER_SIZE: - logger.verbose("No of events this request = {0}".format(events_per_request[event.providerId])) + logger.verbose("No of events this request = {0}".format(events_per_provider[event.providerId])) _send_event(event.providerId, debug_info) buf[event.providerId] = b'' - events_per_request[event.providerId] = 0 + events_per_provider[event.providerId] = 0 # Add encoded events to the buffer buf[event.providerId] = buf[event.providerId] + event_str - events_per_request[event.providerId] += 1 + events_per_provider[event.providerId] += 1 except Exception as error: logger.warn("Unexpected error when generating Events: {0}, {1}", ustr(error), traceback.format_exc()) @@ -1125,7 +1125,7 @@ def _send_event(provider_id, debug_info): # Send out all events left in buffer. for provider_id in list(buf.keys()): if buf[provider_id]: - logger.verbose("No of events this request = {0}".format(events_per_request[provider_id])) + logger.verbose("No of events this request = {0}".format(events_per_provider[provider_id])) _send_event(provider_id, debug_info) debug_info.report_debug_info() diff --git a/azurelinuxagent/common/telemetryevent.py b/azurelinuxagent/common/telemetryevent.py index 544820f76d..e78d2cd5d3 100644 --- a/azurelinuxagent/common/telemetryevent.py +++ b/azurelinuxagent/common/telemetryevent.py @@ -99,8 +99,7 @@ def is_extension_event(self): # "WALinuxAgent", it is an extension event. for param in self.parameters: if param.name == GuestAgentExtensionEventsSchema.Name: - if param.value != AGENT_NAME: - return True + return param.value != AGENT_NAME return False def get_version(self): From a7e2f6a9a0ae00c6774fdb3953cff218bc630ad3 Mon Sep 17 00:00:00 2001 From: larohra Date: Fri, 16 Oct 2020 14:24:01 -0700 Subject: [PATCH 52/63] Fix for flaky test --- tests/ga/test_update.py | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 0fe2153b48..2f1d26bd9c 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1576,29 +1576,24 @@ def test_it_should_recreate_handler_env_on_service_startup(self): @contextlib.contextmanager def _setup_test_for_ext_event_dirs_retention(self): - # This test class creates some files in the lib dir on setup, since we're using a new temp dir for our test, - # making sure the dir created by TestUpdate.setUp() is always clean for this test. - shutil.rmtree(conf.get_lib_dir(), ignore_errors=True) - - temp_ext_log_dir = tempfile.mkdtemp() - temp_lib_dir = tempfile.mkdtemp() try: - with patch.object(conf, "get_lib_dir", return_value=temp_lib_dir): - with patch.object(conf, "get_ext_log_dir", return_value=temp_ext_log_dir): - with self._get_update_handler(test_data=DATA_FILE_MULTIPLE_EXT) as (update_handler, protocol): - with patch('azurelinuxagent.ga.exthandlers._ENABLE_EXTENSION_TELEMETRY_PIPELINE', True): - update_handler.run(debug=True) - expected_events_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) - no_of_extensions = protocol.mock_wire_data.get_no_of_plugins_in_extension_config() - # Ensure extensions installed and events directory created - self.assertEqual(len(expected_events_dirs), no_of_extensions, "Extension events directories dont match") - for ext_dir in expected_events_dirs: - self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} not created!".format(ext_dir)) - - yield update_handler, expected_events_dirs + with self._get_update_handler(test_data=DATA_FILE_MULTIPLE_EXT) as (update_handler, protocol): + with patch('azurelinuxagent.ga.exthandlers._ENABLE_EXTENSION_TELEMETRY_PIPELINE', True): + update_handler.run(debug=True) + expected_events_dirs = glob.glob(os.path.join(conf.get_ext_log_dir(), "*", EVENTS_DIRECTORY)) + no_of_extensions = protocol.mock_wire_data.get_no_of_plugins_in_extension_config() + # Ensure extensions installed and events directory created + self.assertEqual(len(expected_events_dirs), no_of_extensions, "Extension events directories dont match") + for ext_dir in expected_events_dirs: + self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} not created!".format(ext_dir)) + + yield update_handler, expected_events_dirs finally: - shutil.rmtree(temp_ext_log_dir, ignore_errors=True) - shutil.rmtree(temp_lib_dir, ignore_errors=True) + # The TestUpdate.setUp() initializes the self.tmp_dir to be used as a placeholder + # for everything (event logger, status logger, conf.get_lib_dir() and more). + # Since we add more data to the dir for this test, ensuring its completely clean before exiting the test. + shutil.rmtree(self.tmp_dir, ignore_errors=True) + self.tmp_dir = None def test_it_should_delete_extension_events_directory_if_extension_telemetry_pipeline_disabled(self): # Disable extension telemetry pipeline and ensure events directory got deleted From 6c82c1e658a46ac03ef6003c09a7151f55863776 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 19 Oct 2020 15:02:21 -0700 Subject: [PATCH 53/63] Addressed PR comments --- azurelinuxagent/common/event.py | 103 ------------ .../common/utils/extensionprocessutil.py | 2 +- .../ga/collect_telemetry_events.py | 148 +++++++++++++++--- azurelinuxagent/ga/telemetry_service.py | 18 +-- azurelinuxagent/ga/update.py | 8 +- tests/common/test_event.py | 3 +- tests/ga/test_collect_telemetry_events.py | 10 +- tests/ga/test_telemetry_service.py | 39 ++--- tests/ga/test_update.py | 8 +- 9 files changed, 171 insertions(+), 168 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index 8fe20b3212..a40471000a 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -603,105 +603,6 @@ def add_common_event_parameters(self, event, event_timestamp): event.parameters.extend(common_params) event.parameters.extend(self._common_parameters) - @staticmethod - def _trim_extension_event_parameters(event): - """ - This method is called for extension events before they are sent out. Per the agreement with extension - publishers, the parameters that belong to extensions and will be reported intact are Name, Version, Operation, - OperationSuccess, Message, and Duration. Since there is nothing preventing extensions to instantiate other - fields (which belong to the agent), we call this method to ensure the rest of the parameters are trimmed since - they will be replaced with values coming from the agent. - :param event: Extension event to trim. - :return: Trimmed extension event; containing only extension-specific parameters. - """ - params_to_keep = dict().fromkeys([ - GuestAgentExtensionEventsSchema.Name, - GuestAgentExtensionEventsSchema.Version, - GuestAgentExtensionEventsSchema.Operation, - GuestAgentExtensionEventsSchema.OperationSuccess, - GuestAgentExtensionEventsSchema.Message, - GuestAgentExtensionEventsSchema.Duration - ]) - trimmed_params = [] - - for param in event.parameters: - if param.name in params_to_keep: - trimmed_params.append(param) - - event.parameters = trimmed_params - - # too-many-locals Disabled: The number of local variables is OK - def process_events(self, process_event_operation): # pylint: disable=too-many-locals - """ - Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files - from the events directory. - """ - event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) - event_files = os.listdir(event_directory_full_path) - debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_COLLECT) - - for event_file in event_files: - try: - match = EVENT_FILE_REGEX.search(event_file) - if match is None: - continue - - event_file_path = os.path.join(event_directory_full_path, event_file) - - try: - logger.verbose("Processing event file: {0}", event_file_path) - - with open(event_file_path, "rb") as event_fd: - event_data = event_fd.read().decode("utf-8") - - event = parse_event(event_data) - - # "legacy" events are events produced by previous versions of the agent (<= 2.2.46) and extensions; - # they do not include all the telemetry fields, so we add them here - is_legacy_event = match.group('agent_event') is None - - if is_legacy_event: - # We'll use the file creation time for the event's timestamp - event_file_creation_time_epoch = os.path.getmtime(event_file_path) - event_file_creation_time = datetime.fromtimestamp(event_file_creation_time_epoch) - - if event.is_extension_event(): - EventLogger._trim_extension_event_parameters(event) - self.add_common_event_parameters(event, event_file_creation_time) - else: - self._update_legacy_agent_event(event, event_file_creation_time) - - process_event_operation(event) - finally: - os.remove(event_file_path) - except ServiceStoppedError as stopped_error: - logger.error( - "Unable to enqueue events as service stopped: {0}, skipping events collection".format( - ustr(stopped_error))) - except UnicodeError as uni_err: - debug_info.update_unicode_error(uni_err) - except Exception as error: - debug_info.update_op_error(error) - - debug_info.report_debug_info() - - def _update_legacy_agent_event(self, event, event_creation_time): - # Ensure that if an agent event is missing a field from the schema defined since 2.2.47, the missing fields - # will be appended, ensuring the event schema is complete before the event is reported. - new_event = TelemetryEvent() - new_event.parameters = [] - self.add_common_event_parameters(new_event, event_creation_time) - - event_params = dict([(param.name, param.value) for param in event.parameters]) - new_event_params = dict([(param.name, param.value) for param in new_event.parameters]) - - missing_params = set(new_event_params.keys()).difference(set(event_params.keys())) - params_to_add = [] - for param_name in missing_params: - params_to_add.append(TelemetryEventParam(param_name, new_event_params[param_name])) - - event.parameters.extend(params_to_add) - __event_logger__ = EventLogger() @@ -809,10 +710,6 @@ def add_periodic(delta, name, op=WALAEventOperation.Unknown, is_success=True, du message=message, log_event=log_event, force=force) -def process_events(process_event_operation, reporter=__event_logger__): - return reporter.process_events(process_event_operation) - - def mark_event_status(name, version, op, status): # pylint: disable=C0103 if op in __event_status_operations__: __event_status__.mark_event_status(name, version, op, status) diff --git a/azurelinuxagent/common/utils/extensionprocessutil.py b/azurelinuxagent/common/utils/extensionprocessutil.py index 6aaffe7ba9..c85b145519 100644 --- a/azurelinuxagent/common/utils/extensionprocessutil.py +++ b/azurelinuxagent/common/utils/extensionprocessutil.py @@ -103,7 +103,7 @@ def format_stdout_stderr(stdout, stderr): """ Format stdout and stderr's output to make it suitable in telemetry. The goal is to maximize the amount of output given the constraints - of telemetry. + of telemetry. too-few-public-methods Disabled: This class is used as an Enum For example, if there is more stderr output than stdout output give more buffer space to stderr. diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 53efe040dd..d7e9d45427 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -21,25 +21,25 @@ import os import re import threading -from collections import defaultdict - import traceback +from collections import defaultdict import azurelinuxagent.common.logger as logger from azurelinuxagent.common import conf from azurelinuxagent.common.event import EVENTS_DIRECTORY, TELEMETRY_LOG_EVENT_ID, \ - TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, process_events + TELEMETRY_LOG_PROVIDER_ID, add_event, WALAEventOperation, add_log_event, get_event_logger, \ + CollectOrReportEventDebugInfo, EVENT_FILE_REGEX, parse_event from azurelinuxagent.common.exception import InvalidExtensionEventError, ServiceStoppedError from azurelinuxagent.common.future import ustr -from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ - GuestAgentGenericLogsSchema from azurelinuxagent.common.interfaces import ThreadHandlerInterface +from azurelinuxagent.common.telemetryevent import TelemetryEvent, TelemetryEventParam, \ + GuestAgentGenericLogsSchema, GuestAgentExtensionEventsSchema from azurelinuxagent.ga.exthandlers import HANDLER_NAME_PATTERN, is_extension_telemetry_pipeline_enabled from azurelinuxagent.ga.periodic_operation import PeriodicOperation -def get_telemetry_collector_handler(telemetry_service_handler): - return CollectTelemetryEventsHandler(telemetry_service_handler) +def get_collect_telemetry_events_handler(send_telemetry_events_handler): + return CollectTelemetryEventsHandler(send_telemetry_events_handler) # too-few-public-methods Disabled: This class is used as an Enum @@ -56,7 +56,8 @@ class ExtensionEventSchema(object): # pylint: disable=R0903 EventTid = "EventTid" OperationId = "OperationId" -class ProcessExtensionTelemetry(PeriodicOperation): + +class ProcessExtensionEventsPeriodicOperation(PeriodicOperation): """ Periodic operation for collecting and sending extension telemetry events to Wireserver. """ @@ -74,10 +75,10 @@ class ProcessExtensionTelemetry(PeriodicOperation): not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] def __init__(self, telemetry_service_handler): - super(ProcessExtensionTelemetry, self).__init__( + super(ProcessExtensionEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, - period=ProcessExtensionTelemetry._EXTENSION_EVENT_COLLECTION_PERIOD) + period=ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_COLLECTION_PERIOD) self._telemetry_service_handler = telemetry_service_handler @@ -288,7 +289,7 @@ def _parse_telemetry_event(self, handler_name, extension_unparsed_event, event_f event = TelemetryEvent(TELEMETRY_LOG_EVENT_ID, TELEMETRY_LOG_PROVIDER_ID) event.file_type = "json" - self.add_common_params_to_extension_event(event, event_file_time) + CollectTelemetryEventsHandler.add_common_params_to_telemetry_event(event, event_file_time) replace_or_add_params = { GuestAgentGenericLogsSchema.EventName: "{0}-{1}".format(handler_name, extension_event[ @@ -364,11 +365,6 @@ def _replace_or_add_param_in_event(event, replace_or_add_params): for param_name in replace_or_add_params: event.parameters.append(TelemetryEventParam(param_name, replace_or_add_params[param_name])) - @staticmethod - def add_common_params_to_extension_event(event, event_time): - reporter = get_event_logger() - reporter.add_common_event_parameters(event, event_time) - class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): """ @@ -394,11 +390,114 @@ def _collect_and_enqueue_events(self): self._telemetry_service_handler.get_thread_name())) return - process_events(self._telemetry_service_handler.enqueue_event) + self.process_events(self._telemetry_service_handler.enqueue_event) except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) + # too-many-locals Disabled: The number of local variables is OK + @staticmethod + def process_events(process_event_operation): # pylint: disable=too-many-locals + """ + Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files + from the events directory. + """ + event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) + event_files = os.listdir(event_directory_full_path) + debug_info = CollectOrReportEventDebugInfo(operation=CollectOrReportEventDebugInfo.OP_COLLECT) + + for event_file in event_files: + try: + match = EVENT_FILE_REGEX.search(event_file) + if match is None: + continue + + event_file_path = os.path.join(event_directory_full_path, event_file) + + try: + logger.verbose("Processing event file: {0}", event_file_path) + + with open(event_file_path, "rb") as event_fd: + event_data = event_fd.read().decode("utf-8") + + event = parse_event(event_data) + + # "legacy" events are events produced by previous versions of the agent (<= 2.2.46) and extensions; + # they do not include all the telemetry fields, so we add them here + is_legacy_event = match.group('agent_event') is None + + if is_legacy_event: + # We'll use the file creation time for the event's timestamp + event_file_creation_time_epoch = os.path.getmtime(event_file_path) + event_file_creation_time = datetime.datetime.fromtimestamp(event_file_creation_time_epoch) + + if event.is_extension_event(): + CollectAndEnqueueEventsPeriodicOperation._trim_legacy_extension_event_parameters(event) + CollectTelemetryEventsHandler.add_common_params_to_telemetry_event(event, + event_file_creation_time) + else: + CollectAndEnqueueEventsPeriodicOperation._update_legacy_agent_event(event, + event_file_creation_time) + + process_event_operation(event) + finally: + os.remove(event_file_path) + except ServiceStoppedError as stopped_error: + logger.error( + "Unable to enqueue events as service stopped: {0}, skipping events collection".format( + ustr(stopped_error))) + except UnicodeError as uni_err: + debug_info.update_unicode_error(uni_err) + except Exception as error: + debug_info.update_op_error(error) + + debug_info.report_debug_info() + + @staticmethod + def _update_legacy_agent_event(event, event_creation_time): + # Ensure that if an agent event is missing a field from the schema defined since 2.2.47, the missing fields + # will be appended, ensuring the event schema is complete before the event is reported. + new_event = TelemetryEvent() + new_event.parameters = [] + CollectTelemetryEventsHandler.add_common_params_to_telemetry_event(new_event, event_creation_time) + + event_params = dict([(param.name, param.value) for param in event.parameters]) + new_event_params = dict([(param.name, param.value) for param in new_event.parameters]) + + missing_params = set(new_event_params.keys()).difference(set(event_params.keys())) + params_to_add = [] + for param_name in missing_params: + params_to_add.append(TelemetryEventParam(param_name, new_event_params[param_name])) + + event.parameters.extend(params_to_add) + + @staticmethod + def _trim_legacy_extension_event_parameters(event): + """ + This method is called for extension events before they are sent out. Per the agreement with extension + publishers, the parameters that belong to extensions and will be reported intact are Name, Version, Operation, + OperationSuccess, Message, and Duration. Since there is nothing preventing extensions to instantiate other + fields (which belong to the agent), we call this method to ensure the rest of the parameters are trimmed since + they will be replaced with values coming from the agent. + :param event: Extension event to trim. + :return: Trimmed extension event; containing only extension-specific parameters. + """ + params_to_keep = dict().fromkeys([ + GuestAgentExtensionEventsSchema.Name, + GuestAgentExtensionEventsSchema.Version, + GuestAgentExtensionEventsSchema.Operation, + GuestAgentExtensionEventsSchema.OperationSuccess, + GuestAgentExtensionEventsSchema.Message, + GuestAgentExtensionEventsSchema.Duration + ]) + trimmed_params = [] + + for param in event.parameters: + if param.name in params_to_keep: + trimmed_params.append(param) + + event.parameters = trimmed_params + class CollectTelemetryEventsHandler(ThreadHandlerInterface): """ @@ -408,10 +507,10 @@ class CollectTelemetryEventsHandler(ThreadHandlerInterface): _THREAD_NAME = "TelemetryEventsCollector" - def __init__(self, telemetry_service_handler): + def __init__(self, send_telemetry_events_handler): self.should_run = True self.thread = None - self._telemetry_service_handler = telemetry_service_handler + self._send_telemetry_events_handler = send_telemetry_events_handler @staticmethod def get_thread_name(): @@ -443,13 +542,13 @@ def stopped(self): def daemon(self): periodic_operations = [ - CollectAndEnqueueEventsPeriodicOperation(self._telemetry_service_handler) + CollectAndEnqueueEventsPeriodicOperation(self._send_telemetry_events_handler) ] logger.info("Extension Telemetry pipeline enabled: {0}".format( is_extension_telemetry_pipeline_enabled())) if is_extension_telemetry_pipeline_enabled(): - periodic_operations.append(ProcessExtensionTelemetry(self._telemetry_service_handler)) + periodic_operations.append(ProcessExtensionEventsPeriodicOperation(self._send_telemetry_events_handler)) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): @@ -462,4 +561,9 @@ def daemon(self): "An error occurred in the Telemetry Extension thread main loop; will skip the current iteration.\n{0}", ustr(error)) finally: - PeriodicOperation.sleep_until_next_operation(periodic_operations) \ No newline at end of file + PeriodicOperation.sleep_until_next_operation(periodic_operations) + + @staticmethod + def add_common_params_to_telemetry_event(event, event_time): + reporter = get_event_logger() + reporter.add_common_event_parameters(event, event_time) \ No newline at end of file diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/telemetry_service.py index 5f5d139c46..236b86410d 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/telemetry_service.py @@ -28,17 +28,17 @@ from azurelinuxagent.common.interfaces import ThreadHandlerInterface -def get_telemetry_service_handler(protocol_util): - return TelemetryServiceHandler(protocol_util) +def get_send_telemetry_events_handler(protocol_util): + return SendTelemetryEventsHandler(protocol_util) -class TelemetryServiceHandler(ThreadHandlerInterface): +class SendTelemetryEventsHandler(ThreadHandlerInterface): """ This Handler takes care of sending all telemetry out of the agent to Wireserver. It sends out data as soon as there's any data available in the queue to send. """ - _THREAD_NAME = "TelemetryServiceHandler" + _THREAD_NAME = "SendTelemetryEventsHandler" _MAX_TIMEOUT = datetime.timedelta(seconds=5).seconds _MIN_EVENTS_TO_BATCH = 30 _MIN_BATCH_WAIT_TIME = datetime.timedelta(seconds=5) @@ -53,7 +53,7 @@ def __init__(self, protocol_util): @staticmethod def get_thread_name(): - return TelemetryServiceHandler._THREAD_NAME + return SendTelemetryEventsHandler._THREAD_NAME def run(self): logger.info("Start Extension Telemetry service.") @@ -91,9 +91,9 @@ def enqueue_event(self, event): raise ServiceStoppedError("{0} is stopped, not accepting anymore events".format(self.get_thread_name())) # Queue.put() can block if the queue is full which can be an uninterruptible wait. Blocking for a max of - # TelemetryServiceHandler._MAX_TIMEOUT seconds and raising a ServiceStoppedError to retry later. + # SendTelemetryEventsHandler._MAX_TIMEOUT seconds and raising a ServiceStoppedError to retry later. try: - self._queue.put(event, timeout=TelemetryServiceHandler._MAX_TIMEOUT) + self._queue.put(event, timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) except Full as error: raise ServiceStoppedError( "Queue full, stopping any more enqueuing until the next run. {0}".format(ustr(error))) @@ -105,9 +105,9 @@ def _process_telemetry_thread(self): logger.info("Successfully started the {0} thread".format(self.get_thread_name())) try: # On demand wait, start processing as soon as there is any data available in the queue. In worst case, - # also keep checking every TelemetryServiceHandler._MAX_TIMEOUT secs to avoid uninterruptible waits + # also keep checking every SendTelemetryEventsHandler._MAX_TIMEOUT secs to avoid uninterruptible waits while not self.stopped(): - self._should_process_events.wait(timeout=TelemetryServiceHandler._MAX_TIMEOUT) + self._should_process_events.wait(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) self._send_events_in_queue() except Exception as error: diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index c95f899040..aff1678516 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -54,14 +54,14 @@ PY_VERSION_MINOR, PY_VERSION_MICRO from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed from azurelinuxagent.ga.env import get_env_handler -from azurelinuxagent.ga.collect_telemetry_events import get_telemetry_collector_handler +from azurelinuxagent.ga.collect_telemetry_events import get_collect_telemetry_events_handler from azurelinuxagent.ga.exthandlers import HandlerManifest, get_traceback, ExtHandlersHandler, list_agent_lib_directory, \ is_extension_telemetry_pipeline_enabled from azurelinuxagent.ga.monitor import get_monitor_handler # pylint: disable=C0302 -from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler +from azurelinuxagent.ga.telemetry_service import get_send_telemetry_events_handler AGENT_ERROR_FILE = "error.json" # File name for agent error record AGENT_MANIFEST_FILE = "HandlerManifest.json" @@ -289,12 +289,12 @@ def run(self, debug=False): # pylint: disable=R0912,R0914 self._ensure_extension_telemetry_state_configured_properly(protocol) # Get all thread handlers - telemetry_handler = get_telemetry_service_handler(self.protocol_util) + telemetry_handler = get_send_telemetry_events_handler(self.protocol_util) all_thread_handlers = [ get_monitor_handler(), get_env_handler(), telemetry_handler, - get_telemetry_collector_handler(telemetry_handler) + get_collect_telemetry_events_handler(telemetry_handler) ] if is_log_collection_allowed(): diff --git a/tests/common/test_event.py b/tests/common/test_event.py index aa82e9de56..7275b65e65 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -37,6 +37,7 @@ from azurelinuxagent.common.telemetryevent import CommonTelemetryEventSchema, GuestAgentGenericLogsSchema, \ GuestAgentExtensionEventsSchema, GuestAgentPerfCounterEventsSchema from azurelinuxagent.common.version import CURRENT_AGENT, CURRENT_VERSION, AGENT_EXECUTION_MODE +from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation from tests.protocol import mockwiredata from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse from tests.tools import AgentTestCase, data_dir, load_data, patch, skip_if_predicate_true @@ -94,7 +95,7 @@ def _yield_events(): @staticmethod def _collect_events(): event_list = [] - event.process_events(event_list.append) + CollectAndEnqueueEventsPeriodicOperation.process_events(event_list.append) return event_list @staticmethod diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index f8918fc746..473bab615e 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -36,7 +36,7 @@ from azurelinuxagent.common.telemetryevent import GuestAgentGenericLogsSchema, \ CommonTelemetryEventSchema from azurelinuxagent.common.utils import fileutil, textutil -from azurelinuxagent.ga.collect_telemetry_events import ExtensionEventSchema, ProcessExtensionTelemetry +from azurelinuxagent.ga.collect_telemetry_events import ExtensionEventSchema, ProcessExtensionEventsPeriodicOperation from tests.protocol.mocks import HttpRequestPredicates from tests.tools import AgentTestCase, clear_singleton_instances, data_dir @@ -173,7 +173,7 @@ def _create_extension_telemetry_processor(self, telemetry_handler=None): telemetry_handler = MagicMock(autospec=True) telemetry_handler.stopped = MagicMock(return_value=False) telemetry_handler.enqueue_event = MagicMock(wraps=event_list.append) - extension_telemetry_processor = ProcessExtensionTelemetry(telemetry_handler) + extension_telemetry_processor = ProcessExtensionEventsPeriodicOperation(telemetry_handler) extension_telemetry_processor.event_list = event_list yield extension_telemetry_processor @@ -359,7 +359,7 @@ def _assert_event_reported(self, mock_event, handler_name_with_count, pattern): def test_it_should_trim_message_if_more_than_limit(self): max_len = 100 no_of_extensions = 2 - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_MSG_LEN", max_len): + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_MAX_MSG_LEN", max_len): handler_name_with_count, event_list = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable context1_vals = self._get_param_value_from_event_body_if_exists(event_list, GuestAgentGenericLogsSchema.Context1) @@ -373,7 +373,7 @@ def test_it_should_skip_events_larger_than_max_size_and_report_event(self): max_size = 1000 no_of_extensions = 3 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_MAX_SIZE", max_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) self._assert_invalid_extension_error_event_reported(mock_event, handler_name_with_count, @@ -383,7 +383,7 @@ def test_it_should_skip_large_files_greater_than_max_file_size_and_report_event( max_file_size = 10000 no_of_extensions = 5 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetry._EXTENSION_EVENT_FILE_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_FILE_MAX_SIZE", max_file_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_telemetry_service.py index 501415f066..b192cd843e 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_telemetry_service.py @@ -46,7 +46,7 @@ from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, AGENT_VERSION, CURRENT_AGENT, \ DISTRO_CODE_NAME from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation -from azurelinuxagent.ga.telemetry_service import get_telemetry_service_handler +from azurelinuxagent.ga.telemetry_service import get_send_telemetry_events_handler from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE @@ -54,7 +54,7 @@ from tests.utils.event_logger_tools import EventLoggerTools -class TestTelemetryServiceHandler(AgentTestCase, HttpRequestPredicates): +class TestSendTelemetryEventsHandler(AgentTestCase, HttpRequestPredicates): def setUp(self): AgentTestCase.setUp(self) clear_singleton_instances(ProtocolUtil) @@ -80,11 +80,11 @@ def http_post_handler(url, body, **__): with mock_wire_protocol(DATA_FILE, http_post_handler=http_post_handler) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) - telemetry_service_handler = get_telemetry_service_handler(protocol_util) + telemetry_service_handler = get_send_telemetry_events_handler(protocol_util) telemetry_service_handler.event_calls = [] - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_EVENTS_TO_BATCH", + with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_EVENTS_TO_BATCH", batching_queue_limit): - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MAX_TIMEOUT", timeout): + with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MAX_TIMEOUT", timeout): telemetry_service_handler.get_mock_wire_protocol = lambda: protocol if start_thread: @@ -100,7 +100,7 @@ def _stop_handler(telemetry_handler, timeout=0.001): def _assert_test_data_in_event_body(self, telemetry_handler, test_events): # Stop the thread and Wait for the queue and thread to join - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) for telemetry_event in test_events: event_str = event_to_v1_encoded(telemetry_event) @@ -128,7 +128,7 @@ def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_ms with patch("azurelinuxagent.common.event.add_event") as mock_add_event: telemetry_handler.enqueue_event(TelemetryEvent()) - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) for msg in expected_msgs: self._assert_error_event_reported(mock_add_event, msg) @@ -179,17 +179,17 @@ def test_it_should_honor_batch_time_limits_before_sending_telemetry(self): orig_sleep = time.sleep with patch("time.sleep", lambda *_: orig_sleep(0.01)): - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: for test_event in events: telemetry_handler.enqueue_event(test_event) self.assertEqual(0, len(telemetry_handler.event_calls), "No events should have been logged") - TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler, timeout=0.01) wait_time = timedelta(seconds=0.2) with patch("time.sleep", lambda *_: orig_sleep(0.05)): - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: test_start_time = datetime.now() for test_event in events: @@ -211,13 +211,13 @@ def test_it_should_clear_queue_before_stopping(self): wait_time = timedelta(seconds=10) with patch("time.sleep", lambda *_: mock_sleep(0.01)): - with patch("azurelinuxagent.ga.telemetry_service.TelemetryServiceHandler._MIN_BATCH_WAIT_TIME", wait_time): + with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: for test_event in events: telemetry_handler.enqueue_event(test_event) self.assertEqual(0, len(telemetry_handler.event_calls), "No events should have been logged") - TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler, timeout=0.01) # After the service is asked to stop, we should send all data in the queue self._assert_test_data_in_event_body(telemetry_handler, events) @@ -250,7 +250,8 @@ def test_it_should_raise_on_enqueue_if_service_stopped(self): telemetry_handler.enqueue_event(TelemetryEvent(eventId=ustr(uuid.uuid4()))) exception = context_manager.exception - self.assertIn("TelemetryServiceHandler is stopped, not accepting anymore events", str(exception)) + self.assertIn("{0} is stopped, not accepting anymore events".format(telemetry_handler.get_thread_name()), + str(exception)) def test_it_should_honour_the_incoming_order_of_events(self): @@ -261,7 +262,7 @@ def test_it_should_honour_the_incoming_order_of_events(self): telemetry_handler.start() self.assertTrue(telemetry_handler.is_alive(), "Thread not alive") - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) _, event_body = telemetry_handler.event_calls[0] event_orders = re.findall(r'', textutil.str_to_encoded_ustr(event_body)) @@ -301,7 +302,7 @@ def test_telemetry_service_should_add_event_on_unexpected_errors(self): patch_report_event.side_effect = Exception(test_str) telemetry_handler.enqueue_event(TelemetryEvent()) - TestTelemetryServiceHandler._stop_handler(telemetry_handler, timeout=0.01) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler, timeout=0.01) self._assert_error_event_reported(mock_add_event, test_str, operation=WALAEventOperation.UnhandledError) @@ -317,7 +318,7 @@ def _create_extension_event(self, @staticmethod def _get_event_data(message, name): - event = TelemetryEvent(1, TestTelemetryServiceHandler._TEST_EVENT_PROVIDER_ID) + event = TelemetryEvent(1, TestSendTelemetryEventsHandler._TEST_EVENT_PROVIDER_ID) event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Name, name)) event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Version, str(CURRENT_VERSION))) event.parameters.append(TelemetryEventParam(GuestAgentExtensionEventsSchema.Operation, WALAEventOperation.Unknown)) @@ -349,7 +350,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): with patch("threading.Thread.getName", return_value=test_taskname): monitor_handler.run() - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) # Validating the crafted message by the collect_and_send_events call. self.assertEqual(1, len(telemetry_handler.event_calls), "Only 1 event should be sent") @@ -408,7 +409,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) self.assertEqual(4, len(telemetry_handler.event_calls)) @patch("azurelinuxagent.common.conf.get_lib_dir") @@ -424,7 +425,7 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir): with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() - TestTelemetryServiceHandler._stop_handler(telemetry_handler) + TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) self.assertEqual(3, patch_periodic_warn.call_count) # The send_event call should never be called as the events are larger than 2**16. diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 2f1d26bd9c..4382d64def 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1258,8 +1258,8 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument with patch('azurelinuxagent.ga.update.get_monitor_handler') as mock_monitor: with patch('azurelinuxagent.ga.update.get_env_handler') as mock_env: with patch('azurelinuxagent.ga.update.get_collect_logs_handler') as mock_collect_logs: - with patch('azurelinuxagent.ga.update.get_telemetry_service_handler') as mock_telemetry_service: - with patch('azurelinuxagent.ga.update.get_telemetry_collector_handler') as mock_event_collector: + with patch('azurelinuxagent.ga.update.get_send_telemetry_events_handler') as mock_telemetry_service: + with patch('azurelinuxagent.ga.update.get_collect_telemetry_events_handler') as mock_event_collector: with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): with patch('time.sleep', side_effect=iterator) as sleep_mock: @@ -1611,8 +1611,8 @@ def test_it_should_retain_extension_events_directories_if_extension_telemetry_pi self.assertTrue(os.path.exists(ext_dir), "Extension directory {0} should exist!".format(ext_dir)) -@patch('azurelinuxagent.ga.update.get_telemetry_collector_handler') -@patch('azurelinuxagent.ga.update.get_telemetry_service_handler') +@patch('azurelinuxagent.ga.update.get_collect_telemetry_events_handler') +@patch('azurelinuxagent.ga.update.get_send_telemetry_events_handler') @patch('azurelinuxagent.ga.update.get_collect_logs_handler') @patch('azurelinuxagent.ga.update.get_monitor_handler') @patch('azurelinuxagent.ga.update.get_env_handler') From c990cd8b61d75106695ca31eb4e2da89c38db69d Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 19 Oct 2020 15:11:51 -0700 Subject: [PATCH 54/63] Fixed failing tests --- azurelinuxagent/common/event.py | 2 +- tests/ga/test_collect_telemetry_events.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py index a40471000a..eeee6a41f1 100644 --- a/azurelinuxagent/common/event.py +++ b/azurelinuxagent/common/event.py @@ -29,7 +29,7 @@ import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger from azurelinuxagent.common.AgentGlobals import AgentGlobals -from azurelinuxagent.common.exception import EventError, OSUtilError, ServiceStoppedError +from azurelinuxagent.common.exception import EventError, OSUtilError from azurelinuxagent.common.future import ustr from azurelinuxagent.common.datacontract import get_properties, set_properties from azurelinuxagent.common.osutil import get_osutil diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index 473bab615e..5138fb11c2 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -359,7 +359,7 @@ def _assert_event_reported(self, mock_event, handler_name_with_count, pattern): def test_it_should_trim_message_if_more_than_limit(self): max_len = 100 no_of_extensions = 2 - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_MAX_MSG_LEN", max_len): + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_MSG_LEN", max_len): handler_name_with_count, event_list = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable context1_vals = self._get_param_value_from_event_body_if_exists(event_list, GuestAgentGenericLogsSchema.Context1) @@ -373,7 +373,7 @@ def test_it_should_skip_events_larger_than_max_size_and_report_event(self): max_size = 1000 no_of_extensions = 3 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_SIZE", max_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) self._assert_invalid_extension_error_event_reported(mock_event, handler_name_with_count, @@ -383,7 +383,7 @@ def test_it_should_skip_large_files_greater_than_max_file_size_and_report_event( max_file_size = 10000 no_of_extensions = 5 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionTelemetryPeriodicOperation._EXTENSION_EVENT_FILE_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_FILE_MAX_SIZE", max_file_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) From 6d37dff78f80491cff3af89a83ecde7dd31c1c54 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 19 Oct 2020 15:34:42 -0700 Subject: [PATCH 55/63] Minor code cleanup --- azurelinuxagent/ga/collect_telemetry_events.py | 8 +++----- tests/common/test_event.py | 7 ++++++- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index d7e9d45427..9faee4bdc5 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -389,15 +389,13 @@ def _collect_and_enqueue_events(self): logger.warn("{0} service is not running, skipping iteration.".format( self._telemetry_service_handler.get_thread_name())) return - - self.process_events(self._telemetry_service_handler.enqueue_event) + self.process_events() except Exception as error: err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) # too-many-locals Disabled: The number of local variables is OK - @staticmethod - def process_events(process_event_operation): # pylint: disable=too-many-locals + def process_events(self): # pylint: disable=too-many-locals """ Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. @@ -439,7 +437,7 @@ def process_events(process_event_operation): # pylint: disable=too-many-locals CollectAndEnqueueEventsPeriodicOperation._update_legacy_agent_event(event, event_file_creation_time) - process_event_operation(event) + self._telemetry_service_handler.enqueue_event(event) finally: os.remove(event_file_path) except ServiceStoppedError as stopped_error: diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 7275b65e65..492d1b7c81 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -26,6 +26,8 @@ import xml.dom from datetime import datetime, timedelta +from mock import MagicMock + import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common import event, logger from azurelinuxagent.common.AgentGlobals import AgentGlobals @@ -95,7 +97,10 @@ def _yield_events(): @staticmethod def _collect_events(): event_list = [] - CollectAndEnqueueEventsPeriodicOperation.process_events(event_list.append) + telemetry_service = MagicMock() + telemetry_service.enqueue_event = MagicMock(wraps=event_list.append) + event_collector = CollectAndEnqueueEventsPeriodicOperation(telemetry_service) + event_collector.process_events() return event_list @staticmethod From 5bc6a106a7a37e2b907b5d6a074ff5d3441954c0 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 20 Oct 2020 15:56:30 -0700 Subject: [PATCH 56/63] Renamed telemetry_service.py to send_telemetry_events.py --- .../ga/collect_telemetry_events.py | 20 +++--- ...ry_service.py => send_telemetry_events.py} | 4 ++ azurelinuxagent/ga/update.py | 2 +- tests/common/test_event.py | 6 +- tests/ga/test_collect_telemetry_events.py | 2 +- ...rvice.py => test_send_telemetry_events.py} | 66 +++++++++---------- tests/ga/test_update.py | 8 +-- 7 files changed, 56 insertions(+), 52 deletions(-) rename azurelinuxagent/ga/{telemetry_service.py => send_telemetry_events.py} (95%) rename tests/ga/{test_telemetry_service.py => test_send_telemetry_events.py} (86%) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 9faee4bdc5..df4d34e8bb 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -74,19 +74,19 @@ class ProcessExtensionEventsPeriodicOperation(PeriodicOperation): _EXTENSION_EVENT_REQUIRED_FIELDS = [attr.lower() for attr in dir(ExtensionEventSchema) if not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] - def __init__(self, telemetry_service_handler): + def __init__(self, send_telemetry_events_handler): super(ProcessExtensionEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, period=ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_COLLECTION_PERIOD) - self._telemetry_service_handler = telemetry_service_handler + self._send_telemetry_events_handler = send_telemetry_events_handler def _collect_and_enqueue_extension_events(self): - if self._telemetry_service_handler.stopped(): + if self._send_telemetry_events_handler.stopped(): logger.warn("{0} service is not running, skipping current iteration".format( - self._telemetry_service_handler.get_thread_name())) + self._send_telemetry_events_handler.get_thread_name())) return delete_all_event_files = True @@ -253,7 +253,7 @@ def _get_captured_events_count(self, handler_name, event_file_path, captured_eve for event in events: try: - self._telemetry_service_handler.enqueue_event( + self._send_telemetry_events_handler.enqueue_event( self._parse_telemetry_event(handler_name, event, event_file_time) ) captured_events_count += 1 @@ -373,21 +373,21 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) - def __init__(self, telemetry_service_handler): + def __init__(self, send_telemetry_events_handler): super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self._collect_and_enqueue_events, period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) - self._telemetry_service_handler = telemetry_service_handler + self._send_telemetry_events_handler = send_telemetry_events_handler def _collect_and_enqueue_events(self): """ Periodically send any events located in the events folder """ try: - if self._telemetry_service_handler.stopped(): + if self._send_telemetry_events_handler.stopped(): logger.warn("{0} service is not running, skipping iteration.".format( - self._telemetry_service_handler.get_thread_name())) + self._send_telemetry_events_handler.get_thread_name())) return self.process_events() except Exception as error: @@ -437,7 +437,7 @@ def process_events(self): # pylint: disable=too-many-locals CollectAndEnqueueEventsPeriodicOperation._update_legacy_agent_event(event, event_file_creation_time) - self._telemetry_service_handler.enqueue_event(event) + self._send_telemetry_events_handler.enqueue_event(event) finally: os.remove(event_file_path) except ServiceStoppedError as stopped_error: diff --git a/azurelinuxagent/ga/telemetry_service.py b/azurelinuxagent/ga/send_telemetry_events.py similarity index 95% rename from azurelinuxagent/ga/telemetry_service.py rename to azurelinuxagent/ga/send_telemetry_events.py index 236b86410d..1f2d0ae13b 100644 --- a/azurelinuxagent/ga/telemetry_service.py +++ b/azurelinuxagent/ga/send_telemetry_events.py @@ -49,6 +49,10 @@ def __init__(self, protocol_util): self.should_run = True self._thread = None self._should_process_events = threading.Event() + + # We're using a Queue for handling the communication between threads. We plan to remove any dependency on the + # filesystem in the future and use add_event to directly queue events into the queue rather than writing to + # a file and then parsing it later. self._queue = Queue() @staticmethod diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index aff1678516..11390ffac3 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -61,7 +61,7 @@ from azurelinuxagent.ga.monitor import get_monitor_handler # pylint: disable=C0302 -from azurelinuxagent.ga.telemetry_service import get_send_telemetry_events_handler +from azurelinuxagent.ga.send_telemetry_events import get_send_telemetry_events_handler AGENT_ERROR_FILE = "error.json" # File name for agent error record AGENT_MANIFEST_FILE = "HandlerManifest.json" diff --git a/tests/common/test_event.py b/tests/common/test_event.py index 492d1b7c81..bbbe6ab67d 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -97,9 +97,9 @@ def _yield_events(): @staticmethod def _collect_events(): event_list = [] - telemetry_service = MagicMock() - telemetry_service.enqueue_event = MagicMock(wraps=event_list.append) - event_collector = CollectAndEnqueueEventsPeriodicOperation(telemetry_service) + send_telemetry_events = MagicMock() + send_telemetry_events.enqueue_event = MagicMock(wraps=event_list.append) + event_collector = CollectAndEnqueueEventsPeriodicOperation(send_telemetry_events) event_collector.process_events() return event_list diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index 5138fb11c2..5acc34bda2 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -533,7 +533,7 @@ def test_it_should_not_send_event_where_message_is_empty_and_report_event(self): InvalidExtensionEventError.EmptyMessageError, expected_drop_count=1) - def test_it_should_not_process_events_if_telemetry_service_stopped(self): + def test_it_should_not_process_events_if_send_telemetry_events_handler_stopped(self): event_list = [] telemetry_handler = MagicMock(autospec=True) telemetry_handler.stopped = MagicMock(return_value=True) diff --git a/tests/ga/test_telemetry_service.py b/tests/ga/test_send_telemetry_events.py similarity index 86% rename from tests/ga/test_telemetry_service.py rename to tests/ga/test_send_telemetry_events.py index b192cd843e..12ab22515f 100644 --- a/tests/ga/test_telemetry_service.py +++ b/tests/ga/test_send_telemetry_events.py @@ -46,7 +46,7 @@ from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, AGENT_VERSION, CURRENT_AGENT, \ DISTRO_CODE_NAME from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation -from azurelinuxagent.ga.telemetry_service import get_send_telemetry_events_handler +from azurelinuxagent.ga.send_telemetry_events import get_send_telemetry_events_handler from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates from tests.protocol.mockwiredata import DATA_FILE @@ -70,27 +70,27 @@ def tearDown(self): _TEST_EVENT_PROVIDER_ID = "XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX" @contextlib.contextmanager - def _create_telemetry_service_handler(self, timeout=0.5, start_thread=True, batching_queue_limit=1): + def _create_send_telemetry_events_handler(self, timeout=0.5, start_thread=True, batching_queue_limit=1): def http_post_handler(url, body, **__): if self.is_telemetry_request(url): - telemetry_service_handler.event_calls.append((datetime.now(), body)) + send_telemetry_events_handler.event_calls.append((datetime.now(), body)) return MockHttpResponse(status=200) return None with mock_wire_protocol(DATA_FILE, http_post_handler=http_post_handler) as protocol: protocol_util = MagicMock() protocol_util.get_protocol = Mock(return_value=protocol) - telemetry_service_handler = get_send_telemetry_events_handler(protocol_util) - telemetry_service_handler.event_calls = [] - with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_EVENTS_TO_BATCH", + send_telemetry_events_handler = get_send_telemetry_events_handler(protocol_util) + send_telemetry_events_handler.event_calls = [] + with patch("azurelinuxagent.ga.send_telemetry_events.SendTelemetryEventsHandler._MIN_EVENTS_TO_BATCH", batching_queue_limit): - with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MAX_TIMEOUT", timeout): + with patch("azurelinuxagent.ga.send_telemetry_events.SendTelemetryEventsHandler._MAX_TIMEOUT", timeout): - telemetry_service_handler.get_mock_wire_protocol = lambda: protocol + send_telemetry_events_handler.get_mock_wire_protocol = lambda: protocol if start_thread: - telemetry_service_handler.start() - self.assertTrue(telemetry_service_handler.is_alive(), "Thread didn't start properly!") - yield telemetry_service_handler + send_telemetry_events_handler.start() + self.assertTrue(send_telemetry_events_handler.is_alive(), "Thread didn't start properly!") + yield send_telemetry_events_handler @staticmethod def _stop_handler(telemetry_handler, timeout=0.001): @@ -122,7 +122,7 @@ def _assert_error_event_reported(self, mock_add_event, expected_msg, operation=W self.assertTrue(found_msg, "Error msg: {0} not reported".format(expected_msg)) def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_msgs): - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: telemetry_handler.get_mock_wire_protocol().set_http_handlers(http_post_handler=http_post_handler) @@ -135,7 +135,7 @@ def _setup_and_assert_bad_request_scenarios(self, http_post_handler, expected_ms def test_it_should_send_events_properly(self): events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -144,7 +144,7 @@ def test_it_should_send_events_properly(self): def test_it_should_send_as_soon_as_events_available_in_queue_with_minimal_batching_limits(self): events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: test_start_time = datetime.now() for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -159,7 +159,7 @@ def test_it_should_send_as_soon_as_events_available_in_queue_with_minimal_batchi def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): events = [TelemetryEvent(eventId=ustr(uuid.uuid4())), TelemetryEvent(eventId=ustr(uuid.uuid4()))] - with self._create_telemetry_service_handler(timeout=0.1) as telemetry_handler: + with self._create_send_telemetry_events_handler(timeout=0.1) as telemetry_handler: # Do nothing for some time time.sleep(0.3) @@ -167,7 +167,7 @@ def test_thread_should_wait_for_events_to_get_in_queue_before_processing(self): # Ensure that no events were transmitted by the telemetry handler during this time, i.e. telemetry thread was idle self.assertEqual(0, len(telemetry_handler.event_calls), "Unwanted calls to telemetry") - # Now enqueue data and verify telemetry_service sends them asap + # Now enqueue data and verify send_telemetry_events sends them asap for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -179,8 +179,8 @@ def test_it_should_honor_batch_time_limits_before_sending_telemetry(self): orig_sleep = time.sleep with patch("time.sleep", lambda *_: orig_sleep(0.01)): - with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): - with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + with patch("azurelinuxagent.ga.send_telemetry_events.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_send_telemetry_events_handler(batching_queue_limit=5) as telemetry_handler: for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -189,8 +189,8 @@ def test_it_should_honor_batch_time_limits_before_sending_telemetry(self): wait_time = timedelta(seconds=0.2) with patch("time.sleep", lambda *_: orig_sleep(0.05)): - with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): - with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + with patch("azurelinuxagent.ga.send_telemetry_events.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_send_telemetry_events_handler(batching_queue_limit=5) as telemetry_handler: test_start_time = datetime.now() for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -211,8 +211,8 @@ def test_it_should_clear_queue_before_stopping(self): wait_time = timedelta(seconds=10) with patch("time.sleep", lambda *_: mock_sleep(0.01)): - with patch("azurelinuxagent.ga.telemetry_service.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): - with self._create_telemetry_service_handler(batching_queue_limit=5) as telemetry_handler: + with patch("azurelinuxagent.ga.send_telemetry_events.SendTelemetryEventsHandler._MIN_BATCH_WAIT_TIME", wait_time): + with self._create_send_telemetry_events_handler(batching_queue_limit=5) as telemetry_handler: for test_event in events: telemetry_handler.enqueue_event(test_event) @@ -225,7 +225,7 @@ def test_it_should_honor_batch_queue_limits_before_sending_telemetry(self): batch_limit = 5 - with self._create_telemetry_service_handler(batching_queue_limit=batch_limit) as telemetry_handler: + with self._create_send_telemetry_events_handler(batching_queue_limit=batch_limit) as telemetry_handler: events = [] for _ in range(batch_limit-1): @@ -243,7 +243,7 @@ def test_it_should_honor_batch_queue_limits_before_sending_telemetry(self): self._assert_test_data_in_event_body(telemetry_handler, events) def test_it_should_raise_on_enqueue_if_service_stopped(self): - with self._create_telemetry_service_handler(start_thread=False) as telemetry_handler: + with self._create_send_telemetry_events_handler(start_thread=False) as telemetry_handler: # Ensure the thread is stopped telemetry_handler.stop() with self.assertRaises(ServiceStoppedError) as context_manager: @@ -256,7 +256,7 @@ def test_it_should_raise_on_enqueue_if_service_stopped(self): def test_it_should_honour_the_incoming_order_of_events(self): - with self._create_telemetry_service_handler(timeout=0.3, start_thread=False) as telemetry_handler: + with self._create_send_telemetry_events_handler(timeout=0.3, start_thread=False) as telemetry_handler: for index in range(5): telemetry_handler.enqueue_event(TelemetryEvent(eventId=index)) @@ -269,7 +269,7 @@ def test_it_should_honour_the_incoming_order_of_events(self): self.assertEqual(sorted(event_orders), event_orders, "Events not ordered correctly") - def test_telemetry_service_should_report_event_if_wireserver_returns_http_error(self): + def test_send_telemetry_events_should_report_event_if_wireserver_returns_http_error(self): test_str = "A test exception, Guid: {0}".format(str(uuid.uuid4())) @@ -280,7 +280,7 @@ def http_post_handler(url, _, **__): self._setup_and_assert_bad_request_scenarios(http_post_handler, [test_str]) - def test_telemetry_service_should_report_event_when_http_post_returning_503(self): + def test_send_telemetry_events_should_report_event_when_http_post_returning_503(self): def http_post_handler(url, _, **__): if self.is_telemetry_request(url): @@ -292,11 +292,11 @@ def http_post_handler(url, _, **__): self._setup_and_assert_bad_request_scenarios(http_post_handler, expected_msgs) - def test_telemetry_service_should_add_event_on_unexpected_errors(self): + def test_send_telemetry_events_should_add_event_on_unexpected_errors(self): - with self._create_telemetry_service_handler(timeout=0.1) as telemetry_handler: + with self._create_send_telemetry_events_handler(timeout=0.1) as telemetry_handler: - with patch("azurelinuxagent.ga.telemetry_service.add_event") as mock_add_event: + with patch("azurelinuxagent.ga.send_telemetry_events.add_event") as mock_add_event: with patch("azurelinuxagent.common.protocol.wire.WireClient.report_event") as patch_report_event: test_str = "Test exception, Guid: {0}".format(str(uuid.uuid4())) patch_report_event.side_effect = Exception(test_str) @@ -334,7 +334,7 @@ def _get_event_data(message, name): def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler) self._create_extension_event(message="Message-Test") @@ -399,7 +399,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): def test_collect_and_send_events_with_small_events(self, mock_lib_dir): mock_lib_dir.return_value = self.lib_dir - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: sizes = [15, 15, 15, 15] # get the powers of 2 - 2**16 is the limit for power in sizes: @@ -416,7 +416,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): def test_collect_and_send_events_with_large_events(self, mock_lib_dir): mock_lib_dir.return_value = self.lib_dir - with self._create_telemetry_service_handler() as telemetry_handler: + with self._create_send_telemetry_events_handler() as telemetry_handler: sizes = [17, 17, 17] # get the powers of 2 for power in sizes: diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 4382d64def..246c9df57f 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1258,7 +1258,7 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument with patch('azurelinuxagent.ga.update.get_monitor_handler') as mock_monitor: with patch('azurelinuxagent.ga.update.get_env_handler') as mock_env: with patch('azurelinuxagent.ga.update.get_collect_logs_handler') as mock_collect_logs: - with patch('azurelinuxagent.ga.update.get_send_telemetry_events_handler') as mock_telemetry_service: + with patch('azurelinuxagent.ga.update.get_send_telemetry_events_handler') as mock_telemetry_send_events: with patch('azurelinuxagent.ga.update.get_collect_telemetry_events_handler') as mock_event_collector: with patch('azurelinuxagent.ga.update.initialize_event_logger_vminfo_common_parameters'): with patch('azurelinuxagent.ga.update.is_log_collection_allowed', return_value=True): @@ -1280,7 +1280,7 @@ def iterator(*args, **kwargs): # pylint: disable=useless-return,unused-argument self.assertEqual(1, mock_monitor.call_count) self.assertEqual(1, mock_env.call_count) self.assertEqual(1, mock_collect_logs.call_count) - self.assertEqual(1, mock_telemetry_service.call_count) + self.assertEqual(1, mock_telemetry_send_events.call_count) self.assertEqual(1, mock_event_collector.call_count) self.assertEqual(1, mock_exit.call_count) @@ -1659,7 +1659,7 @@ def _setup_mock_thread_and_start_test_run(self, mock_thread, is_alive=True, invo return thread # too-many-arguments Disabled: The number of arguments maps to the number of threads - def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs, mock_telemetry_service, mock_telemetry_collector): # pylint: disable=too-many-arguments + def test_start_threads(self, mock_env, mock_monitor, mock_collect_logs, mock_telemetry_send_events, mock_telemetry_collector): # pylint: disable=too-many-arguments self.assertTrue(self.update_handler.running) def _get_mock_thread(): @@ -1667,7 +1667,7 @@ def _get_mock_thread(): thread.run = MagicMock() return thread - all_threads = [mock_telemetry_service, mock_telemetry_collector, mock_env, mock_monitor, mock_collect_logs] + all_threads = [mock_telemetry_send_events, mock_telemetry_collector, mock_env, mock_monitor, mock_collect_logs] for thread in all_threads: thread.return_value = _get_mock_thread() From 0cf5543515aba2a8149bf0a0a005954c26993582 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 26 Oct 2020 16:25:38 -0700 Subject: [PATCH 57/63] Removed event based synchronization --- azurelinuxagent/common/future.py | 4 +- azurelinuxagent/common/protocol/wire.py | 2 +- .../ga/collect_telemetry_events.py | 9 +- azurelinuxagent/ga/send_telemetry_events.py | 93 ++++++++++++------- azurelinuxagent/ga/update.py | 2 + 5 files changed, 71 insertions(+), 39 deletions(-) diff --git a/azurelinuxagent/common/future.py b/azurelinuxagent/common/future.py index 963bb37a6c..d2c0396910 100644 --- a/azurelinuxagent/common/future.py +++ b/azurelinuxagent/common/future.py @@ -33,12 +33,12 @@ # unused-import, import-error Disabled: Due to backward compatibility between py2 and py3 from builtins import int, range # pylint: disable=unused-import,import-error from collections import OrderedDict # pylint: disable=W0611 - from queue import Queue, Full # pylint: disable=W0611,import-error + from queue import Queue, Empty # pylint: disable=W0611,import-error elif sys.version_info[0] == 2: import httplib as httpclient # pylint: disable=E0401,W0611 from urlparse import urlparse # pylint: disable=E0401 - from Queue import Queue, Full # pylint: disable=W0611,import-error + from Queue import Queue, Empty # pylint: disable=W0611,import-error # We want to suppress the following: diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index fe4ac02e95..f5e02b9f57 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -1091,7 +1091,7 @@ def _send_event(provider_id, debug_info): debug_info.update_op_error(error) # Group events by providerId - for event in events_iterator(): + for event in events_iterator: try: if event.providerId not in buf: buf[event.providerId] = b'' diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index df4d34e8bb..cff6c2f6dc 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -193,6 +193,7 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): except ServiceStoppedError: # Not logging here as already logged once, re-raising # Since we already started processing this file, deleting it as we could've already sent some events out + # This is a trade-off between data replication vs data loss. raise except Exception as error: msg = "Failed to process event file {0}: {1}, {2}".format(event_file, ustr(error), @@ -200,6 +201,9 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): logger.warn(msg) add_log_event(level=logger.LogLevel.WARNING, message=msg, forced=True) finally: + # Todo: We should delete files after ensuring that we sent the data to Wireserver successfully + # from our end rather than deleting first and sending later. This is to ensure the data reliability + # of the agent telemetry pipeline. os.remove(event_file_path) finally: @@ -397,7 +401,7 @@ def _collect_and_enqueue_events(self): # too-many-locals Disabled: The number of local variables is OK def process_events(self): # pylint: disable=too-many-locals """ - Retuns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files + Returns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. """ event_directory_full_path = os.path.join(conf.get_lib_dir(), EVENTS_DIRECTORY) @@ -439,6 +443,9 @@ def process_events(self): # pylint: disable=too-many-locals self._send_telemetry_events_handler.enqueue_event(event) finally: + # Todo: We should delete files after ensuring that we sent the data to Wireserver successfully + # from our end rather than deleting first and sending later. This is to ensure the data reliability + # of the agent telemetry pipeline. os.remove(event_file_path) except ServiceStoppedError as stopped_error: logger.error( diff --git a/azurelinuxagent/ga/send_telemetry_events.py b/azurelinuxagent/ga/send_telemetry_events.py index 1f2d0ae13b..580d8a55c1 100644 --- a/azurelinuxagent/ga/send_telemetry_events.py +++ b/azurelinuxagent/ga/send_telemetry_events.py @@ -24,7 +24,7 @@ from azurelinuxagent.common import logger from azurelinuxagent.common.event import add_event, WALAEventOperation from azurelinuxagent.common.exception import ServiceStoppedError -from azurelinuxagent.common.future import ustr, Queue, Full +from azurelinuxagent.common.future import ustr, Queue, Empty from azurelinuxagent.common.interfaces import ThreadHandlerInterface @@ -48,11 +48,15 @@ def __init__(self, protocol_util): self._protocol = protocol_util.get_protocol() self.should_run = True self._thread = None - self._should_process_events = threading.Event() + # self._should_process_events = threading.Event() # We're using a Queue for handling the communication between threads. We plan to remove any dependency on the # filesystem in the future and use add_event to directly queue events into the queue rather than writing to # a file and then parsing it later. + + # Once we move add_event to directly queue events, we need to add a maxsize here to ensure some limitations are + # being set (currently our limits are enforced by collector_threads but that would become obsolete once we + # start enqueuing events directly). self._queue = Queue() @staticmethod @@ -78,7 +82,7 @@ def stop(self): """ self.should_run = False # Set the event to unblock the thread to ensure that the thread is not blocking shutdown. - self._should_process_events.set() + # self._should_process_events.set() if self.is_alive(): self.join() @@ -96,55 +100,74 @@ def enqueue_event(self, event): # Queue.put() can block if the queue is full which can be an uninterruptible wait. Blocking for a max of # SendTelemetryEventsHandler._MAX_TIMEOUT seconds and raising a ServiceStoppedError to retry later. + + # Todo: Queue.put() will only raise a Full exception if a maxsize is set for the Queue. Once some size + # limitations are set for the Queue, ensure to handle that correctly here. try: self._queue.put(event, timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) - except Full as error: + except Exception as error: raise ServiceStoppedError( - "Queue full, stopping any more enqueuing until the next run. {0}".format(ustr(error))) + "Unable to enqueue due to: {0}, stopping any more enqueuing until the next run".format(ustr(error))) # Set the event if any enqueue happens (even if already set) to trigger sending those events - self._should_process_events.set() + # self._should_process_events.set() + + def _wait_for_event_in_queue(self): + event = None + try: + event = self._queue.get(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) + self._queue.task_done() + except Empty: + # No elements in Queue, do nothing + pass + + return event def _process_telemetry_thread(self): logger.info("Successfully started the {0} thread".format(self.get_thread_name())) try: # On demand wait, start processing as soon as there is any data available in the queue. In worst case, - # also keep checking every SendTelemetryEventsHandler._MAX_TIMEOUT secs to avoid uninterruptible waits - while not self.stopped(): - self._should_process_events.wait(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) - self._send_events_in_queue() + # also keep checking every SendTelemetryEventsHandler._MAX_TIMEOUT secs to avoid uninterruptible waits. + # Incase the service is stopped but we have events in queue, ensure we send them out before killing the thread. + while not self.stopped() or not self._queue.empty(): + # self._should_process_events.wait(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) + event = self._wait_for_event_in_queue() + if event: + # Start processing queue only if initial event is not None (i.e. Queue has atleast 1 event), + # else do nothing + self._send_events_in_queue(event) except Exception as error: err_msg = "An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}".format( self.get_thread_name(), ustr(error), traceback.format_exc()) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) - def _get_events_in_queue(self): + def _send_events_in_queue(self, event): + # Process everything in Queue + # if not self._queue.empty(): + start_time = datetime.datetime.utcnow() + while not self.stopped() and (self._queue.qsize() + 1) < self._MIN_EVENTS_TO_BATCH and ( + start_time + self._MIN_BATCH_WAIT_TIME) > datetime.datetime.utcnow(): + # To promote batching, we either wait for atleast _MIN_EVENTS_TO_BATCH events or _MIN_BATCH_WAIT_TIME secs + # before sending out the first request to wireserver. + # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. + logger.verbose("Waiting for events to batch. Total events so far: {0}, Time elapsed: {1} secs", + self._queue.qsize()+1, (datetime.datetime.utcnow() - start_time).seconds) + time.sleep(1) + # Delete files after sending the data rather than deleting and sending + self._protocol.report_event(self._get_events_in_queue(event)) + + # Reset the event when done processing all events in queue + # if self._should_process_events.is_set() and self._queue.empty(): + # logger.verbose("Resetting the event") + # self._should_process_events.clear() + + def _get_events_in_queue(self, first_event): + yield first_event while not self._queue.empty(): try: - event = self._queue.get_nowait() - yield event + yield self._queue.get_nowait() + self._queue.task_done() except Exception as error: logger.error("Some exception when fetching event from queue: {0}, {1}".format(ustr(error), - traceback.format_exc())) - finally: - self._queue.task_done() - - def _send_events_in_queue(self): - # Process everything in Queue - if not self._queue.empty(): - start_time = datetime.datetime.utcnow() - while not self.stopped() and self._queue.qsize() < self._MIN_EVENTS_TO_BATCH and ( - start_time + self._MIN_BATCH_WAIT_TIME) > datetime.datetime.utcnow(): - # To promote batching, we either wait for atleast _MIN_EVENTS_TO_BATCH events or _MIN_BATCH_WAIT_TIME secs - # before sending out the first request to wireserver. - # If the thread is requested to stop midway, we skip batching and send whatever we have in the queue. - logger.verbose("Waiting for events to batch. Queue size: {0}, Time elapsed: {1} secs", - self._queue.qsize(), (datetime.datetime.utcnow() - start_time).seconds) - time.sleep(1) - self._protocol.report_event(self._get_events_in_queue) - - # Reset the event when done processing all events in queue - if self._should_process_events.is_set() and self._queue.empty(): - logger.verbose("Resetting the event") - self._should_process_events.clear() \ No newline at end of file + traceback.format_exc())) \ No newline at end of file diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 11390ffac3..647dd37056 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -660,6 +660,8 @@ def _sentinel_file_path(self): return os.path.join(conf.get_lib_dir(), AGENT_SENTINEL_FILE) def _shutdown(self): + # Todo: Ensure all threads stopped when shutting down the main extension handler to ensure that the state of + # all threads is clean. self.running = False if not os.path.isfile(self._sentinel_file_path()): From 5af6e1a1b30e13d47196f2fa0a431d7744a4d091 Mon Sep 17 00:00:00 2001 From: larohra Date: Mon, 26 Oct 2020 16:43:33 -0700 Subject: [PATCH 58/63] Fix failing tests --- tests/common/test_event.py | 2 +- tests/protocol/test_wire.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/common/test_event.py b/tests/common/test_event.py index bbbe6ab67d..ecb7049c1e 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -92,7 +92,7 @@ def _yield_events(): for telemetry_event in event_list: yield telemetry_event - protocol.client.report_event(_yield_events) + protocol.client.report_event(_yield_events()) @staticmethod def _collect_events(): diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 9f461278b7..cea07bd4fb 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -122,7 +122,7 @@ def _yield_events(): for telemetry_event in event_list: yield telemetry_event - return _yield_events + return _yield_events() def test_getters(self, *args): """Normal case""" From 881818050882d772a1886455b59079391cc5b08a Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 27 Oct 2020 13:25:07 -0700 Subject: [PATCH 59/63] Addressed PR comments --- .../common/utils/extensionprocessutil.py | 2 +- .../ga/collect_telemetry_events.py | 34 +++++++++++++------ azurelinuxagent/ga/send_telemetry_events.py | 13 +------ tests/common/test_event.py | 4 +-- tests/ga/test_collect_telemetry_events.py | 10 +++--- tests/ga/test_send_telemetry_events.py | 10 +++--- 6 files changed, 36 insertions(+), 37 deletions(-) diff --git a/azurelinuxagent/common/utils/extensionprocessutil.py b/azurelinuxagent/common/utils/extensionprocessutil.py index c85b145519..6aaffe7ba9 100644 --- a/azurelinuxagent/common/utils/extensionprocessutil.py +++ b/azurelinuxagent/common/utils/extensionprocessutil.py @@ -103,7 +103,7 @@ def format_stdout_stderr(stdout, stderr): """ Format stdout and stderr's output to make it suitable in telemetry. The goal is to maximize the amount of output given the constraints - of telemetry. too-few-public-methods Disabled: This class is used as an Enum + of telemetry. For example, if there is more stderr output than stdout output give more buffer space to stderr. diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index cff6c2f6dc..ed2cfa337d 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -46,6 +46,18 @@ def get_collect_telemetry_events_handler(send_telemetry_events_handler): class ExtensionEventSchema(object): # pylint: disable=R0903 """ Class for defining the schema for Extension Events. + + Sample Extension Event Example: + { + "Version":"1.0.0.23", + "Timestamp":"2018-01-02T22:08:12.510696Z" //(time in UTC (ISO-8601 standard), + "TaskName":"TestRun" //Open for publishers, + "EventLevel":"Critical/Error/Warning/Verbose/Informational/LogAlways", + "Message": "Successful test" //(max 3K, 3072 characters), + "EventPid":"1", + "EventTid":"2", + "OperationId":"Guid (str)" + } """ Version = "Version" Timestamp = "Timestamp" @@ -57,7 +69,7 @@ class ExtensionEventSchema(object): # pylint: disable=R0903 OperationId = "OperationId" -class ProcessExtensionEventsPeriodicOperation(PeriodicOperation): +class _ProcessExtensionEventsPeriodicOperation(PeriodicOperation): """ Periodic operation for collecting and sending extension telemetry events to Wireserver. """ @@ -75,10 +87,10 @@ class ProcessExtensionEventsPeriodicOperation(PeriodicOperation): not callable(getattr(ExtensionEventSchema, attr)) and not attr.startswith("__")] def __init__(self, send_telemetry_events_handler): - super(ProcessExtensionEventsPeriodicOperation, self).__init__( + super(_ProcessExtensionEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_extension_events", operation=self._collect_and_enqueue_extension_events, - period=ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_COLLECTION_PERIOD) + period=_ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_COLLECTION_PERIOD) self._send_telemetry_events_handler = send_telemetry_events_handler @@ -370,7 +382,7 @@ def _replace_or_add_param_in_event(event, replace_or_add_params): event.parameters.append(TelemetryEventParam(param_name, replace_or_add_params[param_name])) -class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): +class _CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): """ Periodic operation to collect and send telemetry events located in the events folder """ @@ -378,10 +390,10 @@ class CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) def __init__(self, send_telemetry_events_handler): - super(CollectAndEnqueueEventsPeriodicOperation, self).__init__( + super(_CollectAndEnqueueEventsPeriodicOperation, self).__init__( name="collect_and_enqueue_events", operation=self._collect_and_enqueue_events, - period=CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) + period=_CollectAndEnqueueEventsPeriodicOperation._EVENT_COLLECTION_PERIOD) self._send_telemetry_events_handler = send_telemetry_events_handler def _collect_and_enqueue_events(self): @@ -434,12 +446,12 @@ def process_events(self): # pylint: disable=too-many-locals event_file_creation_time = datetime.datetime.fromtimestamp(event_file_creation_time_epoch) if event.is_extension_event(): - CollectAndEnqueueEventsPeriodicOperation._trim_legacy_extension_event_parameters(event) + _CollectAndEnqueueEventsPeriodicOperation._trim_legacy_extension_event_parameters(event) CollectTelemetryEventsHandler.add_common_params_to_telemetry_event(event, event_file_creation_time) else: - CollectAndEnqueueEventsPeriodicOperation._update_legacy_agent_event(event, - event_file_creation_time) + _CollectAndEnqueueEventsPeriodicOperation._update_legacy_agent_event(event, + event_file_creation_time) self._send_telemetry_events_handler.enqueue_event(event) finally: @@ -547,13 +559,13 @@ def stopped(self): def daemon(self): periodic_operations = [ - CollectAndEnqueueEventsPeriodicOperation(self._send_telemetry_events_handler) + _CollectAndEnqueueEventsPeriodicOperation(self._send_telemetry_events_handler) ] logger.info("Extension Telemetry pipeline enabled: {0}".format( is_extension_telemetry_pipeline_enabled())) if is_extension_telemetry_pipeline_enabled(): - periodic_operations.append(ProcessExtensionEventsPeriodicOperation(self._send_telemetry_events_handler)) + periodic_operations.append(_ProcessExtensionEventsPeriodicOperation(self._send_telemetry_events_handler)) logger.info("Successfully started the {0} thread".format(self.get_thread_name())) while not self.stopped(): diff --git a/azurelinuxagent/ga/send_telemetry_events.py b/azurelinuxagent/ga/send_telemetry_events.py index 580d8a55c1..43538d6951 100644 --- a/azurelinuxagent/ga/send_telemetry_events.py +++ b/azurelinuxagent/ga/send_telemetry_events.py @@ -38,7 +38,7 @@ class SendTelemetryEventsHandler(ThreadHandlerInterface): there's any data available in the queue to send. """ - _THREAD_NAME = "SendTelemetryEventsHandler" + _THREAD_NAME = "SendTelemetryHandler" _MAX_TIMEOUT = datetime.timedelta(seconds=5).seconds _MIN_EVENTS_TO_BATCH = 30 _MIN_BATCH_WAIT_TIME = datetime.timedelta(seconds=5) @@ -81,8 +81,6 @@ def stop(self): Stop server communication and join the thread to main thread. """ self.should_run = False - # Set the event to unblock the thread to ensure that the thread is not blocking shutdown. - # self._should_process_events.set() if self.is_alive(): self.join() @@ -109,9 +107,6 @@ def enqueue_event(self, event): raise ServiceStoppedError( "Unable to enqueue due to: {0}, stopping any more enqueuing until the next run".format(ustr(error))) - # Set the event if any enqueue happens (even if already set) to trigger sending those events - # self._should_process_events.set() - def _wait_for_event_in_queue(self): event = None try: @@ -144,7 +139,6 @@ def _process_telemetry_thread(self): def _send_events_in_queue(self, event): # Process everything in Queue - # if not self._queue.empty(): start_time = datetime.datetime.utcnow() while not self.stopped() and (self._queue.qsize() + 1) < self._MIN_EVENTS_TO_BATCH and ( start_time + self._MIN_BATCH_WAIT_TIME) > datetime.datetime.utcnow(): @@ -157,11 +151,6 @@ def _send_events_in_queue(self, event): # Delete files after sending the data rather than deleting and sending self._protocol.report_event(self._get_events_in_queue(event)) - # Reset the event when done processing all events in queue - # if self._should_process_events.is_set() and self._queue.empty(): - # logger.verbose("Resetting the event") - # self._should_process_events.clear() - def _get_events_in_queue(self, first_event): yield first_event while not self._queue.empty(): diff --git a/tests/common/test_event.py b/tests/common/test_event.py index ecb7049c1e..fc880762f8 100644 --- a/tests/common/test_event.py +++ b/tests/common/test_event.py @@ -39,7 +39,7 @@ from azurelinuxagent.common.telemetryevent import CommonTelemetryEventSchema, GuestAgentGenericLogsSchema, \ GuestAgentExtensionEventsSchema, GuestAgentPerfCounterEventsSchema from azurelinuxagent.common.version import CURRENT_AGENT, CURRENT_VERSION, AGENT_EXECUTION_MODE -from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation +from azurelinuxagent.ga.collect_telemetry_events import _CollectAndEnqueueEventsPeriodicOperation from tests.protocol import mockwiredata from tests.protocol.mocks import mock_wire_protocol, HttpRequestPredicates, MockHttpResponse from tests.tools import AgentTestCase, data_dir, load_data, patch, skip_if_predicate_true @@ -99,7 +99,7 @@ def _collect_events(): event_list = [] send_telemetry_events = MagicMock() send_telemetry_events.enqueue_event = MagicMock(wraps=event_list.append) - event_collector = CollectAndEnqueueEventsPeriodicOperation(send_telemetry_events) + event_collector = _CollectAndEnqueueEventsPeriodicOperation(send_telemetry_events) event_collector.process_events() return event_list diff --git a/tests/ga/test_collect_telemetry_events.py b/tests/ga/test_collect_telemetry_events.py index 5acc34bda2..2677770f40 100644 --- a/tests/ga/test_collect_telemetry_events.py +++ b/tests/ga/test_collect_telemetry_events.py @@ -36,7 +36,7 @@ from azurelinuxagent.common.telemetryevent import GuestAgentGenericLogsSchema, \ CommonTelemetryEventSchema from azurelinuxagent.common.utils import fileutil, textutil -from azurelinuxagent.ga.collect_telemetry_events import ExtensionEventSchema, ProcessExtensionEventsPeriodicOperation +from azurelinuxagent.ga.collect_telemetry_events import ExtensionEventSchema, _ProcessExtensionEventsPeriodicOperation from tests.protocol.mocks import HttpRequestPredicates from tests.tools import AgentTestCase, clear_singleton_instances, data_dir @@ -173,7 +173,7 @@ def _create_extension_telemetry_processor(self, telemetry_handler=None): telemetry_handler = MagicMock(autospec=True) telemetry_handler.stopped = MagicMock(return_value=False) telemetry_handler.enqueue_event = MagicMock(wraps=event_list.append) - extension_telemetry_processor = ProcessExtensionEventsPeriodicOperation(telemetry_handler) + extension_telemetry_processor = _ProcessExtensionEventsPeriodicOperation(telemetry_handler) extension_telemetry_processor.event_list = event_list yield extension_telemetry_processor @@ -359,7 +359,7 @@ def _assert_event_reported(self, mock_event, handler_name_with_count, pattern): def test_it_should_trim_message_if_more_than_limit(self): max_len = 100 no_of_extensions = 2 - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_MSG_LEN", max_len): + with patch("azurelinuxagent.ga.collect_telemetry_events._ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_MSG_LEN", max_len): handler_name_with_count, event_list = self._setup_and_assert_tests_for_max_sizes() # pylint: disable=unused-variable context1_vals = self._get_param_value_from_event_body_if_exists(event_list, GuestAgentGenericLogsSchema.Context1) @@ -373,7 +373,7 @@ def test_it_should_skip_events_larger_than_max_size_and_report_event(self): max_size = 1000 no_of_extensions = 3 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events._ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_MAX_SIZE", max_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) self._assert_invalid_extension_error_event_reported(mock_event, handler_name_with_count, @@ -383,7 +383,7 @@ def test_it_should_skip_large_files_greater_than_max_file_size_and_report_event( max_file_size = 10000 no_of_extensions = 5 with patch("azurelinuxagent.ga.collect_telemetry_events.add_log_event") as mock_event: - with patch("azurelinuxagent.ga.collect_telemetry_events.ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_FILE_MAX_SIZE", + with patch("azurelinuxagent.ga.collect_telemetry_events._ProcessExtensionEventsPeriodicOperation._EXTENSION_EVENT_FILE_MAX_SIZE", max_file_size): handler_name_with_count, _ = self._setup_and_assert_tests_for_max_sizes(no_of_extensions, expected_count=0) diff --git a/tests/ga/test_send_telemetry_events.py b/tests/ga/test_send_telemetry_events.py index 12ab22515f..aed7992a04 100644 --- a/tests/ga/test_send_telemetry_events.py +++ b/tests/ga/test_send_telemetry_events.py @@ -45,7 +45,7 @@ GuestAgentExtensionEventsSchema from azurelinuxagent.common.version import CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, AGENT_VERSION, CURRENT_AGENT, \ DISTRO_CODE_NAME -from azurelinuxagent.ga.collect_telemetry_events import CollectAndEnqueueEventsPeriodicOperation +from azurelinuxagent.ga.collect_telemetry_events import _CollectAndEnqueueEventsPeriodicOperation from azurelinuxagent.ga.send_telemetry_events import get_send_telemetry_events_handler from tests.ga.test_monitor import random_generator from tests.protocol.mocks import MockHttpResponse, mock_wire_protocol, HttpRequestPredicates @@ -253,7 +253,6 @@ def test_it_should_raise_on_enqueue_if_service_stopped(self): self.assertIn("{0} is stopped, not accepting anymore events".format(telemetry_handler.get_thread_name()), str(exception)) - def test_it_should_honour_the_incoming_order_of_events(self): with self._create_send_telemetry_events_handler(timeout=0.3, start_thread=False) as telemetry_handler: @@ -268,7 +267,6 @@ def test_it_should_honour_the_incoming_order_of_events(self): textutil.str_to_encoded_ustr(event_body)) self.assertEqual(sorted(event_orders), event_orders, "Events not ordered correctly") - def test_send_telemetry_events_should_report_event_if_wireserver_returns_http_error(self): test_str = "A test exception, Guid: {0}".format(str(uuid.uuid4())) @@ -335,7 +333,7 @@ def test_it_should_enqueue_and_send_events_properly(self, mock_lib_dir, *_): mock_lib_dir.return_value = self.lib_dir with self._create_send_telemetry_events_handler() as telemetry_handler: - monitor_handler = CollectAndEnqueueEventsPeriodicOperation(telemetry_handler) + monitor_handler = _CollectAndEnqueueEventsPeriodicOperation(telemetry_handler) self._create_extension_event(message="Message-Test") test_mtime = 1000 # epoch time, in ms @@ -406,7 +404,7 @@ def test_collect_and_send_events_with_small_events(self, mock_lib_dir): size = 2 ** power self._create_extension_event(size) - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() + _CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() # The send_event call would be called each time, as we are filling up the buffer up to the brim for each call. TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) @@ -424,7 +422,7 @@ def test_collect_and_send_events_with_large_events(self, mock_lib_dir): self._create_extension_event(size) with patch("azurelinuxagent.common.logger.periodic_warn") as patch_periodic_warn: - CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() + _CollectAndEnqueueEventsPeriodicOperation(telemetry_handler).run() TestSendTelemetryEventsHandler._stop_handler(telemetry_handler) self.assertEqual(3, patch_periodic_warn.call_count) From 7e8a223434544eb2b84fbc7d492826d3ae64f158 Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 28 Oct 2020 10:17:13 -0700 Subject: [PATCH 60/63] Updated changes and removed dead linter errors --- azurelinuxagent/ga/collect_telemetry_events.py | 3 +-- tests/ga/test_update.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index ed2cfa337d..8e68933777 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -410,8 +410,7 @@ def _collect_and_enqueue_events(self): err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) - # too-many-locals Disabled: The number of local variables is OK - def process_events(self): # pylint: disable=too-many-locals + def process_events(self): """ Returns a list of events that need to be sent to the telemetry pipeline and deletes the corresponding files from the events directory. diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 246c9df57f..e0d048fb9f 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1233,8 +1233,7 @@ def test_run_latest_creates_only_one_signal_handler(self, mock_signal): self._test_run_latest() self.assertEqual(0, mock_signal.call_count) - # too-many-locals Disabled: The number of local variables is OK - def _test_run(self, invocations=1, calls=[call.run()], enable_updates=False, sleep_interval=(6,)): # pylint: disable=dangerous-default-value,too-many-locals + def _test_run(self, invocations=1, calls=[call.run()], enable_updates=False, sleep_interval=(6,)): # pylint: disable=dangerous-default-value conf.get_autoupdate_enabled = Mock(return_value=enable_updates) # Note: From 8f6193367f3e74f90dfc661fc29fd7b5ac6f9c8a Mon Sep 17 00:00:00 2001 From: larohra Date: Thu, 29 Oct 2020 14:11:56 -0700 Subject: [PATCH 61/63] Addressed PR comments --- azurelinuxagent/ga/send_telemetry_events.py | 28 ++++++++++++--------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/azurelinuxagent/ga/send_telemetry_events.py b/azurelinuxagent/ga/send_telemetry_events.py index 43538d6951..6bccfa17e4 100644 --- a/azurelinuxagent/ga/send_telemetry_events.py +++ b/azurelinuxagent/ga/send_telemetry_events.py @@ -48,7 +48,6 @@ def __init__(self, protocol_util): self._protocol = protocol_util.get_protocol() self.should_run = True self._thread = None - # self._should_process_events = threading.Event() # We're using a Queue for handling the communication between threads. We plan to remove any dependency on the # filesystem in the future and use add_event to directly queue events into the queue rather than writing to @@ -108,13 +107,18 @@ def enqueue_event(self, event): "Unable to enqueue due to: {0}, stopping any more enqueuing until the next run".format(ustr(error))) def _wait_for_event_in_queue(self): - event = None + """ + Wait for atleast one event in Queue or timeout after SendTelemetryEventsHandler._MAX_TIMEOUT seconds. + In case of a timeout, set the event to None. + :return: event if an event is added to the Queue or None to signify no events were added in queue. + This would raise in case of an error. + """ try: event = self._queue.get(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) self._queue.task_done() except Empty: - # No elements in Queue, do nothing - pass + # No elements in Queue, return None + event = None return event @@ -125,19 +129,18 @@ def _process_telemetry_thread(self): # also keep checking every SendTelemetryEventsHandler._MAX_TIMEOUT secs to avoid uninterruptible waits. # Incase the service is stopped but we have events in queue, ensure we send them out before killing the thread. while not self.stopped() or not self._queue.empty(): - # self._should_process_events.wait(timeout=SendTelemetryEventsHandler._MAX_TIMEOUT) - event = self._wait_for_event_in_queue() - if event: - # Start processing queue only if initial event is not None (i.e. Queue has atleast 1 event), + first_event = self._wait_for_event_in_queue() + if first_event: + # Start processing queue only if first event is not None (i.e. Queue has atleast 1 event), # else do nothing - self._send_events_in_queue(event) + self._send_events_in_queue(first_event) except Exception as error: err_msg = "An unknown error occurred in the {0} thread main loop, stopping thread. Error: {1}, Stack: {2}".format( self.get_thread_name(), ustr(error), traceback.format_exc()) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) - def _send_events_in_queue(self, event): + def _send_events_in_queue(self, first_event): # Process everything in Queue start_time = datetime.datetime.utcnow() while not self.stopped() and (self._queue.qsize() + 1) < self._MIN_EVENTS_TO_BATCH and ( @@ -149,14 +152,15 @@ def _send_events_in_queue(self, event): self._queue.qsize()+1, (datetime.datetime.utcnow() - start_time).seconds) time.sleep(1) # Delete files after sending the data rather than deleting and sending - self._protocol.report_event(self._get_events_in_queue(event)) + self._protocol.report_event(self._get_events_in_queue(first_event)) def _get_events_in_queue(self, first_event): yield first_event while not self._queue.empty(): try: - yield self._queue.get_nowait() + event = self._queue.get_nowait() self._queue.task_done() + yield event except Exception as error: logger.error("Some exception when fetching event from queue: {0}, {1}".format(ustr(error), traceback.format_exc())) \ No newline at end of file From 24b4a139101062d8cb568dd0b3e115f5286e26d9 Mon Sep 17 00:00:00 2001 From: larohra Date: Tue, 3 Nov 2020 17:13:08 -0800 Subject: [PATCH 62/63] Improved comments and function names --- azurelinuxagent/ga/collect_telemetry_events.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index 8e68933777..d8f47b9d73 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -71,7 +71,7 @@ class ExtensionEventSchema(object): # pylint: disable=R0903 class _ProcessExtensionEventsPeriodicOperation(PeriodicOperation): """ - Periodic operation for collecting and sending extension telemetry events to Wireserver. + Periodic operation for collecting extension telemetry events and enqueueing them for the SendTelemetryHandler thread. """ _EXTENSION_EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=5) @@ -191,9 +191,9 @@ def _capture_extension_events(self, handler_name, handler_event_dir_path): continue # We support multiple events in a file, read the file and parse events. - captured_extension_events_count = self._get_captured_events_count(handler_name, event_file_path, - captured_extension_events_count, - dropped_events_with_error_count) + captured_extension_events_count = self._enqueue_events_and_get_count(handler_name, event_file_path, + captured_extension_events_count, + dropped_events_with_error_count) # We only allow MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD=300 maximum events per period per handler if captured_extension_events_count >= self._MAX_NUMBER_OF_EVENTS_PER_EXTENSION_PER_PERIOD: @@ -250,8 +250,8 @@ def _ensure_all_events_directories_empty(extension_events_directories): if err is not None: logger.error("Failed to completely clear the {0} directory. Exception: {1}", event_dir_path, err) - def _get_captured_events_count(self, handler_name, event_file_path, captured_events_count, - dropped_events_with_error_count): + def _enqueue_events_and_get_count(self, handler_name, event_file_path, captured_events_count, + dropped_events_with_error_count): event_file_time = datetime.datetime.fromtimestamp(os.path.getmtime(event_file_path)) @@ -384,7 +384,8 @@ def _replace_or_add_param_in_event(event, replace_or_add_params): class _CollectAndEnqueueEventsPeriodicOperation(PeriodicOperation): """ - Periodic operation to collect and send telemetry events located in the events folder + Periodic operation to collect telemetry events located in the events folder and enqueue them for the + SendTelemetryHandler thread. """ _EVENT_COLLECTION_PERIOD = datetime.timedelta(minutes=1) From 30e56c06c7e494448f39149099dd22c6aa82bb4a Mon Sep 17 00:00:00 2001 From: larohra Date: Wed, 4 Nov 2020 14:56:05 -0800 Subject: [PATCH 63/63] Addressed PR comments and removed unused import --- azurelinuxagent/common/protocol/wire.py | 4 ++-- azurelinuxagent/ga/collect_telemetry_events.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index d7ae301517..5f7dfc280a 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -23,13 +23,13 @@ import traceback import xml.sax.saxutils as saxutils from collections import defaultdict -from datetime import datetime import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger import azurelinuxagent.common.utils.textutil as textutil from azurelinuxagent.common.datacontract import validate_param -from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, report_event, CollectOrReportEventDebugInfo +from azurelinuxagent.common.event import add_event, add_periodic, WALAEventOperation, report_event, \ + CollectOrReportEventDebugInfo from azurelinuxagent.common.exception import ProtocolNotFoundError, \ ResourceGoneError, ExtensionDownloadError, InvalidContainerError, ProtocolError, HttpError from azurelinuxagent.common.future import httpclient, bytebuffer, ustr diff --git a/azurelinuxagent/ga/collect_telemetry_events.py b/azurelinuxagent/ga/collect_telemetry_events.py index d8f47b9d73..48f535be34 100644 --- a/azurelinuxagent/ga/collect_telemetry_events.py +++ b/azurelinuxagent/ga/collect_telemetry_events.py @@ -408,7 +408,7 @@ def _collect_and_enqueue_events(self): return self.process_events() except Exception as error: - err_msg = "Failure in collecting Agent events: {0}".format(ustr(error)) + err_msg = "Failure in collecting telemetry events: {0}".format(ustr(error)) add_event(op=WALAEventOperation.UnhandledError, message=err_msg, is_success=False) def process_events(self):