From 2f4b33e8c76260ad7a0f52f4e52852ec043e7a95 Mon Sep 17 00:00:00 2001 From: Laveesh Rohra Date: Fri, 14 Jan 2022 02:10:48 +0530 Subject: [PATCH] Don't test for agent log in between tests ETP (#2471) --- dcr/scenario_utils/agent_log_parser.py | 8 +++++ dcr/scenario_utils/check_waagent_log.py | 10 ++---- .../etp_helpers.py | 4 +++ .../extension-telemetry-pipeline/run.py | 31 ++++++++++++++++--- 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/dcr/scenario_utils/agent_log_parser.py b/dcr/scenario_utils/agent_log_parser.py index ceca589ba6..b316b0cdb0 100644 --- a/dcr/scenario_utils/agent_log_parser.py +++ b/dcr/scenario_utils/agent_log_parser.py @@ -1,3 +1,5 @@ +from __future__ import print_function + import os import re from datetime import datetime @@ -37,6 +39,8 @@ class AgentLogRecord: + __ERROR_TAGS = ['Exception', 'Traceback', '[CGW]'] + def __init__(self, match): self.text = match.string self.when = match.group("when") @@ -48,6 +52,10 @@ def __init__(self, match): def get_timestamp(self): return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ') + @property + def is_error(self): + return self.level in ('ERROR', 'WARNING') or any(err in self.text for err in self.__ERROR_TAGS) + def parse_agent_log_file(waagent_log_path=AGENT_LOG_FILE): if not os.path.exists(waagent_log_path): diff --git a/dcr/scenario_utils/check_waagent_log.py b/dcr/scenario_utils/check_waagent_log.py index b3795fd730..0dcb8972a4 100644 --- a/dcr/scenario_utils/check_waagent_log.py +++ b/dcr/scenario_utils/check_waagent_log.py @@ -16,8 +16,6 @@ def check_waagent_log_for_errors(waagent_log=AGENT_LOG_FILE, ignore=None): distro = "".join(get_distro()) systemd_enabled = is_systemd_distro() - error_tags = ['ERROR', 'Exception', 'Traceback', 'WARNING', '[CGW]'] - # # NOTES: # * 'message' is matched using re.search; be sure to escape any regex metacharacters @@ -80,7 +78,8 @@ def check_waagent_log_for_errors(waagent_log=AGENT_LOG_FILE, ignore=None): # Ignoring this error for Deb 8 as its not a blocker and since Deb 8 is old and not widely used { 'message': r"journalctl: unrecognized option '--utc'", - 'if': lambda log_line: re.match(r"(debian8\.11)\D*", distro, flags=re.IGNORECASE) is not None and log_line.level == "WARNING" + 'if': lambda log_line: re.match(r"(debian8\.11)\D*", distro, + flags=re.IGNORECASE) is not None and log_line.level == "WARNING" }, # 2021-07-09T01:46:53.307959Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed: # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 2367] UNKNOWN'] @@ -104,16 +103,13 @@ def check_waagent_log_for_errors(waagent_log=AGENT_LOG_FILE, ignore=None): if ignore is not None: ignore_list.extend(ignore) - def is_error(log_line): - return any(err in log_line.text for err in error_tags) - def can_be_ignored(log_line): return any(re.search(msg['message'], log_line.text) is not None and ('if' not in msg or msg['if'](log_line)) for msg in ignore_list) errors = [] for agent_log_line in parse_agent_log_file(waagent_log): - if is_error(agent_log_line) and not can_be_ignored(agent_log_line): + if agent_log_line.is_error and not can_be_ignored(agent_log_line): # Handle "/proc/net/route contains no routes" as a special case since it can take time for the # primary interface to come up and we don't want to report transient errors as actual errors if "/proc/net/route contains no routes" in agent_log_line.text: diff --git a/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py b/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py index 282526ed57..1e5ad6e1dc 100644 --- a/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py +++ b/dcr/scenarios/extension-telemetry-pipeline/etp_helpers.py @@ -10,6 +10,10 @@ from random import choice +def get_collect_telemetry_thread_name(): + return "TelemetryEventsCollector" + + def wait_for_extension_events_dir_empty(timeout=timedelta(minutes=2)): # By ensuring events dir to be empty, we verify that the telemetry events collector has completed its run event_dirs = glob.glob(os.path.join("/var/log/azure/", "*", "events")) diff --git a/dcr/scenarios/extension-telemetry-pipeline/run.py b/dcr/scenarios/extension-telemetry-pipeline/run.py index 61c1c895d3..9ae7d63b46 100644 --- a/dcr/scenarios/extension-telemetry-pipeline/run.py +++ b/dcr/scenarios/extension-telemetry-pipeline/run.py @@ -3,13 +3,12 @@ import random import time -from dotenv import load_dotenv - +from dcr.scenario_utils.agent_log_parser import parse_agent_log_file from dcr.scenario_utils.check_waagent_log import is_data_in_waagent_log, check_waagent_log_for_errors -from dcr.scenario_utils.models import get_vm_data_from_env from dcr.scenario_utils.test_orchestrator import TestFuncObj from dcr.scenario_utils.test_orchestrator import TestOrchestrator -from etp_helpers import add_extension_events_and_get_count, wait_for_extension_events_dir_empty +from etp_helpers import add_extension_events_and_get_count, wait_for_extension_events_dir_empty, \ + get_collect_telemetry_thread_name def add_good_extension_events_and_verify(): @@ -22,7 +21,15 @@ def add_good_extension_events_and_verify(): # Sleep for a min to ensure that the TelemetryService has enough time to send events and report errors if any time.sleep(60) - check_waagent_log_for_errors() + telemetry_event_collector_name = get_collect_telemetry_thread_name() + errors_reported = False + for agent_log_line in parse_agent_log_file(): + if agent_log_line.thread == telemetry_event_collector_name and agent_log_line.is_error: + if not errors_reported: + print( + f"waagent.log contains the following errors emitted by the {telemetry_event_collector_name} thread (none expected):") + errors_reported = True + print(agent_log_line.text.rstrip()) for ext_name in ext_event_count: good_count = ext_event_count[ext_name]['good'] @@ -65,12 +72,26 @@ def verify_etp_enabled(): raise AssertionError("Event directory not found for all extensions!") +def check_agent_log(): + # Since we're injecting bad events in the add_bad_events_and_verify_count() function test, + # we expect some warnings to be emitted by the agent. + # We're already verifying if these warnings are being emitted properly in the specified test, so ignoring those here. + ignore = [ + { + 'message': r"Dropped events for Extension: Microsoft\.(OSTCExtensions.VMAccessForLinux|Azure.Extensions.CustomScript); Details:", + 'if': lambda log_line: log_line.level == "WARNING" and log_line.thread == get_collect_telemetry_thread_name() + } + ] + check_waagent_log_for_errors(ignore=ignore) + + if __name__ == '__main__': tests = [ TestFuncObj("Verify ETP enabled", verify_etp_enabled, raise_on_error=True, retry=3), TestFuncObj("Add Good extension events and verify", add_good_extension_events_and_verify), TestFuncObj("Add Bad extension events and verify", add_bad_events_and_verify_count), TestFuncObj("Verify all events processed", wait_for_extension_events_dir_empty), + TestFuncObj("Check Agent log", check_agent_log), ] test_orchestrator = TestOrchestrator("ETPTests-VM", tests=tests)