From 6497374fa6b742505321fa22bdef1706a89d61ae Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Thu, 9 Mar 2023 15:30:18 -0800 Subject: [PATCH] logging manifest type and don't log same error until next period in agent update. (#2778) * improve logging and don't log same error until next period * address comments * update comment * update comment --- azurelinuxagent/common/protocol/goal_state.py | 2 +- azurelinuxagent/common/protocol/wire.py | 4 +-- azurelinuxagent/ga/agent_update.py | 27 +++++++++++++------ tests/ga/test_agent_update.py | 26 ++++++++++++++++++ tests/ga/test_extension.py | 2 +- tests/protocol/test_wire.py | 8 +++--- tests_e2e/pipeline/pipeline.yml | 2 +- 7 files changed, 54 insertions(+), 17 deletions(-) diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index ed96159c8a..d44ee96026 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -183,7 +183,7 @@ def fetch_extension_manifest(self, extension_name, uris): def _fetch_manifest(self, manifest_type, name, uris): try: is_fast_track = self.extensions_goal_state.source == GoalStateSource.FastTrack - xml_text = self._wire_client.fetch_manifest(uris, use_verify_header=is_fast_track) + xml_text = self._wire_client.fetch_manifest(manifest_type, uris, use_verify_header=is_fast_track) self._history.save_manifest(name, xml_text) return ExtensionManifest(xml_text) except Exception as e: diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 38a3e0621d..ea397f0497 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -582,8 +582,8 @@ def call_storage_service(http_req, *args, **kwargs): def fetch_artifacts_profile_blob(self, uri): return self._fetch_content("artifacts profile blob", [uri], use_verify_header=False)[1] # _fetch_content returns a (uri, content) tuple - def fetch_manifest(self, uris, use_verify_header): - uri, content = self._fetch_content("manifest", uris, use_verify_header=use_verify_header) + def fetch_manifest(self, manifest_type, uris, use_verify_header): + uri, content = self._fetch_content("{0} manifest".format(manifest_type), uris, use_verify_header=use_verify_header) self.get_host_plugin().update_manifest_uri(uri) return content diff --git a/azurelinuxagent/ga/agent_update.py b/azurelinuxagent/ga/agent_update.py index 206a628eaa..3728e57ed8 100644 --- a/azurelinuxagent/ga/agent_update.py +++ b/azurelinuxagent/ga/agent_update.py @@ -29,6 +29,8 @@ def __init__(self, protocol): self._gs_id = self._protocol.get_goal_state().extensions_goal_state.id self._last_attempted_update_time = datetime.datetime.min self._last_attempted_update_version = FlexibleVersion("0.0.0.0") + self._last_warning = "" + self._last_warning_time = datetime.datetime.min def __should_update_agent(self, requested_version): """ @@ -170,15 +172,24 @@ def __get_all_agents_on_disk(): path = os.path.join(conf.get_lib_dir(), "{0}-*".format(AGENT_NAME)) return [GuestAgent.from_installed_agent(path=agent_dir) for agent_dir in glob.iglob(path) if os.path.isdir(agent_dir)] - @staticmethod - def __log_event(level, msg_, success_=True): - if level == LogLevel.WARNING: - logger.warn(msg_) - elif level == LogLevel.ERROR: - logger.error(msg_) - elif level == LogLevel.INFO: + def __log_event(self, level, msg_, success_=True): + if level == LogLevel.INFO: logger.info(msg_) - add_event(op=WALAEventOperation.AgentUpgrade, is_success=success_, message=msg_, log_event=False) + add_event(op=WALAEventOperation.AgentUpgrade, is_success=success_, message=msg_, log_event=False) + else: + msg_ += "[NOTE: Will not log the same error for the next 6 hours]" + # Incarnation may change if we get new goal state that would make whole string unique every time. So comparing only the substring until Incarnation if Incarnation included in msg + # Example msg "Unable to update Agent: No manifest links found for agent family: Prod for incarnation: incarnation_1, skipping agent update" + prefix_msg = msg_.split("incarnation", 1)[0] + prefix_last_warning_msg = self._last_warning.split("incarnation", 1)[0] + if prefix_msg != prefix_last_warning_msg or self._last_warning_time == datetime.datetime.min or datetime.datetime.now() >= self._last_warning_time + datetime.timedelta(hours=6): + if level == LogLevel.WARNING: + logger.warn(msg_) + elif level == LogLevel.ERROR: + logger.error(msg_) + add_event(op=WALAEventOperation.AgentUpgrade, is_success=success_, message=msg_, log_event=False) + self._last_warning_time = datetime.datetime.now() + self._last_warning = msg_ def run(self, goal_state): try: diff --git a/tests/ga/test_agent_update.py b/tests/ga/test_agent_update.py index f484a1dae0..73339d7c3c 100644 --- a/tests/ga/test_agent_update.py +++ b/tests/ga/test_agent_update.py @@ -309,3 +309,29 @@ def get_handler(url, **kwargs): self.assertEqual(VMAgentUpdateStatuses.Error, vm_agent_update_status.status) self.assertEqual(1, vm_agent_update_status.code) self.assertIn("Missing requested version", vm_agent_update_status.message) + + def test_it_should_not_log_same_error_next_hours(self): + data_file = DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_missing_family.xml" + + # Set the test environment by adding 20 random agents to the agent directory + self.prepare_agents() + self.assertEqual(20, self.agent_count(), "Agent directories not set properly") + + with self.__get_agent_update_handler(test_data=data_file) as (agent_update_handler, mock_telemetry): + agent_update_handler.run(agent_update_handler._protocol.get_goal_state()) + + self.assertFalse(os.path.exists(self.agent_dir("99999.0.0.0")), + "New agent directory should not be found") + + self.assertEqual(1, len([kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if + "No manifest links found for agent family" in kwarg[ + 'message'] and kwarg[ + 'op'] == WALAEventOperation.AgentUpgrade]), "Agent manifest should not be in GS") + + agent_update_handler.run(agent_update_handler._protocol.get_goal_state()) + + self.assertEqual(1, len([kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if + "No manifest links found for agent family" in kwarg[ + 'message'] and kwarg[ + 'op'] == WALAEventOperation.AgentUpgrade]), "Agent manifest should not be in GS") \ No newline at end of file diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py index 2272a1907b..a6604dcb32 100644 --- a/tests/ga/test_extension.py +++ b/tests/ga/test_extension.py @@ -3185,7 +3185,7 @@ def manifest_location_handler(url, **kwargs): wire._DOWNLOAD_TIMEOUT = datetime.timedelta(minutes=0) try: with self.assertRaises(ExtensionDownloadError): - protocol.client.fetch_manifest(ext_handlers[0].manifest_uris, use_verify_header=False) + protocol.client.fetch_manifest("extension", ext_handlers[0].manifest_uris, use_verify_header=False) finally: wire._DOWNLOAD_TIMEOUT = download_timeout diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index de365623b9..f771a1e607 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -684,7 +684,7 @@ def http_get_handler(url, *_, **__): with mock_wire_protocol(mockwiredata.DATA_FILE, http_get_handler=http_get_handler) as protocol: HostPluginProtocol.is_default_channel = False - manifest = protocol.client.fetch_manifest([manifest_url], use_verify_header=False) + manifest = protocol.client.fetch_manifest("test", [manifest_url], use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(manifest, manifest_xml, 'The expected manifest was not downloaded') @@ -707,7 +707,7 @@ def http_get_handler(url, *_, **kwargs): HostPluginProtocol.is_default_channel = False try: - manifest = protocol.client.fetch_manifest([manifest_url], use_verify_header=False) + manifest = protocol.client.fetch_manifest("test", [manifest_url], use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(manifest, manifest_xml, 'The expected manifest was not downloaded') @@ -744,7 +744,7 @@ def http_get_handler(url, *_, **kwargs): protocol.client.get_host_plugin() protocol.set_http_handlers(http_get_handler=http_get_handler) - manifest = protocol.client.fetch_manifest([manifest_url], use_verify_header=False) + manifest = protocol.client.fetch_manifest("test", [manifest_url], use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(manifest, manifest_xml) @@ -778,7 +778,7 @@ def http_get_handler(url, *_, **kwargs): protocol.set_http_handlers(http_get_handler=http_get_handler) with self.assertRaises(ExtensionDownloadError): - protocol.client.fetch_manifest([manifest_url], use_verify_header=False) + protocol.client.fetch_manifest("test", [manifest_url], use_verify_header=False) urls = protocol.get_tracked_urls() self.assertEqual(len(urls), 4, "Unexpected number of HTTP requests: [{0}]".format(urls)) diff --git a/tests_e2e/pipeline/pipeline.yml b/tests_e2e/pipeline/pipeline.yml index fe532d6d74..9fdfe07f7d 100644 --- a/tests_e2e/pipeline/pipeline.yml +++ b/tests_e2e/pipeline/pipeline.yml @@ -9,7 +9,7 @@ parameters: - name: test_suites displayName: Test Suites type: string - default: agent_bvt + default: agent_bvt,agent_update # NOTES: # * 'image', 'location' and 'vm_size' override any values in the test suites/images definition