From 2f6b4467c717dc7d404a8e3cabdb2b8dccb8eb00 Mon Sep 17 00:00:00 2001 From: Laveesh Rohra Date: Fri, 14 Jan 2022 02:05:04 +0530 Subject: [PATCH] GA Versioning: Download required version if available (#2467) --- azurelinuxagent/ga/update.py | 139 ++++++++++---- .../ext_conf_missing_requested_version.xml | 38 ++++ tests/data/wire/ga_manifest.xml | 6 + tests/ga/test_update.py | 174 +++++++++++++++++- tests/protocol/mockwiredata.py | 1 + 5 files changed, 324 insertions(+), 34 deletions(-) create mode 100644 tests/data/wire/ext_conf_missing_requested_version.xml diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 293dd2313a..83db4be7a1 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -46,7 +46,7 @@ from azurelinuxagent.common.exception import ResourceGoneError, UpdateError, ExitException, AgentUpgradeExitException from azurelinuxagent.common.future import ustr from azurelinuxagent.common.osutil import get_osutil, systemd -from azurelinuxagent.common.protocol.restapi import VMAgentUpdateStatus, VMAgentUpdateStatuses +from azurelinuxagent.common.protocol.restapi import VMAgentUpdateStatus, VMAgentUpdateStatuses, ExtHandlerPackageList from azurelinuxagent.common.protocol.util import get_protocol_util from azurelinuxagent.common.protocol.hostplugin import HostPluginProtocol from azurelinuxagent.common.utils import shellutil @@ -449,6 +449,15 @@ def _try_update_goal_state(self, protocol): return False return True + def __goal_state_updated(self, incarnation): + """ + This function returns if the Goal State updated. + We currently rely on the incarnation number to determine that; i.e. if it changed from the last processed GS + """ + # TODO: This check should be based on the ExtensionsGoalState.id property + # (this property abstracts incarnation/etag logic based on the delivery pipeline of the Goal State) + return incarnation != self.last_incarnation + def _process_goal_state(self, exthandlers_handler, remote_access_handler): protocol = exthandlers_handler.protocol if not self._try_update_goal_state(protocol): @@ -479,7 +488,7 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): incarnation = protocol.get_incarnation() try: - if incarnation != self.last_incarnation: # TODO: This check should be based in the etag for the extensions goal state + if self.__goal_state_updated(incarnation): if not self._extensions_summary.converged: message = "A new goal state was received, but not all the extensions in the previous goal state have completed: {0}".format(self._extensions_summary) logger.warn(message) @@ -491,9 +500,9 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): # report status always, even if the goal state did not change # do it before processing the remote access, since that operation can take a long time - self._report_status(exthandlers_handler, incarnation_changed=incarnation != self.last_incarnation) + self._report_status(exthandlers_handler, incarnation_changed=self.__goal_state_updated(incarnation)) - if incarnation != self.last_incarnation: + if self.__goal_state_updated(incarnation): remote_access_handler.run() finally: self.last_incarnation = incarnation @@ -903,7 +912,9 @@ def _shutdown(self): def _check_and_download_agent_if_upgrade_available(self, protocol, base_version=CURRENT_VERSION): """ - This function periodically (1hr by default) checks if new Agent upgrade is available and downloads it on filesystem if it is. + This function downloads the new agent if an update is available. + If a requested version is available in goal state, then only that version is downloaded (new-update model) + Else, we periodically (1hr by default) checks if new Agent upgrade is available and download it on filesystem if available (old-update model) rtype: Boolean return: True if current agent is no longer available or an agent with a higher version number is available else False @@ -912,40 +923,103 @@ def _check_and_download_agent_if_upgrade_available(self, protocol, base_version= if not conf.get_autoupdate_enabled(): return False - now = time.time() - if self.last_attempt_time is not None: - next_attempt_time = self.last_attempt_time + conf.get_autoupdate_frequency() - else: - next_attempt_time = now - if next_attempt_time > now: - return False + def report_error(msg_, version=CURRENT_VERSION): + logger.warn(msg_) + add_event(AGENT_NAME, op=WALAEventOperation.Download, version=version, is_success=False, message=msg_) family = conf.get_autoupdate_gafamily() - logger.info("Checking for agent updates (family: {0})", family) - - self.last_attempt_time = now - + incarnation_changed = False try: - manifest_list, etag = protocol.get_vmagent_manifests() - + # Fetch the agent manifests from the latest Goal State + manifest_list, incarnation = protocol.get_vmagent_manifests() + incarnation_changed = self.__goal_state_updated(incarnation) manifests = [m for m in manifest_list if m.family == family and len(m.uris) > 0] if len(manifests) == 0: - logger.verbose(u"Incarnation {0} has no {1} agent updates", - etag, family) + logger.verbose( + u"No manifest links found for agent family: {0} for incarnation: {1}, skipping update check".format( + family, incarnation)) return False + except Exception as err: + # If there's some issues in fetching the agent manifests, report it only on incarnation change + if incarnation_changed: + msg = u"Exception retrieving agent manifests: {0}".format(textutil.format_exception(err)) + report_error(msg) + return False - pkg_list = protocol.get_vmagent_pkgs(manifests[0]) + requested_version = None + if conf.get_enable_ga_versioning() and manifests[0].is_requested_version_specified: + # If GA versioning is enabled and requested version present in GS, and it's a new GS, follow new logic + if incarnation_changed: + # With the new model, we will get a new GS when CRP wants us to auto-update using required version. + # If there's no new incarnation, don't proceed with anything + requested_version = manifests[0].requested_version + msg = "Found requested version in manifest: {0} for incarnation: {1}".format( + requested_version, incarnation) + logger.info(msg) + add_event(AGENT_NAME, op=WALAEventOperation.AgentUpgrade, is_success=True, message=msg) + else: + # If incarnation didn't change, don't process anything. + return False + else: + # If no requested version specified in the Goal State, follow the old auto-update logic + # Note: If the first Goal State contains a requested version, this timer won't start (i.e. self.last_attempt_time won't be updated). + # If any subsequent goal state does not contain requested version, this timer will start then, and we will + # download all versions available in PIR and auto-update to the highest available version on that goal state. + now = time.time() + if self.last_attempt_time is not None: + next_attempt_time = self.last_attempt_time + conf.get_autoupdate_frequency() + else: + next_attempt_time = now + if next_attempt_time > now: + return False - # Set the agents to those available for download at least as - # current as the existing agent and remove from disk any agent - # no longer reported to the VM. - # Note: - # The code leaves on disk available, but blacklisted, agents - # so as to preserve the state. Otherwise, those agents could be - # again downloaded and inappropriately retried. + logger.info("No requested version specified, checking for all versions for agent update (family: {0})", + family) + + self.last_attempt_time = now + + try: + # If we make it to this point, then either there is a requested version in a new GS (new auto-update model), + # or the 1hr time limit has elapsed for us to check the agent manifest for updates (old auto-update model). + pkg_list = ExtHandlerPackageList() + + # If the requested version is the current version, don't download anything; + # the call to purge() below will delete all other agents from disk + # In this case, no need to even fetch the GA family manifest as we don't need to download any agent. + if requested_version is not None and requested_version == CURRENT_VERSION: + packages_to_download = [] + logger.info("The requested version is running as the current version: {0}".format(requested_version)) + else: + pkg_list = protocol.get_vmagent_pkgs(manifests[0]) + packages_to_download = pkg_list.versions + + # Verify the requested version is in GA family manifest (if specified) + if requested_version is not None and requested_version != CURRENT_VERSION: + package_found = False + for pkg in pkg_list.versions: + if FlexibleVersion(pkg.version) == requested_version: + # Found a matching package, only download that one + packages_to_download = [pkg] + package_found = True + break + + if not package_found: + msg = "No matching package found in the agent manifest for requested version: {0} in incarnation: {1}, skipping agent update".format( + requested_version, incarnation) + report_error(msg, version=requested_version) + return False + + # Set the agents to those available for download at least as current as the existing agent + # or to the requested version (if specified) host = self._get_host_plugin(protocol=protocol) - self._set_agents([GuestAgent(pkg=pkg, host=host) for pkg in pkg_list.versions]) + self._set_agents([GuestAgent(pkg=pkg, host=host) for pkg in packages_to_download]) + # Remove from disk any agent no longer needed in the VM. + # If requested version is provided, this would delete all other agents present on the VM except the + # current one and the requested one if they're different, and only the current one if same. + # Note: + # The code leaves on disk available, but blacklisted, agents to preserve the state. + # Otherwise, those agents could be downloaded again and inappropriately retried. self._purge_agents() self._filter_blacklisted_agents() @@ -954,10 +1028,9 @@ def _check_and_download_agent_if_upgrade_available(self, protocol, base_version= return not self._is_version_eligible(base_version) \ or (len(self.agents) > 0 and self.agents[0].version > base_version) - except Exception as e: # pylint: disable=W0612 - msg = u"Exception retrieving agent manifests: {0}".format(textutil.format_exception(e)) - add_event(AGENT_NAME, op=WALAEventOperation.Download, version=CURRENT_VERSION, is_success=False, - message=msg) + except Exception as err: + msg = u"Exception downloading agents for update: {0}".format(textutil.format_exception(err)) + report_error(msg) return False def _write_pid_file(self): diff --git a/tests/data/wire/ext_conf_missing_requested_version.xml b/tests/data/wire/ext_conf_missing_requested_version.xml new file mode 100644 index 0000000000..e68fcaf995 --- /dev/null +++ b/tests/data/wire/ext_conf_missing_requested_version.xml @@ -0,0 +1,38 @@ + + + + + Prod + 5.2.1.0 + + http://mock-goal-state/manifest_of_ga.xml + + + + Test + 5.2.1.0 + + http://mock-goal-state/manifest_of_ga.xml + + + + + + + + + + + {"runtimeSettings":[{"handlerSettings":{"protectedSettingsCertThumbprint":"4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3","protectedSettings":"MIICWgYJK","publicSettings":{"foo":"bar"}}}]} + + + + + https://test.blob.core.windows.net/vhds/test-cs12.test-cs12.test-cs12.status?sr=b&sp=rw&se=9999-01-01&sk=key1&sv=2014-02-14&sig=hfRh7gzUE7sUtYwke78IOlZOrTRCYvkec4hGZ9zZzXo + + + diff --git a/tests/data/wire/ga_manifest.xml b/tests/data/wire/ga_manifest.xml index 48a1f4f94c..e12f054916 100644 --- a/tests/data/wire/ga_manifest.xml +++ b/tests/data/wire/ga_manifest.xml @@ -25,6 +25,12 @@ 2.1.0http://mock-goal-state/ga-manifests/OSTCExtensions.WALinuxAgent__2.1.0 + + 9.9.9.10 + + http://mock-goal-state/ga-manifests/OSTCExtensions.WALinuxAgent__99999.0.0.0 + + 99999.0.0.0http://mock-goal-state/ga-manifests/OSTCExtensions.WALinuxAgent__99999.0.0.0 diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 9ab3f4bfde..e40c05eaf3 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1918,7 +1918,8 @@ def create_conf_mocks(self, hotfix_frequency, normal_frequency): return_value=hotfix_frequency): with patch("azurelinuxagent.common.conf.get_normal_upgrade_frequency", return_value=normal_frequency): - yield + with patch("azurelinuxagent.common.conf.get_autoupdate_gafamily", return_value="Prod"): + yield @contextlib.contextmanager def __get_update_handler(self, iterations=1, test_data=None, hotfix_frequency=1.0, normal_frequency=2.0, @@ -1934,12 +1935,14 @@ def get_handler(url, **kwargs): if HttpRequestPredicates.is_agent_package_request(url): agent_pkg = load_bin_data(self._get_agent_file_name(), self._agent_zip_dir) + protocol.mock_wire_data.call_counts['agentArtifact'] += 1 return ResponseMock(response=agent_pkg) return protocol.mock_wire_data.mock_http_get(url, **kwargs) protocol.set_http_handlers(http_get_handler=get_handler) with self.create_conf_mocks(hotfix_frequency, normal_frequency): with patch("azurelinuxagent.ga.update.add_event") as mock_telemetry: + update_handler._protocol = protocol yield update_handler, mock_telemetry def __assert_exit_code_successful(self, exit_mock): @@ -1958,6 +1961,13 @@ def __assert_agent_directories_available(self, versions): for version in versions: self.assertTrue(os.path.exists(self.agent_dir(version)), "Agent directory {0} not found".format(version)) + def __assert_agent_directories_exist_and_others_dont_exist(self, versions): + self.__assert_agent_directories_available(versions=versions) + other_agents = [agent_dir for agent_dir in self.agent_dirs() if + agent_dir not in [self.agent_dir(version) for version in versions]] + self.assertFalse(any(other_agents), + "All other agents should be purged from agent dir: {0}".format(other_agents)) + def __assert_no_agent_upgrade_telemetry(self, mock_telemetry): self.assertEqual(0, len([kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if "Agent upgrade discovered, updating to" in kwarg['message'] and kwarg[ @@ -2073,6 +2083,168 @@ def reload_conf(url, mock_wire_data): 'Discovered new {0} upgrade WALinuxAgent-99999.0.0.0; Will upgrade on or after'.format( AgentUpgradeType.Normal) in msg]), 0, "Error message not propagated properly") + def test_it_should_download_only_requested_version_if_available(self): + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_requested_version.xml" + with self.__get_update_handler(test_data=data_file) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler.run(debug=True) + + self.__assert_exit_code_successful(update_handler.exit_mock) + upgrade_event_msgs = [kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if + 'Agent upgrade discovered, updating to WALinuxAgent-9.9.9.10 -- exiting' in kwarg[ + 'message'] and kwarg['op'] == WALAEventOperation.AgentUpgrade] + self.assertEqual(1, len(upgrade_event_msgs), + "Did not find the event indicating that the agent was upgraded. Got: {0}".format( + mock_telemetry.call_args_list)) + self.__assert_agent_directories_exist_and_others_dont_exist(versions=["9.9.9.10"]) + + def test_it_should_cleanup_all_agents_except_requested_version_and_current_version(self): + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_requested_version.xml" + + # Set the test environment by adding 20 random agents to the agent directory + self.prepare_agents() + self.assertEqual(20, self.agent_count(), "Agent directories not set properly") + + with self.__get_update_handler(test_data=data_file) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler.run(debug=True) + + self.__assert_exit_code_successful(update_handler.exit_mock) + upgrade_event_msgs = [kwarg['message'] for _, kwarg in mock_telemetry.call_args_list if + 'Agent upgrade discovered, updating to WALinuxAgent-9.9.9.10 -- exiting' in kwarg[ + 'message'] and kwarg['op'] == WALAEventOperation.AgentUpgrade] + self.assertEqual(1, len(upgrade_event_msgs), "Agent not upgraded properly") + self.__assert_agent_directories_exist_and_others_dont_exist(versions=["9.9.9.10", str(CURRENT_VERSION)]) + + def test_it_should_not_update_if_requested_version_not_found_in_manifest(self): + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_missing_requested_version.xml" + with self.__get_update_handler(test_data=data_file) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler.run(debug=True) + + self.__assert_exit_code_successful(update_handler.exit_mock) + self.__assert_no_agent_upgrade_telemetry(mock_telemetry) + agent_msgs = [kwarg for _, kwarg in mock_telemetry.call_args_list if + kwarg['op'] in (WALAEventOperation.AgentUpgrade, WALAEventOperation.Download)] + # This will throw if corresponding message not found so not asserting on that + requested_version_found = next(kwarg for kwarg in agent_msgs if + "Found requested version in manifest: 5.2.1.0 for incarnation: 1" in kwarg['message']) + self.assertTrue(requested_version_found['is_success'], + "The requested version found op should be reported as a success") + + skipping_update = next(kwarg for kwarg in agent_msgs if + "No matching package found in the agent manifest for requested version: 5.2.1.0 in incarnation: 1, skipping agent update" in kwarg['message']) + self.assertEqual(skipping_update['version'], FlexibleVersion("5.2.1.0"), + "The not found message should be reported from requested agent version") + self.assertFalse(skipping_update['is_success'], "The not found op should be reported as a failure") + + def test_it_should_only_try_downloading_requested_version_on_new_incarnation(self): + no_of_iterations = 1000 + + # Set the test environment by adding 20 random agents to the agent directory + self.prepare_agents() + self.assertEqual(20, self.agent_count(), "Agent directories not set properly") + + def reload_conf(url, mock_wire_data): + # This function reloads the conf mid-run to mimic an actual customer scenario + + if HttpRequestPredicates.is_goal_state_request(url) and mock_wire_data.call_counts[ + "goalstate"] >= 10 and mock_wire_data.call_counts["goalstate"] < 15: + + # Ensure we didn't try to download any agents except during the incarnation change + self.__assert_agent_directories_exist_and_others_dont_exist(versions=[str(CURRENT_VERSION)]) + + # Update the requested version to "99999.0.0.0" + update_handler._protocol.mock_wire_data.set_extension_config_requested_version("99999.0.0.0") + reload_conf.call_count += 1 + self._add_write_permission_to_goal_state_files() + reload_conf.incarnation += 1 + mock_wire_data.set_incarnation(reload_conf.incarnation) + + reload_conf.call_count = 0 + reload_conf.incarnation = 2 + + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_requested_version.xml" + with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf, + normal_frequency=0.01, hotfix_frequency=0.01) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler._protocol.mock_wire_data.set_extension_config_requested_version(str(CURRENT_VERSION)) + update_handler._protocol.mock_wire_data.set_incarnation(2) + update_handler.run(debug=True) + + self.assertGreaterEqual(reload_conf.call_count, 1, "Reload conf not updated as expected") + self.__assert_exit_code_successful(update_handler.exit_mock) + self.__assert_upgrade_telemetry_emitted(mock_telemetry) + self.__assert_agent_directories_exist_and_others_dont_exist(versions=["99999.0.0.0", str(CURRENT_VERSION)]) + self.assertEqual(update_handler._protocol.mock_wire_data.call_counts['agentArtifact'], 1, + "only 1 agent should've been downloaded - 1 per incarnation") + self.assertEqual(update_handler._protocol.mock_wire_data.call_counts["manifest_of_ga.xml"], 1, + "only 1 agent manifest call should've been made - 1 per incarnation") + + def test_it_should_fallback_to_old_update_logic_if_requested_version_not_available(self): + no_of_iterations = 100 + + # Set the test environment by adding 20 random agents to the agent directory + self.prepare_agents() + self.assertEqual(20, self.agent_count(), "Agent directories not set properly") + + def reload_conf(url, mock_wire_data): + # This function reloads the conf mid-run to mimic an actual customer scenario + if HttpRequestPredicates.is_goal_state_request(url) and mock_wire_data.call_counts[ + "goalstate"] >= 5: + reload_conf.call_count += 1 + + # By this point, the GS with requested version should've been executed. Verify that + self.__assert_agent_directories_exist_and_others_dont_exist(versions=[str(CURRENT_VERSION)]) + + # Update the ext-conf and incarnation and remove requested versions from GS, + # this should download all versions requested in config + mock_wire_data.data_files["ext_conf"] = "wire/ext_conf.xml" + mock_wire_data.reload() + self._add_write_permission_to_goal_state_files() + reload_conf.incarnation += 1 + mock_wire_data.set_incarnation(reload_conf.incarnation) + + reload_conf.call_count = 0 + reload_conf.incarnation = 2 + + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_requested_version.xml" + with self.__get_update_handler(iterations=no_of_iterations, test_data=data_file, reload_conf=reload_conf, + normal_frequency=0.001) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler._protocol.mock_wire_data.set_extension_config_requested_version(str(CURRENT_VERSION)) + update_handler._protocol.mock_wire_data.set_incarnation(2) + update_handler.run(debug=True) + + self.assertGreater(reload_conf.call_count, 0, "Reload conf not updated") + self.__assert_exit_code_successful(update_handler.exit_mock) + self.__assert_upgrade_telemetry_emitted(mock_telemetry) + self.__assert_agent_directories_exist_and_others_dont_exist( + versions=["1.0.0", "1.1.0", "1.2.0", "2.0.0", "2.1.0", "9.9.9.10", "99999.0.0.0", str(CURRENT_VERSION)]) + + def test_it_should_not_download_anything_if_requested_version_is_current_version_and_delete_all_agents(self): + data_file = mockwiredata.DATA_FILE.copy() + data_file["ext_conf"] = "wire/ext_conf_requested_version.xml" + + # Set the test environment by adding 20 random agents to the agent directory + self.prepare_agents() + self.assertEqual(20, self.agent_count(), "Agent directories not set properly") + + with self.__get_update_handler(test_data=data_file) as (update_handler, mock_telemetry): + with patch.object(conf, "get_enable_ga_versioning", return_value=True): + update_handler._protocol.mock_wire_data.set_extension_config_requested_version(str(CURRENT_VERSION)) + update_handler._protocol.mock_wire_data.set_incarnation(2) + update_handler.run(debug=True) + + self.__assert_exit_code_successful(update_handler.exit_mock) + self.__assert_no_agent_upgrade_telemetry(mock_telemetry) + self.__assert_agent_directories_exist_and_others_dont_exist(versions=[str(CURRENT_VERSION)]) + @patch('azurelinuxagent.ga.update.get_collect_telemetry_events_handler') @patch('azurelinuxagent.ga.update.get_send_telemetry_events_handler') diff --git a/tests/protocol/mockwiredata.py b/tests/protocol/mockwiredata.py index b49f91569b..d7d6a09391 100644 --- a/tests/protocol/mockwiredata.py +++ b/tests/protocol/mockwiredata.py @@ -140,6 +140,7 @@ def __init__(self, data_files=None): "extensionsconfiguri": 0, "remoteaccessinfouri": 0, "extensionArtifact": 0, + "agentArtifact": 0, "manifest.xml": 0, "manifest_of_ga.xml": 0, "ExampleHandlerLinux": 0,