From a2ce6c80f3d7251cf2242c6d2b05967eb80fe5ef Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Wed, 20 Apr 2022 16:49:32 -0700 Subject: [PATCH 01/13] enforce extension cpu limits (#2555) * enforce ext cpu limits * fix UTs --- azurelinuxagent/common/cgroupconfigurator.py | 22 ++++++++++++++++---- azurelinuxagent/ga/exthandlers.py | 6 +++--- tests/common/test_cgroupconfigurator.py | 22 +++++++++++++++++--- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index 866e4f866f..7dc0a80a99 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -56,6 +56,7 @@ Before=slices.target [Slice] CPUAccounting=yes +CPUQuota={cpu_quota} """ LOGCOLLECTOR_SLICE = "azure-walinuxagent-logcollector.slice" # More info on resource limits properties in systemd here: @@ -762,21 +763,22 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) - def setup_extension_slice(self, extension_name): + def setup_extension_slice(self, extension_name, cpu_quota): """ Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for extensions are grouped under "azure-vmextensions.slice. This method ensures that the extension slice is created. Setup should create under /lib/systemd/system if it is not exist. - TODO: set cpu and memory quotas + TODO: set memory quotas """ if self.enabled(): unit_file_install_path = systemd.get_unit_file_install_path() extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name)) try: - slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name) + cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" + slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) except Exception as exception: _log_cgroup_warning("Failed to create unit files for the extension slice: {0}", ustr(exception)) @@ -800,7 +802,7 @@ def set_extension_services_cpu_memory_quota(self, services_list): Each extension service will have name, systemd path and it's quotas. This method ensures that drop-in files are created under service.d folder if quotas given. ex: /lib/systemd/system/extension.service.d/11-CPUAccounting.conf - TODO: set cpu and memory quotas + TODO: set memory quotas """ if self.enabled() and services_list is not None: for service in services_list: @@ -813,6 +815,13 @@ def set_extension_services_cpu_memory_quota(self, services_list): _DROP_IN_FILE_CPU_ACCOUNTING) files_to_create.append((drop_in_file_cpu_accounting, _DROP_IN_FILE_CPU_ACCOUNTING_CONTENTS)) + cpu_quota = service.get('cpuQuotaPercentage', None) + if cpu_quota is not None: + cpu_quota = str(cpu_quota) + "%" + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) + cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) + files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) + self.__create_all_files(files_to_create) # reload the systemd configuration; the new unit will be used once the service restarts @@ -836,6 +845,11 @@ def remove_extension_services_drop_in_files(self, services_list): drop_in_file_cpu_accounting = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_ACCOUNTING) files_to_cleanup.append(drop_in_file_cpu_accounting) + cpu_quota = service.get('cpuQuotaPercentage', None) + if cpu_quota is not None: + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) + files_to_cleanup.append(drop_in_file_cpu_quota) + CGroupConfigurator._Impl.__cleanup_all_files(files_to_cleanup) _log_cgroup_info("Drop in files removed for {0}".format(service_name)) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index 13d0ca3e09..edbb9c3db0 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -1361,7 +1361,7 @@ def set_extension_resource_limits(self): man = self.load_manifest() resource_limits = man.get_resource_limits(extension_name, self.ext_handler.version) CGroupConfigurator.get_instance().setup_extension_slice( - extension_name=extension_name) + extension_name=extension_name, cpu_quota=resource_limits.get_extension_slice_cpu_quota()) CGroupConfigurator.get_instance().set_extension_services_cpu_memory_quota(resource_limits.get_service_list()) def create_status_file_if_not_exist(self, extension, status, code, operation, message): @@ -2318,12 +2318,12 @@ def __init__(self, data): def get_extension_slice_cpu_quota(self): if self.data is not None: - return self.data.get('cpuQuota', None) + return self.data.get('cpuQuotaPercentage', None) return None def get_extension_slice_memory_quota(self): if self.data is not None: - return self.data.get('memoryQuota', None) + return self.data.get('memoryQuotaInMB', None) return None def get_service_list(self): diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index 56aa3d48ed..6a945c5e18 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -186,14 +186,18 @@ def test_setup_extension_slice_should_create_unit_files(self): # get the paths to the mocked files extension_slice_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.extensionslice) - configurator.setup_extension_slice(extension_name="Microsoft.CPlat.Extension") + configurator.setup_extension_slice(extension_name="Microsoft.CPlat.Extension", cpu_quota=5) expected_cpu_accounting = "CPUAccounting=yes" + expected_cpu_quota_percentage = "5%" self.assertTrue(os.path.exists(extension_slice_unit_file), "{0} was not created".format(extension_slice_unit_file)) self.assertTrue(fileutil.findre_in_file(extension_slice_unit_file, expected_cpu_accounting), "CPUAccounting was not set correctly. Expected: {0}. Got:\n{1}".format(expected_cpu_accounting, fileutil.read_file( extension_slice_unit_file))) + self.assertTrue(fileutil.findre_in_file(extension_slice_unit_file, expected_cpu_quota_percentage), + "CPUQuota was not set correctly. Expected: {0}. Got:\n{1}".format(expected_cpu_quota_percentage, fileutil.read_file( + extension_slice_unit_file))) def test_remove_extension_slice_should_remove_unit_files(self): with self._get_cgroup_configurator() as configurator: @@ -583,15 +587,18 @@ def mock_popen(command, *args, **kwargs): def test_it_should_set_extension_services_cpu_memory_quota(self): service_list = [ { - "name": "extension.service" + "name": "extension.service", + "cpuQuotaPercentage": 5 } ] with self._get_cgroup_configurator() as configurator: # get the paths to the mocked files extension_service_cpu_accounting = configurator.mocks.get_mapped_path(UnitFilePaths.extension_service_cpu_accounting) + extension_service_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.extension_service_cpu_quota) configurator.set_extension_services_cpu_memory_quota(service_list) expected_cpu_accounting = "CPUAccounting=yes" + expected_cpu_quota_percentage = "CPUQuota=5%" # create drop in files to set those properties self.assertTrue(os.path.exists(extension_service_cpu_accounting), "{0} was not created".format(extension_service_cpu_accounting)) @@ -599,6 +606,11 @@ def test_it_should_set_extension_services_cpu_memory_quota(self): fileutil.findre_in_file(extension_service_cpu_accounting, expected_cpu_accounting), "CPUAccounting was not enabled. Expected: {0}. Got:\n{1}".format(expected_cpu_accounting, fileutil.read_file(extension_service_cpu_accounting))) + self.assertTrue(os.path.exists(extension_service_cpu_quota), "{0} was not created".format(extension_service_cpu_quota)) + self.assertTrue( + fileutil.findre_in_file(extension_service_cpu_quota, expected_cpu_quota_percentage), + "CPUQuota was not set. Expected: {0}. Got:\n{1}".format(expected_cpu_quota_percentage, fileutil.read_file(extension_service_cpu_quota))) + def test_it_should_set_extension_services_when_quotas_not_defined(self): service_list = [ { @@ -654,15 +666,19 @@ def test_it_should_stop_tracking_extension_services_cgroups(self): def test_it_should_remove_extension_services_drop_in_files(self): service_list = [ { - "name": "extension.service" + "name": "extension.service", + "cpuQuotaPercentage": 5 } ] with self._get_cgroup_configurator() as configurator: extension_service_cpu_accounting = configurator.mocks.get_mapped_path( UnitFilePaths.extension_service_cpu_accounting) + extension_service_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.extension_service_cpu_quota) configurator.remove_extension_services_drop_in_files(service_list) self.assertFalse(os.path.exists(extension_service_cpu_accounting), "{0} should not have been created".format(extension_service_cpu_accounting)) + self.assertFalse(os.path.exists(extension_service_cpu_quota), + "{0} should not have been created".format(extension_service_cpu_quota)) def test_it_should_start_tracking_unit_cgroups(self): From d6678b4055b525d919ee5e37d666e62503bfa8d3 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Tue, 26 Apr 2022 11:55:03 -0700 Subject: [PATCH 02/13] Improvements in waagent.log (#2558) (#2564) * Improvements in waagent.log * fix function call * update comment * typo Co-authored-by: narrieta (cherry picked from commit fa8e3708328de50c899e68b6cfc519c1bda40361) --- .../extensions_goal_state_from_vm_settings.py | 2 +- azurelinuxagent/common/protocol/goal_state.py | 29 +++++--- azurelinuxagent/common/utils/archive.py | 9 ++- azurelinuxagent/daemon/main.py | 2 +- azurelinuxagent/ga/exthandlers.py | 3 +- azurelinuxagent/ga/update.py | 36 ++++++---- .../ext_conf-requested_version.xml | 4 +- tests/data/hostgaplugin/ext_conf.xml | 4 +- ...tings-difference_in_required_features.json | 4 +- .../vm_settings-missing_cert.json | 68 +++++++++++++++++++ .../hostgaplugin/vm_settings-out-of-sync.json | 2 +- .../vm_settings-requested_version.json | 4 +- tests/data/hostgaplugin/vm_settings.json | 6 +- tests/protocol/test_goal_state.py | 22 +++++- tests/utils/test_archive.py | 3 +- 15 files changed, 154 insertions(+), 44 deletions(-) create mode 100644 tests/data/hostgaplugin/vm_settings-missing_cert.json diff --git a/azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py b/azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py index ce99a26079..38cca48f1b 100644 --- a/azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py +++ b/azurelinuxagent/common/protocol/extensions_goal_state_from_vm_settings.py @@ -289,7 +289,7 @@ def _parse_extensions(self, vm_settings): # "settingsSeqNo": 0, # "settings": [ # { - # "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + # "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", # "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", # "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" # } diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index ae01e4d22c..6ec0a5ab64 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -18,6 +18,7 @@ import os import re import time +import json import azurelinuxagent.common.conf as conf import azurelinuxagent.common.logger as logger @@ -145,6 +146,7 @@ def update(self): return if vm_settings_updated: + logger.info('') logger.info("Fetched new vmSettings [HostGAPlugin correlation ID: {0} eTag: {1} source: {2}]", vm_settings.hostga_plugin_correlation_id, vm_settings.etag, vm_settings.source) # Ignore the vmSettings if their source is Fabric (processing a Fabric goal state may require the tenant certificate and the vmSettings don't include it.) if vm_settings is not None and vm_settings.source == GoalStateSource.Fabric: @@ -184,6 +186,17 @@ def update(self): if self._extensions_goal_state is None or most_recent.created_on_timestamp > self._extensions_goal_state.created_on_timestamp: self._extensions_goal_state = most_recent + # For Fast Track goal states, verify that the required certificates are in the goal state + if self.extensions_goal_state.source == GoalStateSource.FastTrack: + for extension in self.extensions_goal_state.extensions: + for settings in extension.settings: + if settings.protectedSettings is None: + continue + certificates = self.certs.summary + if not any(settings.certificateThumbprint == c['thumbprint'] for c in certificates): + message = "Certificate {0} needed by {1} is missing from the goal state".format(settings.certificateThumbprint, extension.name) + add_event(op=WALAEventOperation.VmSettings, message=message, is_success=False) + def _restore_wire_server_goal_state(self, incarnation, xml_text, xml_doc, vm_settings_support_stopped_error): logger.info('The HGAP stopped supporting vmSettings; will fetched the goal state from the WireServer.') self._history = GoalStateHistory(timeutil.create_timestamp(), incarnation) @@ -270,6 +283,7 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): Returns the value of ExtensionsConfig. """ try: + logger.info('') logger.info('Fetching full goal state from the WireServer [incarnation {0}]', incarnation) role_instance = find(xml_doc, "RoleInstance") @@ -300,9 +314,10 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): certs = None certs_uri = findtext(xml_doc, "Certificates") if certs_uri is not None: - # Note that we do not save the certificates to the goal state history xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert()) certs = Certificates(xml_text) + # Save the certificate summary, which includes only the thumbprint but not the certificate itself, to the goal state history + self._history.save_certificates(json.dumps(certs.summary)) remote_access = None remote_access_uri = findtext(container, "RemoteAccessInfo") @@ -349,6 +364,7 @@ def __init__(self, xml_text): class Certificates(object): def __init__(self, xml_text): self.cert_list = CertList() + self.summary = [] # debugging info # Save the certificates local_file = os.path.join(conf.get_lib_dir(), CERTS_FILE_NAME) @@ -428,18 +444,15 @@ def __init__(self, xml_text): tmp_file = prvs[pubkey] prv = "{0}.prv".format(thumbprint) os.rename(tmp_file, os.path.join(conf.get_lib_dir(), prv)) - logger.info("Found private key matching thumbprint {0}".format(thumbprint)) else: # Since private key has *no* matching certificate, # it will not be named correctly logger.warn("Found NO matching cert/thumbprint for private key!") - # Log if any certificates were found without matching private keys - # This can happen (rarely), and is useful to know for debugging - for pubkey in thumbprints: - if not pubkey in prvs: - msg = "Certificate with thumbprint {0} has no matching private key." - logger.info(msg.format(thumbprints[pubkey])) + for pubkey, thumbprint in thumbprints.items(): + has_private_key = pubkey in prvs + logger.info("Downloaded certificate with thumbprint {0} (has private key: {1})".format(thumbprint, has_private_key)) + self.summary.append({"thumbprint": thumbprint, "hasPrivateKey": has_private_key}) for v1_cert in v1_cert_list: cert = Cert() diff --git a/azurelinuxagent/common/utils/archive.py b/azurelinuxagent/common/utils/archive.py index ed8122e970..880a23a119 100644 --- a/azurelinuxagent/common/utils/archive.py +++ b/azurelinuxagent/common/utils/archive.py @@ -42,13 +42,14 @@ _MAX_ARCHIVED_STATES = 50 _CACHE_PATTERNS = [ - re.compile(r"^VmSettings.\d+\.json$"), + re.compile(r"^VmSettings\.\d+\.json$"), re.compile(r"^(.*)\.(\d+)\.(agentsManifest)$", re.IGNORECASE), re.compile(r"^(.*)\.(\d+)\.(manifest\.xml)$", re.IGNORECASE), re.compile(r"^(.*)\.(\d+)\.(xml)$", re.IGNORECASE), re.compile(r"^SharedConfig\.xml$", re.IGNORECASE), re.compile(r"^HostingEnvironmentConfig\.xml$", re.IGNORECASE), - re.compile(r"^RemoteAccess\.xml$", re.IGNORECASE) + re.compile(r"^RemoteAccess\.xml$", re.IGNORECASE), + re.compile(r"^waagent_status\.\d+\.json$"), ] # @@ -69,6 +70,7 @@ _GOAL_STATE_FILE_NAME = "GoalState.xml" _VM_SETTINGS_FILE_NAME = "VmSettings.json" +_CERTIFICATES_FILE_NAME = "Certificates.json" _HOSTING_ENV_FILE_NAME = "HostingEnvironmentConfig.xml" _SHARED_CONF_FILE_NAME = "SharedConfig.xml" _REMOTE_ACCESS_FILE_NAME = "RemoteAccess.xml" @@ -239,6 +241,9 @@ def save_vm_settings(self, text): def save_remote_access(self, text): self.save(text, _REMOTE_ACCESS_FILE_NAME) + def save_certificates(self, text): + self.save(text, _CERTIFICATES_FILE_NAME) + def save_hosting_env(self, text): self.save(text, _HOSTING_ENV_FILE_NAME) diff --git a/azurelinuxagent/daemon/main.py b/azurelinuxagent/daemon/main.py index 91685bc641..c608768a67 100644 --- a/azurelinuxagent/daemon/main.py +++ b/azurelinuxagent/daemon/main.py @@ -64,7 +64,7 @@ def run(self, child_args=None): # # Be aware that telemetry events emitted before that will not include the Container ID. # - logger.info("{0} Version:{1}", AGENT_LONG_NAME, AGENT_VERSION) + logger.info("{0} Version: {1}", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py index edbb9c3db0..b7da7b4f14 100644 --- a/azurelinuxagent/ga/exthandlers.py +++ b/azurelinuxagent/ga/exthandlers.py @@ -308,6 +308,7 @@ def run(self): error = None message = "ProcessExtensionsGoalState started [{0} channel: {1} source: {2} activity: {3} correlation {4} created: {5}]".format( egs.id, egs.channel, egs.source, egs.activity_id, egs.correlation_id, egs.created_on_timestamp) + logger.info('') logger.info(message) add_event(op=WALAEventOperation.ExtensionProcessing, message=message) @@ -319,7 +320,7 @@ def run(self): finally: duration = elapsed_milliseconds(utc_start) if error is None: - message = 'ProcessExtensionsGoalState completed [{0} {1} ms]'.format(egs.id, duration) + message = 'ProcessExtensionsGoalState completed [{0} {1} ms]\n'.format(egs.id, duration) logger.info(message) else: message = 'ProcessExtensionsGoalState failed [{0} {1} ms]\n{2}'.format(egs.id, duration, error) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 1bdda2fc44..d8b879ce42 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -54,7 +54,7 @@ from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import AddFirewallRules from azurelinuxagent.common.utils.shellutil import CommandError -from azurelinuxagent.common.version import AGENT_NAME, AGENT_DIR_PATTERN, CURRENT_AGENT, \ +from azurelinuxagent.common.version import AGENT_LONG_NAME, AGENT_NAME, AGENT_DIR_PATTERN, CURRENT_AGENT, \ CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, get_lis_version, \ has_logrotate, PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO, get_daemon_version from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed @@ -324,20 +324,11 @@ def run(self, debug=False): """ try: + logger.info("{0} Version: {1}", AGENT_LONG_NAME, CURRENT_AGENT) + logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) + logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) - # - # Initialize the goal state; some components depend on information provided by the goal state and this - # call ensures the required info is initialized (e.g. telemetry depends on the container ID.) - # - protocol = self.protocol_util.get_protocol() - - self._initialize_goal_state(protocol) - - # Initialize the common parameters for telemetry events - initialize_event_logger_vminfo_common_parameters(protocol) - - # Log OS-specific info. os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\ u"OSUtil: {util_name}; AgentService: {service_name}; "\ u"Python: {py_major}.{py_minor}.{py_micro}; "\ @@ -351,8 +342,20 @@ def run(self, debug=False): py_micro=PY_VERSION_MICRO, systemd=systemd.is_systemd(), lis_ver=get_lis_version(), has_logrotate=has_logrotate() ) - logger.info(os_info_msg) + + # + # Initialize the goal state; some components depend on information provided by the goal state and this + # call ensures the required info is initialized (e.g. telemetry depends on the container ID.) + # + protocol = self.protocol_util.get_protocol() + + self._initialize_goal_state(protocol) + + # Initialize the common parameters for telemetry events + initialize_event_logger_vminfo_common_parameters(protocol) + + # Send telemetry for the OS-specific info. add_event(AGENT_NAME, op=WALAEventOperation.OSInfo, message=os_info_msg) # @@ -738,7 +741,7 @@ def forward_signal(self, signum, frame): return logger.info( - u"Agent {0} forwarding signal {1} to {2}", + u"Agent {0} forwarding signal {1} to {2}\n", CURRENT_AGENT, signum, self.child_agent.name if self.child_agent is not None else CURRENT_AGENT) @@ -827,6 +830,9 @@ def log_if_op_disabled(name, value): if conf.get_autoupdate_enabled(): log_if_int_changed_from_default("Autoupdate.Frequency", conf.get_autoupdate_frequency()) + if conf.get_enable_fast_track(): + log_if_op_disabled("Debug.EnableFastTrack", conf.get_enable_fast_track()) + if conf.get_lib_dir() != "/var/lib/waagent": log_event("lib dir is in an unexpected location: {0}".format(conf.get_lib_dir())) diff --git a/tests/data/hostgaplugin/ext_conf-requested_version.xml b/tests/data/hostgaplugin/ext_conf-requested_version.xml index c3bd928236..bbb8a20feb 100644 --- a/tests/data/hostgaplugin/ext_conf-requested_version.xml +++ b/tests/data/hostgaplugin/ext_conf-requested_version.xml @@ -60,7 +60,7 @@ "runtimeSettings": [ { "handlerSettings": { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": {"GCS_AUTO_CONFIG":true} } @@ -73,7 +73,7 @@ "runtimeSettings": [ { "handlerSettings": { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": {"enableGenevaUpload":true} } diff --git a/tests/data/hostgaplugin/ext_conf.xml b/tests/data/hostgaplugin/ext_conf.xml index eac5d63647..ebd90aa0b2 100644 --- a/tests/data/hostgaplugin/ext_conf.xml +++ b/tests/data/hostgaplugin/ext_conf.xml @@ -58,7 +58,7 @@ "runtimeSettings": [ { "handlerSettings": { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent==", "publicSettings": {"GCS_AUTO_CONFIG":true} } @@ -71,7 +71,7 @@ "runtimeSettings": [ { "handlerSettings": { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent==", "publicSettings": {"enableGenevaUpload":true} } diff --git a/tests/data/hostgaplugin/vm_settings-difference_in_required_features.json b/tests/data/hostgaplugin/vm_settings-difference_in_required_features.json index 9fd2e4f722..5601268706 100644 --- a/tests/data/hostgaplugin/vm_settings-difference_in_required_features.json +++ b/tests/data/hostgaplugin/vm_settings-difference_in_required_features.json @@ -56,7 +56,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" } @@ -76,7 +76,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": "{\"enableGenevaUpload\":true}" } diff --git a/tests/data/hostgaplugin/vm_settings-missing_cert.json b/tests/data/hostgaplugin/vm_settings-missing_cert.json new file mode 100644 index 0000000000..a7192e942d --- /dev/null +++ b/tests/data/hostgaplugin/vm_settings-missing_cert.json @@ -0,0 +1,68 @@ +{ + "hostGAPluginVersion": "1.0.8.124", + "vmSettingsSchemaVersion": "0.0", + "activityId": "a33f6f53-43d6-4625-b322-1a39651a00c9", + "correlationId": "9a47a2a2-e740-4bfc-b11b-4f2f7cfe7d2e", + "inSvdSeqNo": 1, + "extensionsLastModifiedTickCount": 637726657706205299, + "extensionGoalStatesSource": "FastTrack", + "onHold": true, + "statusUploadBlob": { + "statusBlobType": "BlockBlob", + "value": "https://dcrcl3a0xs.blob.core.windows.net/$system/edp0plkw2b.86f4ae0a-61f8-48ae-9199-40f402d56864.status?sv=2018-03-28&sr=b&sk=system-1&sig=KNWgC2%3d&se=9999-01-01T00%3a00%3a00Z&sp=w" + }, + "inVMMetadata": { + "subscriptionId": "8e037ad4-618f-4466-8bc8-5099d41ac15b", + "resourceGroupName": "rg-dc-86fjzhp", + "vmName": "edp0plkw2b", + "location": "CentralUSEUAP", + "vmId": "86f4ae0a-61f8-48ae-9199-40f402d56864", + "vmSize": "Standard_B2s", + "osType": "Linux" + }, + "requiredFeatures": [ + { + "name": "MultipleExtensionsPerHandler" + } + ], + "gaFamilies": [ + { + "name": "Prod", + "uris": [ + "https://zrdfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Prod_uscentraleuap_manifest.xml", + "https://ardfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Prod_uscentraleuap_manifest.xml" + ] + }, + { + "name": "Test", + "uris": [ + "https://zrdfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Test_uscentraleuap_manifest.xml", + "https://ardfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Test_uscentraleuap_manifest.xml" + ] + } + ], + "extensionGoalStates": [ + { + "name": "Microsoft.OSTCExtensions.VMAccessForLinux", + "version": "1.5.11", + "location": "https://umsasc25p0kjg0c1dg4b.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml", + "failoverlocation": "https://umsamfwlmfshvxx2lsjm.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml", + "additionalLocations": [ + "https://umsah3cwjlctnmhsvzqv.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml" + ], + "state": "enabled", + "autoUpgrade": false, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "isMultiConfig": false, + "settings": [ + { + "protectedSettingsCertThumbprint": "59A10F50FFE2A0408D3F03FE336C8FD5716CF25C", + "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpddesZQewdDBgegkxNzA1BgoJkgergres/Microsoft.OSTCExtensions.VMAccessForLinux==" + } + ] + } + ] +} diff --git a/tests/data/hostgaplugin/vm_settings-out-of-sync.json b/tests/data/hostgaplugin/vm_settings-out-of-sync.json index 0aae82031c..1f369ae5bc 100644 --- a/tests/data/hostgaplugin/vm_settings-out-of-sync.json +++ b/tests/data/hostgaplugin/vm_settings-out-of-sync.json @@ -56,7 +56,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" } diff --git a/tests/data/hostgaplugin/vm_settings-requested_version.json b/tests/data/hostgaplugin/vm_settings-requested_version.json index 49e6a27780..98959dd4ec 100644 --- a/tests/data/hostgaplugin/vm_settings-requested_version.json +++ b/tests/data/hostgaplugin/vm_settings-requested_version.json @@ -56,7 +56,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" } @@ -74,7 +74,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/IsZAEZFidXaW5kb3dzIEF6dXJlIENSUCBDZXJ0aWZpY2F0ZSBHZW5lcmF0b3ICEFpB/HKM/7evRk+DBz754wUwDQYJKoZIhvcNAQEBBQAEggEADPJwniDeIUXzxNrZCloitFdscQ59Bz1dj9DLBREAiM8jmxM0LLicTJDUv272Qm/4ZQgdqpFYBFjGab/9MX+Ih2x47FkVY1woBkckMaC/QOFv84gbboeQCmJYZC/rZJdh8rCMS+CEPq3uH1PVrvtSdZ9uxnaJ+E4exTPPviIiLIPtqWafNlzdbBt8HZjYaVw+SSe+CGzD2pAQeNttq3Rt/6NjCzrjG8ufKwvRoqnrInMs4x6nnN5/xvobKIBSv4/726usfk8Ug+9Q6Benvfpmre2+1M5PnGTfq78cO3o6mI3cPoBUjp5M0iJjAMGeMt81tyHkimZrEZm6pLa4NQMOEjArBgkqhkiG9w0BBwEwFAYIKoZIhvcNAwcECC5nVaiJaWt+gAhgeYvxUOYHXw==", "publicSettings": "{\"enableGenevaUpload\":true}" } diff --git a/tests/data/hostgaplugin/vm_settings.json b/tests/data/hostgaplugin/vm_settings.json index 96836e7663..a4ef0f785f 100644 --- a/tests/data/hostgaplugin/vm_settings.json +++ b/tests/data/hostgaplugin/vm_settings.json @@ -56,7 +56,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent==", "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" } @@ -76,7 +76,7 @@ "settingsSeqNo": 0, "settings": [ { - "protectedSettingsCertThumbprint": "4C4F304667711036E64AF4894B76EB208A863BD4", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpMIIBZQIBADBNMDkxNzA1BgoJkiaJk/Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent==", "publicSettings": "{\"enableGenevaUpload\":true}" } @@ -192,7 +192,7 @@ "isMultiConfig": false, "settings": [ { - "protectedSettingsCertThumbprint": "59A10F50FFE2A0408D3F03FE336C8FD5716CF25C", + "protectedSettingsCertThumbprint": "4037FBF5F1F3014F99B5D6C7799E9B20E6871CB3", "protectedSettings": "MIIBsAYJKoZIhvcNAQcDoIIBoTCCAZ0CAQAxggFpddesZQewdDBgegkxNzA1BgoJkgergres/Microsoft.OSTCExtensions.VMAccessForLinux==" } ] diff --git a/tests/protocol/test_goal_state.py b/tests/protocol/test_goal_state.py index 65d050d862..8cfa4842a8 100644 --- a/tests/protocol/test_goal_state.py +++ b/tests/protocol/test_goal_state.py @@ -8,6 +8,7 @@ import re import time +from azurelinuxagent.common.event import WALAEventOperation from azurelinuxagent.common.future import httpclient from azurelinuxagent.common.protocol.extensions_goal_state import GoalStateSource, GoalStateChannel from azurelinuxagent.common.protocol.extensions_goal_state_from_extensions_config import ExtensionsGoalStateFromExtensionsConfig @@ -105,7 +106,7 @@ def test_instantiating_goal_state_should_save_the_goal_state_to_the_history_dire self._assert_directory_contents( self._find_history_subdirectory("999-888"), - ["GoalState.xml", "ExtensionsConfig.xml", "VmSettings.json", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) + ["GoalState.xml", "ExtensionsConfig.xml", "VmSettings.json", "Certificates.json", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) def _find_history_subdirectory(self, tag): matches = glob.glob(os.path.join(self.tmp_dir, ARCHIVE_DIRECTORY_NAME, "*_{0}".format(tag))) @@ -128,7 +129,7 @@ def test_update_should_create_new_history_subdirectories(self): goal_state = GoalState(protocol.client) self._assert_directory_contents( self._find_history_subdirectory("123-654"), - ["GoalState.xml", "ExtensionsConfig.xml", "VmSettings.json", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) + ["GoalState.xml", "ExtensionsConfig.xml", "VmSettings.json", "Certificates.json", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) def http_get_handler(url, *_, **__): if HttpRequestPredicates.is_host_plugin_vm_settings_request(url): @@ -140,7 +141,7 @@ def http_get_handler(url, *_, **__): goal_state.update() self._assert_directory_contents( self._find_history_subdirectory("234-654"), - ["GoalState.xml", "ExtensionsConfig.xml", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) + ["GoalState.xml", "ExtensionsConfig.xml", "Certificates.json", "SharedConfig.xml", "HostingEnvironmentConfig.xml"]) protocol.mock_wire_data.set_etag(987) protocol.set_http_handlers(http_get_handler=None) @@ -358,3 +359,18 @@ def http_get_handler(url, *_, **__): self._assert_goal_state(goal_state, initial_incarnation, channel=GoalStateChannel.WireServer, source=GoalStateSource.Fabric) self.assertEqual(initial_timestamp, goal_state.extensions_goal_state.created_on_timestamp, "The timestamp of the updated goal state is incorrect") self.assertTrue(goal_state.extensions_goal_state.is_outdated, "The updated goal state should be marked as outdated") + + def test_it_should_report_missing_certificates(self): + data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() + data_file["vm_settings"] = "hostgaplugin/vm_settings-missing_cert.json" + + with mock_wire_protocol(data_file) as protocol: + with patch("azurelinuxagent.common.protocol.goal_state.add_event") as add_event: + _ = GoalState(protocol.client) + + expected_message = "Certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C needed by Microsoft.OSTCExtensions.VMAccessForLinux is missing from the goal state" + events = [kwargs for _, kwargs in add_event.call_args_list if kwargs['op'] == WALAEventOperation.VmSettings and kwargs['message'] == expected_message] + + self.assertTrue( + len(events) == 1, + "Missing certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C was note reported. Telemetry: {0}".format([kwargs['message'] for _, kwargs in add_event.call_args_list])) diff --git a/tests/utils/test_archive.py b/tests/utils/test_archive.py index 466d674c79..61214558c5 100644 --- a/tests/utils/test_archive.py +++ b/tests/utils/test_archive.py @@ -135,7 +135,8 @@ def test_purge_legacy_goal_state_history(self): 'Microsoft.Azure.Extensions.CustomScript.1.xml', 'SharedConfig.xml', 'HostingEnvironmentConfig.xml', - 'RemoteAccess.xml' + 'RemoteAccess.xml', + 'waagent_status.1.json' ] legacy_files = [os.path.join(self.tmp_dir, f) for f in legacy_files] for f in legacy_files: From 03df04a07c64439b51a20626471fc0ceabecc6a5 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Tue, 26 Apr 2022 12:57:14 -0700 Subject: [PATCH 03/13] Change format of history items (#2560) (#2565) * Change format of history directory * Update message; fix typo * py2 compat * py2 compat Co-authored-by: narrieta (cherry picked from commit 93a2564fd6509a93ddab1417507a61f40ba56424) --- azurelinuxagent/common/protocol/goal_state.py | 9 +++++---- azurelinuxagent/common/utils/archive.py | 19 +++++++++++-------- azurelinuxagent/common/utils/timeutil.py | 11 ++++++++++- azurelinuxagent/ga/update.py | 5 ++--- tests/protocol/test_goal_state.py | 2 +- tests/utils/test_archive.py | 10 ++++++++-- 6 files changed, 37 insertions(+), 19 deletions(-) diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index 6ec0a5ab64..da0de9a032 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -15,6 +15,7 @@ # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ +import datetime import os import re import time @@ -31,7 +32,7 @@ from azurelinuxagent.common.protocol.extensions_goal_state import VmSettingsParseError, GoalStateSource from azurelinuxagent.common.protocol.hostplugin import VmSettingsNotSupported, VmSettingsSupportStopped from azurelinuxagent.common.protocol.restapi import Cert, CertList, RemoteAccessUser, RemoteAccessUsersList -from azurelinuxagent.common.utils import fileutil, timeutil +from azurelinuxagent.common.utils import fileutil from azurelinuxagent.common.utils.archive import GoalStateHistory from azurelinuxagent.common.utils.cryptutil import CryptUtil from azurelinuxagent.common.utils.textutil import parse_doc, findall, find, findtext, getattrib @@ -131,7 +132,7 @@ def update(self): # # Fetch the goal state from both the HGAP and the WireServer # - timestamp = timeutil.create_timestamp() + timestamp = datetime.datetime.utcnow() incarnation, xml_text, xml_doc = GoalState._fetch_goal_state(self._wire_client) goal_state_updated = incarnation != self._incarnation @@ -199,7 +200,7 @@ def update(self): def _restore_wire_server_goal_state(self, incarnation, xml_text, xml_doc, vm_settings_support_stopped_error): logger.info('The HGAP stopped supporting vmSettings; will fetched the goal state from the WireServer.') - self._history = GoalStateHistory(timeutil.create_timestamp(), incarnation) + self._history = GoalStateHistory(datetime.datetime.utcnow(), incarnation) self._history.save_goal_state(xml_text) self._extensions_goal_state = self._fetch_full_wire_server_goal_state(incarnation, xml_doc) if self._extensions_goal_state.created_on_timestamp < vm_settings_support_stopped_error.timestamp: @@ -270,7 +271,7 @@ def _fetch_vm_settings(wire_client): except VmSettingsParseError as exception: # ensure we save the vmSettings if there were parsing errors, but save them only once per ETag if not GoalStateHistory.tag_exists(exception.etag): - GoalStateHistory(timeutil.create_timestamp(), exception.etag).save_vm_settings(exception.vm_settings_text) + GoalStateHistory(datetime.datetime.utcnow(), exception.etag).save_vm_settings(exception.vm_settings_text) raise return vm_settings, vm_settings_updated diff --git a/azurelinuxagent/common/utils/archive.py b/azurelinuxagent/common/utils/archive.py index 880a23a119..6123fdb0d2 100644 --- a/azurelinuxagent/common/utils/archive.py +++ b/azurelinuxagent/common/utils/archive.py @@ -9,7 +9,7 @@ import azurelinuxagent.common.logger as logger import azurelinuxagent.common.conf as conf -from azurelinuxagent.common.utils import fileutil +from azurelinuxagent.common.utils import fileutil, timeutil # pylint: disable=W0105 @@ -58,13 +58,15 @@ # 2018-04-06T08:21:37.142697.zip # 2018-04-06T08:21:37.142697_incarnation_N # 2018-04-06T08:21:37.142697_incarnation_N.zip +# 2018-04-06T08:21:37.142697_N-M +# 2018-04-06T08:21:37.142697_N-M.zip # # Current names # -# 2018-04-06T08:21:37.142697_N-M -# 2018-04-06T08:21:37.142697_N-M.zip +# 2018-04-06T08-21-37__N-M +# 2018-04-06T08-21-37__N-M.zip # -_ARCHIVE_BASE_PATTERN = r"\d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d+((_incarnation)?_(\d+|status)(-\d+)?)?" +_ARCHIVE_BASE_PATTERN = r"\d{4}\-\d{2}\-\d{2}T\d{2}[:-]\d{2}[:-]\d{2}(\.\d+)?((_incarnation)?_+(\d+|status)(-\d+)?)?" _ARCHIVE_PATTERNS_DIRECTORY = re.compile(r'^{0}$'.format(_ARCHIVE_BASE_PATTERN)) _ARCHIVE_PATTERNS_ZIP = re.compile(r'^{0}\.zip$'.format(_ARCHIVE_BASE_PATTERN)) @@ -163,7 +165,6 @@ def purge(self): newest ones. Also, clean up any legacy history files. """ states = self._get_archive_states() - states.sort(reverse=True) for state in states[_MAX_ARCHIVED_STATES:]: state.delete() @@ -184,7 +185,6 @@ def purge_legacy_goal_state_history(): def archive(self): states = self._get_archive_states() - states.sort(reverse=True) if len(states) > 0: # Skip the most recent goal state, since it may still be in use @@ -203,13 +203,16 @@ def _get_archive_states(self): if match is not None: states.append(StateZip(full_path, match.group(0))) + states.sort(key=lambda state: os.path.getctime(state._path), reverse=True) + return states class GoalStateHistory(object): - def __init__(self, timestamp, tag): + def __init__(self, time, tag): self._errors = False - self._root = os.path.join(conf.get_lib_dir(), ARCHIVE_DIRECTORY_NAME, "{0}_{1}".format(timestamp, tag) if tag is not None else timestamp) + timestamp = timeutil.create_history_timestamp(time) + self._root = os.path.join(conf.get_lib_dir(), ARCHIVE_DIRECTORY_NAME, "{0}__{1}".format(timestamp, tag) if tag is not None else timestamp) @staticmethod def tag_exists(tag): diff --git a/azurelinuxagent/common/utils/timeutil.py b/azurelinuxagent/common/utils/timeutil.py index c4dd755a0c..c8fa37647c 100644 --- a/azurelinuxagent/common/utils/timeutil.py +++ b/azurelinuxagent/common/utils/timeutil.py @@ -5,7 +5,7 @@ def create_timestamp(dt=None): """ - Returns a string with the given datetime iso format. If no datetime is given as parameter, it + Returns a string with the given datetime in iso format. If no datetime is given as parameter, it uses datetime.utcnow(). """ if dt is None: @@ -13,6 +13,15 @@ def create_timestamp(dt=None): return dt.isoformat() +def create_history_timestamp(dt=None): + """ + Returns a string with the given datetime formatted as a timestamp for the agent's history folder + """ + if dt is None: + dt = datetime.datetime.utcnow() + return dt.strftime('%Y-%m-%dT%H-%M-%S') + + def datetime_to_ticks(dt): """ Converts 'dt', a datetime, to the number of ticks (1 tick == 1/10000000 sec) since datetime.min (0001-01-01 00:00:00). diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index d8b879ce42..1a453c8c05 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -54,7 +54,7 @@ from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import AddFirewallRules from azurelinuxagent.common.utils.shellutil import CommandError -from azurelinuxagent.common.version import AGENT_LONG_NAME, AGENT_NAME, AGENT_DIR_PATTERN, CURRENT_AGENT, \ +from azurelinuxagent.common.version import AGENT_LONG_NAME, AGENT_NAME, AGENT_DIR_PATTERN, CURRENT_AGENT, AGENT_VERSION, \ CURRENT_VERSION, DISTRO_NAME, DISTRO_VERSION, get_lis_version, \ has_logrotate, PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO, get_daemon_version from azurelinuxagent.ga.collect_logs import get_collect_logs_handler, is_log_collection_allowed @@ -324,10 +324,9 @@ def run(self, debug=False): """ try: - logger.info("{0} Version: {1}", AGENT_LONG_NAME, CURRENT_AGENT) + logger.info("{0} (Goal State Agent version {1})", AGENT_LONG_NAME, AGENT_VERSION) logger.info("OS: {0} {1}", DISTRO_NAME, DISTRO_VERSION) logger.info("Python: {0}.{1}.{2}", PY_VERSION_MAJOR, PY_VERSION_MINOR, PY_VERSION_MICRO) - logger.info(u"Agent {0} is running as the goal state agent", CURRENT_AGENT) os_info_msg = u"Distro: {dist_name}-{dist_ver}; "\ u"OSUtil: {util_name}; AgentService: {service_name}; "\ diff --git a/tests/protocol/test_goal_state.py b/tests/protocol/test_goal_state.py index 8cfa4842a8..c54a65f9f7 100644 --- a/tests/protocol/test_goal_state.py +++ b/tests/protocol/test_goal_state.py @@ -373,4 +373,4 @@ def test_it_should_report_missing_certificates(self): self.assertTrue( len(events) == 1, - "Missing certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C was note reported. Telemetry: {0}".format([kwargs['message'] for _, kwargs in add_event.call_args_list])) + "Missing certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C was not reported. Telemetry: {0}".format([kwargs['message'] for _, kwargs in add_event.call_args_list])) diff --git a/tests/utils/test_archive.py b/tests/utils/test_archive.py index 61214558c5..0c649c9e24 100644 --- a/tests/utils/test_archive.py +++ b/tests/utils/test_archive.py @@ -67,7 +67,10 @@ def test_archive_should_zip_all_but_the_latest_goal_state_in_the_history_folder( test_directories.append(directory) test_subject = StateArchiver(self.tmp_dir) - test_subject.archive() + # NOTE: StateArchiver sorts the state directories by creation time, but the test files are created too fast and the + # time resolution is too coarse, so instead we mock getctime to simply return the path of the file + with patch("azurelinuxagent.common.utils.archive.os.path.getctime", side_effect=lambda path: path): + test_subject.archive() for directory in test_directories[0:2]: zip_file = directory + ".zip" @@ -110,7 +113,10 @@ def test_archive02(self): self.assertEqual(total, len(os.listdir(self.history_dir))) test_subject = StateArchiver(self.tmp_dir) - test_subject.purge() + # NOTE: StateArchiver sorts the state directories by creation time, but the test files are created too fast and the + # time resolution is too coarse, so instead we mock getctime to simply return the path of the file + with patch("azurelinuxagent.common.utils.archive.os.path.getctime", side_effect=lambda path: path): + test_subject.purge() archived_entries = os.listdir(self.history_dir) self.assertEqual(_MAX_ARCHIVED_STATES, len(archived_entries)) From 1b3123b023a42c7d1b9fbe65a09a79bb15368655 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Tue, 26 Apr 2022 16:18:21 -0700 Subject: [PATCH 04/13] Update log messages (#2567) Co-authored-by: narrieta --- dcr/scenario_utils/check_waagent_log.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dcr/scenario_utils/check_waagent_log.py b/dcr/scenario_utils/check_waagent_log.py index bff1a3b295..cfc6668bcd 100644 --- a/dcr/scenario_utils/check_waagent_log.py +++ b/dcr/scenario_utils/check_waagent_log.py @@ -96,23 +96,23 @@ def check_waagent_log_for_errors(waagent_log=AGENT_LOG_FILE, ignore=None): 'if': lambda log_line: log_line.level == "WARNING" and log_line.who == "Daemon" }, # - # 2022-02-09T04:50:37.384810Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] GET vmSettings [correlation ID: 2bed9b62-188e-4668-b1a8-87c35cfa4927 eTag: 7031887032544600793]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] b'{ "errorCode": "VMArtifactsProfileBlobContentNotFound", "message": "VM artifacts profile blob has no content in it.", "details": ""}' + # 2022-02-09T04:50:37.384810Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 2bed9b62-188e-4668-b1a8-87c35cfa4927 eTag: 7031887032544600793]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] b'{ "errorCode": "VMArtifactsProfileBlobContentNotFound", "message": "VM artifacts profile blob has no content in it.", "details": ""}' # # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include additional instances. # { 'message': r"\[ProtocolError\] GET vmSettings.*VMArtifactsProfileBlobContentNotFound", - 'if': lambda log_line: log_line.level == "WARNING" + 'if': lambda log_line: log_line.level == "ERROR" }, # - # 2021-12-29T06:50:49.904601Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] Error fetching goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] [HTTP Failed] [410: Gone] The page you requested was removed. - # 2022-03-21T02:44:03.770017Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] Resource is gone + # 2021-12-29T06:50:49.904601Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] [HTTP Failed] [410: Gone] The page you requested was removed. + # 2022-03-21T02:44:03.770017Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] Resource is gone # 2022-02-16T04:46:50.477315Z WARNING Daemon Daemon Fetching the goal state failed: [ResourceGoneError] [HTTP Failed] [410: Gone] b'\n\n ResourceNotAvailable\n The resource requested is no longer available. Please refresh your cache.\n
\n
' # # ResourceGone can happen if we are fetching one of the URIs in the goal state and a new goal state arrives { - 'message': r"(?s)(An error occurred while retrieving the goal state|Fetching the goal state failed|Error fetching goal state).*(\[ResourceGoneError\]|\[410: Gone\]|Resource is gone)", - 'if': lambda log_line: log_line.level == "WARNING" + 'message': r"(?s)(Fetching the goal state failed|Error fetching goal state|Error fetching the goal state).*(\[ResourceGoneError\]|\[410: Gone\]|Resource is gone)", + 'if': lambda log_line: log_line.level in ("WARNING", "ERROR") }, # # 2022-03-08T03:03:23.036161Z WARNING ExtHandler ExtHandler Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' From a9213e449799ce1178285bb08b36343e827a8a67 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Wed, 27 Apr 2022 17:00:20 -0700 Subject: [PATCH 05/13] Refresh goal state when certificates are missing (#2562) (#2566) * Refresh goal state when certificates are missing * Improve error reporting * Fix assert message Co-authored-by: narrieta (cherry picked from commit cd03ff2c22c572fcafb74d645ed5a437a2c3bf5d) --- azurelinuxagent/common/logger.py | 4 + azurelinuxagent/common/protocol/goal_state.py | 100 ++++++++++++------ azurelinuxagent/common/protocol/hostplugin.py | 4 +- azurelinuxagent/common/protocol/wire.py | 12 +-- azurelinuxagent/ga/update.py | 52 +++++---- tests/data/wire/certs-2.xml | 85 +++++++++++++++ tests/data/wire/goal_state.xml | 12 +-- tests/data/wire/goal_state_no_ext.xml | 10 +- tests/data/wire/goal_state_remote_access.xml | 13 +-- tests/ga/test_update.py | 50 ++++----- tests/protocol/HttpRequestPredicates.py | 16 +++ tests/protocol/mockwiredata.py | 28 ++--- tests/protocol/test_goal_state.py | 56 ++++++++-- tests/protocol/test_hostplugin.py | 12 +-- tests/protocol/test_wire.py | 2 +- 15 files changed, 325 insertions(+), 131 deletions(-) create mode 100644 tests/data/wire/certs-2.xml diff --git a/azurelinuxagent/common/logger.py b/azurelinuxagent/common/logger.py index 07e3f23936..3d0dc617d3 100644 --- a/azurelinuxagent/common/logger.py +++ b/azurelinuxagent/common/logger.py @@ -45,6 +45,7 @@ def __init__(self, logger=None, prefix=None): self.logger = self if logger is None else logger self.periodic_messages = {} self.prefix = prefix + self.silent = False def reset_periodic(self): self.logger.periodic_messages = {} @@ -124,6 +125,9 @@ def write_log(log_appender): # pylint: disable=W0612 finally: log_appender.appender_lock = False + if self.silent: + return + # if msg_format is not unicode convert it to unicode if type(msg_format) is not ustr: msg_format = ustr(msg_format, errors="backslashreplace") diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index da0de9a032..edfd9d14f7 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -48,8 +48,16 @@ _GET_GOAL_STATE_MAX_ATTEMPTS = 6 +class GoalStateInconsistentError(ProtocolError): + """ + Indicates an inconsistency in the goal state (e.g. missing tenant certificate) + """ + def __init__(self, msg, inner=None): + super(GoalStateInconsistentError, self).__init__(msg, inner) + + class GoalState(object): - def __init__(self, wire_client): + def __init__(self, wire_client, silent=False): """ Fetches the goal state using the given wire client. @@ -64,6 +72,8 @@ def __init__(self, wire_client): self._wire_client = wire_client self._history = None self._extensions_goal_state = None # populated from vmSettings or extensionsConfig + self.logger = logger.Logger(logger.DEFAULT_LOGGER) + self.logger.silent = silent # These properties hold the goal state from the WireServer and are initialized by self._fetch_full_wire_server_goal_state() self._incarnation = None @@ -75,8 +85,10 @@ def __init__(self, wire_client): self._certs = None self._remote_access = None - self.update() + self.update(silent=silent) + except ProtocolError: + raise except Exception as exception: # We don't log the error here since fetching the goal state is done every few seconds raise ProtocolError(msg="Error fetching goal state", inner=exception) @@ -125,34 +137,47 @@ def update_host_plugin_headers(wire_client): # Fetching the goal state updates the HostGAPlugin so simply trigger the request GoalState._fetch_goal_state(wire_client) - def update(self): + def update(self, silent=False): """ Updates the current GoalState instance fetching values from the WireServer/HostGAPlugin as needed """ + self.logger.silent = silent + + try: + self._update(force_update=False) + except GoalStateInconsistentError as e: + self.logger.warn("Detected an inconsistency in the goal state: {0}", ustr(e)) + self._update(force_update=True) + self.logger.info("The goal state is consistent") + + def _update(self, force_update): # # Fetch the goal state from both the HGAP and the WireServer # timestamp = datetime.datetime.utcnow() + if force_update: + self.logger.info("Refreshing goal state and vmSettings") + incarnation, xml_text, xml_doc = GoalState._fetch_goal_state(self._wire_client) - goal_state_updated = incarnation != self._incarnation + goal_state_updated = force_update or incarnation != self._incarnation if goal_state_updated: - logger.info('Fetched a new incarnation for the WireServer goal state [incarnation {0}]', incarnation) + self.logger.info('Fetched a new incarnation for the WireServer goal state [incarnation {0}]', incarnation) vm_settings, vm_settings_updated = None, False try: - vm_settings, vm_settings_updated = GoalState._fetch_vm_settings(self._wire_client) + vm_settings, vm_settings_updated = GoalState._fetch_vm_settings(self._wire_client, force_update=force_update) except VmSettingsSupportStopped as exception: # If the HGAP stopped supporting vmSettings, we need to use the goal state from the WireServer self._restore_wire_server_goal_state(incarnation, xml_text, xml_doc, exception) return if vm_settings_updated: - logger.info('') - logger.info("Fetched new vmSettings [HostGAPlugin correlation ID: {0} eTag: {1} source: {2}]", vm_settings.hostga_plugin_correlation_id, vm_settings.etag, vm_settings.source) + self.logger.info('') + self.logger.info("Fetched new vmSettings [HostGAPlugin correlation ID: {0} eTag: {1} source: {2}]", vm_settings.hostga_plugin_correlation_id, vm_settings.etag, vm_settings.source) # Ignore the vmSettings if their source is Fabric (processing a Fabric goal state may require the tenant certificate and the vmSettings don't include it.) if vm_settings is not None and vm_settings.source == GoalStateSource.Fabric: if vm_settings_updated: - logger.info("The vmSettings originated via Fabric; will ignore them.") + self.logger.info("The vmSettings originated via Fabric; will ignore them.") vm_settings, vm_settings_updated = None, False # If neither goal state has changed we are done with the update @@ -187,19 +212,30 @@ def update(self): if self._extensions_goal_state is None or most_recent.created_on_timestamp > self._extensions_goal_state.created_on_timestamp: self._extensions_goal_state = most_recent - # For Fast Track goal states, verify that the required certificates are in the goal state + # + # For Fast Track goal states, verify that the required certificates are in the goal state. + # + # Some scenarios can produce inconsistent goal states. For example, during hibernation/resume, the Fabric goal state changes (the + # tenant certificate is re-generated when the VM is restarted) *without* the incarnation necessarily changing (e.g. if the incarnation + # is 1 before the hibernation; on resume the incarnation is set to 1 even though the goal state has a new certificate). If a Fast + # Track goal state comes after that, the extensions will need the new certificate. The Agent needs to refresh the goal state in that + # case, to ensure it fetches the new certificate. + # if self.extensions_goal_state.source == GoalStateSource.FastTrack: - for extension in self.extensions_goal_state.extensions: - for settings in extension.settings: - if settings.protectedSettings is None: - continue - certificates = self.certs.summary - if not any(settings.certificateThumbprint == c['thumbprint'] for c in certificates): - message = "Certificate {0} needed by {1} is missing from the goal state".format(settings.certificateThumbprint, extension.name) - add_event(op=WALAEventOperation.VmSettings, message=message, is_success=False) + self._check_certificates() + + def _check_certificates(self): + for extension in self.extensions_goal_state.extensions: + for settings in extension.settings: + if settings.protectedSettings is None: + continue + certificates = self.certs.summary + if not any(settings.certificateThumbprint == c['thumbprint'] for c in certificates): + message = "Certificate {0} needed by {1} is missing from the goal state".format(settings.certificateThumbprint, extension.name) + raise GoalStateInconsistentError(message) def _restore_wire_server_goal_state(self, incarnation, xml_text, xml_doc, vm_settings_support_stopped_error): - logger.info('The HGAP stopped supporting vmSettings; will fetched the goal state from the WireServer.') + self.logger.info('The HGAP stopped supporting vmSettings; will fetched the goal state from the WireServer.') self._history = GoalStateHistory(datetime.datetime.utcnow(), incarnation) self._history.save_goal_state(xml_text) self._extensions_goal_state = self._fetch_full_wire_server_goal_state(incarnation, xml_doc) @@ -207,7 +243,7 @@ def _restore_wire_server_goal_state(self, incarnation, xml_text, xml_doc, vm_set self._extensions_goal_state.is_outdated = True msg = "Fetched a Fabric goal state older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}".format( self._extensions_goal_state.created_on_timestamp, vm_settings_support_stopped_error.timestamp) - logger.info(msg) + self.logger.info(msg) add_event(op=WALAEventOperation.VmSettings, message=msg, is_success=True) def save_to_history(self, data, file_name): @@ -249,7 +285,7 @@ def _fetch_goal_state(wire_client): return incarnation, xml_text, xml_doc @staticmethod - def _fetch_vm_settings(wire_client): + def _fetch_vm_settings(wire_client, force_update=False): """ Issues an HTTP request (HostGAPlugin) for the vm settings and returns the response as an ExtensionsGoalState. """ @@ -258,11 +294,11 @@ def _fetch_vm_settings(wire_client): if conf.get_enable_fast_track(): try: try: - vm_settings, vm_settings_updated = wire_client.get_host_plugin().fetch_vm_settings() + vm_settings, vm_settings_updated = wire_client.get_host_plugin().fetch_vm_settings(force_update=force_update) except ResourceGoneError: # retry after refreshing the HostGAPlugin GoalState.update_host_plugin_headers(wire_client) - vm_settings, vm_settings_updated = wire_client.get_host_plugin().fetch_vm_settings() + vm_settings, vm_settings_updated = wire_client.get_host_plugin().fetch_vm_settings(force_update=force_update) except VmSettingsSupportStopped: raise @@ -284,8 +320,8 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): Returns the value of ExtensionsConfig. """ try: - logger.info('') - logger.info('Fetching full goal state from the WireServer [incarnation {0}]', incarnation) + self.logger.info('') + self.logger.info('Fetching full goal state from the WireServer [incarnation {0}]', incarnation) role_instance = find(xml_doc, "RoleInstance") role_instance_id = findtext(role_instance, "InstanceId") @@ -317,7 +353,11 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): if certs_uri is not None: xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert()) certs = Certificates(xml_text) - # Save the certificate summary, which includes only the thumbprint but not the certificate itself, to the goal state history + # Log and save the certificates summary (i.e. the thumbprint but not the certificate itself) to the goal state history + for c in certs.summary: + logger.info("Downloaded certificate {0}".format(c)) + if len(certs.warnings) > 0: + logger.warn(certs.warnings) self._history.save_certificates(json.dumps(certs.summary)) remote_access = None @@ -339,10 +379,10 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): return extensions_config except Exception as exception: - logger.warn("Fetching the goal state failed: {0}", ustr(exception)) + self.logger.warn("Fetching the goal state failed: {0}", ustr(exception)) raise ProtocolError(msg="Error fetching goal state", inner=exception) finally: - logger.info('Fetch goal state completed') + self.logger.info('Fetch goal state completed') class HostingEnv(object): @@ -366,6 +406,7 @@ class Certificates(object): def __init__(self, xml_text): self.cert_list = CertList() self.summary = [] # debugging info + self.warnings = [] # Save the certificates local_file = os.path.join(conf.get_lib_dir(), CERTS_FILE_NAME) @@ -448,11 +489,10 @@ def __init__(self, xml_text): else: # Since private key has *no* matching certificate, # it will not be named correctly - logger.warn("Found NO matching cert/thumbprint for private key!") + self.warnings.append("Found NO matching cert/thumbprint for private key!") for pubkey, thumbprint in thumbprints.items(): has_private_key = pubkey in prvs - logger.info("Downloaded certificate with thumbprint {0} (has private key: {1})".format(thumbprint, has_private_key)) self.summary.append({"thumbprint": thumbprint, "hasPrivateKey": has_private_key}) for v1_cert in v1_cert_list: diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 81b9062566..5f795dca3e 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -453,7 +453,7 @@ def get_fast_track_timestamp(): HostPluginProtocol._get_fast_track_state_file(), ustr(e)) return timeutil.create_timestamp(datetime.datetime.utcnow()) - def fetch_vm_settings(self): + def fetch_vm_settings(self, force_update=False): """ Queries the vmSettings from the HostGAPlugin and returns an (ExtensionsGoalState, bool) tuple with the vmSettings and a boolean indicating if they are an updated (True) or a cached value (False). @@ -491,7 +491,7 @@ def format_message(msg): # Raise VmSettingsNotSupported directly instead of using raise_not_supported() to avoid resetting the timestamp for the next check raise VmSettingsNotSupported() - etag = None if self._cached_vm_settings is None else self._cached_vm_settings.etag + etag = None if force_update or self._cached_vm_settings is None else self._cached_vm_settings.etag correlation_id = str(uuid.uuid4()) self._vm_settings_error_reporter.report_request() diff --git a/azurelinuxagent/common/protocol/wire.py b/azurelinuxagent/common/protocol/wire.py index 40e58cc0f5..7923bea75d 100644 --- a/azurelinuxagent/common/protocol/wire.py +++ b/azurelinuxagent/common/protocol/wire.py @@ -83,8 +83,8 @@ def detect(self): logger.info('Initializing goal state during protocol detection') self.client.update_goal_state(force_update=True) - def update_goal_state(self): - self.client.update_goal_state() + def update_goal_state(self, silent=False): + self.client.update_goal_state(silent=silent) def update_host_plugin_from_goal_state(self): self.client.update_host_plugin_from_goal_state() @@ -759,18 +759,18 @@ def update_host_plugin(self, container_id, role_config_name): self._host_plugin.update_container_id(container_id) self._host_plugin.update_role_config_name(role_config_name) - def update_goal_state(self, force_update=False): + def update_goal_state(self, force_update=False, silent=False): """ Updates the goal state if the incarnation or etag changed or if 'force_update' is True """ try: if force_update: - logger.info("Forcing an update of the goal state..") + logger.info("Forcing an update of the goal state.") if self._goal_state is None or force_update: - self._goal_state = GoalState(self) + self._goal_state = GoalState(self, silent=silent) else: - self._goal_state.update() + self._goal_state.update(silent=silent) except ProtocolError: raise diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index 1a453c8c05..c800ce4700 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -168,7 +168,8 @@ def __init__(self): # these members are used to avoid reporting errors too frequently self._heartbeat_update_goal_state_error_count = 0 - self._last_try_update_goal_state_failed = False + self._update_goal_state_error_count = 0 + self._update_goal_state_last_error_report = datetime.min self._report_status_last_failed_goal_state = None # incarnation of the last goal state that has been fully processed @@ -481,13 +482,16 @@ def _try_update_goal_state(self, protocol): Attempts to update the goal state and returns True on success or False on failure, sending telemetry events about the failures. """ try: - protocol.update_goal_state() + max_errors_to_log = 3 + + protocol.update_goal_state(silent=self._update_goal_state_error_count >= max_errors_to_log) self._goal_state = protocol.get_goal_state() - if self._last_try_update_goal_state_failed: - self._last_try_update_goal_state_failed = False - message = u"Retrieving the goal state recovered from previous errors" + if self._update_goal_state_error_count > 0: + self._update_goal_state_error_count = 0 + message = u"Fetching the goal state recovered from previous errors. Fetched {0} (certificates: {1})".format( + self._goal_state.extensions_goal_state.id, self._goal_state.certs.summary) add_event(AGENT_NAME, op=WALAEventOperation.FetchGoalState, version=CURRENT_VERSION, is_success=True, message=message, log_event=False) logger.info(message) @@ -497,15 +501,21 @@ def _try_update_goal_state(self, protocol): self._supports_fast_track = False except Exception as e: - if not self._last_try_update_goal_state_failed: - self._last_try_update_goal_state_failed = True - message = u"An error occurred while retrieving the goal state: {0}".format(textutil.format_exception(e)) - logger.warn(message) - add_event(AGENT_NAME, op=WALAEventOperation.FetchGoalState, version=CURRENT_VERSION, is_success=False, message=message, log_event=False) - message = u"Attempts to retrieve the goal state are failing: {0}".format(ustr(e)) - logger.periodic_warn(logger.EVERY_SIX_HOURS, "[PERIODIC] {0}".format(message)) + self._update_goal_state_error_count += 1 self._heartbeat_update_goal_state_error_count += 1 + if self._update_goal_state_error_count <= max_errors_to_log: + message = u"Error fetching the goal state: {0}".format(textutil.format_exception(e)) + logger.error(message) + add_event(op=WALAEventOperation.FetchGoalState, is_success=False, message=message, log_event=False) + self._update_goal_state_last_error_report = datetime.now() + else: + if self._update_goal_state_last_error_report + timedelta(hours=6) > datetime.now(): + self._update_goal_state_last_error_report = datetime.now() + message = u"Fetching the goal state is still failing: {0}".format(textutil.format_exception(e)) + logger.error(message) + add_event(op=WALAEventOperation.FetchGoalState, is_success=False, message=message, log_event=False) return False + return True def __update_guest_agent(self, protocol): @@ -559,8 +569,8 @@ def handle_updates_for_requested_version(): raise AgentUpgradeExitException( "Exiting current process to {0} to the request Agent version {1}".format(prefix, requested_version)) - # Ignore new agents if updating is disabled - if not conf.get_autoupdate_enabled(): + # Skip the update if there is no goal state yet or auto-update is disabled + if self._goal_state is None or not conf.get_autoupdate_enabled(): return False if self._download_agent_if_upgrade_available(protocol): @@ -600,11 +610,14 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): protocol = exthandlers_handler.protocol # update self._goal_state - self._try_update_goal_state(protocol) - - # Update the Guest Agent if a new version is available - if self._goal_state is not None: + if not self._try_update_goal_state(protocol): + # agent updates and status reporting should be done even when the goal state is not updated self.__update_guest_agent(protocol) + self._report_status(exthandlers_handler) + return + + # check for agent updates + self.__update_guest_agent(protocol) if self._processing_new_extensions_goal_state(): if not self._extensions_summary.converged: @@ -620,8 +633,7 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler): # Note: Monitor thread periodically checks this in addition to here. CGroupConfigurator.get_instance().check_cgroups(cgroup_metrics=[]) - # always report status, even if the goal state did not change - # do it before processing the remote access, since that operation can take a long time + # report status before processing the remote access, since that operation can take a long time self._report_status(exthandlers_handler) if self._processing_new_incarnation(): diff --git a/tests/data/wire/certs-2.xml b/tests/data/wire/certs-2.xml new file mode 100644 index 0000000000..66a231ee87 --- /dev/null +++ b/tests/data/wire/certs-2.xml @@ -0,0 +1,85 @@ + + + 2012-11-30 + 5 + Pkcs7BlobWithPfxContents + MIIOgwYJKoZIhvcNAQcDoIIOdDCCDnACAQIxggEwMIIBLAIBAoAUiF8ZYMs9mMa8 +QOEMxDaIhGza+0IwDQYJKoZIhvcNAQEBBQAEggEAQW7GyeRVEhHSU1/dzV0IndH0 +rDQk+27MvlsWTcpNcgGFtfRYxu5bzmp0+DoimX3pRBlSFOpMJ34jpg4xs78EsSWH +FRhCf3EGuEUBHo6yR8FhXDTuS7kZ0UmquiCI2/r8j8gbaGBNeP8IRizcAYrPMA5S +E8l1uCrw7DHuLscbVni/7UglGaTfFS3BqS5jYbiRt2Qh3p+JPUfm51IG3WCIw/WS +2QHebmHxvMFmAp8AiBWSQJizQBEJ1lIfhhBMN4A7NadMWAe6T2DRclvdrQhJX32k +amOiogbW4HJsL6Hphn7Frrw3CENOdWMAvgQBvZ3EjAXgsJuhBA1VIrwofzlDljCC +DTUGCSqGSIb3DQEHATAUBggqhkiG9w0DBwQIxcvw9qx4y0qAgg0QrINXpC23BWT2 +Fb9N8YS3Be9eO3fF8KNdM6qGf0kKR16l/PWyP2L+pZxCcCPk83d070qPdnJK9qpJ +6S1hI80Y0oQnY9VBFrdfkc8fGZHXqm5jNS9G32v/AxYpJJC/qrAQnWuOdLtOZaGL +94GEh3XRagvz1wifv8SRI8B1MzxrpCimeMxHkL3zvJFg9FjLGdrak868feqhr6Nb +pqH9zL7bMq8YP788qTRELUnL72aDzGAM7HEj7V4yu2uD3i3Ryz3bqWaj9IF38Sa0 +6rACBkiNfZBPgExoMUm2GNVyx8hTis2XKRgz4NLh29bBkKrArK9sYDncE9ocwrrX +AQ99yn03Xv6TH8bRp0cSj4jzBXc5RFsUQG/LxzJVMjvnkDbwNE41DtFiYz5QVcv1 +cMpTH16YfzSL34a479eNq/4+JAs/zcb2wjBskJipMUU4hNx5fhthvfKwDOQbLTqN +HcP23iPQIhjdUXf6gpu5RGu4JZ0dAMHMHFKvNL6TNejwx/H6KAPp6rCRsYi6QhAb +42SXdZmhAyQsFpGD9U5ieJApqeCHfj9Xhld61GqLJA9+WLVhDPADjqHoAVvrOkKH +OtPegId/lWnCB7p551klAjiEA2/DKxFBIAEhqZpiLl+juZfMXovkdmGxMP4gvNNF +gbS2k5A0IJ8q51gZcH1F56smdAmi5kvhPnFdy/9gqeI/F11F1SkbPVLImP0mmrFi +zQD5JGfEu1psUYvhpOdaYDkmAK5qU5xHSljqZFz5hXNt4ebvSlurHAhunJb2ln3g +AJUHwtZnVBrtYMB0w6fdwYqMxXi4vLeqUiHtIQtbOq32zlSryNPQqG9H0iP9l/G1 +t7oUfr9woI/B0kduaY9jd5Qtkqs1DoyfNMSaPNohUK/CWOTD51qOadzSvK0hJ+At +033PFfv9ilaX6GmzHdEVEanrn9a+BoBCnGnuysHk/8gdswj9OzeCemyIFJD7iObN +rNex3SCf3ucnAejJOA0awaLx88O1XTteUjcFn26EUji6DRK+8JJiN2lXSyQokNeY +ox6Z4hFQDmw/Q0k/iJqe9/Dq4zA0l3Krkpra0DZoWh5kzYUA0g5+Yg6GmRNRa8YG +tuuD6qK1SBEzmCYff6ivjgsXV5+vFBSjEpx2dPEaKdYxtHMOjkttuTi1mr+19dVf +hSltbzfISbV9HafX76dhwZJ0QwsUx+aOW6OrnK8zoQc5AFOXpe9BrrOuEX01qrM0 +KX5tS8Zx5HqDLievjir194oi3r+nAiG14kYlGmOTHshu7keGCgJmzJ0iVG/i+TnV +ZSLyd8OqV1F6MET1ijgR3OPL3kt81Zy9lATWk/DgKbGBkkKAnXO2HUw9U34JFyEy +vEc81qeHci8sT5QKSFHiP3r8EcK8rT5k9CHpnbFmg7VWSMVD0/wRB/C4BiIw357a +xyJ/q1NNvOZVAyYzIzf9TjwREtyeHEo5kS6hyWSn7fbFf3sNGO2I30veWOvE6kFA +HMtF3NplOrTYcM7fAK5zJCBK20oU645TxI8GsICMog7IFidFMdRn4MaXpwAjEZO4 +44m2M+4XyeRCAZhp1Fu4mDiHGqgd44mKtwvLACVF4ygWZnACDpI17X88wMnwL4uU +vgehLZdAE89gvukSCsET1inVBnn/hVenCRbbZ++IGv2XoYvRfeezfOoNUcJXyawQ +JFqN0CRB5pliuCesTO2urn4HSwGGoeBd507pGWZmOAjbNjGswlJJXF0NFnNW/zWw +UFYy+BI9axuhWTSnCXbNbngdNQKHznKe1Lwit6AI3U9jS33pM3W+pwUAQegVdtpG +XT01YgiMCBX+b8B/xcWTww0JbeUwKXudzKsPhQmaA0lubAo04JACMfON8jSZCeRV +TyIzgacxGU6YbEKH4PhYTGl9srcWIT9iGSYD53V7Kyvjumd0Y3Qc3JLnuWZT6Oe3 +uJ4xz9jJtoaTDvPJQNK3igscjZnWZSP8XMJo1/f7vbvD57pPt1Hqdirp1EBQNshk +iX9CUh4fuGFFeHf6MtGxPofbXmvA2GYcFsOez4/2eOTEmo6H3P4Hrya97XHS0dmD +zFSAjzAlacTrn1uuxtxFTikdOwvdmQJJEfyYWCB1lqWOZi97+7nzqyXMLvMgmwug +ZF/xHFMhFTR8Wn7puuwf36JpPQiM4oQ/Lp66zkS4UlKrVsmSXIXudLMg8SQ5WqK8 +DjevEZwsHHaMtfDsnCAhAdRc2jCpyHKKnmhCDdkcdJJEymWKILUJI5PJ3XtiMHnR +Sa35OOICS0lTq4VwhUdkGwGjRoY1GsriPHd6LOt1aom14yJros1h7ta604hSCn4k +zj9p7wY9gfgkXWXNfmarrZ9NNwlHxzgSva+jbJcLmE4GMX5OFHHGlRj/9S1xC2Wf +MY9orzlooGM74NtmRi4qNkFj3dQCde8XRR4wh2IvPUCsr4j+XaoCoc3R5Rn/yNJK +zIkccJ2K14u9X/A0BLXHn5Gnd0tBYcVOqP6dQlW9UWdJC/Xooh7+CVU5cZIxuF/s +Vvg+Xwiv3XqekJRu3cMllJDp5rwe5EWZSmnoAiGKjouKAIszlevaRiD/wT6Zra3c +Wn/1U/sGop6zRscHR7pgI99NSogzpVGThUs+ez7otDBIdDbLpMjktahgWoi1Vqhc +fNZXjA6ob4zTWY/16Ys0YWxHO+MtyWTMP1dnsqePDfYXGUHe8yGxylbcjfrsVYta +4H6eYR86eU3eXB+MpS/iA4jBq4QYWR9QUkd6FDfmRGgWlMXhisPv6Pfnj384NzEV +Emeg7tW8wzWR64EON9iGeGYYa2BBl2FVaayMEoUhthhFcDM1r3/Mox5xF0qnlys4 +goWkMzqbzA2t97bC0KDGzkcHT4wMeiJBLDZ7S2J2nDAEhcTLY0P2zvOB4879pEWx +Bd15AyG1DvNssA5ooaDzKi/Li6NgDuMJ8W7+tmsBwDvwuf2N3koqBeXfKhR4rTqu +Wg1k9fX3+8DzDf0EjtDZJdfWZAynONi1PhZGbNbaMKsQ+6TflkCACInRdOADR5GM +rL7JtrgF1a9n0HD9vk2WGZqKI71tfS8zODkOZDD8aAusD2DOSmVZl48HX/t4i4Wc +3dgi/gkCMrfK3wOujb8tL4zjnlVkM7kzKk0MgHuA1w81zFjeMFvigHes4IWhQVcz +ek3l4bGifI2kzU7bGIi5e/019ppJzGsVcrOE/3z4GS0DJVk6fy7MEMIFx0LhJPlL +T+9HMH85sSYb97PTiMWpfBvNw3FSC7QQT9FC3L8d/XtMY3NvZoc7Fz7cSGaj7NXG +1OgVnAzMunPa3QaduoxMF9346s+4a+FrpRxL/3bb4skojjmmLqP4dsbD1uz0fP9y +xSifnTnrtjumYWMVi+pEb5kR0sTHl0XS7qKRi3SEfv28uh72KdvcufonIA5rnEb5 ++yqAZiqW2OxVsRoVLVODPswP4VIDiun2kCnfkQygPzxlZUeDZur0mmZ3vwC81C1Q +dZcjlukZcqUaxybUloUilqfNeby+2Uig0krLh2+AM4EqR63LeZ/tk+zCitHeRBW0 +wl3Bd7ShBFg6kN5tCJlHf/G6suIJVr+A9BXfwekO9+//CutKakCwmJTUiNWbQbtN +q3aNCnomyD3WjvUbitVO0CWYjZrmMLIsPtzyLQydpT7tjXpHgvwm5GYWdUGnNs4y +NbA262sUl7Ku/GDw1CnFYXbxl+qxbucLtCdSIFR2xUq3rEO1MXlD/txdTxn6ANax +hi9oBg8tHzuGYJFiCDCvbVVTHgWUSnm/EqfclpJzGmxt8g7vbaohW7NMmMQrLBFP +G6qBypgvotx1iJWaHVLNNiXvyqQwTtelNPAUweRoNawBp/5KTwwy/tHeF0gsVQ7y +mFX4umub9YT34Lpe7qUPKNxXzFcUgAf1SA6vyZ20UI7p42S2OT2PrahJ+uO6LQVD ++REhtN0oyS3G6HzAmKkBgw7LcV3XmAr39iSR7mdmoHSJuI9bjveAPhniK+N6uuln +xf17Qnw5NWfr9MXcLli7zqwMglU/1bNirkwVqf/ogi/zQ3JYCo6tFGf/rnGQAORJ +hvOq2SEYXnizPPIH7VrpE16+jUXwgpiQ8TDyeLPmpZVuhXTXiCaJO5lIwmLQqkmg +JqNiT9V44sksNFTGNKgZo5O9rEqfqX4dLjfv6pGJL+MFXD9if4f1JQiXJfhcRcDh +Ff9B6HukgbJ1H96eLUUNj8sL1+WPOqawkS4wg7tVaERE8CW7mqk15dCysn9shSut +I+7JU7+dZsxpj0ownrxuPAFuT8ZlcBPrFzPUwTlW1G0CbuEco8ijfy5IfbyGCn5s +K/0bOfAuNVGoOpLZ1dMki2bGdBwQOQlkLKhAxYcCVQ0/urr1Ab+VXU9kBsIU8ssN +GogKngYpuUV0PHmpzmobielOHLjNqA2v9vQSV3Ed48wRy5OCwLX1+vYmYlggMDGt +wfl+7QbXYf+k5WnELf3IqYvh8ZWexa0= + + \ No newline at end of file diff --git a/tests/data/wire/goal_state.xml b/tests/data/wire/goal_state.xml index 579b5e87ad..0ccff211c9 100644 --- a/tests/data/wire/goal_state.xml +++ b/tests/data/wire/goal_state.xml @@ -15,12 +15,12 @@ b61f93d0-e1ed-40b2-b067-22c243233448.MachineRole_IN_0 Started - http://168.63.129.16:80/hostingenvuri/ - http://168.63.129.16:80/sharedconfiguri/ - http://168.63.129.16:80/certificatesuri/ - http://168.63.129.16:80/extensionsconfiguri/ - http://168.63.129.16:80/fullconfiguri/ - b61f93d0-e1ed-40b2-b067-22c243233448.1.b61f93d0-e1ed-40b2-b067-22c243233448.2.MachineRole_IN_0.xml + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=hostingEnvironmentConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=sharedConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=extensionsConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=fullConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=certificates&incarnation=1 + bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5.bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5._canary.1.xml diff --git a/tests/data/wire/goal_state_no_ext.xml b/tests/data/wire/goal_state_no_ext.xml index ef7e3989e6..e9048daf6e 100644 --- a/tests/data/wire/goal_state_no_ext.xml +++ b/tests/data/wire/goal_state_no_ext.xml @@ -15,11 +15,11 @@ b61f93d0-e1ed-40b2-b067-22c243233448.MachineRole_IN_0 Started - http://168.63.129.16:80/hostingenvuri/ - http://168.63.129.16:80/sharedconfiguri/ - http://168.63.129.16:80/certificatesuri/ - http://168.63.129.16:80/fullconfiguri/ - b61f93d0-e1ed-40b2-b067-22c243233448.1.b61f93d0-e1ed-40b2-b067-22c243233448.2.MachineRole_IN_0.xml + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=hostingEnvironmentConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=sharedConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=fullConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=certificates&incarnation=1 + bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5.bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5._canary.1.xml diff --git a/tests/data/wire/goal_state_remote_access.xml b/tests/data/wire/goal_state_remote_access.xml index c2840645fd..279006f219 100644 --- a/tests/data/wire/goal_state_remote_access.xml +++ b/tests/data/wire/goal_state_remote_access.xml @@ -17,12 +17,13 @@ b61f93d0-e1ed-40b2-b067-22c243233448.MachineRole_IN_0 Started - http://168.63.129.16:80/hostingenvuri/ - http://168.63.129.16:80/sharedconfiguri/ - http://168.63.129.16:80/certificatesuri/ - http://168.63.129.16:80/extensionsconfiguri/ - http://168.63.129.16:80/fullconfiguri/ - b61f93d0-e1ed-40b2-b067-22c243233448.1.b61f93d0-e1ed-40b2-b067-22c243233448.2.MachineRole_IN_0.xml + b61f93d0-e1ed-40b2-b067-22c243233448.1.b61f93d0-e1ed-40b2-b067-22c243233448.2.MachineRole_IN_0.xml + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=hostingEnvironmentConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=sharedConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=extensionsConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=fullConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=certificates&incarnation=1 + bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5.bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5._canary.1.xml diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 664c78e205..f301a8a7de 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1785,7 +1785,7 @@ def update_goal_state_and_run_handler(): def test_it_should_wait_to_fetch_first_goal_state(self): with _get_update_handler() as (update_handler, protocol): - with patch("azurelinuxagent.common.logger.warn") as patch_warn: + with patch("azurelinuxagent.common.logger.error") as patch_error: with patch("azurelinuxagent.common.logger.info") as patch_info: # Fail GS fetching for the 1st 5 times the agent asks for it update_handler._fail_gs_count = 5 @@ -1799,13 +1799,15 @@ def get_handler(url, **kwargs): protocol.set_http_handlers(http_get_handler=get_handler) update_handler.run(debug=True) - self.assertEqual(0, update_handler.get_exit_code(), "Exit code should be 0; List of all warnings logged by the agent: {0}".format( - patch_warn.call_args_list)) - warn_msgs = [args[0] for (args, _) in patch_warn.call_args_list if - "An error occurred while retrieving the goal state" in args[0]] - self.assertTrue(len(warn_msgs) > 0, "Error should've been reported when failed to retrieve GS") + self.assertEqual(0, update_handler.get_exit_code(), "Exit code should be 0; List of all errors logged by the agent: {0}".format( + patch_error.call_args_list)) + + error_msgs = [args[0] for (args, _) in patch_error.call_args_list if + "Error fetching the goal state" in args[0]] + self.assertTrue(len(error_msgs) > 0, "Error should've been reported when failed to retrieve GS") + info_msgs = [args[0] for (args, _) in patch_info.call_args_list if - "Retrieving the goal state recovered from previous errors" in args[0]] + "Fetching the goal state recovered from previous errors." in args[0]] self.assertTrue(len(info_msgs) > 0, "Agent should've logged a message when recovered from GS errors") def test_it_should_reset_legacy_blacklisted_agents_on_process_start(self): @@ -2637,9 +2639,9 @@ def create_log_and_telemetry_mocks(): calls_to_strings = lambda calls: (str(c) for c in calls) filter_calls = lambda calls, regex=None: (c for c in calls_to_strings(calls) if regex is None or re.match(regex, c)) logger_calls = lambda regex=None: [m for m in filter_calls(logger.method_calls, regex)] # pylint: disable=used-before-assignment,unnecessary-comprehension - warnings = lambda: logger_calls(r'call.warn\(.*An error occurred while retrieving the goal state.*') - periodic_warnings = lambda: logger_calls(r'call.periodic_warn\(.*Attempts to retrieve the goal state are failing.*') - success_messages = lambda: logger_calls(r'call.info\(.*Retrieving the goal state recovered from previous errors.*') + errors = lambda: logger_calls(r'call.error\(.*Error fetching the goal state.*') + periodic_errors = lambda: logger_calls(r'call.error\(.*Fetching the goal state is still failing*') + success_messages = lambda: logger_calls(r'call.info\(.*Fetching the goal state recovered from previous errors.*') telemetry_calls = lambda regex=None: [m for m in filter_calls(add_event.mock_calls, regex)] # pylint: disable=used-before-assignment,unnecessary-comprehension goal_state_events = lambda: telemetry_calls(r".*op='FetchGoalState'.*") @@ -2664,10 +2666,8 @@ def create_log_and_telemetry_mocks(): with create_log_and_telemetry_mocks() as (logger, add_event): update_handler._try_update_goal_state(protocol) - w = warnings() - pw = periodic_warnings() - self.assertEqual(1, len(w), "A failure should have produced a warning: [{0}]".format(w)) - self.assertEqual(1, len(pw), "A failure should have produced a periodic warning: [{0}]".format(pw)) + e = errors() + self.assertEqual(1, len(e), "A failure should have produced an error: [{0}]".format(e)) gs = goal_state_events() self.assertTrue(len(gs) == 1 and 'is_success=False' in gs[0], "A failure should produce a telemetry event (success=false): [{0}]".format(gs)) @@ -2676,17 +2676,17 @@ def create_log_and_telemetry_mocks(): # ... and errors continue happening... # with create_log_and_telemetry_mocks() as (logger, add_event): - update_handler._try_update_goal_state(protocol) - update_handler._try_update_goal_state(protocol) - update_handler._try_update_goal_state(protocol) + for _ in range(5): + update_handler._update_goal_state_last_error_report = datetime.now() + timedelta(days=1) + update_handler._try_update_goal_state(protocol) - w = warnings() - pw = periodic_warnings() - self.assertTrue(len(w) == 0, "Subsequent failures should not produce warnings: [{0}]".format(w)) - self.assertEqual(len(pw), 3, "Subsequent failures should produce periodic warnings: [{0}]".format(pw)) + e = errors() + pe = periodic_errors() + self.assertEqual(2, len(e), "Two additional errors should have been reported: [{0}]".format(e)) + self.assertEqual(len(pe), 3, "Subsequent failures should produce periodic errors: [{0}]".format(pe)) tc = telemetry_calls() - self.assertTrue(len(tc) == 0, "Subsequent failures should not produce any telemetry events: [{0}]".format(tc)) + self.assertTrue(len(tc) == 5, "The failures should have produced telemetry events. Got: [{0}]".format(tc)) # # ... until we finally succeed @@ -2696,10 +2696,10 @@ def create_log_and_telemetry_mocks(): update_handler._try_update_goal_state(protocol) s = success_messages() - w = warnings() - pw = periodic_warnings() + e = errors() + pe = periodic_errors() self.assertEqual(len(s), 1, "Recovering after failures should have produced an info message: [{0}]".format(s)) - self.assertTrue(len(w) == 0 and len(pw) == 0, "Recovering after failures should have not produced any warnings: [{0}] [{1}]".format(w, pw)) + self.assertTrue(len(e) == 0 and len(pe) == 0, "Recovering after failures should have not produced any errors: [{0}] [{1}]".format(e, pe)) gs = goal_state_events() self.assertTrue(len(gs) == 1 and 'is_success=True' in gs[0], "Recovering after failures should produce a telemetry event (success=true): [{0}]".format(gs)) diff --git a/tests/protocol/HttpRequestPredicates.py b/tests/protocol/HttpRequestPredicates.py index 39243d5431..db3ab8b2a6 100644 --- a/tests/protocol/HttpRequestPredicates.py +++ b/tests/protocol/HttpRequestPredicates.py @@ -11,6 +11,22 @@ class HttpRequestPredicates(object): def is_goal_state_request(url): return url.lower() == 'http://{0}/machine/?comp=goalstate'.format(restutil.KNOWN_WIRESERVER_IP) + @staticmethod + def is_certificates_request(url): + return re.match(r'http://{0}(:80)?/machine/.*?comp=certificates'.format(restutil.KNOWN_WIRESERVER_IP), url, re.IGNORECASE) + + @staticmethod + def is_extensions_config_request(url): + return re.match(r'http://{0}(:80)?/machine/.*?comp=config&type=extensionsConfig'.format(restutil.KNOWN_WIRESERVER_IP), url, re.IGNORECASE) + + @staticmethod + def is_hosting_environment_config_request(url): + return re.match(r'http://{0}(:80)?/machine/.*?comp=config&type=hostingEnvironmentConfig'.format(restutil.KNOWN_WIRESERVER_IP), url, re.IGNORECASE) + + @staticmethod + def is_shared_config_request(url): + return re.match(r'http://{0}(:80)?/machine/.*?comp=config&type=sharedConfig'.format(restutil.KNOWN_WIRESERVER_IP), url, re.IGNORECASE) + @staticmethod def is_telemetry_request(url): return url.lower() == 'http://{0}/machine?comp=telemetrydata'.format(restutil.KNOWN_WIRESERVER_IP) diff --git a/tests/protocol/mockwiredata.py b/tests/protocol/mockwiredata.py index 218bd29377..7ec311af46 100644 --- a/tests/protocol/mockwiredata.py +++ b/tests/protocol/mockwiredata.py @@ -135,10 +135,10 @@ def __init__(self, data_files=None): "/HealthService": 0, "/vmAgentLog": 0, "goalstate": 0, - "hostingenvuri": 0, - "sharedconfiguri": 0, - "certificatesuri": 0, - "extensionsconfiguri": 0, + "hostingEnvironmentConfig": 0, + "sharedConfig": 0, + "certificates": 0, + "extensionsConfig": 0, "remoteaccessinfouri": 0, "extensionArtifact": 0, "agentArtifact": 0, @@ -198,6 +198,10 @@ def reload(self): if in_vm_artifacts_profile_file is not None: self.in_vm_artifacts_profile = load_data(in_vm_artifacts_profile_file) + def reset_call_counts(self): + for counter in self.call_counts: + self.call_counts[counter] = 0 + def mock_http_get(self, url, *_, **kwargs): content = '' response_headers = [] @@ -217,18 +221,18 @@ def mock_http_get(self, url, *_, **kwargs): elif "goalstate" in url: content = self.goal_state self.call_counts["goalstate"] += 1 - elif "hostingenvuri" in url: + elif HttpRequestPredicates.is_hosting_environment_config_request(url): content = self.hosting_env - self.call_counts["hostingenvuri"] += 1 - elif "sharedconfiguri" in url: + self.call_counts["hostingEnvironmentConfig"] += 1 + elif HttpRequestPredicates.is_shared_config_request(url): content = self.shared_config - self.call_counts["sharedconfiguri"] += 1 - elif "certificatesuri" in url: + self.call_counts["sharedConfig"] += 1 + elif HttpRequestPredicates.is_certificates_request(url): content = self.certs - self.call_counts["certificatesuri"] += 1 - elif "extensionsconfiguri" in url: + self.call_counts["certificates"] += 1 + elif HttpRequestPredicates.is_extensions_config_request(url): content = self.ext_conf - self.call_counts["extensionsconfiguri"] += 1 + self.call_counts["extensionsConfig"] += 1 elif "remoteaccessinfouri" in url: content = self.remote_access self.call_counts["remoteaccessinfouri"] += 1 diff --git a/tests/protocol/test_goal_state.py b/tests/protocol/test_goal_state.py index c54a65f9f7..c774171595 100644 --- a/tests/protocol/test_goal_state.py +++ b/tests/protocol/test_goal_state.py @@ -8,13 +8,12 @@ import re import time -from azurelinuxagent.common.event import WALAEventOperation from azurelinuxagent.common.future import httpclient from azurelinuxagent.common.protocol.extensions_goal_state import GoalStateSource, GoalStateChannel from azurelinuxagent.common.protocol.extensions_goal_state_from_extensions_config import ExtensionsGoalStateFromExtensionsConfig from azurelinuxagent.common.protocol.extensions_goal_state_from_vm_settings import ExtensionsGoalStateFromVmSettings from azurelinuxagent.common.protocol import hostplugin -from azurelinuxagent.common.protocol.goal_state import GoalState, _GET_GOAL_STATE_MAX_ATTEMPTS +from azurelinuxagent.common.protocol.goal_state import GoalState, GoalStateInconsistentError, _GET_GOAL_STATE_MAX_ATTEMPTS from azurelinuxagent.common.exception import ProtocolError from azurelinuxagent.common.utils import fileutil from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME @@ -360,17 +359,54 @@ def http_get_handler(url, *_, **__): self.assertEqual(initial_timestamp, goal_state.extensions_goal_state.created_on_timestamp, "The timestamp of the updated goal state is incorrect") self.assertTrue(goal_state.extensions_goal_state.is_outdated, "The updated goal state should be marked as outdated") - def test_it_should_report_missing_certificates(self): + def test_it_should_raise_when_the_tenant_certificate_is_missing(self): data_file = mockwiredata.DATA_FILE_VM_SETTINGS.copy() - data_file["vm_settings"] = "hostgaplugin/vm_settings-missing_cert.json" with mock_wire_protocol(data_file) as protocol: - with patch("azurelinuxagent.common.protocol.goal_state.add_event") as add_event: + data_file["vm_settings"] = "hostgaplugin/vm_settings-missing_cert.json" + protocol.mock_wire_data.reload() + + with self.assertRaises(GoalStateInconsistentError) as context: _ = GoalState(protocol.client) - expected_message = "Certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C needed by Microsoft.OSTCExtensions.VMAccessForLinux is missing from the goal state" - events = [kwargs for _, kwargs in add_event.call_args_list if kwargs['op'] == WALAEventOperation.VmSettings and kwargs['message'] == expected_message] + expected_message = "Certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C needed by Microsoft.OSTCExtensions.VMAccessForLinux is missing from the goal state" + self.assertIn(expected_message, str(context.exception)) + + def test_it_should_refresh_the_goal_state_when_it_is_inconsistent(self): + # + # Some scenarios can produce inconsistent goal states. For example, during hibernation/resume, the Fabric goal state changes (the + # tenant certificate is re-generated when the VM is restarted) *without* the incarnation changing. If a Fast Track goal state + # comes after that, the extensions will need the new certificate. This test simulates that scenario by mocking the certificates + # request and returning first a set of certificates (certs-2.xml) that do not match those needed by the extensions, and then a + # set (certs.xml) that does match. The test then ensures that the goal state was refreshed and the correct certificates were + # fetched. + # + data_files = [ + "wire/certs-2.xml", + "wire/certs.xml" + ] + + def http_get_handler(url, *_, **__): + if HttpRequestPredicates.is_certificates_request(url): + http_get_handler.certificate_requests += 1 + if http_get_handler.certificate_requests < len(data_files): + data = load_data(data_files[http_get_handler.certificate_requests - 1]) + return MockHttpResponse(status=200, body=data.encode('utf-8')) + return None + http_get_handler.certificate_requests = 0 + + with mock_wire_protocol(mockwiredata.DATA_FILE_VM_SETTINGS) as protocol: + protocol.set_http_handlers(http_get_handler=http_get_handler) + protocol.mock_wire_data.reset_call_counts() + + goal_state = GoalState(protocol.client) + + self.assertEqual(2, protocol.mock_wire_data.call_counts['goalstate'], "There should have been exactly 2 requests for the goal state (original + refresh)") + self.assertEqual(2, http_get_handler.certificate_requests, "There should have been exactly 2 requests for the goal state certificates (original + refresh)") + + thumbprints = [c.thumbprint for c in goal_state.certs.cert_list.certificates] - self.assertTrue( - len(events) == 1, - "Missing certificate 59A10F50FFE2A0408D3F03FE336C8FD5716CF25C was not reported. Telemetry: {0}".format([kwargs['message'] for _, kwargs in add_event.call_args_list])) + for extension in goal_state.extensions_goal_state.extensions: + for settings in extension.settings: + if settings.protectedSettings is not None: + self.assertIn(settings.certificateThumbprint, thumbprints, "Certificate is missing from the goal state.") diff --git a/tests/protocol/test_hostplugin.py b/tests/protocol/test_hostplugin.py index 16bb7ef0b6..9f96f7d554 100644 --- a/tests/protocol/test_hostplugin.py +++ b/tests/protocol/test_hostplugin.py @@ -257,9 +257,8 @@ def test_default_channel(self, patch_put, patch_upload, _): # assert host plugin route is called self.assertEqual(1, patch_put.call_count, "Host plugin was not used") - # assert update goal state is only called once, non-forced + # assert update goal state is only called once self.assertEqual(1, wire_protocol.client.update_goal_state.call_count, "Unexpected call count") - self.assertEqual(0, len(wire_protocol.client.update_goal_state.call_args[1]), "Unexpected parameters") # ensure the correct url is used self.assertEqual(sas_url, patch_put.call_args[0][0]) @@ -291,9 +290,8 @@ def test_fallback_channel_503(self, patch_put, patch_upload, _): # assert host plugin route is called self.assertEqual(1, patch_put.call_count, "Host plugin was not used") - # assert update goal state is only called once, non-forced + # assert update goal state is only called once self.assertEqual(1, wire_protocol.client.update_goal_state.call_count, "Update goal state unexpected call count") - self.assertEqual(0, len(wire_protocol.client.update_goal_state.call_args[1]), "Update goal state unexpected call count") # ensure the correct url is used self.assertEqual(sas_url, patch_put.call_args[0][0]) @@ -326,9 +324,8 @@ def test_fallback_channel_410(self, patch_refresh_host_plugin, patch_put, patch_ # assert host plugin route is called self.assertEqual(1, patch_put.call_count, "Host plugin was not used") - # assert update goal state is called with no arguments (forced=False), then update_host_plugin_from_goal_state is called + # assert update goal state is called, then update_host_plugin_from_goal_state is called self.assertEqual(1, wire_protocol.client.update_goal_state.call_count, "Update goal state unexpected call count") - self.assertEqual(0, len(wire_protocol.client.update_goal_state.call_args[1]), "Update goal state unexpected argument count") self.assertEqual(1, patch_refresh_host_plugin.call_count, "Refresh host plugin unexpected call count") # ensure the correct url is used @@ -361,9 +358,8 @@ def test_fallback_channel_failure(self, patch_put, patch_upload, _): # assert host plugin route is called self.assertEqual(1, patch_put.call_count, "Host plugin was not used") - # assert update goal state is called twice, forced=True on the second + # assert update goal state is called twice self.assertEqual(1, wire_protocol.client.update_goal_state.call_count, "Update goal state unexpected call count") - self.assertEqual(0, len(wire_protocol.client.update_goal_state.call_args[1]), "Update goal state unexpected call count") # ensure the correct url is used self.assertEqual(sas_url, patch_put.call_args[0][0]) diff --git a/tests/protocol/test_wire.py b/tests/protocol/test_wire.py index 0cc8a01e9a..c564af7217 100644 --- a/tests/protocol/test_wire.py +++ b/tests/protocol/test_wire.py @@ -160,7 +160,7 @@ def test_getters_with_stale_goal_state(self, patch_report, *args): # -- Tracking calls to retrieve GoalState is problematic since it is # fetched often; however, the dependent documents, such as the # HostingEnvironmentConfig, will be retrieved the expected number - self.assertEqual(1, test_data.call_counts["hostingenvuri"]) + self.assertEqual(1, test_data.call_counts["hostingEnvironmentConfig"]) self.assertEqual(1, patch_report.call_count) def test_call_storage_kwargs(self, *args): # pylint: disable=unused-argument From 10c49a11c7c2a03b8f7fbd03f492b0b4ea57eda2 Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Fri, 29 Apr 2022 10:41:01 -0700 Subject: [PATCH 06/13] Do not mark goal state as processed when goal state fails to update (#2569) (#2571) Co-authored-by: narrieta (cherry picked from commit 4bc921590409e610d225387afb13f9b7c821777a) --- azurelinuxagent/common/protocol/goal_state.py | 4 ++-- azurelinuxagent/ga/update.py | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index edfd9d14f7..3301d783da 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -209,7 +209,7 @@ def _update(self, force_update): else: # vm_settings_updated most_recent = vm_settings - if self._extensions_goal_state is None or most_recent.created_on_timestamp > self._extensions_goal_state.created_on_timestamp: + if self._extensions_goal_state is None or most_recent.created_on_timestamp >= self._extensions_goal_state.created_on_timestamp: self._extensions_goal_state = most_recent # @@ -221,7 +221,7 @@ def _update(self, force_update): # Track goal state comes after that, the extensions will need the new certificate. The Agent needs to refresh the goal state in that # case, to ensure it fetches the new certificate. # - if self.extensions_goal_state.source == GoalStateSource.FastTrack: + if self._extensions_goal_state.source == GoalStateSource.FastTrack: self._check_certificates() def _check_certificates(self): diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index c800ce4700..eb14c6bac1 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -606,19 +606,19 @@ def _processing_new_extensions_goal_state(self): return self._goal_state is not None and egs.id != self._last_extensions_gs_id and not egs.is_outdated def _process_goal_state(self, exthandlers_handler, remote_access_handler): - try: - protocol = exthandlers_handler.protocol - - # update self._goal_state - if not self._try_update_goal_state(protocol): - # agent updates and status reporting should be done even when the goal state is not updated - self.__update_guest_agent(protocol) - self._report_status(exthandlers_handler) - return + protocol = exthandlers_handler.protocol - # check for agent updates + # update self._goal_state + if not self._try_update_goal_state(protocol): + # agent updates and status reporting should be done even when the goal state is not updated self.__update_guest_agent(protocol) + self._report_status(exthandlers_handler) + return + + # check for agent updates + self.__update_guest_agent(protocol) + try: if self._processing_new_extensions_goal_state(): if not self._extensions_summary.converged: message = "A new goal state was received, but not all the extensions in the previous goal state have completed: {0}".format(self._extensions_summary) From f0c0fb8214ef12b5279fa1e1847c44aab9bbee80 Mon Sep 17 00:00:00 2001 From: Shantanu Mishra Date: Sat, 30 Apr 2022 03:13:31 +0530 Subject: [PATCH 07/13] Debian - string conversion for systemd service (#2574) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 56f7f54f11..f929435852 100755 --- a/setup.py +++ b/setup.py @@ -251,7 +251,7 @@ def get_data_files(name, version, fullname): # pylint: disable=R0912 def debian_has_systemd(): try: return subprocess.check_output( - ['cat', '/proc/1/comm']).strip() == 'systemd' + ['cat', '/proc/1/comm']).strip().decode() == 'systemd' except subprocess.CalledProcessError: return False From 685348dcba4a59c49a9c8e553c162075b6a5d0e2 Mon Sep 17 00:00:00 2001 From: Kevin Clark Date: Wed, 11 May 2022 09:49:35 -0700 Subject: [PATCH 08/13] Bug fix for fetching a goal state with empty certificates property (#2575) (#2585) (cherry picked from commit 8e9c1b5b514ffccb10f664f9f1b7c9edee1ddced) --- azurelinuxagent/common/protocol/goal_state.py | 9 +++- tests/data/wire/goal_state_no_certs.xml | 27 +++++++++++ tests/ga/test_update.py | 47 +++++++++++++++++++ 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 tests/data/wire/goal_state_no_certs.xml diff --git a/azurelinuxagent/common/protocol/goal_state.py b/azurelinuxagent/common/protocol/goal_state.py index 3301d783da..8b508f61ad 100644 --- a/azurelinuxagent/common/protocol/goal_state.py +++ b/azurelinuxagent/common/protocol/goal_state.py @@ -82,7 +82,7 @@ def __init__(self, wire_client, silent=False): self._container_id = None self._hosting_env = None self._shared_conf = None - self._certs = None + self._certs = EmptyCertificates() self._remote_access = None self.update(silent=silent) @@ -348,7 +348,7 @@ def _fetch_full_wire_server_goal_state(self, incarnation, xml_doc): shared_conf = SharedConfig(xml_text) self._history.save_shared_conf(xml_text) - certs = None + certs = EmptyCertificates() certs_uri = findtext(xml_doc, "Certificates") if certs_uri is not None: xml_text = self._wire_client.fetch_config(certs_uri, self._wire_client.get_header_for_cert()) @@ -506,6 +506,11 @@ def _write_to_tmp_file(index, suffix, buf): fileutil.write_file(file_name, "".join(buf)) return file_name +class EmptyCertificates: + def __init__(self): + self.cert_list = CertList() + self.summary = [] # debugging info + self.warnings = [] class RemoteAccess(object): """ diff --git a/tests/data/wire/goal_state_no_certs.xml b/tests/data/wire/goal_state_no_certs.xml new file mode 100644 index 0000000000..1ab7fa2172 --- /dev/null +++ b/tests/data/wire/goal_state_no_certs.xml @@ -0,0 +1,27 @@ + + + 2010-12-15 + 1 + + Started + + 16001 + + + + c6d5526c-5ac2-4200-b6e2-56f2b70c5ab2 + + + b61f93d0-e1ed-40b2-b067-22c243233448.MachineRole_IN_0 + Started + + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=hostingEnvironmentConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=sharedConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=extensionsConfig&incarnation=1 + http://168.63.129.16:80/machine/865a6683-91d8-450f-99ae/bc8b9d47%2Db5ed%2D4704%2D85d9%2Dfd74cc967ec2.%5Fcanary?comp=config&type=fullConfig&incarnation=1 + bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5.bc8b9d47-b5ed-4704-85d9-fd74cc967ec2.5._canary.1.xml + + + + + diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index f301a8a7de..92bd858374 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -1458,6 +1458,53 @@ def _get_test_ext_handler_instance(protocol, name="OSTCExtensions.ExampleHandler eh = Extension(name=name) eh.version = version return ExtHandlerInstance(eh, protocol) + + def test_update_handler_recovers_from_error_with_no_certs(self): + data = DATA_FILE.copy() + data['goal_state'] = 'wire/goal_state_no_certs.xml' + + def fail_gs_fetch(url, *_, **__): + if HttpRequestPredicates.is_goal_state_request(url): + return MockHttpResponse(status=500) + return None + + with mock_wire_protocol(data) as protocol: + + def fail_fetch_on_second_iter(iteration): + if iteration == 2: + protocol.set_http_handlers(http_get_handler=fail_gs_fetch) + if iteration > 2: # Zero out the fail handler for subsequent iterations. + protocol.set_http_handlers(http_get_handler=None) + + with mock_update_handler(protocol, 3, on_new_iteration=fail_fetch_on_second_iter) as update_handler: + with patch("azurelinuxagent.ga.update.logger.error") as patched_error: + with patch("azurelinuxagent.ga.update.logger.info") as patched_info: + def match_unexpected_errors(): + unexpected_msg_fragment = "Error fetching the goal state:" + + matching_errors = [] + for (args, _) in filter(lambda a: len(a) > 0, patched_error.call_args_list): + if unexpected_msg_fragment in args[0]: + matching_errors.append(args[0]) + + if len(matching_errors) > 1: + self.fail("Guest Agent did not recover, with new error(s): {}"\ + .format(matching_errors[1:])) + + def match_expected_info(): + expected_msg_fragment = "Fetching the goal state recovered from previous errors" + + for (call_args, _) in filter(lambda a: len(a) > 0, patched_info.call_args_list): + if expected_msg_fragment in call_args[0]: + break + else: + self.fail("Expected the guest agent to recover with '{}', but it didn't"\ + .format(expected_msg_fragment)) + + update_handler.run(debug=True) + match_unexpected_errors() # Match on errors first, they can provide more info. + match_expected_info() + def test_it_should_recreate_handler_env_on_service_startup(self): iterations = 5 From a867dffdb50fb569dc53759d491eb4cca003377b Mon Sep 17 00:00:00 2001 From: Kevin Clark Date: Mon, 16 May 2022 10:27:26 -0700 Subject: [PATCH 09/13] Move error counter reset down to end of block. (#2576) (#2586) (cherry picked from commit dee5fef9b98bf3e50b8fce213f67bd0d0b288c21) --- azurelinuxagent/ga/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azurelinuxagent/ga/update.py b/azurelinuxagent/ga/update.py index eb14c6bac1..fd11c4ca09 100644 --- a/azurelinuxagent/ga/update.py +++ b/azurelinuxagent/ga/update.py @@ -489,11 +489,11 @@ def _try_update_goal_state(self, protocol): self._goal_state = protocol.get_goal_state() if self._update_goal_state_error_count > 0: - self._update_goal_state_error_count = 0 message = u"Fetching the goal state recovered from previous errors. Fetched {0} (certificates: {1})".format( self._goal_state.extensions_goal_state.id, self._goal_state.certs.summary) add_event(AGENT_NAME, op=WALAEventOperation.FetchGoalState, version=CURRENT_VERSION, is_success=True, message=message, log_event=False) logger.info(message) + self._update_goal_state_error_count = 0 try: self._supports_fast_track = conf.get_enable_fast_track() and protocol.client.get_host_plugin().check_vm_settings_support() From 81563a6be32ca2f028ff2adb4fbe3b23bc686b63 Mon Sep 17 00:00:00 2001 From: Kevin Clark Date: Mon, 16 May 2022 11:48:34 -0700 Subject: [PATCH 10/13] Bug Fix: Change fast track timestamp default from None to datetime.min (#2577) (#2587) (cherry picked from commit 35fed83afb5da1e93d0e207f4b54a863f188ec6c) --- azurelinuxagent/common/protocol/hostplugin.py | 4 +- .../vm_settings-fabric-no_thumbprints.json | 192 ++++++++++++++++++ tests/ga/test_update.py | 55 ++++- 3 files changed, 245 insertions(+), 6 deletions(-) create mode 100644 tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json diff --git a/azurelinuxagent/common/protocol/hostplugin.py b/azurelinuxagent/common/protocol/hostplugin.py index 5f795dca3e..f79076f8ef 100644 --- a/azurelinuxagent/common/protocol/hostplugin.py +++ b/azurelinuxagent/common/protocol/hostplugin.py @@ -95,7 +95,7 @@ def __init__(self, endpoint): if not os.path.exists(self._get_fast_track_state_file()): self._supports_vm_settings = False self._supports_vm_settings_next_check = datetime.datetime.now() - self._fast_track_timestamp = None + self._fast_track_timestamp = timeutil.create_timestamp(datetime.datetime.min) else: self._supports_vm_settings = True self._supports_vm_settings_next_check = datetime.datetime.now() @@ -443,7 +443,7 @@ def get_fast_track_timestamp(): goal state was Fabric or fetch_vm_settings() has not been invoked. """ if not os.path.exists(HostPluginProtocol._get_fast_track_state_file()): - return None + return timeutil.create_timestamp(datetime.datetime.min) try: with open(HostPluginProtocol._get_fast_track_state_file(), "r") as file_: diff --git a/tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json b/tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json new file mode 100644 index 0000000000..bbd9459336 --- /dev/null +++ b/tests/data/hostgaplugin/vm_settings-fabric-no_thumbprints.json @@ -0,0 +1,192 @@ +{ + "hostGAPluginVersion": "1.0.8.124", + "vmSettingsSchemaVersion": "0.0", + "activityId": "a33f6f53-43d6-4625-b322-1a39651a00c9", + "correlationId": "9a47a2a2-e740-4bfc-b11b-4f2f7cfe7d2e", + "inSvdSeqNo": 1, + "extensionsLastModifiedTickCount": 637726657706205299, + "extensionGoalStatesSource": "Fabric", + "onHold": true, + "statusUploadBlob": { + "statusBlobType": "BlockBlob", + "value": "https://dcrcl3a0xs.blob.core.windows.net/$system/edp0plkw2b.86f4ae0a-61f8-48ae-9199-40f402d56864.status?sv=2018-03-28&sr=b&sk=system-1&sig=KNWgC2%3d&se=9999-01-01T00%3a00%3a00Z&sp=w" + }, + "inVMMetadata": { + "subscriptionId": "8e037ad4-618f-4466-8bc8-5099d41ac15b", + "resourceGroupName": "rg-dc-86fjzhp", + "vmName": "edp0plkw2b", + "location": "CentralUSEUAP", + "vmId": "86f4ae0a-61f8-48ae-9199-40f402d56864", + "vmSize": "Standard_B2s", + "osType": "Linux" + }, + "requiredFeatures": [ + { + "name": "MultipleExtensionsPerHandler" + } + ], + "gaFamilies": [ + { + "name": "Prod", + "uris": [ + "https://zrdfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Prod_uscentraleuap_manifest.xml", + "https://ardfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Prod_uscentraleuap_manifest.xml" + ] + }, + { + "name": "Test", + "uris": [ + "https://zrdfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Test_uscentraleuap_manifest.xml", + "https://ardfepirv2cdm03prdstr01a.blob.core.windows.net/7d89d439b79f4452950452399add2c90/Microsoft.OSTCLinuxAgent_Test_uscentraleuap_manifest.xml" + ] + } + ], + "extensionGoalStates": [ + { + "name": "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent", + "version": "1.9.1", + "location": "https://zrdfepirv2cbn04prdstr01a.blob.core.windows.net/a47f0806d764480a8d989d009c75007d/Microsoft.Azure.Monitor_AzureMonitorLinuxAgent_useast2euap_manifest.xml", + "failoverlocation": "https://zrdfepirv2cbn06prdstr01a.blob.core.windows.net/a47f0806d764480a8d989d009c75007d/Microsoft.Azure.Monitor_AzureMonitorLinuxAgent_useast2euap_manifest.xml", + "additionalLocations": ["https://zrdfepirv2cbn09pr02a.blob.core.windows.net/a47f0806d764480a8d989d009c75007d/Microsoft.Azure.Monitor_AzureMonitorLinuxAgent_useast2euap_manifest.xml"], + "state": "enabled", + "autoUpgrade": true, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "settings": [ + { + "publicSettings": "{\"GCS_AUTO_CONFIG\":true}" + } + ] + }, + { + "name": "Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent", + "version": "2.15.112", + "location": "https://zrdfepirv2cbn04prdstr01a.blob.core.windows.net/4ef06ad957494df49c807a5334f2b5d2/Microsoft.Azure.Security.Monitoring_AzureSecurityLinuxAgent_useast2euap_manifest.xml", + "failoverlocation": "https://zrdfepirv2cbz06prdstr01a.blob.core.windows.net/4ef06ad957494df49c807a5334f2b5d2/Microsoft.Azure.Security.Monitoring_AzureSecurityLinuxAgent_useast2euap_manifest.xml", + "additionalLocations": ["https://zrdfepirv2cbn06prdstr01a.blob.core.windows.net/4ef06ad957494df49c807a5334f2b5d2/Microsoft.Azure.Security.Monitoring_AzureSecurityLinuxAgent_useast2euap_manifest.xml"], + "state": "enabled", + "autoUpgrade": true, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "settings": [ + { + "publicSettings": "{\"enableGenevaUpload\":true}" + } + ] + }, + { + "name": "Microsoft.Azure.Extensions.CustomScript", + "version": "2.1.6", + "location": "https://umsavwggj2v40kvqhc0w.blob.core.windows.net/5237dd14-0aad-f051-0fad-1e33e1b63091/5237dd14-0aad-f051-0fad-1e33e1b63091_manifest.xml", + "failoverlocation": "https://umsafwzhkbm1rfrhl0ws.blob.core.windows.net/5237dd14-0aad-f051-0fad-1e33e1b63091/5237dd14-0aad-f051-0fad-1e33e1b63091_manifest.xml", + "additionalLocations": [ + "https://umsanh4b5rfz0q0p4pwm.blob.core.windows.net/5237dd14-0aad-f051-0fad-1e33e1b63091/5237dd14-0aad-f051-0fad-1e33e1b63091_manifest.xml" + ], + "state": "enabled", + "autoUpgrade": true, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "isMultiConfig": false, + "settings": [ + { + "publicSettings": "{\"commandToExecute\":\"echo 'cee174d4-4daa-4b07-9958-53b9649445c2'\"}" + } + ], + "dependsOn": [ + { + "DependsOnExtension": [ + { + "handler": "Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent" + } + ], + "dependencyLevel": 1 + } + ] + }, + { + "name": "Microsoft.CPlat.Core.RunCommandHandlerLinux", + "version": "1.2.0", + "location": "https://umsavbvncrpzbnxmxzmr.blob.core.windows.net/f4086d41-69f9-3103-78e0-8a2c7e789d0f/f4086d41-69f9-3103-78e0-8a2c7e789d0f_manifest.xml", + "failoverlocation": "https://umsajbjtqrb3zqjvgb2z.blob.core.windows.net/f4086d41-69f9-3103-78e0-8a2c7e789d0f/f4086d41-69f9-3103-78e0-8a2c7e789d0f_manifest.xml", + "additionalLocations": [ + "https://umsawqtlsshtn5v2nfgh.blob.core.windows.net/f4086d41-69f9-3103-78e0-8a2c7e789d0f/f4086d41-69f9-3103-78e0-8a2c7e789d0f_manifest.xml" + ], + "state": "enabled", + "autoUpgrade": true, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "isMultiConfig": true, + "settings": [ + { + "publicSettings": "{\"source\":{\"script\":\"echo '4abb1e88-f349-41f8-8442-247d9fdfcac5'\"}}", + "seqNo": 0, + "extensionName": "MCExt1", + "extensionState": "enabled" + }, + { + "publicSettings": "{\"source\":{\"script\":\"echo 'e865c9bc-a7b3-42c6-9a79-cfa98a1ee8b3'\"}}", + "seqNo": 0, + "extensionName": "MCExt2", + "extensionState": "enabled" + }, + { + "publicSettings": "{\"source\":{\"script\":\"echo 'f923e416-0340-485c-9243-8b84fb9930c6'\"}}", + "seqNo": 0, + "extensionName": "MCExt3", + "extensionState": "enabled" + } + ], + "dependsOn": [ + { + "dependsOnExtension": [ + { + "extension": "...", + "handler": "..." + }, + { + "extension": "...", + "handler": "..." + } + ], + "dependencyLevel": 2, + "name": "MCExt1" + }, + { + "dependsOnExtension": [ + { + "extension": "...", + "handler": "..." + } + ], + "dependencyLevel": 1, + "name": "MCExt2" + } + ] + }, + { + "name": "Microsoft.OSTCExtensions.VMAccessForLinux", + "version": "1.5.11", + "location": "https://umsasc25p0kjg0c1dg4b.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml", + "failoverlocation": "https://umsamfwlmfshvxx2lsjm.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml", + "additionalLocations": [ + "https://umsah3cwjlctnmhsvzqv.blob.core.windows.net/2bbece4f-0283-d415-b034-cc0adc6997a1/2bbece4f-0283-d415-b034-cc0adc6997a1_manifest.xml" + ], + "state": "enabled", + "autoUpgrade": false, + "runAsStartupTask": false, + "isJson": true, + "useExactVersion": true, + "settingsSeqNo": 0, + "isMultiConfig": false, + "settings": [ ] + } + ] +} diff --git a/tests/ga/test_update.py b/tests/ga/test_update.py index 92bd858374..edcf669d62 100644 --- a/tests/ga/test_update.py +++ b/tests/ga/test_update.py @@ -38,7 +38,7 @@ VMAgentUpdateStatuses from azurelinuxagent.common.protocol.util import ProtocolUtil from azurelinuxagent.common.protocol.wire import WireProtocol -from azurelinuxagent.common.utils import fileutil, restutil, textutil +from azurelinuxagent.common.utils import fileutil, restutil, textutil, timeutil from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE from azurelinuxagent.common.utils.flexible_version import FlexibleVersion from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules @@ -52,7 +52,7 @@ READONLY_FILE_GLOBS, ExtensionsSummary, AgentUpgradeType from tests.ga.mocks import mock_update_handler from tests.protocol.mocks import mock_wire_protocol, MockHttpResponse -from tests.protocol.mockwiredata import DATA_FILE, DATA_FILE_MULTIPLE_EXT +from tests.protocol.mockwiredata import DATA_FILE, DATA_FILE_MULTIPLE_EXT, DATA_FILE_VM_SETTINGS from tests.tools import AgentTestCase, AgentTestCaseWithGetVmSizeMock, data_dir, DEFAULT, patch, load_bin_data, Mock, MagicMock, \ clear_singleton_instances from tests.protocol import mockwiredata @@ -2874,7 +2874,7 @@ def test_it_should_mark_outdated_goal_states_on_service_restart_when_host_ga_plu def test_it_should_clear_the_timestamp_for_the_most_recent_fast_track_goal_state(self): data_file = self._prepare_fast_track_goal_state() - if HostPluginProtocol.get_fast_track_timestamp() is None: + if HostPluginProtocol.get_fast_track_timestamp() == timeutil.create_timestamp(datetime.min): raise Exception("The test setup did not save the Fast Track state") with patch("azurelinuxagent.common.conf.get_enable_fast_track", return_value=False): @@ -2882,8 +2882,55 @@ def test_it_should_clear_the_timestamp_for_the_most_recent_fast_track_goal_state with mock_update_handler(protocol) as update_handler: update_handler.run() - self.assertIsNone(HostPluginProtocol.get_fast_track_timestamp(), "The Fast Track state was not cleared") + self.assertEqual(HostPluginProtocol.get_fast_track_timestamp(), timeutil.create_timestamp(datetime.min), + "The Fast Track state was not cleared") + + def test_it_should_default_fast_track_timestamp_to_datetime_min(self): + data = DATA_FILE_VM_SETTINGS.copy() + # TODO: Currently, there's a limitation in the mocks where bumping the incarnation but the goal + # state will cause the agent to error out while trying to write the certificates to disk. These + # files have no dependencies on certs, so using them does not present that issue. + # + # Note that the scenario this test is representing does not depend on certificates at all, and + # can be changed to use the default files when the above limitation is addressed. + data["vm_settings"] = "hostgaplugin/vm_settings-fabric-no_thumbprints.json" + data['goal_state'] = 'wire/goal_state_no_certs.xml' + + def vm_settings_no_change(url, *_, **__): + if HttpRequestPredicates.is_host_plugin_vm_settings_request(url): + return MockHttpResponse(httpclient.NOT_MODIFIED) + return None + + def vm_settings_not_supported(url, *_, **__): + if HttpRequestPredicates.is_host_plugin_vm_settings_request(url): + return MockHttpResponse(404) + return None + + with mock_wire_protocol(data) as protocol: + + def mock_live_migration(iteration): + if iteration == 1: + protocol.mock_wire_data.set_incarnation(2) + protocol.set_http_handlers(http_get_handler=vm_settings_no_change) + elif iteration == 2: + protocol.mock_wire_data.set_incarnation(3) + protocol.set_http_handlers(http_get_handler=vm_settings_not_supported) + + with mock_update_handler(protocol, 3, on_new_iteration=mock_live_migration) as update_handler: + with patch("azurelinuxagent.ga.update.logger.error") as patched_error: + def check_for_errors(): + msg_fragment = "Error fetching the goal state:" + + for (args, _) in filter(lambda a: len(a) > 0, patched_error.call_args_list): + if msg_fragment in args[0]: + self.fail("Found error: {}".format(args[0])) + update_handler.run(debug=True) + check_for_errors() + + timestamp = protocol.client.get_host_plugin()._fast_track_timestamp + self.assertEqual(timestamp, timeutil.create_timestamp(datetime.min), + "Expected fast track time stamp to be set to {0}, got {1}".format(datetime.min, timestamp)) class HeartbeatTestCase(AgentTestCase): From e4ec4c96497ac2e7d9869d54829562cbd07febbe Mon Sep 17 00:00:00 2001 From: Norberto Arrieta Date: Mon, 16 May 2022 16:07:36 -0700 Subject: [PATCH 11/13] Update CODEOWNERS (#2590) Co-authored-by: narrieta --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 9801efc186..e85dfc8d3d 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -20,4 +20,4 @@ # # Linux Agent team # -* @narrieta @larohra @kevinclark19a @ZhidongPeng @dhivyaganesan @nagworld9 +* @narrieta @kevinclark19a @ZhidongPeng @dhivyaganesan @nagworld9 From aa6d82c97082705ee47875fb4429152a3c53f898 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Tue, 17 May 2022 12:21:19 -0700 Subject: [PATCH 12/13] ext cpu throttling scenarios (#2581) * ext cpu throttling scenarios * address comments * remove unused import --- azurelinuxagent/common/cgroup.py | 14 ++- azurelinuxagent/common/cgroupapi.py | 6 +- azurelinuxagent/common/cgroupconfigurator.py | 103 +++++++++++++++--- .../common/utils/extensionprocessutil.py | 30 ++++- tests/common/test_cgroupconfigurator.py | 54 ++++++++- tests/common/test_cgroups.py | 2 +- tests/common/test_cgroupstelemetry.py | 2 +- tests/utils/test_extension_process_util.py | 50 ++++++++- 8 files changed, 223 insertions(+), 38 deletions(-) diff --git a/azurelinuxagent/common/cgroup.py b/azurelinuxagent/common/cgroup.py index 900f4f5b46..5fadf6bfb1 100644 --- a/azurelinuxagent/common/cgroup.py +++ b/azurelinuxagent/common/cgroup.py @@ -171,7 +171,7 @@ def _get_cpu_ticks(self, allow_no_such_file_or_directory_error=False): return cpu_ticks - def _get_throttled_time(self): + def get_throttled_time(self): try: with open(os.path.join(self.path, 'cpu.stat')) as cpu_stat: # @@ -205,7 +205,7 @@ def initialize_cpu_usage(self): raise CGroupsException("initialize_cpu_usage() should be invoked only once") self._current_cgroup_cpu = self._get_cpu_ticks(allow_no_such_file_or_directory_error=True) self._current_system_cpu = self._osutil.get_total_cpu_ticks_since_boot() - self._current_throttled_time = self._get_throttled_time() + self._current_throttled_time = self.get_throttled_time() def get_cpu_usage(self): """ @@ -229,16 +229,20 @@ def get_cpu_usage(self): return round(100.0 * self._osutil.get_processor_cores() * float(cgroup_delta) / float(system_delta), 3) - def get_throttled_time(self): + def get_cpu_throttled_time(self, read_previous_throttled_time=True): """ Computes the throttled time (in seconds) since the last call to this function. NOTE: initialize_cpu_usage() must be invoked before calling this function + Compute only current throttled time if read_previous_throttled_time set to False """ + if not read_previous_throttled_time: + return float(self.get_throttled_time() / 1E9) + if not self._cpu_usage_initialized(): raise CGroupsException("initialize_cpu_usage() must be invoked before the first call to get_throttled_time()") self._previous_throttled_time = self._current_throttled_time - self._current_throttled_time = self._get_throttled_time() + self._current_throttled_time = self.get_throttled_time() return float(self._current_throttled_time - self._previous_throttled_time) / 1E9 @@ -249,7 +253,7 @@ def get_tracked_metrics(self, **kwargs): tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage)) if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']: - throttled_time = self.get_throttled_time() + throttled_time = self.get_cpu_throttled_time() if cpu_usage >= float(0) and throttled_time >= float(0): tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time)) diff --git a/azurelinuxagent/common/cgroupapi.py b/azurelinuxagent/common/cgroupapi.py index f69b1591c0..7b7e688a1a 100644 --- a/azurelinuxagent/common/cgroupapi.py +++ b/azurelinuxagent/common/cgroupapi.py @@ -280,6 +280,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh logger.info("Started extension in unit '{0}'", scope_name) + cpu_cgroup = None try: cgroup_relative_path = os.path.join('azure.slice/azure-vmextensions.slice', extension_slice_name) @@ -289,7 +290,8 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh logger.info("The CPU controller is not mounted; will not track resource usage") else: cpu_cgroup_path = os.path.join(cpu_cgroup_mountpoint, cgroup_relative_path) - CGroupsTelemetry.track_cgroup(CpuCgroup(extension_name, cpu_cgroup_path)) + cpu_cgroup = CpuCgroup(extension_name, cpu_cgroup_path) + CGroupsTelemetry.track_cgroup(cpu_cgroup) except IOError as e: if e.errno == 2: # 'No such file or directory' @@ -301,7 +303,7 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh # Wait for process completion or timeout try: return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, - stderr=stderr, error_code=error_code) + stderr=stderr, error_code=error_code, cpu_cgroup=cpu_cgroup) except ExtensionError as e: # The extension didn't terminate successfully. Determine whether it was due to systemd errors or # extension errors. diff --git a/azurelinuxagent/common/cgroupconfigurator.py b/azurelinuxagent/common/cgroupconfigurator.py index 7dc0a80a99..308838a731 100644 --- a/azurelinuxagent/common/cgroupconfigurator.py +++ b/azurelinuxagent/common/cgroupconfigurator.py @@ -14,6 +14,8 @@ # limitations under the License. # # Requires Python 2.6+ and Openssl 1.0+ +import glob +import json import os import re import subprocess @@ -358,12 +360,16 @@ def __setup_azure_slice(): CGroupConfigurator._Impl.__cleanup_unit_file(unit_file) return - # reload the systemd configuration; the new slices will be used once the agent's service restarts - try: - logger.info("Executing systemctl daemon-reload...") - shellutil.run_command(["systemctl", "daemon-reload"]) - except Exception as exception: - _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception)) + CGroupConfigurator._Impl.__reload_systemd_config() + + @staticmethod + def __reload_systemd_config(): + # reload the systemd configuration; the new slices will be used once the agent's service restarts + try: + logger.info("Executing systemctl daemon-reload...") + shellutil.run_command(["systemctl", "daemon-reload"]) + except Exception as exception: + _log_cgroup_warning("daemon-reload failed (create azure slice): {0}", ustr(exception)) @staticmethod def __create_unit_file(path, contents): @@ -479,18 +485,23 @@ def enable(self): self.__set_cpu_quota(conf.get_agent_cpu_quota()) def disable(self, reason, disable_cgroups): - # Todo: disable/reset extension when ext quotas introduced if disable_cgroups == DisableCgroups.ALL: # disable all - self._agent_cgroups_enabled = False - self._extensions_cgroups_enabled = False + # Reset quotas self.__reset_agent_cpu_quota() + extension_services = self.get_extension_services_list() + for extension in extension_services: + logger.info("Resetting extension : {0} and it's services: {1} CPUQuota".format(extension, extension_services[extension])) + self.__reset_extension_cpu_quota(extension_name=extension) + self.__reset_extension_services_cpu_quota(extension_services[extension]) + self.__reload_systemd_config() + CGroupsTelemetry.reset() + self._agent_cgroups_enabled = False + self._extensions_cgroups_enabled = False elif disable_cgroups == DisableCgroups.AGENT: # disable agent self._agent_cgroups_enabled = False self.__reset_agent_cpu_quota() CGroupsTelemetry.stop_tracking(CpuCgroup(AGENT_NAME_TELEMETRY, self._agent_cpu_cgroup_path)) - elif disable_cgroups == DisableCgroups.EXTENSIONS: # disable extensions - self._extensions_cgroups_enabled = False message = "[CGW] Disabling resource usage monitoring. Reason: {0}".format(reason) logger.info(message) # log as INFO for now, in the future it should be logged as WARNING @@ -519,7 +530,6 @@ def __reset_agent_cpu_quota(): """ logger.info("Resetting agent's CPUQuota") if CGroupConfigurator._Impl.__try_set_cpu_quota(''): # setting an empty value resets to the default (infinity) - CGroupsTelemetry.set_track_throttled_time(False) _log_cgroup_info('CPUQuota: {0}', systemd.get_unit_property(systemd.get_agent_unit_name(), "CPUQuotaPerSecUSec")) @staticmethod @@ -763,6 +773,16 @@ def start_extension_command(self, extension_name, command, cmd_name, timeout, sh process = subprocess.Popen(command, shell=shell, cwd=cwd, env=env, stdout=stdout, stderr=stderr, preexec_fn=os.setsid) # pylint: disable=W1509 return handle_process_completion(process=process, command=command, timeout=timeout, stdout=stdout, stderr=stderr, error_code=error_code) + def __reset_extension_cpu_quota(self, extension_name): + """ + Removes any CPUQuota on the extension + + NOTE: This resets the quota on the extension's slice; any local overrides on the VM will take precedence + over this setting. + """ + if self.enabled(): + self.setup_extension_slice(extension_name, cpu_quota=None) + def setup_extension_slice(self, extension_name, cpu_quota): """ Each extension runs under its own slice (Ex "Microsoft.CPlat.Extension.slice"). All the slices for @@ -777,11 +797,11 @@ def setup_extension_slice(self, extension_name, cpu_quota): extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name)) try: - cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" + cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity) slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name, cpu_quota=cpu_quota) CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents) except Exception as exception: - _log_cgroup_warning("Failed to create unit files for the extension slice: {0}", ustr(exception)) + _log_cgroup_warning("Failed to set the extension {0} slice and quotas: {1}", extension_name, ustr(exception)) CGroupConfigurator._Impl.__cleanup_unit_file(extension_slice_path) def remove_extension_slice(self, extension_name): @@ -823,13 +843,35 @@ def set_extension_services_cpu_memory_quota(self, services_list): files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) self.__create_all_files(files_to_create) + self.__reload_systemd_config() + + def __reset_extension_services_cpu_quota(self, services_list): + """ + Removes any CPUQuota on the extension service - # reload the systemd configuration; the new unit will be used once the service restarts + NOTE: This resets the quota on the extension service's default dropin file; any local overrides on the VM will take precedence + over this setting. + """ + if self.enabled() and services_list is not None: try: - logger.info("Executing systemctl daemon-reload...") - shellutil.run_command(["systemctl", "daemon-reload"]) + service_name = None + for service in services_list: + service_name = service.get('name', None) + unit_file_path = systemd.get_unit_file_install_path() + if service_name is not None and unit_file_path is not None: + files_to_create = [] + drop_in_path = os.path.join(unit_file_path, "{0}.d".format(service_name)) + cpu_quota = "" # setting an empty value resets to the default (infinity) + drop_in_file_cpu_quota = os.path.join(drop_in_path, _DROP_IN_FILE_CPU_QUOTA) + cpu_quota_contents = _DROP_IN_FILE_CPU_QUOTA_CONTENTS_FORMAT.format(cpu_quota) + if os.path.exists(drop_in_file_cpu_quota): + with open(drop_in_file_cpu_quota, "r") as file_: + if file_.read() == cpu_quota_contents: + return + files_to_create.append((drop_in_file_cpu_quota, cpu_quota_contents)) + self.__create_all_files(files_to_create) except Exception as exception: - _log_cgroup_warning("daemon-reload failed (create service unit files): {0}", ustr(exception)) + _log_cgroup_warning('Failed to reset CPUQuota for {0} : {1}', service_name, ustr(exception)) def remove_extension_services_drop_in_files(self, services_list): """ @@ -873,6 +915,31 @@ def start_tracking_extension_services_cgroups(self, services_list): if service_name is not None: self.start_tracking_unit_cgroups(service_name) + @staticmethod + def get_extension_services_list(): + """ + ResourceLimits for extensions are coming from /HandlerManifest.json file. + Use this pattern to determine all the installed extension HandlerManifest files and + read the extension services if ResourceLimits are present. + """ + extensions_services = {} + for manifest_path in glob.iglob(os.path.join(conf.get_lib_dir(), "*/HandlerManifest.json")): + match = re.search("(?P[\\w+\\.-]+).HandlerManifest\\.json", manifest_path) + if match is not None: + extensions_name = match.group('extname') + if not extensions_name.startswith('WALinuxAgent'): + try: + data = json.loads(fileutil.read_file(manifest_path)) + resource_limits = data[0].get('resourceLimits', None) + services = resource_limits.get('services') if resource_limits else None + extensions_services[extensions_name] = services + except (IOError, OSError) as e: + _log_cgroup_warning( + 'Failed to load manifest file ({0}): {1}'.format(manifest_path, e.strerror)) + except ValueError: + _log_cgroup_warning('Malformed manifest file ({0}).'.format(manifest_path)) + return extensions_services + # unique instance for the singleton _instance = None diff --git a/azurelinuxagent/common/utils/extensionprocessutil.py b/azurelinuxagent/common/utils/extensionprocessutil.py index dda83fda29..9038f6145c 100644 --- a/azurelinuxagent/common/utils/extensionprocessutil.py +++ b/azurelinuxagent/common/utils/extensionprocessutil.py @@ -21,13 +21,14 @@ import signal import time +from azurelinuxagent.common import logger from azurelinuxagent.common.exception import ExtensionErrorCodes, ExtensionOperationError, ExtensionError from azurelinuxagent.common.future import ustr TELEMETRY_MESSAGE_MAX_LEN = 3200 -def wait_for_process_completion_or_timeout(process, timeout): +def wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup): """ Utility function that waits for the process to complete within the given time frame. This function will terminate the process if when the given time frame elapses. @@ -40,18 +41,20 @@ def wait_for_process_completion_or_timeout(process, timeout): timeout -= 1 return_code = None + throttled_time = 0 if timeout == 0: + throttled_time = get_cpu_throttled_time(cpu_cgroup) os.killpg(os.getpgid(process.pid), signal.SIGKILL) else: # process completed or forked; sleep 1 sec to give the child process (if any) a chance to start time.sleep(1) return_code = process.wait() - return timeout == 0, return_code + return timeout == 0, return_code, throttled_time -def handle_process_completion(process, command, timeout, stdout, stderr, error_code): +def handle_process_completion(process, command, timeout, stdout, stderr, error_code, cpu_cgroup=None): """ Utility function that waits for process completion and retrieves its output (stdout and stderr) if it completed before the timeout period. Otherwise, the process will get killed and an ExtensionError will be raised. @@ -62,13 +65,18 @@ def handle_process_completion(process, command, timeout, stdout, stderr, error_c :param stdout: Must be a file since we seek on it when parsing the subprocess output :param stderr: Must be a file since we seek on it when parsing the subprocess outputs :param error_code: The error code to set if we raise an ExtensionError + :param cpu_cgroup: Reference the cpu cgroup name and path :return: """ # Wait for process completion or timeout - timed_out, return_code = wait_for_process_completion_or_timeout(process, timeout) + timed_out, return_code, throttled_time = wait_for_process_completion_or_timeout(process, timeout, cpu_cgroup) process_output = read_output(stdout, stderr) if timed_out: + if cpu_cgroup is not None:# Report CPUThrottledTime when timeout happens + raise ExtensionError("Timeout({0});CPUThrottledTime({1}secs): {2}\n{3}".format(timeout, throttled_time, command, process_output), + code=ExtensionErrorCodes.PluginHandlerScriptTimedout) + raise ExtensionError("Timeout({0}): {1}\n{2}".format(timeout, command, process_output), code=ExtensionErrorCodes.PluginHandlerScriptTimedout) @@ -141,3 +149,17 @@ def to_s(captured_stdout, stdout_offset, captured_stderr, stderr_offset): return to_s(stdout, -1*stdout_len, stderr, 0) else: return to_s(stdout, -1*max_len_each, stderr, -1*max_len_each) + + +def get_cpu_throttled_time(cpu_cgroup): + """ + return the throttled time for the given cgroup. + """ + throttled_time = 0 + if cpu_cgroup is not None: + try: + throttled_time = cpu_cgroup.get_cpu_throttled_time(read_previous_throttled_time=False) + except Exception as e: + logger.warn("Failed to get cpu throttled time for the extension: {0}", ustr(e)) + + return throttled_time diff --git a/tests/common/test_cgroupconfigurator.py b/tests/common/test_cgroupconfigurator.py index 6a945c5e18..e2f06a67ac 100644 --- a/tests/common/test_cgroupconfigurator.py +++ b/tests/common/test_cgroupconfigurator.py @@ -254,7 +254,7 @@ def test_enable_should_not_track_throttled_time_when_setting_the_cpu_quota_fails self.assertFalse(CGroupsTelemetry.get_track_throttled_time(), "Throttle time should not be tracked") - def test_disable_should_reset_cpu_quota_and_tracked_cgroups(self): + def test_disable_should_reset_cpu_quota(self): with self._get_cgroup_configurator() as configurator: if len(CGroupsTelemetry._tracked) == 0: raise Exception("Test setup should have started tracking at least 1 cgroup (the agent's)") @@ -269,7 +269,55 @@ def test_disable_should_reset_cpu_quota_and_tracked_cgroups(self): fileutil.findre_in_file(agent_drop_in_file_cpu_quota, "^CPUQuota=$"), "CPUQuota was not set correctly. Expected an empty value. Got:\n{0}".format(fileutil.read_file(agent_drop_in_file_cpu_quota))) self.assertEqual(len(CGroupsTelemetry._tracked), 0, "No cgroups should be tracked after disable. Tracking: {0}".format(CGroupsTelemetry._tracked)) - self.assertFalse(CGroupsTelemetry._track_throttled_time, "Throttle Time should not be tracked after disable") + + def test_disable_should_reset_cpu_quota_for_all_cgroups(self): + service_list = [ + { + "name": "extension.service", + "cpuQuotaPercentage": 5 + } + ] + extension_name = "Microsoft.CPlat.Extension" + extension_services = {extension_name: service_list} + with self._get_cgroup_configurator() as configurator: + with patch.object(configurator, "get_extension_services_list", return_value=extension_services): + # get the paths to the mocked files + agent_drop_in_file_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.cpu_quota) + extension_slice_unit_file = configurator.mocks.get_mapped_path(UnitFilePaths.extensionslice) + extension_service_cpu_quota = configurator.mocks.get_mapped_path(UnitFilePaths.extension_service_cpu_quota) + + configurator.setup_extension_slice(extension_name=extension_name, cpu_quota=5) + configurator.set_extension_services_cpu_memory_quota(service_list) + CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service'] = \ + CpuCgroup('extension.service', '/sys/fs/cgroup/cpu,cpuacct/system.slice/extension.service') + CGroupsTelemetry._tracked['/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/' \ + 'azure-vmextensions-Microsoft.CPlat.Extension.slice'] = \ + CpuCgroup('Microsoft.CPlat.Extension', + '/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.CPlat.Extension.slice') + + configurator.disable("UNIT TEST", DisableCgroups.ALL) + + self.assertTrue(os.path.exists(agent_drop_in_file_cpu_quota), + "{0} was not created".format(agent_drop_in_file_cpu_quota)) + self.assertTrue( + fileutil.findre_in_file(agent_drop_in_file_cpu_quota, "^CPUQuota=$"), + "CPUQuota was not set correctly. Expected an empty value. Got:\n{0}".format( + fileutil.read_file(agent_drop_in_file_cpu_quota))) + self.assertTrue(os.path.exists(extension_slice_unit_file), + "{0} was not created".format(extension_slice_unit_file)) + self.assertTrue( + fileutil.findre_in_file(extension_slice_unit_file, "^CPUQuota=$"), + "CPUQuota was not set correctly. Expected an empty value. Got:\n{0}".format( + fileutil.read_file(extension_slice_unit_file))) + self.assertTrue(os.path.exists(extension_service_cpu_quota), + "{0} was not created".format(extension_service_cpu_quota)) + self.assertTrue( + fileutil.findre_in_file(extension_service_cpu_quota, "^CPUQuota=$"), + "CPUQuota was not set correctly. Expected an empty value. Got:\n{0}".format( + fileutil.read_file(extension_service_cpu_quota))) + self.assertEqual(len(CGroupsTelemetry._tracked), 0, + "No cgroups should be tracked after disable. Tracking: {0}".format( + CGroupsTelemetry._tracked)) @patch('time.sleep', side_effect=lambda _: mock_sleep()) def test_start_extension_command_should_not_use_systemd_when_cgroups_are_not_enabled(self, _): @@ -529,7 +577,7 @@ def test_start_extension_command_should_not_use_fallback_option_if_extension_tim with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: with patch("azurelinuxagent.common.utils.extensionprocessutil.wait_for_process_completion_or_timeout", - return_value=[True, None]): + return_value=[True, None, 0]): with patch("azurelinuxagent.common.cgroupapi.SystemdCgroupsApi._is_systemd_failure", return_value=False): with self.assertRaises(ExtensionError) as context_manager: diff --git a/tests/common/test_cgroups.py b/tests/common/test_cgroups.py index 1f994b8593..61c3178546 100644 --- a/tests/common/test_cgroups.py +++ b/tests/common/test_cgroups.py @@ -183,7 +183,7 @@ def test_get_throttled_time_should_return_the_value_since_its_last_invocation(se cgroup.initialize_cpu_usage() shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t1"), test_file) # throttled_time = 2075541442327 - throttled_time = cgroup.get_throttled_time() + throttled_time = cgroup.get_cpu_throttled_time() self.assertEqual(throttled_time, float(2075541442327 - 50) / 1E9, "The value of throttled_time is incorrect") diff --git a/tests/common/test_cgroupstelemetry.py b/tests/common/test_cgroupstelemetry.py index 9d96a1d825..dc612244d8 100644 --- a/tests/common/test_cgroupstelemetry.py +++ b/tests/common/test_cgroupstelemetry.py @@ -375,7 +375,7 @@ def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # py self.assertEqual(0, len(metrics)) @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage") - @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_throttled_time") + @patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_throttled_time") @patch("azurelinuxagent.common.cgroup.CGroup.is_active") def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_active, path_get_throttled_time, patch_get_cpu_usage): diff --git a/tests/utils/test_extension_process_util.py b/tests/utils/test_extension_process_util.py index e950338fa6..a74c4ff733 100644 --- a/tests/utils/test_extension_process_util.py +++ b/tests/utils/test_extension_process_util.py @@ -15,14 +15,16 @@ # Requires Python 2.6+ and Openssl 1.0+ # import os +import shutil import subprocess import tempfile +from azurelinuxagent.common.cgroup import CpuCgroup from azurelinuxagent.common.exception import ExtensionError, ExtensionErrorCodes from azurelinuxagent.common.future import ustr from azurelinuxagent.common.utils.extensionprocessutil import format_stdout_stderr, read_output, \ wait_for_process_completion_or_timeout, handle_process_completion -from tests.tools import AgentTestCase, patch +from tests.tools import AgentTestCase, patch, data_dir class TestProcessUtils(AgentTestCase): @@ -50,7 +52,7 @@ def test_wait_for_process_completion_or_timeout_should_terminate_cleanly(self): stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret = wait_for_process_completion_or_timeout(process=process, timeout=5) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_cgroup=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 0) @@ -68,7 +70,7 @@ def test_wait_for_process_completion_or_timeout_should_kill_process_on_timeout(s # We don't actually mock the kill, just wrap it so we can assert its call count with patch('azurelinuxagent.common.utils.extensionprocessutil.os.killpg', wraps=os.killpg) as patch_kill: with patch('time.sleep') as mock_sleep: - timed_out, ret = wait_for_process_completion_or_timeout(process=process, timeout=timeout) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=timeout, cpu_cgroup=None) # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure # we're "waiting" the correct amount of time before killing the process @@ -87,7 +89,7 @@ def test_handle_process_completion_should_return_nonzero_when_process_fails(self stdout=subprocess.PIPE, stderr=subprocess.PIPE) - timed_out, ret = wait_for_process_completion_or_timeout(process=process, timeout=5) + timed_out, ret, _ = wait_for_process_completion_or_timeout(process=process, timeout=5, cpu_cgroup=None) self.assertEqual(timed_out, False) self.assertEqual(ret, 2) @@ -143,6 +145,46 @@ def test_handle_process_completion_should_raise_on_timeout(self): self.assertEqual(context_manager.exception.code, ExtensionErrorCodes.PluginHandlerScriptTimedout) self.assertIn("Timeout({0})".format(timeout), ustr(context_manager.exception)) + self.assertNotIn("CPUThrottledTime({0}secs)".format(timeout), ustr(context_manager.exception)) #Extension not started in cpuCgroup + + + def test_handle_process_completion_should_log_throttled_time_on_timeout(self): + command = "sleep 1m" + timeout = 20 + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stdout: + with tempfile.TemporaryFile(dir=self.tmp_dir, mode="w+b") as stderr: + with patch('time.sleep') as mock_sleep: + with self.assertRaises(ExtensionError) as context_manager: + test_file = os.path.join(self.tmp_dir, "cpu.stat") + shutil.copyfile(os.path.join(data_dir, "cgroups", "cpu.stat_t0"), + test_file) # throttled_time = 50 + cgroup = CpuCgroup("test", self.tmp_dir) + process = subprocess.Popen(command, # pylint: disable=subprocess-popen-preexec-fn + shell=True, + cwd=self.tmp_dir, + env={}, + stdout=stdout, + stderr=stderr, + preexec_fn=os.setsid) + + handle_process_completion(process=process, + command=command, + timeout=timeout, + stdout=stdout, + stderr=stderr, + error_code=42, + cpu_cgroup=cgroup) + + # We're mocking sleep to avoid prolonging the test execution time, but we still want to make sure + # we're "waiting" the correct amount of time before killing the process and raising an exception + # Due to an extra call to sleep at some point in the call stack which only happens sometimes, + # we are relaxing this assertion to allow +/- 2 sleep calls. + self.assertTrue(abs(mock_sleep.call_count - timeout) <= 2) + + self.assertEqual(context_manager.exception.code, ExtensionErrorCodes.PluginHandlerScriptTimedout) + self.assertIn("Timeout({0})".format(timeout), ustr(context_manager.exception)) + throttled_time = float(50 / 1E9) + self.assertIn("CPUThrottledTime({0}secs)".format(throttled_time), ustr(context_manager.exception)) def test_handle_process_completion_should_raise_on_nonzero_exit_code(self): command = "ls folder_does_not_exist" From b8ca4323d91fd2fac54fbcb7aab0f6988b4cd0e5 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Tue, 17 May 2022 14:27:46 -0700 Subject: [PATCH 13/13] fix network interface restart in RHEL9 (#2592) --- azurelinuxagent/common/osutil/factory.py | 5 ++++- azurelinuxagent/common/osutil/redhat.py | 22 ++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/azurelinuxagent/common/osutil/factory.py b/azurelinuxagent/common/osutil/factory.py index 2ed4be78ba..61c2e6d20d 100644 --- a/azurelinuxagent/common/osutil/factory.py +++ b/azurelinuxagent/common/osutil/factory.py @@ -34,7 +34,7 @@ from .nsbsd import NSBSDOSUtil from .openbsd import OpenBSDOSUtil from .openwrt import OpenWRTOSUtil -from .redhat import RedhatOSUtil, Redhat6xOSUtil +from .redhat import RedhatOSUtil, Redhat6xOSUtil, RedhatOSModernUtil from .suse import SUSEOSUtil, SUSE11OSUtil from .photonos import PhotonOSUtil from .ubuntu import UbuntuOSUtil, Ubuntu12OSUtil, Ubuntu14OSUtil, \ @@ -107,6 +107,9 @@ def _get_osutil(distro_name, distro_code_name, distro_version, distro_full_name) if Version(distro_version) < Version("7"): return Redhat6xOSUtil() + if Version(distro_version) == Version("8.6") or Version(distro_version) > Version("9"): + return RedhatOSModernUtil() + return RedhatOSUtil() if distro_name == "euleros": diff --git a/azurelinuxagent/common/osutil/redhat.py b/azurelinuxagent/common/osutil/redhat.py index 9759d1136b..312dd16084 100644 --- a/azurelinuxagent/common/osutil/redhat.py +++ b/azurelinuxagent/common/osutil/redhat.py @@ -142,3 +142,25 @@ def get_dhcp_lease_endpoint(self): endpoint = self.get_endpoint_from_leases_path('/var/lib/NetworkManager/dhclient-*.lease') return endpoint + + +class RedhatOSModernUtil(RedhatOSUtil): + def __init__(self): # pylint: disable=W0235 + super(RedhatOSModernUtil, self).__init__() + + def restart_if(self, ifname, retries=3, wait=5): + """ + Restart an interface by bouncing the link. systemd-networkd observes + this event, and forces a renew of DHCP. + """ + retry_limit = retries + 1 + for attempt in range(1, retry_limit): + return_code = shellutil.run("ip link set {0} down && ip link set {0} up".format(ifname)) + if return_code == 0: + return + logger.warn("failed to restart {0}: return code {1}".format(ifname, return_code)) + if attempt < retry_limit: + logger.info("retrying in {0} seconds".format(wait)) + time.sleep(wait) + else: + logger.warn("exceeded restart retries")