Skip to content

Commit

Permalink
Merge branch 'develop' into newflag-readme
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 authored Feb 5, 2024
2 parents 19bfb7e + 20f0670 commit 69096a4
Show file tree
Hide file tree
Showing 31 changed files with 602 additions and 149 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,30 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
provisioning time, via whichever API is being used. We will provide more details on
this on our wiki when it is generally available.

#### __Extensions.WaitForCloudInit__

_Type: Boolean_
_Default: n_

Waits for cloud-init to complete (cloud-init status --wait) before executing VM extensions.

Both cloud-init and VM extensions are common ways to customize a VM during initial deployment. By
default, the agent will start executing extensions while cloud-init may still be in the 'config'
stage and won't wait for the 'final' stage to complete. Cloud-init and extensions may execute operations
that conflict with each other (for example, both of them may try to install packages). Setting this option
to 'y' ensures that VM extensions are executed only after cloud-init has completed all its stages.

Note that using this option requires creating a custom image with the value of this option set to 'y', in
order to ensure that the wait is performed during the initial deployment of the VM.

#### __Extensions.WaitForCloudInitTimeout__

_Type: Integer_
_Default: 3600_

Timeout in seconds for the Agent to wait on cloud-init. If the timeout elapses, the Agent will continue
executing VM extensions. See Extensions.WaitForCloudInit for more details.

#### __Extensions.GoalStatePeriod__

_Type: Integer_
Expand Down
10 changes: 10 additions & 0 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Logs.Console": True,
"Logs.Collect": True,
"Extensions.Enabled": True,
"Extensions.WaitForCloudInit": False,
"Provisioning.AllowResetSysUser": False,
"Provisioning.RegenerateSshHostKeyPair": False,
"Provisioning.DeleteRootPassword": False,
Expand Down Expand Up @@ -170,6 +171,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
__INTEGER_OPTIONS__ = {
"Extensions.GoalStatePeriod": 6,
"Extensions.InitialGoalStatePeriod": 6,
"Extensions.WaitForCloudInitTimeout": 3600,
"OS.EnableFirewallPeriod": 300,
"OS.RemovePersistentNetRulesPeriod": 30,
"OS.RootDeviceScsiTimeoutPeriod": 30,
Expand Down Expand Up @@ -372,6 +374,14 @@ def get_extensions_enabled(conf=__conf__):
return conf.get_switch("Extensions.Enabled", True)


def get_wait_for_cloud_init(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInit", False)


def get_wait_for_cloud_init_timeout(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInitTimeout", 3600)


def get_goal_state_period(conf=__conf__):
return conf.get_int("Extensions.GoalStatePeriod", 6)

Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class WALAEventOperation:
CGroupsCleanUp = "CGroupsCleanUp"
CGroupsDisabled = "CGroupsDisabled"
CGroupsInfo = "CGroupsInfo"
CloudInit = "CloudInit"
CollectEventErrors = "CollectEventErrors"
CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
ConfigurationChange = "ConfigurationChange"
Expand Down
9 changes: 9 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ def __init__(self, msg=None, inner=None):
super(AgentUpdateError, self).__init__(msg, inner)


class AgentFamilyMissingError(AgentError):
"""
When agent family is missing.
"""

def __init__(self, msg=None, inner=None):
super(AgentFamilyMissingError, self).__init__(msg, inner)


class CGroupsException(AgentError):
"""
Exception to classify any cgroups related issue.
Expand Down
7 changes: 7 additions & 0 deletions azurelinuxagent/common/osutil/redhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,10 @@ def restart_if(self, ifname, retries=3, wait=5):
time.sleep(wait)
else:
logger.warn("exceeded restart retries")

def publish_hostname(self, hostname):
# RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
# NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
# NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.
shellutil.run("service NetworkManager restart")
DefaultOSUtil.publish_hostname(self, hostname)
39 changes: 36 additions & 3 deletions azurelinuxagent/common/utils/shellutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@
#
import os
import subprocess
import sys
import tempfile
import threading

if sys.version_info[0] == 2:
# TimeoutExpired was introduced on Python 3; define a dummy class for Python 2
class TimeoutExpired(Exception):
pass
else:
from subprocess import TimeoutExpired

import azurelinuxagent.common.logger as logger
from azurelinuxagent.common.future import ustr

Expand Down Expand Up @@ -206,7 +214,7 @@ def __run_command(command_action, command, log_error, encode_output):


# W0622: Redefining built-in 'input' -- disabled: the parameter name mimics subprocess.communicate()
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True): # pylint:disable=W0622
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True, timeout=None): # pylint:disable=W0622
"""
Executes the given command and returns its stdout.
Expand All @@ -227,7 +235,9 @@ def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=
value for these parameters is anything other than the default (subprocess.PIPE)), then the corresponding
values returned by this function or the CommandError exception will be empty strings.
Note: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
NOTE: The 'timeout' parameter is ignored on Python 2
NOTE: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
"""
if input is not None and stdin is not None:
raise ValueError("The input and stdin arguments are mutually exclusive")
Expand All @@ -246,7 +256,30 @@ def command_action():
else:
process = subprocess.Popen(command, stdin=popen_stdin, stdout=stdout, stderr=stderr, shell=False)

command_stdout, command_stderr = process.communicate(input=communicate_input)
try:
if sys.version_info[0] == 2: # communicate() doesn't support timeout on Python 2
command_stdout, command_stderr = process.communicate(input=communicate_input)
else:
command_stdout, command_stderr = process.communicate(input=communicate_input, timeout=timeout)
except TimeoutExpired:
if log_error:
logger.error(u"Command [{0}] timed out", __format_command(command))

command_stdout, command_stderr = '', ''

try:
process.kill()
# try to get any output from the command, but ignore any errors if we can't
try:
command_stdout, command_stderr = process.communicate()
# W0702: No exception type(s) specified (bare-except)
except: # pylint: disable=W0702
pass
except Exception as exception:
if log_error:
logger.error(u"Can't terminate timed out process: {0}", ustr(exception))
raise CommandError(command=__format_command(command), return_code=-1, stdout=command_stdout, stderr="command timeout\n{0}".format(command_stderr))

if track_process:
_on_command_completed(process.pid)

Expand Down
52 changes: 33 additions & 19 deletions azurelinuxagent/ga/agent_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,16 @@
# limitations under the License.
#
# Requires Python 2.6+ and Openssl 1.0+
import datetime
import os

from azurelinuxagent.common import conf, logger
from azurelinuxagent.common.event import add_event, WALAEventOperation
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError, AgentFamilyMissingError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.protocol.restapi import VMAgentUpdateStatuses, VMAgentUpdateStatus, VERSION_0
from azurelinuxagent.common.utils import textutil
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import get_daemon_version
from azurelinuxagent.ga.ga_version_updater import RSMUpdates
from azurelinuxagent.ga.rsm_version_updater import RSMVersionUpdater
from azurelinuxagent.ga.self_update_version_updater import SelfUpdateVersionUpdater

Expand Down Expand Up @@ -67,7 +65,7 @@ def __init__(self, protocol):

# restore the state of rsm update. Default to self-update if last update is not with RSM.
if not self._get_is_last_update_with_rsm():
self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.min)
self._updater = SelfUpdateVersionUpdater(self._gs_id)
else:
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)

Expand Down Expand Up @@ -117,7 +115,7 @@ def _get_agent_family_manifest(self, goal_state):
"""
Get the agent_family from last GS for the given family
Returns: first entry of Manifest
Exception if no manifests found in the last GS
Exception if no manifests found in the last GS and log it only on new goal state
"""
family = self._ga_family_type
agent_families = goal_state.extensions_goal_state.agent_families
Expand All @@ -130,11 +128,13 @@ def _get_agent_family_manifest(self, goal_state):
agent_family_manifests.append(m)

if not family_found:
raise AgentUpdateError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update".format(family, self._gs_id))
raise AgentFamilyMissingError(u"Agent family: {0} not found in the goal state: {1}, skipping agent update \n"
u"[Note: This error is permanent for this goal state and Will not log same error until we receive new goal state]".format(family, self._gs_id))

if len(agent_family_manifests) == 0:
raise AgentUpdateError(
u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update".format(
raise AgentFamilyMissingError(
u"No manifest links found for agent family: {0} for goal state: {1}, skipping agent update \n"
u"[Note: This error is permanent for this goal state and will not log same error until we receive new goal state]".format(
family, self._gs_id))
return agent_family_manifests[0]

Expand All @@ -145,30 +145,38 @@ def run(self, goal_state, ext_gs_updated):
if not conf.get_autoupdate_enabled() or not conf.get_download_new_agents():
return

# verify if agent update is allowed this time (RSM checks new goal state; self-update checks manifest download interval)
if not self._updater.is_update_allowed_this_time(ext_gs_updated):
return
# Update the state only on new goal state
if ext_gs_updated:
self._gs_id = goal_state.extensions_goal_state.id
self._updater.sync_new_gs_id(self._gs_id)

self._gs_id = goal_state.extensions_goal_state.id
agent_family = self._get_agent_family_manifest(goal_state)

# updater will return RSM enabled or disabled if we need to switch to self-update or rsm update
updater_mode = self._updater.check_and_switch_updater_if_changed(agent_family, self._gs_id, ext_gs_updated)
# Updater will return True or False if we need to switch the updater
# If self-updater receives RSM update enabled, it will switch to RSM updater
# If RSM updater receives RSM update disabled, it will switch to self-update
# No change in updater if GS not updated
is_rsm_update_enabled = self._updater.is_rsm_update_enabled(agent_family, ext_gs_updated)

if updater_mode == RSMUpdates.Disabled:
if not is_rsm_update_enabled and isinstance(self._updater, RSMVersionUpdater):
msg = "VM not enabled for RSM updates, switching to self-update mode"
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = SelfUpdateVersionUpdater(self._gs_id, datetime.datetime.now())
self._updater = SelfUpdateVersionUpdater(self._gs_id)
self._remove_rsm_update_state()

if updater_mode == RSMUpdates.Enabled:
if is_rsm_update_enabled and isinstance(self._updater, SelfUpdateVersionUpdater):
msg = "VM enabled for RSM updates, switching to RSM update mode"
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)
self._updater = RSMVersionUpdater(self._gs_id, self._daemon_version)
self._save_rsm_update_state()

# If updater is changed in previous step, we allow update as it consider as first attempt. If not, it checks below condition
# RSM checks new goal state; self-update checks manifest download interval
if not self._updater.is_update_allowed_this_time(ext_gs_updated):
return

self._updater.retrieve_agent_version(agent_family, goal_state)

if not self._updater.is_retrieved_version_allowed_to_update(agent_family):
Expand All @@ -183,14 +191,20 @@ def run(self, goal_state, ext_gs_updated):
self._updater.proceed_with_update()

except Exception as err:
log_error = True
if isinstance(err, AgentUpgradeExitException):
raise err
elif isinstance(err, AgentUpdateError):
error_msg = ustr(err)
elif isinstance(err, AgentFamilyMissingError):
error_msg = ustr(err)
# Agent family missing error is permanent in the given goal state, so we don't want to log it on every iteration of main loop if there is no new goal state
log_error = ext_gs_updated
else:
error_msg = "Unable to update Agent: {0}".format(textutil.format_exception(err))
logger.warn(error_msg)
add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
if log_error:
logger.warn(error_msg)
add_event(op=WALAEventOperation.AgentUpgrade, is_success=False, message=error_msg, log_event=False)
self._last_attempted_update_error_msg = error_msg

def get_vmagent_update_status(self):
Expand Down
25 changes: 11 additions & 14 deletions azurelinuxagent/ga/ga_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@
from azurelinuxagent.ga.guestagent import GuestAgent


class RSMUpdates(object):
"""
Enum for switching between RSM updates and self updates
"""
Enabled = "Enabled"
Disabled = "Disabled"


class GAVersionUpdater(object):

def __init__(self, gs_id):
Expand All @@ -53,15 +45,13 @@ def is_update_allowed_this_time(self, ext_gs_updated):
"""
raise NotImplementedError

def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
"""
checks and raise the updater exception if we need to switch to self-update from rsm update or vice versa
return True if we need to switch to RSM-update from self-update and vice versa.
@param agent_family: agent family
@param gs_id: incarnation of the goal state
@param ext_gs_updated: True if extension goal state updated else False
@return: RSMUpdates.Disabled: return when agent need to stop rsm updates and switch to self-update
RSMUpdates.Enabled: return when agent need to switch to rsm update
None: return when no need to switch
@return: False when agent need to stop rsm updates
True: when agent need to switch to rsm update
"""
raise NotImplementedError

Expand Down Expand Up @@ -107,6 +97,13 @@ def version(self):
"""
return self._version

def sync_new_gs_id(self, gs_id):
"""
Update gs_id
@param gs_id: goal state id
"""
self._gs_id = gs_id

def download_and_get_new_agent(self, protocol, agent_family, goal_state):
"""
Function downloads the new agent and returns the downloaded version.
Expand Down
2 changes: 1 addition & 1 deletion azurelinuxagent/ga/logcollector_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,6 @@
diskinfo,
echo,### Gathering Guest ProxyAgent Log Files ###
copy,/var/log/proxyagent/*
copy,/var/log/azure-proxy-agent/*
echo,
"""
15 changes: 7 additions & 8 deletions azurelinuxagent/ga/rsm_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from azurelinuxagent.common.exception import AgentUpgradeExitException, AgentUpdateError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import CURRENT_VERSION, AGENT_NAME
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater, RSMUpdates
from azurelinuxagent.ga.ga_version_updater import GAVersionUpdater
from azurelinuxagent.ga.guestagent import GuestAgent


Expand All @@ -49,24 +49,23 @@ def is_update_allowed_this_time(self, ext_gs_updated):
"""
return ext_gs_updated

def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_updated):
def is_rsm_update_enabled(self, agent_family, ext_gs_updated):
"""
Checks if there is a new goal state and decide if we need to continue with rsm update or switch to self-update.
Firstly it checks agent supports GA versioning or not. If not, we return rsm updates disabled to switch to self-update.
if vm is enabled for RSM updates and continue with rsm update, otherwise we return rsm updates disabled to switch to self-update.
Firstly it checks agent supports GA versioning or not. If not, we return false to switch to self-update.
if vm is enabled for RSM updates and continue with rsm update, otherwise we return false to switch to self-update.
if either isVersionFromRSM or isVMEnabledForRSMUpgrades or version is missing in the goal state, we ignore the update as we consider it as invalid goal state.
"""
if ext_gs_updated:
self._gs_id = gs_id
if not conf.get_enable_ga_versioning():
return RSMUpdates.Disabled
return False

if agent_family.is_vm_enabled_for_rsm_upgrades is None:
raise AgentUpdateError(
"Received invalid goal state:{0}, missing isVMEnabledForRSMUpgrades property. So, skipping agent update".format(
self._gs_id))
elif not agent_family.is_vm_enabled_for_rsm_upgrades:
return RSMUpdates.Disabled
return False
else:
if agent_family.is_version_from_rsm is None:
raise AgentUpdateError(
Expand All @@ -77,7 +76,7 @@ def check_and_switch_updater_if_changed(self, agent_family, gs_id, ext_gs_update
"Received invalid goal state:{0}, missing version property. So, skipping agent update".format(
self._gs_id))

return None
return True

def retrieve_agent_version(self, agent_family, goal_state):
"""
Expand Down
Loading

0 comments on commit 69096a4

Please sign in to comment.