Skip to content

Commit

Permalink
Merge branch 'develop' into disable-RSM
Browse files Browse the repository at this point in the history
  • Loading branch information
nagworld9 authored Feb 9, 2024
2 parents 62aa95a + 7521421 commit 12f1adb
Show file tree
Hide file tree
Showing 11 changed files with 277 additions and 206 deletions.
19 changes: 15 additions & 4 deletions azurelinuxagent/ga/agent_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,12 +182,23 @@ def run(self, goal_state, ext_gs_updated):
if not self._updater.is_retrieved_version_allowed_to_update(agent_family):
return
self._updater.log_new_agent_update_message()
self._updater.purge_extra_agents_from_disk()
agent = self._updater.download_and_get_new_agent(self._protocol, agent_family, goal_state)
if agent.is_blacklisted or not agent.is_downloaded:
msg = "Downloaded agent version is in bad state : {0} , skipping agent update".format(
str(agent.version))

# Below condition is to break the update loop if new agent is in bad state in previous attempts
# If the bad agent update already attempted 3 times, we don't want to continue with update anymore.
# Otherewise we allow the update by increment the update attempt count and clear the bad state to make good agent
# [Note: As a result, it is breaking contract between RSM and agent, we may NOT honor the RSM retries for that version]
if agent.get_update_attempt_count() >= 3:
msg = "Attempted enough update retries for version: {0} but still agent not recovered from bad state. So, we stop updating to this version".format(str(agent.version))
raise AgentUpdateError(msg)
else:
agent.clear_error()
agent.inc_update_attempt_count()
msg = "Agent update attempt count: {0} for version: {1}".format(agent.get_update_attempt_count(), str(agent.version))
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

self._updater.purge_extra_agents_from_disk()
self._updater.proceed_with_update()

except Exception as err:
Expand Down
47 changes: 38 additions & 9 deletions azurelinuxagent/ga/ga_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
from azurelinuxagent.common.protocol.extensions_goal_state import GoalStateSource
from azurelinuxagent.common.utils import fileutil
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import AGENT_NAME, AGENT_DIR_PATTERN
from azurelinuxagent.ga.guestagent import GuestAgent
from azurelinuxagent.common.version import AGENT_NAME, AGENT_DIR_PATTERN, CURRENT_VERSION
from azurelinuxagent.ga.guestagent import GuestAgent, AGENT_MANIFEST_FILE


class GAVersionUpdater(object):
Expand Down Expand Up @@ -77,12 +77,6 @@ def log_new_agent_update_message(self):
"""
raise NotImplementedError

def purge_extra_agents_from_disk(self):
"""
Method remove the extra agents from disk.
"""
raise NotImplementedError

def proceed_with_update(self):
"""
performs upgrade/downgrade
Expand All @@ -104,6 +98,33 @@ def sync_new_gs_id(self, gs_id):
"""
self._gs_id = gs_id

@staticmethod
def download_new_agent_pkg(package_to_download, protocol, is_fast_track_goal_state):
"""
Function downloads the new agent.
@param package_to_download: package to download
@param protocol: protocol object
@param is_fast_track_goal_state: True if goal state is fast track else False
"""
agent_name = "{0}-{1}".format(AGENT_NAME, package_to_download.version)
agent_dir = os.path.join(conf.get_lib_dir(), agent_name)
agent_pkg_path = ".".join((os.path.join(conf.get_lib_dir(), agent_name), "zip"))
agent_handler_manifest_file = os.path.join(agent_dir, AGENT_MANIFEST_FILE)
if not os.path.exists(agent_dir) or not os.path.isfile(agent_handler_manifest_file):
protocol.client.download_zip_package("agent package", package_to_download.uris, agent_pkg_path, agent_dir, use_verify_header=is_fast_track_goal_state)
else:
logger.info("Agent {0} was previously downloaded - skipping download", agent_name)

if not os.path.isfile(agent_handler_manifest_file):
try:
# Clean up the agent directory if the manifest file is missing
logger.info("Agent handler manifest file is missing, cleaning up the agent directory: {0}".format(agent_dir))
if os.path.isdir(agent_dir):
shutil.rmtree(agent_dir, ignore_errors=True)
except Exception as err:
logger.warn("Unable to delete Agent directory: {0}".format(err))
raise AgentUpdateError("Downloaded agent package: {0} is missing agent handler manifest file: {1}".format(agent_name, agent_handler_manifest_file))

def download_and_get_new_agent(self, protocol, agent_family, goal_state):
"""
Function downloads the new agent and returns the downloaded version.
Expand All @@ -116,9 +137,17 @@ def download_and_get_new_agent(self, protocol, agent_family, goal_state):
self._agent_manifest = goal_state.fetch_agent_manifest(agent_family.name, agent_family.uris)
package_to_download = self._get_agent_package_to_download(self._agent_manifest, self._version)
is_fast_track_goal_state = goal_state.extensions_goal_state.source == GoalStateSource.FastTrack
agent = GuestAgent.from_agent_package(package_to_download, protocol, is_fast_track_goal_state)
self.download_new_agent_pkg(package_to_download, protocol, is_fast_track_goal_state)
agent = GuestAgent.from_agent_package(package_to_download)
return agent

def purge_extra_agents_from_disk(self):
"""
Remove the agents from disk except current version and new agent version
"""
known_agents = [CURRENT_VERSION, self._version]
self._purge_unknown_agents_from_disk(known_agents)

def _get_agent_package_to_download(self, agent_manifest, version):
"""
Returns the package of the given Version found in the manifest. If not found, returns exception
Expand Down
114 changes: 70 additions & 44 deletions azurelinuxagent/ga/guestagent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,26 @@
from azurelinuxagent.common import logger, conf
from azurelinuxagent.common.exception import UpdateError
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.version import AGENT_DIR_PATTERN, AGENT_NAME, CURRENT_VERSION
from azurelinuxagent.common.version import AGENT_DIR_PATTERN, AGENT_NAME
from azurelinuxagent.ga.exthandlers import HandlerManifest

AGENT_ERROR_FILE = "error.json" # File name for agent error record
AGENT_MANIFEST_FILE = "HandlerManifest.json"
MAX_FAILURE = 3 # Max failure allowed for agent before declare bad agent
AGENT_UPDATE_COUNT_FILE = "update_attempt.json" # File for tracking agent update attempt count


class GuestAgent(object):
def __init__(self, path, pkg, protocol, is_fast_track_goal_state):
def __init__(self, path, pkg):
"""
If 'path' is given, the object is initialized to the version installed under that path.
If 'pkg' is given, the version specified in the package information is downloaded and the object is
initialized to that version.
'is_fast_track_goal_state' and 'protocol' are used only when a package is downloaded.
NOTE: Prefer using the from_installed_agent and from_agent_package methods instead of calling __init__ directly
"""
self._is_fast_track_goal_state = is_fast_track_goal_state
self.pkg = pkg
self._protocol = protocol
version = None
if path is not None:
m = AGENT_DIR_PATTERN.match(path)
Expand All @@ -52,11 +49,13 @@ def __init__(self, path, pkg, protocol, is_fast_track_goal_state):
self.error = GuestAgentError(self.get_agent_error_file())
self.error.load()

self.update_attempt_data = GuestAgentUpdateAttempt(self.get_agent_update_count_file())
self.update_attempt_data.load()

try:
self._ensure_downloaded()
self._ensure_loaded()
except Exception as e:
# If we're unable to download/unpack the agent, delete the Agent directory
# If we're unable to unpack the agent, delete the Agent directory
try:
if os.path.isdir(self.get_agent_dir()):
shutil.rmtree(self.get_agent_dir(), ignore_errors=True)
Expand All @@ -77,14 +76,14 @@ def from_installed_agent(path):
"""
Creates an instance of GuestAgent using the agent installed in the given 'path'.
"""
return GuestAgent(path, None, None, False)
return GuestAgent(path, None)

@staticmethod
def from_agent_package(package, protocol, is_fast_track_goal_state):
def from_agent_package(package):
"""
Creates an instance of GuestAgent using the information provided in the 'package'; if that version of the agent is not installed it, it installs it.
"""
return GuestAgent(None, package, protocol, is_fast_track_goal_state)
return GuestAgent(None, package)

@property
def name(self):
Expand All @@ -99,6 +98,9 @@ def get_agent_dir(self):
def get_agent_error_file(self):
return os.path.join(conf.get_lib_dir(), self.name, AGENT_ERROR_FILE)

def get_agent_update_count_file(self):
return os.path.join(conf.get_lib_dir(), self.name, AGENT_UPDATE_COUNT_FILE)

def get_agent_manifest_path(self):
return os.path.join(self.get_agent_dir(), AGENT_MANIFEST_FILE)

Expand Down Expand Up @@ -136,45 +138,20 @@ def mark_failure(self, is_fatal=False, reason=''):
except Exception as e:
logger.warn(u"Agent {0} failed recording error state: {1}", self.name, ustr(e))

def _ensure_downloaded(self):
logger.verbose(u"Ensuring Agent {0} is downloaded", self.name)

if self.is_downloaded:
logger.verbose(u"Agent {0} was previously downloaded - skipping download", self.name)
return

if self.pkg is None:
raise UpdateError(u"Agent {0} is missing package and download URIs".format(
self.name))

self._download()
def inc_update_attempt_count(self):
try:
self.update_attempt_data.inc_count()
self.update_attempt_data.save()
except Exception as e:
logger.warn(u"Agent {0} failed recording update attempt: {1}", self.name, ustr(e))

msg = u"Agent {0} downloaded successfully".format(self.name)
logger.verbose(msg)
add_event(
AGENT_NAME,
version=self.version,
op=WALAEventOperation.Install,
is_success=True,
message=msg)
def get_update_attempt_count(self):
return self.update_attempt_data.count

def _ensure_loaded(self):
self._load_manifest()
self._load_error()

def _download(self):
try:
self._protocol.client.download_zip_package("agent package", self.pkg.uris, self.get_agent_pkg_path(), self.get_agent_dir(), use_verify_header=self._is_fast_track_goal_state)
except Exception as exception:
msg = "Unable to download Agent {0}: {1}".format(self.name, ustr(exception))
add_event(
AGENT_NAME,
op=WALAEventOperation.Download,
version=CURRENT_VERSION,
is_success=False,
message=msg)
raise UpdateError(msg)

def _load_error(self):
try:
self.error = GuestAgentError(self.get_agent_error_file())
Expand Down Expand Up @@ -303,3 +280,52 @@ def __str__(self):
self.failure_count,
self.was_fatal,
self.reason)


class GuestAgentUpdateAttempt(object):
def __init__(self, path):
self.count = 0
if path is None:
raise UpdateError(u"GuestAgentUpdateAttempt requires a path")
self.path = path

self.clear()

def inc_count(self):
self.count += 1

def clear(self):
self.count = 0

def load(self):
if self.path is not None and os.path.isfile(self.path):
try:
with open(self.path, 'r') as f:
self.from_json(json.load(f))
except Exception as error:
# The update_attempt.json file is only supposed to be written only by the agent.
# If for whatever reason the file is malformed, just delete it to reset state of the errors.
logger.warn(
"Ran into error when trying to load error file {0}, deleting it to clean state. Error: {1}".format(
self.path, textutil.format_exception(error)))
try:
os.remove(self.path)
except Exception:
# We try best case efforts to delete the file, ignore error if we're unable to do so
pass

def save(self):
if os.path.isdir(os.path.dirname(self.path)):
with open(self.path, 'w') as f:
json.dump(self.to_json(), f)

def from_json(self, data):
self.count = data.get(u"count", 0)

def to_json(self):
data = {
u"count": self.count
}
return data


8 changes: 0 additions & 8 deletions azurelinuxagent/ga/rsm_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,14 +104,6 @@ def log_new_agent_update_message(self):
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

def purge_extra_agents_from_disk(self):
"""
Remove the agents( including rsm version if exists) from disk except current version. There is a chance that rsm version could exist and/or blacklisted
on previous update attempts. So we should remove it from disk in order to honor current rsm version update.
"""
known_agents = [CURRENT_VERSION]
self._purge_unknown_agents_from_disk(known_agents)

def proceed_with_update(self):
"""
upgrade/downgrade to the new version.
Expand Down
7 changes: 0 additions & 7 deletions azurelinuxagent/ga/self_update_version_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,6 @@ def log_new_agent_update_message(self):
logger.info(msg)
add_event(op=WALAEventOperation.AgentUpgrade, message=msg, log_event=False)

def purge_extra_agents_from_disk(self):
"""
Remove the agents from disk except current version and new agent version if exists
"""
known_agents = [CURRENT_VERSION, self._version]
self._purge_unknown_agents_from_disk(known_agents)

def proceed_with_update(self):
"""
upgrade to largest version. Downgrade is not supported.
Expand Down
14 changes: 0 additions & 14 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,6 @@ def run(self, debug=False):
self._ensure_extension_telemetry_state_configured_properly(protocol)
self._ensure_firewall_rules_persisted(dst_ip=protocol.get_endpoint())
self._add_accept_tcp_firewall_rule_if_not_enabled(dst_ip=protocol.get_endpoint())
self._reset_legacy_blacklisted_agents()
self._cleanup_legacy_goal_state_history()

# Get all thread handlers
Expand Down Expand Up @@ -1199,16 +1198,3 @@ def _execute_run_command(command):
except Exception as e:
msg = "Error while checking ip table rules:{0}".format(ustr(e))
logger.error(msg)

def _reset_legacy_blacklisted_agents(self):
# Reset the state of all blacklisted agents that were blacklisted by legacy agents (i.e. not during auto-update)

# Filter legacy agents which are blacklisted but do not contain a `reason` in their error.json files
# (this flag signifies that this agent was blacklisted by the newer agents).
try:
legacy_blacklisted_agents = [agent for agent in self._load_agents() if
agent.is_blacklisted and agent.error.reason == '']
for agent in legacy_blacklisted_agents:
agent.clear_error()
except Exception as err:
logger.warn("Unable to reset legacy blacklisted agents due to: {0}".format(err))
2 changes: 1 addition & 1 deletion tests/data/wire/ga_manifest.xml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
<Plugin>
<Version>9.9.9.10</Version>
<Uris>
<Uri>http://mock-goal-state/ga-manifests/OSTCExtensions.WALinuxAgent__99999.0.0.0</Uri>
<Uri>http://mock-goal-state/ga-manifests/OSTCExtensions.WALinuxAgent__9.9.9.10</Uri>
</Uris>
</Plugin>
<Plugin>
Expand Down
Loading

0 comments on commit 12f1adb

Please sign in to comment.