Skip to content

Commit

Permalink
Add retry logic to gs processing lag check
Browse files Browse the repository at this point in the history
  • Loading branch information
maddieford committed Jul 10, 2023
1 parent 7e959d4 commit fd37fc8
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 65 deletions.
4 changes: 0 additions & 4 deletions tests_e2e/tests/agent_ext_workflow/extension_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,10 +437,6 @@ def run(self):
log.info("")
log.info("*******Verifying no lag between agent start and gs processing*******")

log.info("")
log.info("Sleeping for 15 seconds to allow goal state processing to complete...")
time.sleep(15)

log.info("")
log.info("Running agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py remotely...")
result = self._ssh_client.run_command("agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py", use_sudo=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env pypy3

import time
# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
Expand Down Expand Up @@ -28,66 +28,86 @@

def main():
success = True

agent_started_time = []
agent_msg = []
time_diff_max_secs = 15

# Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"

verified_atleast_one_log_line = False
verified_atleast_one_agent_started_log_line = False
verified_atleast_one_gs_complete_log_line = False

agent_log = AgentLog(Path('/var/log/waagent.log'))

try:
for agent_record in agent_log.read():
verified_atleast_one_log_line = True

agent_started = re.match(agent_started_regex, agent_record.message)
verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
if agent_started:
agent_started_time.append(agent_record.timestamp)
agent_msg.append(agent_record.text)

gs_complete = re.match(gs_completed_regex, agent_record.message)
verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
if agent_started_time and gs_complete:
duration = gs_complete.group('duration')
diff = agent_record.timestamp - agent_started_time.pop()
# Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
# the agent took after start/restart to start processing GS
diff -= timedelta(milliseconds=int(duration))
agent_msg_line = agent_msg.pop()
if diff.seconds > time_diff_max_secs:
success = False
print("Found delay between agent start and GoalState Processing > {0}secs: "
"Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))

except IOError as e:
print("Unable to validate no lag time: {0}".format(str(e)))

if not verified_atleast_one_log_line:
success = False
print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")

if not verified_atleast_one_agent_started_log_line:
success = False
print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
.format(agent_started_regex))

if not verified_atleast_one_gs_complete_log_line:
success = False
print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
.format(gs_completed_regex))

if agent_started_time or agent_msg:
success = False
print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
"Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))
retry = 3

while retry > 0:
success = True

agent_started_time = []
agent_msg = []
time_diff_max_secs = 15
last_agent_log_timestamp = None

# Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"

verified_atleast_one_log_line = False
verified_atleast_one_agent_started_log_line = False
verified_atleast_one_gs_complete_log_line = False

agent_log = AgentLog(Path('/var/log/waagent.log'))

try:
for agent_record in agent_log.read():
last_agent_log_timestamp = agent_record.timestamp
verified_atleast_one_log_line = True

agent_started = re.match(agent_started_regex, agent_record.message)
verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
if agent_started:
agent_started_time.append(agent_record.timestamp)
agent_msg.append(agent_record.text)

gs_complete = re.match(gs_completed_regex, agent_record.message)
verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
if agent_started_time and gs_complete:
duration = gs_complete.group('duration')
diff = agent_record.timestamp - agent_started_time.pop()
# Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
# the agent took after start/restart to start processing GS
diff -= timedelta(milliseconds=int(duration))
agent_msg_line = agent_msg.pop()
if diff.seconds > time_diff_max_secs:
success = False
print("Found delay between agent start and GoalState Processing > {0}secs: "
"Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))

except IOError as e:
print("Unable to validate no lag time: {0}".format(str(e)))

if not verified_atleast_one_log_line:
success = False
print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")

if not verified_atleast_one_agent_started_log_line:
success = False
print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
.format(agent_started_regex))

if not verified_atleast_one_gs_complete_log_line:
success = False
print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
.format(gs_completed_regex))

if agent_started_time or agent_msg:
# If agent_started_time or agent_msg is not empty, there is a mismatch in the number of agent start messages
# and GoalState Processing messages
# If another check hasn't already failed, and the last parsed log is less than 15 seconds after the
# mismatched agent start log, we should retry after sleeping for 5s to give the agent time to finish
# GoalState processing
if success and last_agent_log_timestamp < (agent_started_time[-1] + timedelta(seconds=15)):
print("Sleeping for 5 seconds to allow goal state processing to complete...")
time.sleep(5)
else:
success = False
print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
"Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))
retry == 0
else:
retry == 0

retry -= 1

sys.exit(0 if success else 1)

Expand Down

0 comments on commit fd37fc8

Please sign in to comment.