Add retry logic to gs processing lag check

Azure · Jul 10, 2023 · fd37fc8 · fd37fc8
1 parent 7e959d4
commit fd37fc8
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 65 deletions.
diff --git a/tests_e2e/tests/agent_ext_workflow/extension_workflow.py b/tests_e2e/tests/agent_ext_workflow/extension_workflow.py
@@ -437,10 +437,6 @@ def run(self):
             log.info("")
             log.info("*******Verifying no lag between agent start and gs processing*******")
 
-            log.info("")
-            log.info("Sleeping for 15 seconds to allow goal state processing to complete...")
-            time.sleep(15)
-
             log.info("")
             log.info("Running agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py remotely...")
             result = self._ssh_client.run_command("agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py", use_sudo=True)

diff --git a/...tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py b/...tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env pypy3
-
+import time
 # Microsoft Azure Linux Agent
 #
 # Copyright 2018 Microsoft Corporation
@@ -28,66 +28,86 @@
 
 def main():
     success = True
-
-    agent_started_time = []
-    agent_msg = []
-    time_diff_max_secs = 15
-
-    # Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
-    agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
-    gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"
-
-    verified_atleast_one_log_line = False
-    verified_atleast_one_agent_started_log_line = False
-    verified_atleast_one_gs_complete_log_line = False
-
-    agent_log = AgentLog(Path('/var/log/waagent.log'))
-
-    try:
-        for agent_record in agent_log.read():
-            verified_atleast_one_log_line = True
-
-            agent_started = re.match(agent_started_regex, agent_record.message)
-            verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
-            if agent_started:
-                agent_started_time.append(agent_record.timestamp)
-                agent_msg.append(agent_record.text)
-
-            gs_complete = re.match(gs_completed_regex, agent_record.message)
-            verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
-            if agent_started_time and gs_complete:
-                duration = gs_complete.group('duration')
-                diff = agent_record.timestamp - agent_started_time.pop()
-                # Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
-                # the agent took after start/restart to start processing GS
-                diff -= timedelta(milliseconds=int(duration))
-                agent_msg_line = agent_msg.pop()
-                if diff.seconds > time_diff_max_secs:
-                    success = False
-                    print("Found delay between agent start and GoalState Processing > {0}secs: "
-                             "Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))
-
-    except IOError as e:
-        print("Unable to validate no lag time: {0}".format(str(e)))
-
-    if not verified_atleast_one_log_line:
-        success = False
-        print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")
-
-    if not verified_atleast_one_agent_started_log_line:
-        success = False
-        print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
-              .format(agent_started_regex))
-
-    if not verified_atleast_one_gs_complete_log_line:
-        success = False
-        print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
-              .format(gs_completed_regex))
-
-    if agent_started_time or agent_msg:
-        success = False
-        print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
-              "Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))
+    retry = 3
+
+    while retry > 0:
+        success = True
+
+        agent_started_time = []
+        agent_msg = []
+        time_diff_max_secs = 15
+        last_agent_log_timestamp = None
+
+        # Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent
+        agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)"
+        gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P<id>[a-z_\d]{13,14})\s(?P<duration>\d+)\sms\]"
+
+        verified_atleast_one_log_line = False
+        verified_atleast_one_agent_started_log_line = False
+        verified_atleast_one_gs_complete_log_line = False
+
+        agent_log = AgentLog(Path('/var/log/waagent.log'))
+
+        try:
+            for agent_record in agent_log.read():
+                last_agent_log_timestamp = agent_record.timestamp
+                verified_atleast_one_log_line = True
+
+                agent_started = re.match(agent_started_regex, agent_record.message)
+                verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started
+                if agent_started:
+                    agent_started_time.append(agent_record.timestamp)
+                    agent_msg.append(agent_record.text)
+
+                gs_complete = re.match(gs_completed_regex, agent_record.message)
+                verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete
+                if agent_started_time and gs_complete:
+                    duration = gs_complete.group('duration')
+                    diff = agent_record.timestamp - agent_started_time.pop()
+                    # Reduce the duration it took to complete the Goalstate, essentially we should only care about how long
+                    # the agent took after start/restart to start processing GS
+                    diff -= timedelta(milliseconds=int(duration))
+                    agent_msg_line = agent_msg.pop()
+                    if diff.seconds > time_diff_max_secs:
+                        success = False
+                        print("Found delay between agent start and GoalState Processing > {0}secs: "
+                                 "Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text))
+
+        except IOError as e:
+            print("Unable to validate no lag time: {0}".format(str(e)))
+
+        if not verified_atleast_one_log_line:
+            success = False
+            print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex")
+
+        if not verified_atleast_one_agent_started_log_line:
+            success = False
+            print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}"
+                  .format(agent_started_regex))
+
+        if not verified_atleast_one_gs_complete_log_line:
+            success = False
+            print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}"
+                  .format(gs_completed_regex))
+
+        if agent_started_time or agent_msg:
+            # If agent_started_time or agent_msg is not empty, there is a mismatch in the number of agent start messages
+            # and GoalState Processing messages
+            # If another check hasn't already failed, and the last parsed log is less than 15 seconds after the
+            # mismatched agent start log, we should retry after sleeping for 5s to give the agent time to finish
+            # GoalState processing
+            if success and last_agent_log_timestamp < (agent_started_time[-1] + timedelta(seconds=15)):
+                print("Sleeping for 5 seconds to allow goal state processing to complete...")
+                time.sleep(5)
+            else:
+                success = False
+                print("Mismatch between number of agent start messages and number of GoalState Processing messages\n "
+                      "Agent Start Messages: \n {0}".format('\n'.join(agent_msg)))
+                retry == 0
+        else:
+            retry == 0
+
+        retry -= 1
 
     sys.exit(0 if success else 1)