Merge branch 'develop' into scale-sets-working

Azure · Oct 27, 2023 · d801233 · d801233
2 parents b184344 + c323eb4
commit d801233
Show file tree

Hide file tree

Showing 11 changed files with 462 additions and 6 deletions.
diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py
@@ -569,6 +569,7 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
 
                     summary: List[str] = []
                     ignore_error_rules: List[Dict[str, Any]] = []
+                    before_timestamp = datetime.datetime.min
 
                     for test in suite.tests:
                         test_full_name = f"{suite_name}-{test.name}"
@@ -642,6 +643,14 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
 
                         ignore_error_rules.extend(test_instance.get_ignore_error_rules())
 
+                        # If the test has a timestamp before which errors should be ignored in the agent log, use that timestamp
+                        # if multiple tests have this setting, use the earliest timestamp
+                        if test_instance.get_ignore_errors_before_timestamp() != datetime.datetime.min:
+                            if before_timestamp != datetime.datetime.min:
+                                before_timestamp = min(before_timestamp, test_instance.get_ignore_errors_before_timestamp())
+                            else:
+                                before_timestamp = test_instance.get_ignore_errors_before_timestamp()
+
                         if not test_success and test.blocks_suite:
                             log.warning("%s failed and blocks the suite. Stopping suite execution.", test.name)
                             break
@@ -666,11 +675,11 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
                     if not suite_success:
                         self._mark_log_as_failed()
 
-                suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules)
+                suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules, before_timestamp)
 
                 return suite_success
 
-    def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]]) -> bool:
+    def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]], before_timestamp: datetime) -> bool:
         """
         Checks the agent log on the test nodes for errors; returns true on success (no errors in the logs)
         """
@@ -698,6 +707,9 @@ def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]
                 if len(ignore_error_rules) > 0:
                     new = []
                     for e in errors:
+                        # Ignore errors that occurred before the timestamp
+                        if e.timestamp < before_timestamp:
+                            continue
                         if not AgentLog.matches_ignore_rule(e, ignore_error_rules):
                             new.append(e)
                     errors = new

diff --git a/tests_e2e/test_suites/agent_cgroups.yml b/tests_e2e/test_suites/agent_cgroups.yml
@@ -1,7 +1,8 @@
 #
-# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics.
+# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. Also, it verifies the agent cpu quota is set as expected.
 #
 name: "AgentCgroups"
 tests:
   - "agent_cgroups/agent_cgroups.py"
+  - "agent_cgroups/agent_cpu_quota.py"
 images: "cgroups-endorsed"
diff --git a/tests_e2e/test_suites/agent_publish.yml b/tests_e2e/test_suites/agent_publish.yml
@@ -8,4 +8,5 @@ images:
   - "random(endorsed, 10)"
   - "random(endorsed-arm64, 2)"
 locations: "AzureCloud:centraluseuap"
-owns_vm: true
+owns_vm: true
+install_test_agent: false
diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml
@@ -81,7 +81,11 @@ images:
          AzureChinaCloud: []
    centos_610: "OpenLogic CentOS 6.10 latest"
    centos_79: "OpenLogic CentOS 7_9 latest"
-   centos_82: "OpenLogic CentOS 8_2 latest"
+   centos_82:
+      urn: "OpenLogic CentOS 8_2 latest"
+      vm_sizes:
+         # Since centos derived from redhat, please see the comment for vm size in rhel_82
+         - "Standard_B2s"
    debian_8: "credativ Debian 8 latest"
    debian_9: "credativ Debian 9 latest"
    debian_10: "Debian debian-10 10 latest"
@@ -126,6 +130,9 @@ images:
       urn: "RedHat RHEL 8.2 latest"
       locations:
          AzureChinaCloud: []
+      vm_sizes:
+      # Previously one user reported agent hang on this VM size for redhat 7+ but not observed in rhel 8. So I'm using same vm size to test agent cgroups scenario for rhel 8 to make sure we don't see any issue in automation.
+         - "Standard_B2s"
    rhel_90:
       urn: "RedHat RHEL 9_0 latest"
       locations:

diff --git a/tests_e2e/tests/agent_cgroups/agent_cpu_quota.py b/tests_e2e/tests/agent_cgroups/agent_cpu_quota.py
@@ -0,0 +1,39 @@
+from typing import List, Dict, Any
+
+from tests_e2e.tests.lib.agent_test import AgentTest
+from tests_e2e.tests.lib.logging import log
+
+
+class AgentCPUQuota(AgentTest):
+    """
+    The test verify that the agent detects when it is throttled for using too much CPU, that it detects processes that do belong to the agent's cgroup, and that resource metrics are generated.
+    """
+    def __init__(self, context):
+        super().__init__(context)
+        self._ssh_client = self._context.create_ssh_client()
+
+    def run(self):
+        log.info("=====Validating agent cpu quota checks")
+        self._run_remote_test("agent_cpu_quota-check_agent_cpu_quota.py", use_sudo=True)
+        log.info("Successfully Verified that agent running in expected CPU quotas")
+
+    def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
+        ignore_rules = [
+            # This is produced by the test, so it is expected
+            # Examples:
+            #     2023-10-03T17:59:03.007572Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed:
+            #     [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 3190] /usr/bin/python3\x00/home/azureuser/bin/agent_cpu_quota-start_servi', '[PID: 3293] dd\x00if=/dev/zero\x00of=/dev/null\x00']
+            #     [CGroupsException] The agent has been throttled for 5.7720997 seconds
+            {'message': r"Disabling resource usage monitoring. Reason: Check on cgroups failed"},
+            # This may happen during service stop while terminating the process
+            # Example:
+            #     2022-03-11T21:11:11.713161Z ERROR E2ETest [Errno 3] No such process:
+            {'message': r'E2ETest.*No such process'},
+            #     2022-10-26T15:38:39.655677Z ERROR E2ETest 'dd if=/dev/zero of=/dev/null' failed: -15 ():
+            {'message': r"E2ETest.*dd.*failed: -15"}
+        ]
+        return ignore_rules
+
+
+if __name__ == "__main__":
+    AgentCPUQuota.run_from_command_line()
diff --git a/tests_e2e/tests/agent_publish/agent_publish.py b/tests_e2e/tests/agent_publish/agent_publish.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 #
 import uuid
+from datetime import datetime
 from typing import Any, Dict, List
 
 from tests_e2e.tests.lib.agent_test import AgentVmTest
@@ -51,13 +52,17 @@ def run(self):
         self._get_agent_info()
         self._check_cse()
 
+    def get_ignore_errors_before_timestamp(self) -> datetime:
+        timestamp = self._ssh_client.run_command("agent_publish-get_agent_log_record_timestamp.py")
+        return datetime.strptime(timestamp.strip(), u'%Y-%m-%d %H:%M:%S.%f')
+
     def _get_agent_info(self) -> None:
         stdout: str = self._ssh_client.run_command("waagent-version", use_sudo=True)
         log.info('Agent info \n%s', stdout)
 
     def _prepare_agent(self) -> None:
         log.info("Modifying agent update related config flags")
-        self._run_remote_test(self._ssh_client, "update-waagent-conf Debug.DownloadNewAgents=y AutoUpdate.GAFamily=Test", use_sudo=True)
+        self._run_remote_test(self._ssh_client, "update-waagent-conf Debug.DownloadNewAgents=y AutoUpdate.GAFamily=Test AutoUpdate.Enabled=y Extensions.Enabled=y", use_sudo=True)
         log.info('Updated agent-update DownloadNewAgents  GAFamily config flags')
 
     def _check_update(self) -> None:

diff --git a/tests_e2e/tests/lib/agent_test.py b/tests_e2e/tests/lib/agent_test.py
@@ -20,6 +20,8 @@
 import sys
 
 from abc import ABC, abstractmethod
+from datetime import datetime
+
 from assertpy import fail
 from typing import Any, Dict, List
 
@@ -62,6 +64,10 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
         """
         return []
 
+    def get_ignore_errors_before_timestamp(self) -> datetime:
+        # Ignore errors in the agent log before this timestamp
+        return datetime.min
+
     @classmethod
     def run_from_command_line(cls):
         """

diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py
@@ -133,6 +133,7 @@ def check_agent_quota_disabled():
     Returns True if the cpu quota is infinity
     """
     cpu_quota = get_agent_cpu_quota()
+    # the quota can be expressed as seconds (s) or milliseconds (ms); no quota is expressed as "infinity"
     return cpu_quota == 'infinity'
 
 

diff --git a/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py b/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env pypy3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import os
+import re
+import shutil
+import time
+
+from assertpy import fail
+
+from azurelinuxagent.common.osutil import systemd
+from azurelinuxagent.common.utils import shellutil
+from azurelinuxagent.ga.cgroupconfigurator import _DROP_IN_FILE_CPU_QUOTA
+from tests_e2e.tests.lib.agent_log import AgentLog
+from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, \
+    get_agent_cpu_quota
+from tests_e2e.tests.lib.logging import log
+from tests_e2e.tests.lib.remote_test import run_remote_test
+from tests_e2e.tests.lib.retry import retry_if_false
+
+
+def prepare_agent():
+    # This function prepares the agent:
+    #    1) It modifies the service unit file to wrap the agent process with a script that starts the actual agent and then
+    #       launches an instance of the dummy process to consume the CPU. Since all these processes are in the same cgroup,
+    #       this has the same effect as the agent itself consuming the CPU.
+    #
+    #       The process tree is similar to
+    #
+    #           /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
+    #             ├─/usr/bin/python3 -u /usr/sbin/waagent -daemon
+    #             │   └─python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
+    #             │       └─4*[{python3}]
+    #             ├─dd if=/dev/zero of=/dev/null
+    #             │
+    #             └─{python3}
+    #
+    #       And the agent's cgroup looks like
+    #
+    #       CGroup: /azure.slice/walinuxagent.service
+    #              ├─10507 /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
+    #              ├─10508 /usr/bin/python3 -u /usr/sbin/waagent -daemon
+    #              ├─10516 python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
+    #              ├─10711 dd if=/dev/zero of=/dev/null
+    #
+    #    2) It turns on a few debug flags and resart the agent
+    log.info("***Preparing agent for testing cpu quota")
+    #
+    # Create a drop in file to wrap "start-service.py" around the actual agent: This will ovveride the ExecStart line in the agent's unit file
+    #
+    #     ExecStart= (need to be empty to clear the original ExecStart)
+    #     ExecStart=/home/.../agent_cgroups-start-service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
+    #
+    service_file = systemd.get_agent_unit_file()
+    exec_start = None
+    with open(service_file, "r") as file_:
+        for line in file_:
+            match = re.match("ExecStart=(.+)", line)
+            if match is not None:
+                exec_start = match.group(1)
+                break
+        else:
+            file_.seek(0)
+            raise Exception("Could not find ExecStart in {0}\n:{1}".format(service_file, file_.read()))
+    agent_python = exec_start.split()[0]
+    current_directory = os.path.dirname(os.path.abspath(__file__))
+    start_service_script = os.path.join(current_directory, "agent_cpu_quota-start_service.py")
+    drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), "99-ExecStart.conf")
+    log.info("Creating %s...", drop_in_file)
+    with open(drop_in_file, "w") as file_:
+        file_.write("""
+[Service]
+ExecStart=
+ExecStart={0} {1} {2}
+""".format(agent_python, start_service_script, exec_start))
+    log.info("Executing daemon-reload")
+    shellutil.run_command(["systemctl", "daemon-reload"])
+
+    # Disable all checks on cgroups and enable log metrics every 20 sec
+    log.info("Executing script update-waagent-conf to enable agent cgroups config flag")
+    result = shellutil.run_command(["update-waagent-conf", "Debug.CgroupCheckPeriod=20", "Debug.CgroupLogMetrics=y",
+                           "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=n"])
+    log.info("Successfully enabled agent cgroups config flag: {0}".format(result))
+
+
+def verify_agent_reported_metrics():
+    """
+    This method verifies that the agent reports % Processor Time and Throttled Time metrics
+    """
+    log.info("** Verifying agent reported metrics")
+    log.info("Parsing agent log for metrics")
+    processor_time = []
+    throttled_time = []
+
+    def check_agent_log_for_metrics() -> bool:
+        for record in AgentLog().read():
+            match = re.search(r"% Processor Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
+            if match is not None:
+                processor_time.append(float(match.group(1)))
+            else:
+                match = re.search(r"Throttled Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
+                if match is not None:
+                    throttled_time.append(float(match.group(1)))
+        if len(processor_time) < 1 or len(throttled_time) < 1:
+            return False
+        return True
+
+    found: bool = retry_if_false(check_agent_log_for_metrics)
+    if found:
+        log.info("%% Processor Time: %s", processor_time)
+        log.info("Throttled Time: %s", throttled_time)
+        log.info("Successfully verified agent reported resource metrics")
+    else:
+        fail(
+            "The agent doesn't seem to be collecting % Processor Time and Throttled Time metrics. Agent found Processor Time: {0}, Throttled Time: {1}".format(
+                processor_time, throttled_time))
+
+
+def wait_for_log_message(message, timeout=datetime.timedelta(minutes=5)):
+    log.info("Checking agent's log for message matching [%s]", message)
+    start_time = datetime.datetime.now()
+    while datetime.datetime.now() - start_time <= timeout:
+        for record in AgentLog().read():
+            match = re.search(message, record.message, flags=re.DOTALL)
+            if match is not None:
+                log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t"))
+                return
+        time.sleep(30)
+    fail("The agent did not find [{0}] in its log within the allowed timeout".format(message))
+
+
+def verify_process_check_on_agent_cgroups():
+    """
+    This method checks agent detect unexpected processes in its cgroup and disables the CPUQuota
+    """
+    log.info("***Verifying process check on  agent cgroups")
+    log.info("Ensuring agent CPUQuota is enabled and backup the drop-in file to restore later in further tests")
+    if check_agent_quota_disabled():
+        fail("The agent's CPUQuota is not enabled: {0}".format(get_agent_cpu_quota()))
+    quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
+    quota_drop_in_backup = quota_drop_in + ".bk"
+    log.info("Backing up %s to %s...", quota_drop_in, quota_drop_in_backup)
+    shutil.copy(quota_drop_in, quota_drop_in_backup)
+    #
+    # Re-enable Process checks on cgroups and verify that the agent detects unexpected processes in its cgroup and disables the CPUQuota wehen
+    # that happens
+    #
+    shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=y"])
+
+    # The log message indicating the check failed is similar to
+    #     2021-03-29T23:33:15.603530Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
+    #     [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 25826] python3\x00/home/nam/Compute-Runtime-Tux-Pipeline/dungeon_crawler/s']
+    wait_for_log_message(
+        "Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent's cgroup includes unexpected processes")
+    if not check_agent_quota_disabled():
+        fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))
+
+
+def verify_throttling_time_check_on_agent_cgroups():
+    """
+    This method checks agent disables its CPUQuota when it exceeds its throttling limit
+    """
+    log.info("***Verifying CPU throttling check on  agent cgroups")
+    # Now disable the check on unexpected processes and enable the check on throttledtime and verify that the agent disables its CPUQuota when it exceeds its throttling limit
+    log.info("Re-enabling CPUQuota...")
+    quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
+    quota_drop_in_backup = quota_drop_in + ".bk"
+    log.info("Restoring %s from %s...", quota_drop_in, quota_drop_in_backup)
+    shutil.copy(quota_drop_in_backup, quota_drop_in)
+    shellutil.run_command(["systemctl", "daemon-reload"])
+    shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=y", "Debug.AgentCpuThrottledTimeThreshold=5"])
+
+    # The log message indicating the check failed is similar to
+    #     2021-04-01T20:47:55.892569Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
+    #     [CGroupsException] The agent has been throttled for 121.339916938 seconds
+    #
+    # After we need to wait for a little longer for the agent to update systemd:
+    #     2021-04-14T01:51:44.399860Z INFO MonitorHandler ExtHandler Executing systemctl daemon-reload...
+    #
+    wait_for_log_message(
+        "Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent has been throttled",
+        timeout=datetime.timedelta(minutes=10))
+    wait_for_log_message("Stopped tracking cgroup walinuxagent.service", timeout=datetime.timedelta(minutes=10))
+    wait_for_log_message("Executing systemctl daemon-reload...", timeout=datetime.timedelta(minutes=5))
+    if not check_agent_quota_disabled():
+        fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))
+
+
+def main():
+    prepare_agent()
+    verify_agent_reported_metrics()
+    verify_process_check_on_agent_cgroups()
+    verify_throttling_time_check_on_agent_cgroups()
+
+
+run_remote_test(main)