From 47ea3b36f13e12680397ee82e7334f032c399e44 Mon Sep 17 00:00:00 2001 From: Nageswara Nandigam <84482346+nagworld9@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:41:52 -0700 Subject: [PATCH] agent cpu quota scenario (#2937) * agent_cpu_quota scenario * addressed comments * addressed comments --- tests_e2e/test_suites/agent_cgroups.yml | 3 +- tests_e2e/test_suites/images.yml | 9 +- .../tests/agent_cgroups/agent_cpu_quota.py | 39 ++++ tests_e2e/tests/lib/cgroup_helpers.py | 1 + .../agent_cpu_quota-check_agent_cpu_quota.py | 213 ++++++++++++++++++ .../scripts/agent_cpu_quota-start_service.py | 96 ++++++++ 6 files changed, 359 insertions(+), 2 deletions(-) create mode 100644 tests_e2e/tests/agent_cgroups/agent_cpu_quota.py create mode 100755 tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py create mode 100755 tests_e2e/tests/scripts/agent_cpu_quota-start_service.py diff --git a/tests_e2e/test_suites/agent_cgroups.yml b/tests_e2e/test_suites/agent_cgroups.yml index 239f37e32a..32a290c9ef 100644 --- a/tests_e2e/test_suites/agent_cgroups.yml +++ b/tests_e2e/test_suites/agent_cgroups.yml @@ -1,7 +1,8 @@ # -# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. +# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. Also, it verifies the agent cpu quota is set as expected. # name: "AgentCgroups" tests: - "agent_cgroups/agent_cgroups.py" + - "agent_cgroups/agent_cpu_quota.py" images: "cgroups-endorsed" \ No newline at end of file diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml index c254cccd71..02392b375b 100644 --- a/tests_e2e/test_suites/images.yml +++ b/tests_e2e/test_suites/images.yml @@ -81,7 +81,11 @@ images: AzureChinaCloud: [] centos_610: "OpenLogic CentOS 6.10 latest" centos_79: "OpenLogic CentOS 7_9 latest" - centos_82: "OpenLogic CentOS 8_2 latest" + centos_82: + urn: "OpenLogic CentOS 8_2 latest" + vm_sizes: + # Since centos derived from redhat, please see the comment for vm size in rhel_82 + - "Standard_B2s" debian_8: "credativ Debian 8 latest" debian_9: "credativ Debian 9 latest" debian_10: "Debian debian-10 10 latest" @@ -126,6 +130,9 @@ images: urn: "RedHat RHEL 8.2 latest" locations: AzureChinaCloud: [] + vm_sizes: + # Previously one user reported agent hang on this VM size for redhat 7+ but not observed in rhel 8. So I'm using same vm size to test agent cgroups scenario for rhel 8 to make sure we don't see any issue in automation. + - "Standard_B2s" rhel_90: urn: "RedHat RHEL 9_0 latest" locations: diff --git a/tests_e2e/tests/agent_cgroups/agent_cpu_quota.py b/tests_e2e/tests/agent_cgroups/agent_cpu_quota.py new file mode 100644 index 0000000000..79f95fc547 --- /dev/null +++ b/tests_e2e/tests/agent_cgroups/agent_cpu_quota.py @@ -0,0 +1,39 @@ +from typing import List, Dict, Any + +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.logging import log + + +class AgentCPUQuota(AgentTest): + """ + The test verify that the agent detects when it is throttled for using too much CPU, that it detects processes that do belong to the agent's cgroup, and that resource metrics are generated. + """ + def __init__(self, context): + super().__init__(context) + self._ssh_client = self._context.create_ssh_client() + + def run(self): + log.info("=====Validating agent cpu quota checks") + self._run_remote_test("agent_cpu_quota-check_agent_cpu_quota.py", use_sudo=True) + log.info("Successfully Verified that agent running in expected CPU quotas") + + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + ignore_rules = [ + # This is produced by the test, so it is expected + # Examples: + # 2023-10-03T17:59:03.007572Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed: + # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 3190] /usr/bin/python3\x00/home/azureuser/bin/agent_cpu_quota-start_servi', '[PID: 3293] dd\x00if=/dev/zero\x00of=/dev/null\x00'] + # [CGroupsException] The agent has been throttled for 5.7720997 seconds + {'message': r"Disabling resource usage monitoring. Reason: Check on cgroups failed"}, + # This may happen during service stop while terminating the process + # Example: + # 2022-03-11T21:11:11.713161Z ERROR E2ETest [Errno 3] No such process: + {'message': r'E2ETest.*No such process'}, + # 2022-10-26T15:38:39.655677Z ERROR E2ETest 'dd if=/dev/zero of=/dev/null' failed: -15 (): + {'message': r"E2ETest.*dd.*failed: -15"} + ] + return ignore_rules + + +if __name__ == "__main__": + AgentCPUQuota.run_from_command_line() diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py index 7eb3a9b1f7..6da2865c21 100644 --- a/tests_e2e/tests/lib/cgroup_helpers.py +++ b/tests_e2e/tests/lib/cgroup_helpers.py @@ -133,6 +133,7 @@ def check_agent_quota_disabled(): Returns True if the cpu quota is infinity """ cpu_quota = get_agent_cpu_quota() + # the quota can be expressed as seconds (s) or milliseconds (ms); no quota is expressed as "infinity" return cpu_quota == 'infinity' diff --git a/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py b/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py new file mode 100755 index 0000000000..63871b43a6 --- /dev/null +++ b/tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py @@ -0,0 +1,213 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import datetime +import os +import re +import shutil +import time + +from assertpy import fail + +from azurelinuxagent.common.osutil import systemd +from azurelinuxagent.common.utils import shellutil +from azurelinuxagent.ga.cgroupconfigurator import _DROP_IN_FILE_CPU_QUOTA +from tests_e2e.tests.lib.agent_log import AgentLog +from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, \ + get_agent_cpu_quota +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.remote_test import run_remote_test +from tests_e2e.tests.lib.retry import retry_if_false + + +def prepare_agent(): + # This function prepares the agent: + # 1) It modifies the service unit file to wrap the agent process with a script that starts the actual agent and then + # launches an instance of the dummy process to consume the CPU. Since all these processes are in the same cgroup, + # this has the same effect as the agent itself consuming the CPU. + # + # The process tree is similar to + # + # /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon + # ├─/usr/bin/python3 -u /usr/sbin/waagent -daemon + # │ └─python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers + # │ └─4*[{python3}] + # ├─dd if=/dev/zero of=/dev/null + # │ + # └─{python3} + # + # And the agent's cgroup looks like + # + # CGroup: /azure.slice/walinuxagent.service + # ├─10507 /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon + # ├─10508 /usr/bin/python3 -u /usr/sbin/waagent -daemon + # ├─10516 python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers + # ├─10711 dd if=/dev/zero of=/dev/null + # + # 2) It turns on a few debug flags and resart the agent + log.info("***Preparing agent for testing cpu quota") + # + # Create a drop in file to wrap "start-service.py" around the actual agent: This will ovveride the ExecStart line in the agent's unit file + # + # ExecStart= (need to be empty to clear the original ExecStart) + # ExecStart=/home/.../agent_cgroups-start-service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon + # + service_file = systemd.get_agent_unit_file() + exec_start = None + with open(service_file, "r") as file_: + for line in file_: + match = re.match("ExecStart=(.+)", line) + if match is not None: + exec_start = match.group(1) + break + else: + file_.seek(0) + raise Exception("Could not find ExecStart in {0}\n:{1}".format(service_file, file_.read())) + agent_python = exec_start.split()[0] + current_directory = os.path.dirname(os.path.abspath(__file__)) + start_service_script = os.path.join(current_directory, "agent_cpu_quota-start_service.py") + drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), "99-ExecStart.conf") + log.info("Creating %s...", drop_in_file) + with open(drop_in_file, "w") as file_: + file_.write(""" +[Service] +ExecStart= +ExecStart={0} {1} {2} +""".format(agent_python, start_service_script, exec_start)) + log.info("Executing daemon-reload") + shellutil.run_command(["systemctl", "daemon-reload"]) + + # Disable all checks on cgroups and enable log metrics every 20 sec + log.info("Executing script update-waagent-conf to enable agent cgroups config flag") + result = shellutil.run_command(["update-waagent-conf", "Debug.CgroupCheckPeriod=20", "Debug.CgroupLogMetrics=y", + "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=n"]) + log.info("Successfully enabled agent cgroups config flag: {0}".format(result)) + + +def verify_agent_reported_metrics(): + """ + This method verifies that the agent reports % Processor Time and Throttled Time metrics + """ + log.info("** Verifying agent reported metrics") + log.info("Parsing agent log for metrics") + processor_time = [] + throttled_time = [] + + def check_agent_log_for_metrics() -> bool: + for record in AgentLog().read(): + match = re.search(r"% Processor Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message) + if match is not None: + processor_time.append(float(match.group(1))) + else: + match = re.search(r"Throttled Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message) + if match is not None: + throttled_time.append(float(match.group(1))) + if len(processor_time) < 1 or len(throttled_time) < 1: + return False + return True + + found: bool = retry_if_false(check_agent_log_for_metrics) + if found: + log.info("%% Processor Time: %s", processor_time) + log.info("Throttled Time: %s", throttled_time) + log.info("Successfully verified agent reported resource metrics") + else: + fail( + "The agent doesn't seem to be collecting % Processor Time and Throttled Time metrics. Agent found Processor Time: {0}, Throttled Time: {1}".format( + processor_time, throttled_time)) + + +def wait_for_log_message(message, timeout=datetime.timedelta(minutes=5)): + log.info("Checking agent's log for message matching [%s]", message) + start_time = datetime.datetime.now() + while datetime.datetime.now() - start_time <= timeout: + for record in AgentLog().read(): + match = re.search(message, record.message, flags=re.DOTALL) + if match is not None: + log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t")) + return + time.sleep(30) + fail("The agent did not find [{0}] in its log within the allowed timeout".format(message)) + + +def verify_process_check_on_agent_cgroups(): + """ + This method checks agent detect unexpected processes in its cgroup and disables the CPUQuota + """ + log.info("***Verifying process check on agent cgroups") + log.info("Ensuring agent CPUQuota is enabled and backup the drop-in file to restore later in further tests") + if check_agent_quota_disabled(): + fail("The agent's CPUQuota is not enabled: {0}".format(get_agent_cpu_quota())) + quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA) + quota_drop_in_backup = quota_drop_in + ".bk" + log.info("Backing up %s to %s...", quota_drop_in, quota_drop_in_backup) + shutil.copy(quota_drop_in, quota_drop_in_backup) + # + # Re-enable Process checks on cgroups and verify that the agent detects unexpected processes in its cgroup and disables the CPUQuota wehen + # that happens + # + shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=y"]) + + # The log message indicating the check failed is similar to + # 2021-03-29T23:33:15.603530Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed: + # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 25826] python3\x00/home/nam/Compute-Runtime-Tux-Pipeline/dungeon_crawler/s'] + wait_for_log_message( + "Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent's cgroup includes unexpected processes") + if not check_agent_quota_disabled(): + fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota())) + + +def verify_throttling_time_check_on_agent_cgroups(): + """ + This method checks agent disables its CPUQuota when it exceeds its throttling limit + """ + log.info("***Verifying CPU throttling check on agent cgroups") + # Now disable the check on unexpected processes and enable the check on throttledtime and verify that the agent disables its CPUQuota when it exceeds its throttling limit + log.info("Re-enabling CPUQuota...") + quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA) + quota_drop_in_backup = quota_drop_in + ".bk" + log.info("Restoring %s from %s...", quota_drop_in, quota_drop_in_backup) + shutil.copy(quota_drop_in_backup, quota_drop_in) + shellutil.run_command(["systemctl", "daemon-reload"]) + shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=y", "Debug.AgentCpuThrottledTimeThreshold=5"]) + + # The log message indicating the check failed is similar to + # 2021-04-01T20:47:55.892569Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed: + # [CGroupsException] The agent has been throttled for 121.339916938 seconds + # + # After we need to wait for a little longer for the agent to update systemd: + # 2021-04-14T01:51:44.399860Z INFO MonitorHandler ExtHandler Executing systemctl daemon-reload... + # + wait_for_log_message( + "Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent has been throttled", + timeout=datetime.timedelta(minutes=10)) + wait_for_log_message("Stopped tracking cgroup walinuxagent.service", timeout=datetime.timedelta(minutes=10)) + wait_for_log_message("Executing systemctl daemon-reload...", timeout=datetime.timedelta(minutes=5)) + if not check_agent_quota_disabled(): + fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota())) + + +def main(): + prepare_agent() + verify_agent_reported_metrics() + verify_process_check_on_agent_cgroups() + verify_throttling_time_check_on_agent_cgroups() + + +run_remote_test(main) diff --git a/tests_e2e/tests/scripts/agent_cpu_quota-start_service.py b/tests_e2e/tests/scripts/agent_cpu_quota-start_service.py new file mode 100755 index 0000000000..ba0f5abb23 --- /dev/null +++ b/tests_e2e/tests/scripts/agent_cpu_quota-start_service.py @@ -0,0 +1,96 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script starts the actual agent and then launches an instance of the dummy process periodically to consume the CPU +# +import signal +import subprocess +import sys +import threading +import time +import traceback + +from azurelinuxagent.common import logger + + +class CpuConsumer(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + self._stopped = False + + def run(self): + threading.current_thread().setName("*Stress*") + + while not self._stopped: + try: + # Dummy operation(reads empty streams and drops) which creates load on the CPU + dd_command = ["dd", "if=/dev/zero", "of=/dev/null"] + logger.info("Starting dummy dd command: {0} to stress CPU", ' '.join(dd_command)) + subprocess.Popen(dd_command) + logger.info("dd command completed; sleeping...") + i = 0 + while i < 30 and not self._stopped: + time.sleep(1) + i += 1 + except Exception as exception: + logger.error("{0}:\n{1}", exception, traceback.format_exc()) + + def stop(self): + self._stopped = True + + +try: + threading.current_thread().setName("*StartService*") + logger.set_prefix("E2ETest") + logger.add_logger_appender(logger.AppenderType.FILE, logger.LogLevel.INFO, "/var/log/waagent.log") + + agent_command_line = sys.argv[1:] + + logger.info("Starting Agent: {0}", ' '.join(agent_command_line)) + agent_process = subprocess.Popen(agent_command_line) + + # sleep a little to give the agent a chance to initialize + time.sleep(15) + + cpu_consumer = CpuConsumer() + cpu_consumer.start() + + + def forward_signal(signum, _): + if signum == signal.SIGTERM: + logger.info("Stopping stress thread...") + cpu_consumer.stop() + logger.info("Forwarding signal {0} to Agent", signum) + agent_process.send_signal(signum) + + + signal.signal(signal.SIGTERM, forward_signal) + + agent_process.wait() + logger.info("Agent completed") + + cpu_consumer.stop() + cpu_consumer.join() + logger.info("Stress completed") + + logger.info("Exiting...") + sys.exit(agent_process.returncode) + +except Exception as exception: + logger.error("Unexpected error occurred while starting agent service : {0}", exception) + raise