Skip to content

Commit

Permalink
Merge branch 'develop' into scale-sets-working
Browse files Browse the repository at this point in the history
  • Loading branch information
narrieta authored Oct 27, 2023
2 parents b184344 + c323eb4 commit d801233
Show file tree
Hide file tree
Showing 11 changed files with 462 additions and 6 deletions.
16 changes: 14 additions & 2 deletions tests_e2e/orchestrator/lib/agent_test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte

summary: List[str] = []
ignore_error_rules: List[Dict[str, Any]] = []
before_timestamp = datetime.datetime.min

for test in suite.tests:
test_full_name = f"{suite_name}-{test.name}"
Expand Down Expand Up @@ -642,6 +643,14 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte

ignore_error_rules.extend(test_instance.get_ignore_error_rules())

# If the test has a timestamp before which errors should be ignored in the agent log, use that timestamp
# if multiple tests have this setting, use the earliest timestamp
if test_instance.get_ignore_errors_before_timestamp() != datetime.datetime.min:
if before_timestamp != datetime.datetime.min:
before_timestamp = min(before_timestamp, test_instance.get_ignore_errors_before_timestamp())
else:
before_timestamp = test_instance.get_ignore_errors_before_timestamp()

if not test_success and test.blocks_suite:
log.warning("%s failed and blocks the suite. Stopping suite execution.", test.name)
break
Expand All @@ -666,11 +675,11 @@ def _execute_test_suite(self, suite: TestSuiteInfo, test_context: AgentTestConte
if not suite_success:
self._mark_log_as_failed()

suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules)
suite_success = suite_success and self._check_agent_log_on_test_nodes(ignore_error_rules, before_timestamp)

return suite_success

def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]]) -> bool:
def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]], before_timestamp: datetime) -> bool:
"""
Checks the agent log on the test nodes for errors; returns true on success (no errors in the logs)
"""
Expand Down Expand Up @@ -698,6 +707,9 @@ def _check_agent_log_on_test_nodes(self, ignore_error_rules: List[Dict[str, Any]
if len(ignore_error_rules) > 0:
new = []
for e in errors:
# Ignore errors that occurred before the timestamp
if e.timestamp < before_timestamp:
continue
if not AgentLog.matches_ignore_rule(e, ignore_error_rules):
new.append(e)
errors = new
Expand Down
3 changes: 2 additions & 1 deletion tests_e2e/test_suites/agent_cgroups.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#
# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics.
# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. Also, it verifies the agent cpu quota is set as expected.
#
name: "AgentCgroups"
tests:
- "agent_cgroups/agent_cgroups.py"
- "agent_cgroups/agent_cpu_quota.py"
images: "cgroups-endorsed"
3 changes: 2 additions & 1 deletion tests_e2e/test_suites/agent_publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ images:
- "random(endorsed, 10)"
- "random(endorsed-arm64, 2)"
locations: "AzureCloud:centraluseuap"
owns_vm: true
owns_vm: true
install_test_agent: false
9 changes: 8 additions & 1 deletion tests_e2e/test_suites/images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,11 @@ images:
AzureChinaCloud: []
centos_610: "OpenLogic CentOS 6.10 latest"
centos_79: "OpenLogic CentOS 7_9 latest"
centos_82: "OpenLogic CentOS 8_2 latest"
centos_82:
urn: "OpenLogic CentOS 8_2 latest"
vm_sizes:
# Since centos derived from redhat, please see the comment for vm size in rhel_82
- "Standard_B2s"
debian_8: "credativ Debian 8 latest"
debian_9: "credativ Debian 9 latest"
debian_10: "Debian debian-10 10 latest"
Expand Down Expand Up @@ -126,6 +130,9 @@ images:
urn: "RedHat RHEL 8.2 latest"
locations:
AzureChinaCloud: []
vm_sizes:
# Previously one user reported agent hang on this VM size for redhat 7+ but not observed in rhel 8. So I'm using same vm size to test agent cgroups scenario for rhel 8 to make sure we don't see any issue in automation.
- "Standard_B2s"
rhel_90:
urn: "RedHat RHEL 9_0 latest"
locations:
Expand Down
39 changes: 39 additions & 0 deletions tests_e2e/tests/agent_cgroups/agent_cpu_quota.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Dict, Any

from tests_e2e.tests.lib.agent_test import AgentTest
from tests_e2e.tests.lib.logging import log


class AgentCPUQuota(AgentTest):
"""
The test verify that the agent detects when it is throttled for using too much CPU, that it detects processes that do belong to the agent's cgroup, and that resource metrics are generated.
"""
def __init__(self, context):
super().__init__(context)
self._ssh_client = self._context.create_ssh_client()

def run(self):
log.info("=====Validating agent cpu quota checks")
self._run_remote_test("agent_cpu_quota-check_agent_cpu_quota.py", use_sudo=True)
log.info("Successfully Verified that agent running in expected CPU quotas")

def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
ignore_rules = [
# This is produced by the test, so it is expected
# Examples:
# 2023-10-03T17:59:03.007572Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 3190] /usr/bin/python3\x00/home/azureuser/bin/agent_cpu_quota-start_servi', '[PID: 3293] dd\x00if=/dev/zero\x00of=/dev/null\x00']
# [CGroupsException] The agent has been throttled for 5.7720997 seconds
{'message': r"Disabling resource usage monitoring. Reason: Check on cgroups failed"},
# This may happen during service stop while terminating the process
# Example:
# 2022-03-11T21:11:11.713161Z ERROR E2ETest [Errno 3] No such process:
{'message': r'E2ETest.*No such process'},
# 2022-10-26T15:38:39.655677Z ERROR E2ETest 'dd if=/dev/zero of=/dev/null' failed: -15 ():
{'message': r"E2ETest.*dd.*failed: -15"}
]
return ignore_rules


if __name__ == "__main__":
AgentCPUQuota.run_from_command_line()
7 changes: 6 additions & 1 deletion tests_e2e/tests/agent_publish/agent_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
# limitations under the License.
#
import uuid
from datetime import datetime
from typing import Any, Dict, List

from tests_e2e.tests.lib.agent_test import AgentVmTest
Expand Down Expand Up @@ -51,13 +52,17 @@ def run(self):
self._get_agent_info()
self._check_cse()

def get_ignore_errors_before_timestamp(self) -> datetime:
timestamp = self._ssh_client.run_command("agent_publish-get_agent_log_record_timestamp.py")
return datetime.strptime(timestamp.strip(), u'%Y-%m-%d %H:%M:%S.%f')

def _get_agent_info(self) -> None:
stdout: str = self._ssh_client.run_command("waagent-version", use_sudo=True)
log.info('Agent info \n%s', stdout)

def _prepare_agent(self) -> None:
log.info("Modifying agent update related config flags")
self._run_remote_test(self._ssh_client, "update-waagent-conf Debug.DownloadNewAgents=y AutoUpdate.GAFamily=Test", use_sudo=True)
self._run_remote_test(self._ssh_client, "update-waagent-conf Debug.DownloadNewAgents=y AutoUpdate.GAFamily=Test AutoUpdate.Enabled=y Extensions.Enabled=y", use_sudo=True)
log.info('Updated agent-update DownloadNewAgents GAFamily config flags')

def _check_update(self) -> None:
Expand Down
6 changes: 6 additions & 0 deletions tests_e2e/tests/lib/agent_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import sys

from abc import ABC, abstractmethod
from datetime import datetime

from assertpy import fail
from typing import Any, Dict, List

Expand Down Expand Up @@ -62,6 +64,10 @@ def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
"""
return []

def get_ignore_errors_before_timestamp(self) -> datetime:
# Ignore errors in the agent log before this timestamp
return datetime.min

@classmethod
def run_from_command_line(cls):
"""
Expand Down
1 change: 1 addition & 0 deletions tests_e2e/tests/lib/cgroup_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def check_agent_quota_disabled():
Returns True if the cpu quota is infinity
"""
cpu_quota = get_agent_cpu_quota()
# the quota can be expressed as seconds (s) or milliseconds (ms); no quota is expressed as "infinity"
return cpu_quota == 'infinity'


Expand Down
213 changes: 213 additions & 0 deletions tests_e2e/tests/scripts/agent_cpu_quota-check_agent_cpu_quota.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import datetime
import os
import re
import shutil
import time

from assertpy import fail

from azurelinuxagent.common.osutil import systemd
from azurelinuxagent.common.utils import shellutil
from azurelinuxagent.ga.cgroupconfigurator import _DROP_IN_FILE_CPU_QUOTA
from tests_e2e.tests.lib.agent_log import AgentLog
from tests_e2e.tests.lib.cgroup_helpers import check_agent_quota_disabled, \
get_agent_cpu_quota
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.remote_test import run_remote_test
from tests_e2e.tests.lib.retry import retry_if_false


def prepare_agent():
# This function prepares the agent:
# 1) It modifies the service unit file to wrap the agent process with a script that starts the actual agent and then
# launches an instance of the dummy process to consume the CPU. Since all these processes are in the same cgroup,
# this has the same effect as the agent itself consuming the CPU.
#
# The process tree is similar to
#
# /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─/usr/bin/python3 -u /usr/sbin/waagent -daemon
# │ └─python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
# │ └─4*[{python3}]
# ├─dd if=/dev/zero of=/dev/null
# │
# └─{python3}
#
# And the agent's cgroup looks like
#
# CGroup: /azure.slice/walinuxagent.service
# ├─10507 /usr/bin/python3 /home/azureuser/bin/agent_cpu_quota-start_service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─10508 /usr/bin/python3 -u /usr/sbin/waagent -daemon
# ├─10516 python3 -u bin/WALinuxAgent-9.9.9.9-py3.8.egg -run-exthandlers
# ├─10711 dd if=/dev/zero of=/dev/null
#
# 2) It turns on a few debug flags and resart the agent
log.info("***Preparing agent for testing cpu quota")
#
# Create a drop in file to wrap "start-service.py" around the actual agent: This will ovveride the ExecStart line in the agent's unit file
#
# ExecStart= (need to be empty to clear the original ExecStart)
# ExecStart=/home/.../agent_cgroups-start-service.py /usr/bin/python3 -u /usr/sbin/waagent -daemon
#
service_file = systemd.get_agent_unit_file()
exec_start = None
with open(service_file, "r") as file_:
for line in file_:
match = re.match("ExecStart=(.+)", line)
if match is not None:
exec_start = match.group(1)
break
else:
file_.seek(0)
raise Exception("Could not find ExecStart in {0}\n:{1}".format(service_file, file_.read()))
agent_python = exec_start.split()[0]
current_directory = os.path.dirname(os.path.abspath(__file__))
start_service_script = os.path.join(current_directory, "agent_cpu_quota-start_service.py")
drop_in_file = os.path.join(systemd.get_agent_drop_in_path(), "99-ExecStart.conf")
log.info("Creating %s...", drop_in_file)
with open(drop_in_file, "w") as file_:
file_.write("""
[Service]
ExecStart=
ExecStart={0} {1} {2}
""".format(agent_python, start_service_script, exec_start))
log.info("Executing daemon-reload")
shellutil.run_command(["systemctl", "daemon-reload"])

# Disable all checks on cgroups and enable log metrics every 20 sec
log.info("Executing script update-waagent-conf to enable agent cgroups config flag")
result = shellutil.run_command(["update-waagent-conf", "Debug.CgroupCheckPeriod=20", "Debug.CgroupLogMetrics=y",
"Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=n"])
log.info("Successfully enabled agent cgroups config flag: {0}".format(result))


def verify_agent_reported_metrics():
"""
This method verifies that the agent reports % Processor Time and Throttled Time metrics
"""
log.info("** Verifying agent reported metrics")
log.info("Parsing agent log for metrics")
processor_time = []
throttled_time = []

def check_agent_log_for_metrics() -> bool:
for record in AgentLog().read():
match = re.search(r"% Processor Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
if match is not None:
processor_time.append(float(match.group(1)))
else:
match = re.search(r"Throttled Time\s*\[walinuxagent.service\]\s*=\s*([0-9.]+)", record.message)
if match is not None:
throttled_time.append(float(match.group(1)))
if len(processor_time) < 1 or len(throttled_time) < 1:
return False
return True

found: bool = retry_if_false(check_agent_log_for_metrics)
if found:
log.info("%% Processor Time: %s", processor_time)
log.info("Throttled Time: %s", throttled_time)
log.info("Successfully verified agent reported resource metrics")
else:
fail(
"The agent doesn't seem to be collecting % Processor Time and Throttled Time metrics. Agent found Processor Time: {0}, Throttled Time: {1}".format(
processor_time, throttled_time))


def wait_for_log_message(message, timeout=datetime.timedelta(minutes=5)):
log.info("Checking agent's log for message matching [%s]", message)
start_time = datetime.datetime.now()
while datetime.datetime.now() - start_time <= timeout:
for record in AgentLog().read():
match = re.search(message, record.message, flags=re.DOTALL)
if match is not None:
log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t"))
return
time.sleep(30)
fail("The agent did not find [{0}] in its log within the allowed timeout".format(message))


def verify_process_check_on_agent_cgroups():
"""
This method checks agent detect unexpected processes in its cgroup and disables the CPUQuota
"""
log.info("***Verifying process check on agent cgroups")
log.info("Ensuring agent CPUQuota is enabled and backup the drop-in file to restore later in further tests")
if check_agent_quota_disabled():
fail("The agent's CPUQuota is not enabled: {0}".format(get_agent_cpu_quota()))
quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
quota_drop_in_backup = quota_drop_in + ".bk"
log.info("Backing up %s to %s...", quota_drop_in, quota_drop_in_backup)
shutil.copy(quota_drop_in, quota_drop_in_backup)
#
# Re-enable Process checks on cgroups and verify that the agent detects unexpected processes in its cgroup and disables the CPUQuota wehen
# that happens
#
shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=y"])

# The log message indicating the check failed is similar to
# 2021-03-29T23:33:15.603530Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 25826] python3\x00/home/nam/Compute-Runtime-Tux-Pipeline/dungeon_crawler/s']
wait_for_log_message(
"Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent's cgroup includes unexpected processes")
if not check_agent_quota_disabled():
fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))


def verify_throttling_time_check_on_agent_cgroups():
"""
This method checks agent disables its CPUQuota when it exceeds its throttling limit
"""
log.info("***Verifying CPU throttling check on agent cgroups")
# Now disable the check on unexpected processes and enable the check on throttledtime and verify that the agent disables its CPUQuota when it exceeds its throttling limit
log.info("Re-enabling CPUQuota...")
quota_drop_in = os.path.join(systemd.get_agent_drop_in_path(), _DROP_IN_FILE_CPU_QUOTA)
quota_drop_in_backup = quota_drop_in + ".bk"
log.info("Restoring %s from %s...", quota_drop_in, quota_drop_in_backup)
shutil.copy(quota_drop_in_backup, quota_drop_in)
shellutil.run_command(["systemctl", "daemon-reload"])
shellutil.run_command(["update-waagent-conf", "Debug.CgroupDisableOnProcessCheckFailure=n", "Debug.CgroupDisableOnQuotaCheckFailure=y", "Debug.AgentCpuThrottledTimeThreshold=5"])

# The log message indicating the check failed is similar to
# 2021-04-01T20:47:55.892569Z INFO MonitorHandler ExtHandler Disabling resource usage monitoring. Reason: Check on cgroups failed:
# [CGroupsException] The agent has been throttled for 121.339916938 seconds
#
# After we need to wait for a little longer for the agent to update systemd:
# 2021-04-14T01:51:44.399860Z INFO MonitorHandler ExtHandler Executing systemctl daemon-reload...
#
wait_for_log_message(
"Disabling resource usage monitoring. Reason: Check on cgroups failed:.+The agent has been throttled",
timeout=datetime.timedelta(minutes=10))
wait_for_log_message("Stopped tracking cgroup walinuxagent.service", timeout=datetime.timedelta(minutes=10))
wait_for_log_message("Executing systemctl daemon-reload...", timeout=datetime.timedelta(minutes=5))
if not check_agent_quota_disabled():
fail("The agent did not disable its CPUQuota: {0}".format(get_agent_cpu_quota()))


def main():
prepare_agent()
verify_agent_reported_metrics()
verify_process_check_on_agent_cgroups()
verify_throttling_time_check_on_agent_cgroups()


run_remote_test(main)
Loading

0 comments on commit d801233

Please sign in to comment.