diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py index 204546b6e7..7abd714343 100644 --- a/tests_e2e/orchestrator/lib/agent_test_suite.py +++ b/tests_e2e/orchestrator/lib/agent_test_suite.py @@ -16,6 +16,7 @@ # import contextlib import datetime +import json import logging import re import traceback @@ -29,6 +30,7 @@ # E0401: Unable to import 'lisa' (import-error) # etc from lisa import ( # pylint: disable=E0401 + Environment, Logger, Node, notifier, @@ -43,11 +45,13 @@ import makepkg from azurelinuxagent.common.version import AGENT_VERSION from tests_e2e.orchestrator.lib.agent_test_loader import TestSuiteInfo +from tests_e2e.tests.lib.agent_log import AgentLog from tests_e2e.tests.lib.agent_test import TestSkipped from tests_e2e.tests.lib.agent_test_context import AgentTestContext from tests_e2e.tests.lib.identifiers import VmIdentifier from tests_e2e.tests.lib.logging import log as agent_test_logger # Logger used by the tests from tests_e2e.tests.lib.logging import set_current_thread_log +from tests_e2e.tests.lib.agent_log import AgentLogRecord from tests_e2e.tests.lib.shell import run_command from tests_e2e.tests.lib.ssh_client import SshClient @@ -256,7 +260,7 @@ def _clean_up(self) -> None: def _setup_node(self) -> None: """ - Prepares the remote node for executing the test suite. + Prepares the remote node for executing the test suite (installs tools and the test agent, etc) """ self._log.info("") self._log.info("************************************** [Node Setup] **************************************") @@ -265,23 +269,27 @@ def _setup_node(self) -> None: self._log.info("Resource Group: %s", self.context.vm.resource_group) self._log.info("") - self._install_tools_on_node() + self.context.ssh_client.run_command("mkdir -p ~/bin/tests_e2e/tests; touch ~/bin/agent-env") - if self.context.is_vhd: - self._log.info("Using a VHD; will not install the test Agent.") - else: - self._install_agent_on_node() + # Copy the test tools + tools_path = self.context.test_source_directory/"orchestrator"/"scripts" + tools_target_path = Path("~/bin") + self._log.info("Copying %s to %s:%s", tools_path, self.context.node.name, tools_target_path) + self.context.ssh_client.copy_to_node(tools_path, tools_target_path, recursive=True) - def _install_tools_on_node(self) -> None: - """ - Installs the test tools on the test node - """ - self.context.ssh_client.run_command("mkdir -p ~/bin") + # Copy the test libraries + lib_path = self.context.test_source_directory/"tests"/"lib" + lib_target_path = Path("~/bin/tests_e2e/tests") + self._log.info("Copying %s to %s:%s", lib_path, self.context.node.name, lib_target_path) + self.context.ssh_client.copy_to_node(lib_path, lib_target_path, recursive=True) - tools_path = self.context.test_source_directory/"orchestrator"/"scripts" - self._log.info(f"Copying {tools_path} to the test node") - self.context.ssh_client.copy(tools_path, Path("~/bin"), remote_target=True, recursive=True) + # Copy the test agent + agent_package_path: Path = self._get_agent_package_path() + agent_package_target_path = Path("~/bin")/agent_package_path.name + self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_target_path) + self.context.ssh_client.copy_to_node(agent_package_path, agent_package_target_path) + # Copy Pypy if self.context.ssh_client.get_architecture() == "aarch64": pypy_path = Path("/tmp/pypy3.7-arm64.tar.bz2") pypy_download = "https://downloads.python.org/pypy/pypy3.7-v7.3.5-aarch64.tar.bz2" @@ -292,26 +300,21 @@ def _install_tools_on_node(self) -> None: if not pypy_path.exists(): self._log.info(f"Downloading {pypy_download} to {pypy_path}") run_command(["wget", pypy_download, "-O", pypy_path]) - self._log.info(f"Copying {pypy_path} to the test node") - self.context.ssh_client.copy(pypy_path, Path("~/bin/pypy3.7.tar.bz2"), remote_target=True) + pypy_target_path = Path("~/bin/pypy3.7.tar.bz2") + self._log.info("Copying %s to %s:%s", pypy_path, self.context.node.name, pypy_target_path) + self.context.ssh_client.copy_to_node(pypy_path, pypy_target_path) - self._log.info(f'Installing tools on the test node\n{self.context.ssh_client.run_command("~/bin/scripts/install-tools")}') - self._log.info(f'Remote commands will use {self.context.ssh_client.run_command("which python3")}') + # Install the tools and libraries + install_command = lambda: self.context.ssh_client.run_command(f"~/bin/scripts/install-tools --agent-package {agent_package_target_path}") + self._log.info('Installing tools on the test node\n%s', install_command()) + self._log.info('Remote commands will use %s', self.context.ssh_client.run_command("which python3")) - def _install_agent_on_node(self) -> None: - """ - Installs the given agent package on the test node. - """ - agent_package_path: Path = self._get_agent_package_path() - - self._log.info("Installing %s on %s", agent_package_path, self.context.node.name) - agent_package_remote_path = self.context.remote_working_directory/agent_package_path.name - self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_remote_path) - self.context.ssh_client.copy(agent_package_path, agent_package_remote_path, remote_target=True) - stdout = self.context.ssh_client.run_command(f"install-agent --package {agent_package_remote_path} --version {AGENT_VERSION}", use_sudo=True) - self._log.info(stdout) - - self._log.info("The agent was installed successfully.") + # Install the agent + if self.context.is_vhd: + self._log.info("Using a VHD; will not install the Test Agent.") + else: + install_command = lambda: self.context.ssh_client.run_command(f"install-agent --package {agent_package_target_path} --version {AGENT_VERSION}", use_sudo=True) + self._log.info("Installing the Test Agent on %s\n%s", self.context.node.name, install_command()) def _collect_node_logs(self) -> None: """ @@ -327,23 +330,25 @@ def _collect_node_logs(self) -> None: remote_path = "/tmp/waagent-logs.tgz" local_path = self.context.log_path/'{0}.tgz'.format(self.context.image_name) self._log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path) - self.context.ssh_client.copy(remote_path, local_path, remote_source=True) + self.context.ssh_client.copy_from_node(remote_path, local_path) + except: # pylint: disable=bare-except self._log.exception("Failed to collect logs from the test machine") @TestCaseMetadata(description="", priority=0) - def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str, log: Logger) -> None: + def agent_test_suite(self, node: Node, environment: Environment, variables: Dict[str, Any], log_path: str, log: Logger) -> None: """ Executes each of the AgentTests included in the "c_test_suites" variable (which is generated by the AgentTestSuitesCombinator). """ self._set_context(node, variables, log_path, log) - with _set_thread_name(self.context.image_name): # The thread name is added to self._log - # E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable) - # (OK to iterate, test_suite is a List) + # Set the thread name to the image; this name is added to self._log + with _set_thread_name(self.context.image_name): + # Log the environment's name and the variables received from the runbook (note that we need to expand the names of the test suites) + self._log.info("LISA Environment: %s", environment.name) self._log.info( - "Test suite parameters: [test_suites: %s] [skip_setup: %s] [collect_logs: %s]", - [t.name for t in self.context.test_suites], self.context.skip_setup, self.context.collect_logs) # pylint: disable=E1133 + "Runbook variables:\n%s", + '\n'.join([f"\t{name}: {value if name != 'c_test_suites' else [t.name for t in value] }" for name, value in variables.items()])) start_time: datetime.datetime = datetime.datetime.now() test_suite_success = True @@ -361,6 +366,8 @@ def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str, for suite in self.context.test_suites: # pylint: disable=E1133 test_suite_success = self._execute_test_suite(suite) and test_suite_success + test_suite_success = self._check_agent_log() and test_suite_success + finally: collect = self.context.collect_logs if collect == CollectLogs.Always or collect == CollectLogs.Failed and not test_suite_success: @@ -475,6 +482,53 @@ def _execute_test_suite(self, suite: TestSuiteInfo) -> bool: return success + def _check_agent_log(self) -> bool: + """ + Checks the agent log for errors; returns true on success (no errors int the log) + """ + start_time: datetime.datetime = datetime.datetime.now() + + self._log.info("Checking agent log on the test node") + output = self.context.ssh_client.run_command("check-agent-log.py -j") + errors = json.loads(output, object_hook=AgentLogRecord.from_dictionary) + + # Individual tests may have rules to ignore known errors; filter those out + ignore_error_rules = [] + # pylint seems to think self.context.test_suites is not iterable. Suppressing warning, since its type is List[AgentTestSuite] + # E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable) + for suite in self.context.test_suites: # pylint: disable=E1133 + for test in suite.tests: + ignore_error_rules.extend(test(self.context).get_ignore_error_rules()) + + if len(ignore_error_rules) > 0: + new = [] + for e in errors: + if not AgentLog.matches_ignore_rule(e, ignore_error_rules): + new.append(e) + errors = new + + if len(errors) == 0: + # If no errors, we are done; don't create a log or test result. + self._log.info("There are no errors in the agent log") + return True + + log_path: Path = self.context.log_path/f"CheckAgentLog-{self.context.image_name}.log" + message = f"Detected {len(errors)} error(s) in the agent log. See {log_path} for a full report." + self._log.info(message) + + with set_current_thread_log(log_path): + agent_test_logger.info("Detected %s error(s) in the agent log:\n\n%s", len(errors), '\n'.join(['\t' + e.text for e in errors])) + + self._report_test_result( + self.context.image_name, + "CheckAgentLog", + TestStatus.FAILED, + start_time, + message=message + '\n' + '\n'.join([e.text for e in errors[0:3]]), + add_exception_stack_trace=True) + + return False + @staticmethod def _report_test_result( suite_name: str, diff --git a/tests_e2e/orchestrator/scripts/check-agent-log.py b/tests_e2e/orchestrator/scripts/check-agent-log.py new file mode 100755 index 0000000000..231e7bcd05 --- /dev/null +++ b/tests_e2e/orchestrator/scripts/check-agent-log.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import argparse +import json +import sys + +from pathlib import Path +from tests_e2e.tests.lib.agent_log import AgentLog + +try: + parser = argparse.ArgumentParser() + parser.add_argument('path', nargs='?', help='Path of the log file', default='/var/log/waagent.log') + parser.add_argument('-j', '--json', action='store_true', help='Produce a JSON report') + parser.set_defaults(json=False) + args = parser.parse_args() + + error_list = AgentLog(Path(args.path)).get_errors() + + if args.json: + print(json.dumps(error_list, default=lambda o: o.__dict__)) + else: + if len(error_list) == 0: + print("No errors were found.") + else: + for e in error_list: + print(e.text) + +except Exception as e: + print(f"{e}", file=sys.stderr) + sys.exit(1) + +sys.exit(0) diff --git a/tests_e2e/orchestrator/scripts/get-waagent-path b/tests_e2e/orchestrator/scripts/get-agent-path similarity index 98% rename from tests_e2e/orchestrator/scripts/get-waagent-path rename to tests_e2e/orchestrator/scripts/get-agent-path index c177585695..e2e44f453f 100755 --- a/tests_e2e/orchestrator/scripts/get-waagent-path +++ b/tests_e2e/orchestrator/scripts/get-agent-path @@ -22,8 +22,6 @@ # set -euo pipefail -# - # On most distros, 'waagent' is in PATH if which waagent 2> /dev/null; then exit 0 @@ -42,6 +40,7 @@ fi # try some well-known locations declare -a known_locations=( + "/usr/sbin/waagent" "/usr/share/oem/bin/waagent" ) diff --git a/tests_e2e/orchestrator/scripts/install-agent b/tests_e2e/orchestrator/scripts/install-agent index 64d96307a3..868b9fa788 100755 --- a/tests_e2e/orchestrator/scripts/install-agent +++ b/tests_e2e/orchestrator/scripts/install-agent @@ -72,6 +72,14 @@ else fi echo "Service name: $service_name" +# +# Find the path to the Agent's executable file +# +waagent=$(get-agent-path) +echo "Agent's path: $waagent" +$waagent --version +echo "" + # # Install the package # @@ -98,20 +106,13 @@ service-start $service_name # # Verify that the new agent is running and output its status. # Note that the extension handler may take some time to start so give 1 minute. -# Also, note that the default Python is set to Pypy, so before executing 'waagent' we need to set the -# Python path to the location of the azurelinuxagent module. # echo "Verifying agent installation..." -PYTHONPATH=$(get-agent-pythonpath) -export PYTHONPATH - check-version() { - waagent=$(get-waagent-path) - for i in {0..5} do - if $waagent --version | grep -E "Goal state agent:\s+$1" > /dev/null; then + if $waagent --version | grep -E "Goal state agent:\s+$version" > /dev/null; then return 0 fi sleep 10 @@ -128,7 +129,6 @@ else exit_code=1 fi -waagent=$(get-waagent-path) $waagent --version printf "\n" service-status $service_name diff --git a/tests_e2e/orchestrator/scripts/install-tools b/tests_e2e/orchestrator/scripts/install-tools index 04dbb4f653..6feb71e530 100755 --- a/tests_e2e/orchestrator/scripts/install-tools +++ b/tests_e2e/orchestrator/scripts/install-tools @@ -25,47 +25,112 @@ set -euo pipefail +usage() ( + echo "Usage: install-tools -p|--agent-package " + exit 1 +) + +while [[ $# -gt 0 ]]; do + case $1 in + -p|--agent-package) + shift + if [ "$#" -lt 1 ]; then + usage + fi + agent_package=$1 + shift + ;; + *) + usage + esac +done +if [ "$#" -ne 0 ] || [ -z ${agent_package+x} ]; then + usage +fi + echo "Installing scripts to ~/bin" mv ~/bin/scripts/* ~/bin rm -r ~/bin/scripts +PATH="$HOME/bin:$PATH" -echo "Installing Pypy to ~/bin" -# bzip2/lbzip2 (used by tar to uncompress *.bz2 files) are not available by default in some distros; -# use Python to uncompress the Pypy tarball. +# If the system's Python is <= 3.7, install Pypy and make it the default Python for the user executing the tests python=$(~/bin/find-python) -$python ~/bin/uncompress.py ~/bin/pypy3.7.tar.bz2 ~/bin/pypy3.7.tar -tar xf ~/bin/pypy3.7.tar -C ~/bin -rm ~/bin/pypy3.7.tar ~/bin/pypy3.7.tar.bz2 +python_version=$($python -c 'import sys; print("{0:02}.{1:02}".format(sys.version_info[0], sys.version_info[1]))') +echo "Python: $python ($python_version)" +if [[ $python_version < "03.07" ]]; then + echo "Installing Pypy 3.7 to ~/bin and making it the default Python for user $USER" + # bzip2/lbzip2 (used by tar to uncompress *.bz2 files) are not available by default in some distros; + # use Python to uncompress the Pypy tarball. + $python ~/bin/uncompress.py ~/bin/pypy3.7.tar.bz2 ~/bin/pypy3.7.tar + tar xf ~/bin/pypy3.7.tar -C ~/bin + rm ~/bin/pypy3.7.tar ~/bin/pypy3.7.tar.bz2 -if [[ -e ~/bin/python ]]; then - rm ~/bin/python -fi -ln -s ~/bin/pypy*/bin/pypy3.7 ~/bin/python + if [[ -e ~/bin/python ]]; then + rm ~/bin/python + fi + ln -s ~/bin/pypy*/bin/pypy3.7 ~/bin/python + + if [[ -e ~/bin/python3 ]]; then + rm ~/bin/python3 + fi + ln -s ~/bin/pypy*/bin/pypy3.7 ~/bin/python3 -if [[ -e ~/bin/python3 ]]; then - rm ~/bin/python3 + echo "Installing the 'distro' module" + python3 -m ensurepip + python3 -mpip install -U pip wheel + python3 -mpip install distro +else + # In some distros (e.g. Flatcar), Python is not in PATH; in that case create a symlink under ~/bin + if ! which python3 > /dev/null 2>&1; then + if [[ ! $python_version < "03.00" ]]; then + echo "Python ($python) is not in PATH; creating symbolic links as ~/bin/python and ~/bin/python3" + ln -s "$python" ~/bin/python + ln -s "$python" ~/bin/python3 + fi + fi fi -ln -s ~/bin/pypy*/bin/pypy3.7 ~/bin/python3 -# Note that we place $HOME/bin at the front of PATH, so the tools in ~/bin (including python) will +echo "Installing Agent modules to ~/bin" +unzip.py "$agent_package" ~/bin/WALinuxAgent +unzip.py ~/bin/WALinuxAgent/bin/WALinuxAgent-*.egg ~/bin/WALinuxAgent/bin/WALinuxAgent.egg +mv ~/bin/WALinuxAgent/bin/WALinuxAgent.egg/azurelinuxagent ~/bin +rm -rf ~/bin/WALinuxAgent + +# +# Create ~/bin/agent-env to set PATH and PYTHONPATH. +# +# We add $HOME/bin to the front of PATH, so tools in ~/bin (including python) will # take precedence over system tools with the same name. +# +# We set PYTHONPATH to include the location of the agent modules installed on the VM image and also +# the test modules we copied to ~/bin. +# +# +echo "Creating ~/bin/agent-env to set PATH and PYTHONPATH" +echo ' +if [[ $PATH != *"$HOME/bin"* ]]; then + PATH="$HOME/bin:$PATH:" +fi + +export PYTHONPATH="$HOME/bin" +' > ~/bin/agent-env +chmod u+x ~/bin/agent-env + +echo "Adding ~/bin/agent-env to ~/.bash_profile" # In some distros, e.g. Flatcar, .bash_profile is a symbolic link to a read-only file. Make a copy in # that case. -echo "Adding ~/bin to PATH in .bash_profile" if test -e ~/.bash_profile && ls -l .bash_profile | grep '\->'; then cp ~/.bash_profile ~/.bash_profile-bk rm ~/.bash_profile mv ~/.bash_profile-bk ~/.bash_profile fi -if ! test -e ~/.bash_profile || ! grep '# Add $HOME/bin to $PATH' ~/.bash_profile > /dev/null; then - echo ' -# Add $HOME/bin to $PATH -if [[ $PATH != *"$HOME/bin"* ]]; then - PATH="$HOME/bin:$PATH:" -fi +if ! test -e ~/.bash_profile || ! grep '~/bin/agent-env' ~/.bash_profile > /dev/null; then + echo 'source ~/bin/agent-env ' >> ~/.bash_profile fi -echo "python3 has been set to $(which python3)" +source ~/bin/agent-env +echo "PATH=$PATH" +echo "PYTHONPATH=$PYTHONPATH" +echo "python3 -> $(which python3)" python3 --version - diff --git a/tests_e2e/tests/lib/agent_log.py b/tests_e2e/tests/lib/agent_log.py new file mode 100644 index 0000000000..61b4ca85cd --- /dev/null +++ b/tests_e2e/tests/lib/agent_log.py @@ -0,0 +1,446 @@ +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re + +from datetime import datetime +from pathlib import Path +from typing import Any, AnyStr, Dict, Iterable, List, Match + +from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION + + +class AgentLogRecord: + """ + Represents an entry in the Agent's log (note that entries can span multiple lines in the log) + + Sample message: + 2023-03-13T15:44:04.906673Z INFO ExtHandler ExtHandler Azure Linux Agent (Goal State Agent version 9.9.9.9) + """ + text: str # Full text of the record + when: str # Timestamp (as text) + level: str # Level (INFO, ERROR, etc) + thread: str # Thread name (e.g. 'Daemon', 'ExtHandler') + prefix: str # Prefix (e.g. 'Daemon', 'ExtHandler', ) + message: str # Message + + @staticmethod + def from_match(match: Match[AnyStr]): + """Builds a record from a regex match""" + record = AgentLogRecord() + record.text = match.string + record.when = match.group("when") + record.level = match.group("level") + record.thread = match.group("thread") + record.prefix = match.group("prefix") + record.message = match.group("message") + return record + + @staticmethod + def from_dictionary(dictionary: Dict[str, str]): + """Deserializes from a dict""" + record = AgentLogRecord() + record.text = dictionary["text"] + record.when = dictionary["when"] + record.level = dictionary["level"] + record.thread = dictionary["thread"] + record.prefix = dictionary["prefix"] + record.message = dictionary["message"] + return record + + @property + def timestamp(self) -> datetime: + return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ') + + +class AgentLog(object): + """ + Provides facilities to parse and/or extract errors from the agent's log. + """ + def __init__(self, path: Path = Path('/var/log/waagent.log')): + self._path: Path = path + self._counter_table: Dict[str, int] = {} + + def get_errors(self) -> List[AgentLogRecord]: + """ + Returns any ERRORs or WARNINGs in the agent log. + + The function filters out known/uninteresting errors, which are kept in the 'ignore_list' variable. + """ + # + # Items in this list are known errors and they are ignored. + # + # * 'message' - A regular expression matched using re.search; be sure to escape any regex metacharacters. A positive match indicates + # that the error should be ignored + # * 'if' - A lambda that takes as parameter an AgentLogRecord representing an error and returns true if the error should be ignored + # + ignore_rules = [ + # + # NOTE: This list was taken from the older agent tests and needs to be cleaned up. Feel free to un-comment rules as new tests are added. + # + # # This warning is expected on CentOS/RedHat 7.4, 7.8 and Redhat 7.6 + # { + # 'message': r"Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules", + # 'if': lambda r: + # re.match(r"(((centos|redhat)7\.[48])|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None + # and r.level == "WARNING" + # and r.prefix == "ExtHandler" and r.thread in ("", "EnvHandler") + # }, + # # This warning is expected on SUSE 12 + # { + # 'message': r"WARNING EnvHandler ExtHandler Move rules file 75-persistent-net-generator.rules to /var/lib/waagent/75-persistent-net-generator.rules", + # 'if': lambda _: re.match(r"((sles15\.2)|suse12)\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None + # }, + # # The following message is expected to log an error if systemd is not enabled on it + # { + # 'message': r"Did not detect Systemd, unable to set wa(|linux)agent-network-setup.service", + # 'if': lambda _: not self._is_systemd() + # }, + # # + # # Journalctl in Debian 8.11 does not have the --utc option by default. + # # Ignoring this error for Deb 8 as its not a blocker and since Deb 8 is old and not widely used + # { + # 'message': r"journalctl: unrecognized option '--utc'", + # 'if': lambda r: re.match(r"(debian8\.11)\D*", DISTRO_NAME, flags=re.IGNORECASE) is not None and r.level == "WARNING" + # }, + # # Sometimes it takes the Daemon some time to identify primary interface and the route to Wireserver, + # # ignoring those errors if they come from the Daemon. + # { + # 'message': r"(No route exists to \d+\.\d+\.\d+\.\d+|" + # r"Could not determine primary interface, please ensure \/proc\/net\/route is correct|" + # r"Contents of \/proc\/net\/route:|Primary interface examination will retry silently)", + # 'if': lambda r: r.prefix == "Daemon" + # }, + # + # # This happens in CENTOS and RHEL when waagent attempt to format and mount the error while cloud init is already doing it + # # 2021-09-20T06:45:57.253801Z WARNING Daemon Daemon Could not mount resource disk: mount: /dev/sdb1 is already mounted or /mnt/resource busy + # # /dev/sdb1 is already mounted on /mnt/resource + # { + # 'message': r"Could not mount resource disk: mount: \/dev\/sdb1 is already mounted or \/mnt\/resource busy", + # 'if': lambda r: + # re.match(r"((centos7\.8)|(redhat7\.8)|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + # and r.level == "WARNING" + # and r.prefix == "Daemon" + # }, + # # + # # 2021-09-20T06:45:57.246593Z ERROR Daemon Daemon Command: [mkfs.ext4 -F /dev/sdb1], return code: [1], result: [mke2fs 1.42.9 (28-Dec-2013) + # # /dev/sdb1 is mounted; will not make a filesystem here! + # { + # 'message': r"Command: \[mkfs.ext4 -F \/dev\/sdb1\], return code: \[1\]", + # 'if': lambda r: + # re.match(r"((centos7\.8)|(redhat7\.8)|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + # and r.level == "ERROR" + # and r.prefix == "Daemon" + # }, + # # + # # 2022-01-20T06:52:21.515447Z WARNING Daemon Daemon Fetch failed: [HttpError] [HTTP Failed] GET https://dcrgajhx62.blob.core.windows.net/$system/edprpwqbj6.5c2ddb5b-d6c3-4d73-9468-54419ca87a97.vmSettings -- IOError timed out -- 6 attempts made + # # + # # The daemon does not need the artifacts profile blob, but the request is done as part of protocol initialization. This timeout can be ignored, if the issue persist the log would include additional instances. + # # + # { + # 'message': r"\[HTTP Failed\] GET https://.*\.vmSettings -- IOError timed out", + # 'if': lambda r: r.level == "WARNING" and r.prefix == "Daemon" + # }, + # + # Probably the agent should log this as INFO, but for now it is a warning + # e.g. + # 2021-07-29T04:40:17.190879Z WARNING EnvHandler ExtHandler Dhcp client is not running. + # old agents logs don't have a prefix of thread and/or logger names. + { + 'message': r"Dhcp client is not running.", + 'if': lambda r: r.level == "WARNING" + }, + # Known bug fixed in the current agent, but still present in older daemons + # + { + 'message': r"\[CGroupsException\].*Error: join\(\) argument must be str, bytes, or os.PathLike object, not 'NoneType'", + 'if': lambda r: r.level == "WARNING" and r.prefix == "Daemon" + }, + # This warning is expected on when WireServer gives us the incomplete goalstate without roleinstance data + { + 'message': r"\[ProtocolError\] Fetched goal state without a RoleInstance", + }, + # + # Download warnings (manifest and zips). + # + # Examples: + # 2021-03-31T03:48:35.216494Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Failed] GET https://zrdfepirv2cbn04prdstr01a.blob.core.windows.net/f72653efd9e349ed9842c8b99e4c1712/Microsoft.CPlat.Core_NullSeqA_useast2euap_manifest.xml -- IOError ('The read operation timed out',) -- 1 attempts made + # 2021-03-31T06:54:29.655861Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] [HTTP Retry] GET http://168.63.129.16:32526/extensionArtifact -- Status Code 502 -- 1 attempts made + # 2021-03-31T06:43:17.806663Z WARNING ExtHandler ExtHandler Download failed, switching to host plugin + { + 'message': r"(Fetch failed: \[HttpError\] .+ GET .+ -- [0-9]+ attempts made)|(Download failed, switching to host plugin)", + 'if': lambda r: r.level == "WARNING" and r.prefix == "ExtHandler" and r.thread == "ExtHandler" + }, + # 2021-07-09T01:46:53.307959Z INFO MonitorHandler ExtHandler [CGW] Disabling resource usage monitoring. Reason: Check on cgroups failed: + # [CGroupsException] The agent's cgroup includes unexpected processes: ['[PID: 2367] UNKNOWN'] + { + 'message': r"The agent's cgroup includes unexpected processes: \[('\[PID:\s?\d+\]\s*UNKNOWN'(,\s*)?)+\]" + }, + # 2021-12-20T07:46:23.020197Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a memory cgroup + # Ignoring this since memory cgroup(MemoryAccounting) not enabled. + { + 'message': r"The agent's process is not within a memory cgroup", + 'if': lambda r: re.match(r"(((centos|redhat)7\.[48])|(redhat7\.6)|(redhat8\.2))\D*", DISTRO_NAME, flags=re.IGNORECASE) + }, + # + # Ubuntu 22 uses cgroups v2, so we need to ignore these: + # + # 2023-03-15T20:47:56.684849Z INFO ExtHandler ExtHandler [CGW] The CPU cgroup controller is not mounted + # 2023-03-15T20:47:56.685392Z INFO ExtHandler ExtHandler [CGW] The memory cgroup controller is not mounted + # 2023-03-15T20:47:56.688576Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a CPU cgroup + # 2023-03-15T20:47:56.688981Z INFO ExtHandler ExtHandler [CGW] The agent's process is not within a memory cgroup + # + { + 'message': r"\[CGW\]\s*(The (CPU|memory) cgroup controller is not mounted)|(The agent's process is not within a (CPU|memory) cgroup)", + 'if': lambda r: DISTRO_NAME == 'ubuntu' and DISTRO_VERSION >= '22.00' + }, + # + # 2022-02-09T04:50:37.384810Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 2bed9b62-188e-4668-b1a8-87c35cfa4927 eTag: 7031887032544600793]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] b'{ "errorCode": "VMArtifactsProfileBlobContentNotFound", "message": "VM artifacts profile blob has no content in it.", "details": ""}' + # + # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*VMArtifactsProfileBlobContentNotFound", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2022-11-01T02:45:55.513692Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: 616873cc-be87-41b6-83b7-ef3a76370628 eTag: 3693655388249891516]: [Internal error in HostGAPlugin] [HTTP Failed] [502: Bad Gateway] { "errorCode": "InternalError", "message": "The server encountered an internal error. Please retry the request.", "details": ""} + # + # Fetching the goal state may catch the HostGAPlugin in the process of computing the vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*Please retry the request", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2022-08-16T01:50:10.759502Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: e162f7c3-8d0c-4a9b-a987-8f9ec0699dae eTag: 9757461589808963322]: Timeout + # + # Fetching the goal state may hit timeouts in the HostGAPlugin's vmSettings. This can be ignored, if the issue persist the log would include other errors as well. + # + { + 'message': r"\[ProtocolError\] GET vmSettings.*Timeout", + 'if': lambda r: r.level == "ERROR" + }, + # + # 2021-12-29T06:50:49.904601Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] [HTTP Failed] [410: Gone] The page you requested was removed. + # 2022-03-21T02:44:03.770017Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] Error fetching goal state Inner error: [ResourceGoneError] Resource is gone + # 2022-02-16T04:46:50.477315Z WARNING Daemon Daemon Fetching the goal state failed: [ResourceGoneError] [HTTP Failed] [410: Gone] b'\n\n ResourceNotAvailable\n The resource requested is no longer available. Please refresh your cache.\n
\n
' + # + # ResourceGone can happen if we are fetching one of the URIs in the goal state and a new goal state arrives + { + 'message': r"(?s)(Fetching the goal state failed|Error fetching goal state|Error fetching the goal state).*(\[ResourceGoneError\]|\[410: Gone\]|Resource is gone)", + 'if': lambda r: r.level in ("WARNING", "ERROR") + }, + # + # 2022-12-02T05:45:51.771876Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] [Wireserver Exception] [HttpError] [HTTP Failed] GET http://168.63.129.16/machine/ -- IOError [Errno 104] Connection reset by peer -- 6 attempts made + # + { + 'message': r"\[HttpError\] \[HTTP Failed\] GET http://168.63.129.16/machine/ -- IOError \[Errno 104\] Connection reset by peer", + 'if': lambda r: r.level in ("WARNING", "ERROR") + }, + # + # 2022-03-08T03:03:23.036161Z WARNING ExtHandler ExtHandler Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' + # 2022-03-08T03:03:23.042008Z WARNING ExtHandler ExtHandler Fetch failed: [ProtocolError] Fetch failed from [http://168.63.129.16:32526/extensionArtifact]: [HTTP Failed] [400: Bad Request] b'' + # + # Warning downloading extension manifest. If the issue persists, this would cause errors elsewhere so safe to ignore + { + 'message': r"\[http://168.63.129.16:32526/extensionArtifact\]: \[HTTP Failed\] \[400: Bad Request\]", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-03-29T05:52:10.089958Z WARNING ExtHandler ExtHandler An error occurred while retrieving the goal state: [ProtocolError] GET vmSettings [correlation ID: da106cf5-83a0-44ec-9484-d0e9223847ab eTag: 9856274988128027586]: Timeout + # + # Ignore warnings about timeouts in vmSettings; if the condition persists, an error will occur elsewhere. + # + { + 'message': r"GET vmSettings \[[^]]+\]: Timeout", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-09-30T02:48:33.134649Z WARNING MonitorHandler ExtHandler Error in SendHostPluginHeartbeat: [HttpError] [HTTP Failed] GET http://168.63.129.16:32526/health -- IOError timed out -- 1 attempts made --- [NOTE: Will not log the same error for the next hour] + # + # Ignore timeouts in the HGAP's health API... those are tracked in the HGAP dashboard so no need to worry about them on test runs + # + { + 'message': r"SendHostPluginHeartbeat:.*GET http://168.63.129.16:32526/health.*timed out", + 'if': lambda r: r.level == "WARNING" + }, + # + # 2022-09-30T03:09:25.013398Z WARNING MonitorHandler ExtHandler Error in SendHostPluginHeartbeat: [ResourceGoneError] [HTTP Failed] [410: Gone] + # + # ResourceGone should not happen very often, since the monitor thread already refreshes the goal state before sending the HostGAPlugin heartbeat. Errors can still happen, though, since the goal state + # can change in-between the time at which the monitor thread refreshes and the time at which it sends the heartbeat. Ignore these warnings unless there are 2 or more of them. + # + { + 'message': r"SendHostPluginHeartbeat:.*ResourceGoneError.*410", + 'if': lambda r: r.level == "WARNING" and self._increment_counter("SendHostPluginHeartbeat-ResourceGoneError-410") < 2 # ignore unless there are 2 or more instances + }, + # 2023-01-18T02:58:25.589492Z ERROR SendTelemetryHandler ExtHandler Event: name=WALinuxAgent, op=ReportEventErrors, message=DroppedEventsCount: 1 + # Reasons (first 5 errors): [ProtocolError] [Wireserver Exception] [ProtocolError] [Wireserver Failed] URI http://168.63.129.16/machine?comp=telemetrydata [HTTP Failed] Status Code 400: Traceback (most recent call last): + # + { + 'message': r"(?s)SendTelemetryHandler.*http://168.63.129.16/machine\?comp=telemetrydata.*Status Code 400", + 'if': lambda _: self._increment_counter("SendTelemetryHandler-telemetrydata-Status Code 400") < 2 # ignore unless there are 2 or more instances + }, + # + # Ignore these errors in flatcar: + # + # 1) 2023-03-16T14:30:33.091427Z ERROR Daemon Daemon Failed to mount resource disk [ResourceDiskError] unable to detect disk topology + # 2) 2023-03-16T14:30:33.091708Z ERROR Daemon Daemon Event: name=WALinuxAgent, op=ActivateResourceDisk, message=[ResourceDiskError] unable to detect disk topology, duration=0 + # 3) 2023-03-16T14:30:34.660976Z WARNING ExtHandler ExtHandler Fetch failed: [HttpError] HTTPS is unavailable and required + # 4) 2023-03-16T14:30:34.800112Z ERROR ExtHandler ExtHandler Unable to setup the persistent firewall rules: [Errno 30] Read-only file system: '/lib/systemd/system/waagent-network-setup.service' + # + # 1, 2) under investigation + # 3) There seems to be a configuration issue in flatcar that prevents python from using HTTPS when trying to reach storage. This does not produce any actual errors, since the agent fallbacks to the HGAP. + # 4) Remove this when bug 17523033 is fixed. + # + { + 'message': r"(Failed to mount resource disk)|(unable to detect disk topology)", + 'if': lambda r: r.prefix == 'Daemon' and DISTRO_NAME == 'flatcar' + }, + { + 'message': r"(HTTPS is unavailable and required)|(Unable to setup the persistent firewall rules.*Read-only file system)", + 'if': lambda r: DISTRO_NAME == 'flatcar' + }, + # + # AzureSecurityLinuxAgent fails to install on a few distros (e.g. Debian 11) + # + # 2023-03-16T14:29:48.798415Z ERROR ExtHandler ExtHandler Event: name=Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent, op=Install, message=[ExtensionOperationError] Non-zero exit code: 56, /var/lib/waagent/Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent-2.21.115/handler.sh install + # + { + 'message': r"Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent.*op=Install.*Non-zero exit code: 56,", + }, + + ] + + def is_error(r: AgentLogRecord) -> bool: + return r.level in ('ERROR', 'WARNING') or any(err in r.text for err in ['Exception', 'Traceback', '[CGW]']) + + errors = [] + primary_interface_error = None + provisioning_complete = False + + for record in self.read(): + if is_error(record) and not self.matches_ignore_rule(record, ignore_rules): + # Handle "/proc/net/route contains no routes" and "/proc/net/route is missing headers" as a special case + # since it can take time for the primary interface to come up, and we don't want to report transient + # errors as actual errors. The last of these errors in the log will be reported + if "/proc/net/route contains no routes" in record.text or "/proc/net/route is missing headers" in record.text and record.prefix == "Daemon": + primary_interface_error = record + provisioning_complete = False + else: + errors.append(record) + + if "Provisioning complete" in record.text and record.prefix == "Daemon": + provisioning_complete = True + + # Keep the "no routes found" as a genuine error message if it was never corrected + if primary_interface_error is not None and not provisioning_complete: + errors.append(primary_interface_error) + + return errors + + @staticmethod + def _is_systemd(): + # Taken from azurelinuxagent/common/osutil/systemd.py; repeated here because it is available only on agents >= 2.3 + return os.path.exists("/run/systemd/system/") + + def _increment_counter(self, counter_name) -> int: + """ + Keeps a table of counters indexed by the given 'counter_name'. Each call to the function + increments the value of that counter and returns the new value. + """ + count = self._counter_table.get(counter_name) + count = 1 if count is None else count + 1 + self._counter_table[counter_name] = count + return count + + @staticmethod + def matches_ignore_rule(record: AgentLogRecord, ignore_rules: List[Dict[str, Any]]) -> bool: + """ + Returns True if the given 'record' matches any of the 'ignore_rules' + """ + return any(re.search(rule['message'], record.message) is not None and ('if' not in rule or rule['if'](record)) for rule in ignore_rules) + + # The format of the log has changed over time and the current log may include records from different sources. Most records are single-line, but some of them + # can span across multiple lines. We will assume records always start with a line similar to the examples below; any other lines will be assumed to be part + # of the record that is being currently parsed. + # + # Newer Agent: 2019-11-27T22:22:48.123985Z VERBOSE ExtHandler ExtHandler Report vm agent status + # 2021-03-30T19:45:33.793213Z INFO ExtHandler [Microsoft.Azure.Security.Monitoring.AzureSecurityLinuxAgent-2.14.64] Target handler state: enabled [incarnation 3] + # + # 2.2.46: the date time was changed to ISO-8601 format but the thread name was not added. + # 2021-05-28T01:17:40.683072Z INFO ExtHandler Wire server endpoint:168.63.129.16 + # 2021-05-28T01:17:40.683823Z WARNING ExtHandler Move rules file 70-persistent-net.rules to /var/lib/waagent/70-persistent-net.rules + # 2021-05-28T01:17:40.767600Z INFO ExtHandler Successfully added Azure fabric firewall rules + # + # Older Agent: 2021/03/30 19:35:35.971742 INFO Daemon Azure Linux Agent Version:2.2.45 + # + # Extension: 2021/03/30 19:45:31 Azure Monitoring Agent for Linux started to handle. + # 2021/03/30 19:45:31 [Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0] cwd is /var/lib/waagent/Microsoft.Azure.Monitor.AzureMonitorLinuxAgent-1.7.0 + # + _NEWER_AGENT_RECORD = re.compile(r'(?P[\d-]+T[\d:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P\S+)\s(?P(Daemon)|(ExtHandler)|(\[\S+\]))\s(?P.*)') + _2_2_46_AGENT_RECORD = re.compile(r'(?P[\d-]+T[\d:.]+Z)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?PDaemon|ExtHandler|\[\S+\])\s(?P.*)') + _OLDER_AGENT_RECORD = re.compile(r'(?P[\d/]+\s[\d:.]+)\s(?PVERBOSE|INFO|WARNING|ERROR)\s(?P)(?P\S*)\s(?P.*)') + _EXTENSION_RECORD = re.compile(r'(?P[\d/]+\s[\d:.]+)\s(?P)(?P)((?P\[[^\]]+\])\s)?(?P.*)') + + def read(self) -> Iterable[AgentLogRecord]: + """ + Generator function that returns each of the entries in the agent log parsed as AgentLogRecords. + + The function can be used following this pattern: + + for record in read_agent_log(): + ... do something... + + """ + if not self._path.exists(): + raise IOError('{0} is not found'.format(self._path)) + + def match_record(): + for regex in [self._NEWER_AGENT_RECORD, self._2_2_46_AGENT_RECORD, self._OLDER_AGENT_RECORD]: + m = regex.match(line) + if m is not None: + return m + # The extension regex also matches the old agent records, so it needs to be last + return self._EXTENSION_RECORD.match(line) + + def complete_record(): + record.text = record.text.rstrip() # the text includes \n + if extra_lines != "": + record.text = record.text + "\n" + extra_lines.rstrip() + record.message = record.message + "\n" + extra_lines.rstrip() + return record + + with self._path.open() as file_: + record = None + extra_lines = "" + + line = file_.readline() + while line != "": # while not EOF + match = match_record() + if match is not None: + if record is not None: + yield complete_record() + record = AgentLogRecord.from_match(match) + extra_lines = "" + else: + extra_lines = extra_lines + line + line = file_.readline() + + if record is not None: + yield complete_record() diff --git a/tests_e2e/tests/lib/agent_test.py b/tests_e2e/tests/lib/agent_test.py index 6e79ad4f2f..22f865a6f3 100644 --- a/tests_e2e/tests/lib/agent_test.py +++ b/tests_e2e/tests/lib/agent_test.py @@ -20,6 +20,7 @@ import sys from abc import ABC, abstractmethod +from typing import Any, Dict, List from tests_e2e.tests.lib.agent_test_context import AgentTestContext from tests_e2e.tests.lib.logging import log @@ -44,6 +45,10 @@ def __init__(self, context: AgentTestContext): def run(self): pass + def get_ignore_error_rules(self) -> List[Dict[str, Any]]: + # Tests can override this method to return a list with rules to ignore errors in the agent log (see agent_log.py for sample rules). + return [] + @classmethod def run_from_command_line(cls): """ diff --git a/tests_e2e/tests/lib/ssh_client.py b/tests_e2e/tests/lib/ssh_client.py index e0c07420e6..50ff9f8086 100644 --- a/tests_e2e/tests/lib/ssh_client.py +++ b/tests_e2e/tests/lib/ssh_client.py @@ -16,6 +16,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import re + from pathlib import Path from tests_e2e.tests.lib import shell @@ -33,14 +35,17 @@ def run_command(self, command: str, use_sudo: bool = False) -> str: Executes the given command over SSH and returns its stdout. If the command returns a non-zero exit code, the function raises a RunCommandException. """ + if re.match(r"^\s*sudo\s*", command): + raise Exception("Do not include 'sudo' in the 'command' argument, use the 'use_sudo' parameter instead") + destination = f"ssh://{self._username}@{self._ip_address}:{self._port}" # Note that we add ~/bin to the remote PATH, since Python (Pypy) and other test tools are installed there. # Note, too, that when using sudo we need to carry over the value of PATH to the sudo session - sudo = "sudo env PATH=$PATH" if use_sudo else '' + sudo = "sudo env PATH=$PATH PYTHONPATH=$PYTHONPATH" if use_sudo else '' return shell.run_command([ "ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, destination, - f"PATH=~/bin:$PATH;{sudo} {command}"]) + f"source ~/bin/agent-env;{sudo} {command}"]) @staticmethod def generate_ssh_key(private_key_file: Path): @@ -52,10 +57,19 @@ def generate_ssh_key(private_key_file: Path): def get_architecture(self): return self.run_command("uname -m").rstrip() - def copy(self, source: Path, target: Path, remote_source: bool = False, remote_target: bool = False, recursive: bool = False): + def copy_to_node(self, local_path: Path, remote_path: Path, recursive: bool = False) -> None: + """ + File copy to a remote node + """ + self._copy(local_path, remote_path, remote_source=False, remote_target=True, recursive=recursive) + + def copy_from_node(self, remote_path: Path, local_path: Path, recursive: bool = False) -> None: """ - Copy file from local to remote machine + File copy from a remote node """ + self._copy(remote_path, local_path, remote_source=True, remote_target=False, recursive=recursive) + + def _copy(self, source: Path, target: Path, remote_source: bool, remote_target: bool, recursive: bool) -> None: if remote_source: source = f"{self._username}@{self._ip_address}:{source}" if remote_target: