Check agent log for errors; install test libraries (#2787)

* Check agent log for errors; install test libraries --------- Co-authored-by: narrieta <narrieta>
Azure · Mar 22, 2023 · 1640510 · 1640510
1 parent e9b51d7
commit 1640510
Show file tree

Hide file tree

Showing 8 changed files with 709 additions and 77 deletions.
diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py
@@ -16,6 +16,7 @@
 #
 import contextlib
 import datetime
+import json
 import logging
 import re
 import traceback
@@ -29,6 +30,7 @@
 #     E0401: Unable to import 'lisa' (import-error)
 #     etc
 from lisa import (  # pylint: disable=E0401
+    Environment,
     Logger,
     Node,
     notifier,
@@ -43,11 +45,13 @@
 import makepkg
 from azurelinuxagent.common.version import AGENT_VERSION
 from tests_e2e.orchestrator.lib.agent_test_loader import TestSuiteInfo
+from tests_e2e.tests.lib.agent_log import AgentLog
 from tests_e2e.tests.lib.agent_test import TestSkipped
 from tests_e2e.tests.lib.agent_test_context import AgentTestContext
 from tests_e2e.tests.lib.identifiers import VmIdentifier
 from tests_e2e.tests.lib.logging import log as agent_test_logger  # Logger used by the tests
 from tests_e2e.tests.lib.logging import set_current_thread_log
+from tests_e2e.tests.lib.agent_log import AgentLogRecord
 from tests_e2e.tests.lib.shell import run_command
 from tests_e2e.tests.lib.ssh_client import SshClient
 
@@ -256,7 +260,7 @@ def _clean_up(self) -> None:
 
     def _setup_node(self) -> None:
         """
-        Prepares the remote node for executing the test suite.
+        Prepares the remote node for executing the test suite (installs tools and the test agent, etc)
         """
         self._log.info("")
         self._log.info("************************************** [Node Setup] **************************************")
@@ -265,23 +269,27 @@ def _setup_node(self) -> None:
         self._log.info("Resource Group: %s", self.context.vm.resource_group)
         self._log.info("")
 
-        self._install_tools_on_node()
+        self.context.ssh_client.run_command("mkdir -p ~/bin/tests_e2e/tests; touch ~/bin/agent-env")
 
-        if self.context.is_vhd:
-            self._log.info("Using a VHD; will not install the test Agent.")
-        else:
-            self._install_agent_on_node()
+        # Copy the test tools
+        tools_path = self.context.test_source_directory/"orchestrator"/"scripts"
+        tools_target_path = Path("~/bin")
+        self._log.info("Copying %s to %s:%s", tools_path, self.context.node.name, tools_target_path)
+        self.context.ssh_client.copy_to_node(tools_path, tools_target_path, recursive=True)
 
-    def _install_tools_on_node(self) -> None:
-        """
-        Installs the test tools on the test node
-        """
-        self.context.ssh_client.run_command("mkdir -p ~/bin")
+        # Copy the test libraries
+        lib_path = self.context.test_source_directory/"tests"/"lib"
+        lib_target_path = Path("~/bin/tests_e2e/tests")
+        self._log.info("Copying %s to %s:%s", lib_path, self.context.node.name, lib_target_path)
+        self.context.ssh_client.copy_to_node(lib_path, lib_target_path, recursive=True)
 
-        tools_path = self.context.test_source_directory/"orchestrator"/"scripts"
-        self._log.info(f"Copying {tools_path} to the test node")
-        self.context.ssh_client.copy(tools_path, Path("~/bin"), remote_target=True, recursive=True)
+        # Copy the test agent
+        agent_package_path: Path = self._get_agent_package_path()
+        agent_package_target_path = Path("~/bin")/agent_package_path.name
+        self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_target_path)
+        self.context.ssh_client.copy_to_node(agent_package_path, agent_package_target_path)
 
+        # Copy Pypy
         if self.context.ssh_client.get_architecture() == "aarch64":
             pypy_path = Path("/tmp/pypy3.7-arm64.tar.bz2")
             pypy_download = "https://downloads.python.org/pypy/pypy3.7-v7.3.5-aarch64.tar.bz2"
@@ -292,26 +300,21 @@ def _install_tools_on_node(self) -> None:
         if not pypy_path.exists():
             self._log.info(f"Downloading {pypy_download} to {pypy_path}")
             run_command(["wget", pypy_download, "-O",  pypy_path])
-        self._log.info(f"Copying {pypy_path} to the test node")
-        self.context.ssh_client.copy(pypy_path, Path("~/bin/pypy3.7.tar.bz2"), remote_target=True)
+        pypy_target_path = Path("~/bin/pypy3.7.tar.bz2")
+        self._log.info("Copying %s to %s:%s", pypy_path, self.context.node.name, pypy_target_path)
+        self.context.ssh_client.copy_to_node(pypy_path, pypy_target_path)
 
-        self._log.info(f'Installing tools on the test node\n{self.context.ssh_client.run_command("~/bin/scripts/install-tools")}')
-        self._log.info(f'Remote commands will use {self.context.ssh_client.run_command("which python3")}')
+        # Install the tools and libraries
+        install_command = lambda: self.context.ssh_client.run_command(f"~/bin/scripts/install-tools --agent-package {agent_package_target_path}")
+        self._log.info('Installing tools on the test node\n%s', install_command())
+        self._log.info('Remote commands will use %s', self.context.ssh_client.run_command("which python3"))
 
-    def _install_agent_on_node(self) -> None:
-        """
-        Installs the given agent package on the test node.
-        """
-        agent_package_path: Path = self._get_agent_package_path()
-
-        self._log.info("Installing %s on %s", agent_package_path, self.context.node.name)
-        agent_package_remote_path = self.context.remote_working_directory/agent_package_path.name
-        self._log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, agent_package_remote_path)
-        self.context.ssh_client.copy(agent_package_path, agent_package_remote_path, remote_target=True)
-        stdout = self.context.ssh_client.run_command(f"install-agent --package {agent_package_remote_path} --version {AGENT_VERSION}", use_sudo=True)
-        self._log.info(stdout)
-
-        self._log.info("The agent was installed successfully.")
+        # Install the agent
+        if self.context.is_vhd:
+            self._log.info("Using a VHD; will not install the Test Agent.")
+        else:
+            install_command = lambda: self.context.ssh_client.run_command(f"install-agent --package {agent_package_target_path} --version {AGENT_VERSION}", use_sudo=True)
+            self._log.info("Installing the Test Agent on %s\n%s", self.context.node.name, install_command())
 
     def _collect_node_logs(self) -> None:
         """
@@ -327,23 +330,25 @@ def _collect_node_logs(self) -> None:
             remote_path = "/tmp/waagent-logs.tgz"
             local_path = self.context.log_path/'{0}.tgz'.format(self.context.image_name)
             self._log.info("Copying %s:%s to %s", self.context.node.name, remote_path, local_path)
-            self.context.ssh_client.copy(remote_path, local_path, remote_source=True)
+            self.context.ssh_client.copy_from_node(remote_path, local_path)
+
         except:  # pylint: disable=bare-except
             self._log.exception("Failed to collect logs from the test machine")
 
     @TestCaseMetadata(description="", priority=0)
-    def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str, log: Logger) -> None:
+    def agent_test_suite(self, node: Node, environment: Environment, variables: Dict[str, Any], log_path: str, log: Logger) -> None:
         """
         Executes each of the AgentTests included in the "c_test_suites" variable (which is generated by the AgentTestSuitesCombinator).
         """
         self._set_context(node, variables, log_path, log)
 
-        with _set_thread_name(self.context.image_name):  # The thread name is added to self._log
-            # E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable)
-            # (OK to iterate, test_suite is a List)
+        # Set the thread name to the image; this name is added to self._log
+        with _set_thread_name(self.context.image_name):
+            # Log the environment's name and the variables received from the runbook (note that we need to expand the names of the test suites)
+            self._log.info("LISA Environment: %s", environment.name)
             self._log.info(
-                "Test suite parameters:  [test_suites: %s] [skip_setup: %s] [collect_logs: %s]",
-                [t.name for t in self.context.test_suites],  self.context.skip_setup, self.context.collect_logs)  # pylint: disable=E1133
+                "Runbook variables:\n%s",
+                '\n'.join([f"\t{name}: {value if name != 'c_test_suites' else [t.name for t in value] }" for name, value in variables.items()]))
 
             start_time: datetime.datetime = datetime.datetime.now()
             test_suite_success = True
@@ -361,6 +366,8 @@ def agent_test_suite(self, node: Node, variables: Dict[str, Any], log_path: str,
                     for suite in self.context.test_suites:  # pylint: disable=E1133
                         test_suite_success = self._execute_test_suite(suite) and test_suite_success
 
+                    test_suite_success = self._check_agent_log() and test_suite_success
+
                 finally:
                     collect = self.context.collect_logs
                     if collect == CollectLogs.Always or collect == CollectLogs.Failed and not test_suite_success:
@@ -475,6 +482,53 @@ def _execute_test_suite(self, suite: TestSuiteInfo) -> bool:
 
         return success
 
+    def _check_agent_log(self) -> bool:
+        """
+        Checks the agent log for errors; returns true on success (no errors int the log)
+        """
+        start_time: datetime.datetime = datetime.datetime.now()
+
+        self._log.info("Checking agent log on the test node")
+        output = self.context.ssh_client.run_command("check-agent-log.py -j")
+        errors = json.loads(output, object_hook=AgentLogRecord.from_dictionary)
+
+        # Individual tests may have rules to ignore known errors; filter those out
+        ignore_error_rules = []
+        # pylint seems to think self.context.test_suites is not iterable. Suppressing warning, since its type is List[AgentTestSuite]
+        #  E1133: Non-iterable value self.context.test_suites is used in an iterating context (not-an-iterable)
+        for suite in self.context.test_suites:  # pylint: disable=E1133
+            for test in suite.tests:
+                ignore_error_rules.extend(test(self.context).get_ignore_error_rules())
+
+        if len(ignore_error_rules) > 0:
+            new = []
+            for e in errors:
+                if not AgentLog.matches_ignore_rule(e, ignore_error_rules):
+                    new.append(e)
+            errors = new
+
+        if len(errors) == 0:
+            # If no errors, we are done; don't create a log or test result.
+            self._log.info("There are no errors in the agent log")
+            return True
+
+        log_path: Path = self.context.log_path/f"CheckAgentLog-{self.context.image_name}.log"
+        message = f"Detected {len(errors)} error(s) in the agent log. See {log_path} for a full report."
+        self._log.info(message)
+
+        with set_current_thread_log(log_path):
+            agent_test_logger.info("Detected %s error(s) in the agent log:\n\n%s", len(errors), '\n'.join(['\t' + e.text for e in errors]))
+
+        self._report_test_result(
+            self.context.image_name,
+            "CheckAgentLog",
+            TestStatus.FAILED,
+            start_time,
+            message=message + '\n' + '\n'.join([e.text for e in errors[0:3]]),
+            add_exception_stack_trace=True)
+
+        return False
+
     @staticmethod
     def _report_test_result(
             suite_name: str,

diff --git a/tests_e2e/orchestrator/scripts/check-agent-log.py b/tests_e2e/orchestrator/scripts/check-agent-log.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import json
+import sys
+
+from pathlib import Path
+from tests_e2e.tests.lib.agent_log import AgentLog
+
+try:
+    parser = argparse.ArgumentParser()
+    parser.add_argument('path', nargs='?', help='Path of the log file', default='/var/log/waagent.log')
+    parser.add_argument('-j', '--json', action='store_true', help='Produce a JSON report')
+    parser.set_defaults(json=False)
+    args = parser.parse_args()
+
+    error_list = AgentLog(Path(args.path)).get_errors()
+
+    if args.json:
+        print(json.dumps(error_list, default=lambda o: o.__dict__))
+    else:
+        if len(error_list) == 0:
+            print("No errors were found.")
+        else:
+            for e in error_list:
+                print(e.text)
+
+except Exception as e:
+    print(f"{e}", file=sys.stderr)
+    sys.exit(1)
+
+sys.exit(0)
diff --git a/...e2e/orchestrator/scripts/get-waagent-path → ...s_e2e/orchestrator/scripts/get-agent-path b/...e2e/orchestrator/scripts/get-waagent-path → ...s_e2e/orchestrator/scripts/get-agent-path
@@ -22,8 +22,6 @@
 #
 set -euo pipefail
 
-#
-
 # On most distros, 'waagent' is in PATH
 if which waagent 2> /dev/null; then
   exit 0
@@ -42,6 +40,7 @@ fi
 
 # try some well-known locations
 declare -a known_locations=(
+  "/usr/sbin/waagent"
   "/usr/share/oem/bin/waagent"
 )
 

diff --git a/tests_e2e/orchestrator/scripts/install-agent b/tests_e2e/orchestrator/scripts/install-agent
@@ -72,6 +72,14 @@ else
 fi
 echo "Service name: $service_name"
 
+#
+# Find the path to the Agent's executable file
+#
+waagent=$(get-agent-path)
+echo "Agent's path: $waagent"
+$waagent --version
+echo ""
+
 #
 # Install the package
 #
@@ -98,20 +106,13 @@ service-start $service_name
 #
 # Verify that the new agent is running and output its status.
 # Note that the extension handler may take some time to start so give 1 minute.
-# Also, note that the default Python is set to Pypy, so before executing 'waagent' we need to set the
-# Python path to the location of the azurelinuxagent module.
 #
 echo "Verifying agent installation..."
 
-PYTHONPATH=$(get-agent-pythonpath)
-export PYTHONPATH
-
 check-version() {
-  waagent=$(get-waagent-path)
-
   for i in {0..5}
   do
-    if $waagent --version | grep -E "Goal state agent:\s+$1" > /dev/null; then
+    if $waagent --version | grep -E "Goal state agent:\s+$version" > /dev/null; then
       return 0
     fi
     sleep 10
@@ -128,7 +129,6 @@ else
   exit_code=1
 fi
 
-waagent=$(get-waagent-path)
 $waagent --version
 printf "\n"
 service-status $service_name