-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improvements in error handling on end-to-end tests #2716
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,9 +14,9 @@ | |
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
from collections.abc import Callable | ||
from pathlib import Path | ||
import shutil | ||
from shutil import rmtree | ||
|
||
import makepkg | ||
|
||
|
@@ -25,69 +25,42 @@ | |
CustomScriptBuilder, | ||
Logger, | ||
Node, | ||
TestSuite, | ||
TestSuiteMetadata, | ||
) | ||
# E0401: Unable to import 'lisa.sut_orchestrator.azure.common' (import-error) | ||
from lisa.sut_orchestrator.azure.common import get_node_context # pylint: disable=E0401 | ||
|
||
from azurelinuxagent.common.version import AGENT_VERSION | ||
|
||
|
||
class AgentTestSuite(TestSuite): | ||
class AgentTestScenario(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This class used to be an specialization of LISA's TestSuite in order to use the before_case/after_case protocol, but since I am not using anymore now there is no need for that class relationship. Now this is a plain AgentTestScenario class (it is very similar in concept to a DCR scenario) . I renamed the class, but will rename the py file until my next PR, otherwise reviewing the actual code changes would be more difficult. |
||
""" | ||
Base class for VM Agent tests. It provides initialization, cleanup, and utilities common to all VM Agent test suites. | ||
Instances of this class are used to execute Agent test scenarios. It also provides facilities to execute commands over SSH. | ||
""" | ||
def __init__(self, metadata: TestSuiteMetadata): | ||
super().__init__(metadata) | ||
# The actual initialization happens in _initialize() | ||
self._log = None | ||
self._node = None | ||
self._subscription_id = None | ||
self._resource_group_name = None | ||
self._vm_name = None | ||
self._test_source_directory = None | ||
self._working_directory = None | ||
self._node_home_directory = None | ||
|
||
def before_case(self, *_, **kwargs) -> None: | ||
self._initialize(kwargs['node'], kwargs['log']) | ||
self._setup_node() | ||
|
||
def after_case(self, *_, **__) -> None: | ||
try: | ||
self._collect_node_logs() | ||
finally: | ||
self._clean_up() | ||
|
||
def _initialize(self, node: Node, log: Logger) -> None: | ||
self._node = node | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is now done in init() |
||
self._log = log | ||
|
||
node_context = get_node_context(node) | ||
self._subscription_id = node.features._platform.subscription_id | ||
self._resource_group_name = node_context.resource_group_name | ||
self._vm_name = node_context.vm_name | ||
|
||
self._test_source_directory = AgentTestSuite._get_test_source_directory() | ||
self._working_directory = Path().home()/"waagent-tmp" | ||
self._node_home_directory = Path('/home')/self._node.connection_info['username'] | ||
class Context: | ||
""" | ||
Execution context for test scenarios, this information is passed to test scenarios by AgentTestScenario.execute() | ||
""" | ||
subscription_id: str | ||
resource_group_name: str | ||
vm_name: str | ||
test_source_directory: Path | ||
working_directory: Path | ||
node_home_directory: Path | ||
|
||
self._log.info(f"Test Node: {self._vm_name}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This logic is now in the _setup() method |
||
self._log.info(f"Resource Group: {self._resource_group_name}") | ||
self._log.info(f"Working directory: {self._working_directory}...") | ||
def __init__(self, node: Node, log: Logger) -> None: | ||
self._node: Node = node | ||
self._log: Logger = log | ||
|
||
if self._working_directory.exists(): | ||
self._log.info(f"Removing existing working directory: {self._working_directory}...") | ||
try: | ||
shutil.rmtree(self._working_directory.as_posix()) | ||
except Exception as exception: | ||
self._log.warning(f"Failed to remove the working directory: {exception}") | ||
self._working_directory.mkdir() | ||
node_context = get_node_context(node) | ||
self._context: AgentTestScenario.Context = AgentTestScenario.Context() | ||
self._context.subscription_id = node.features._platform.subscription_id | ||
self._context.resource_group_name = node_context.resource_group_name | ||
self._context.vm_name = node_context.vm_name | ||
|
||
def _clean_up(self) -> None: | ||
self._log.info(f"Removing {self._working_directory}...") | ||
shutil.rmtree(self._working_directory.as_posix(), ignore_errors=True) | ||
self._context.test_source_directory = AgentTestScenario._get_test_source_directory() | ||
self._context.working_directory = Path().home()/"waagent-tmp" | ||
self._context.node_home_directory = Path('/home')/node.connection_info['username'] | ||
|
||
@staticmethod | ||
def _get_test_source_directory() -> Path: | ||
|
@@ -101,6 +74,29 @@ def _get_test_source_directory() -> Path: | |
path = path.parent | ||
raise Exception("Can't find the test root directory (tests_e2e)") | ||
|
||
def _setup(self) -> None: | ||
""" | ||
Prepares the test scenario for execution | ||
""" | ||
self._log.info(f"Test Node: {self._context.vm_name}") | ||
self._log.info(f"Resource Group: {self._context.resource_group_name}") | ||
self._log.info(f"Working directory: {self._context.working_directory}...") | ||
|
||
if self._context.working_directory.exists(): | ||
self._log.info(f"Removing existing working directory: {self._context.working_directory}...") | ||
try: | ||
rmtree(self._context.working_directory.as_posix()) | ||
except Exception as exception: | ||
self._log.warning(f"Failed to remove the working directory: {exception}") | ||
self._context.working_directory.mkdir() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so, continue the setup if we failed to remove? or mkdir raise the exception if already exist? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. continue with warning. this is mainly for the dev scenario. the automation run works on fresh vms There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was reading mkkdir() raise exception if directory exists. So, in dev scenario it won't work if failed to remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My point is we could raise the exception if we failed to remove because anyway in next step mkdir throws exception if folder exist. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it will fail and there will be a warning with the info about the delete failure the dev will get an error, see the warning in the log and fix the issue There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Same difference, with the possibility that the working directory goes away in-between the 2 calls and there is no error |
||
|
||
def _clean_up(self) -> None: | ||
""" | ||
Cleans up any leftovers from the test scenario run. | ||
""" | ||
self._log.info(f"Removing {self._context.working_directory}...") | ||
rmtree(self._context.working_directory.as_posix(), ignore_errors=True) | ||
|
||
def _setup_node(self) -> None: | ||
""" | ||
Prepares the remote node for executing the test suite. | ||
|
@@ -112,18 +108,10 @@ def _build_agent_package(self) -> Path: | |
""" | ||
Builds the agent package and returns the path to the package. | ||
""" | ||
build_path = self._working_directory/"build" | ||
|
||
# The same orchestrator machine may be executing multiple suites on the same test VM, or | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I removed the "*.done" files (they actually never worked, since I am removing the working directory on cleanup). In the current DCR automation we use a new test VM to run 1 single scenario. I want to be able to run multiple scenarios on the same test VM in the new automation pipeline and the intention of those "*.done" files was to avoid duplicated setup. However, not only the implementation was incorrect, but that approach was cumbersome during test development. I will come up with a better approach when I start combining scenarios. |
||
# the same suite on one or more test VMs; we use this file to mark the build is already done | ||
build_done_path = self._working_directory/"build.done" | ||
if build_done_path.exists(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did we remove this case because we're no longer executing multiple suites on the same VM? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not quite, I explained the motivation in the PR comment just a couple of lines above. |
||
self._log.info("The agent build is already completed, will use existing package.") | ||
else: | ||
self._log.info(f"Building agent package to {build_path}") | ||
makepkg.run(agent_family="Test", output_directory=str(build_path), log=self._log) | ||
build_done_path.touch() | ||
build_path = self._context.working_directory/"build" | ||
|
||
self._log.info(f"Building agent package to {build_path}") | ||
makepkg.run(agent_family="Test", output_directory=str(build_path), log=self._log) | ||
package_path = build_path/"eggs"/f"WALinuxAgent-{AGENT_VERSION}.zip" | ||
if not package_path.exists(): | ||
raise Exception(f"Can't find the agent package at {package_path}") | ||
|
@@ -136,54 +124,70 @@ def _install_agent_on_node(self, agent_package: Path) -> None: | |
""" | ||
Installs the given agent package on the test node. | ||
""" | ||
# The same orchestrator machine may be executing multiple suites on the same test VM, | ||
# we use this file to mark the agent is already installed on the test VM. | ||
install_done_path = self._working_directory/f"agent-install.{self._vm_name}.done" | ||
if install_done_path.exists(): | ||
self._log.info(f"Package {agent_package} is already installed on {self._vm_name}...") | ||
return | ||
|
||
# The install script needs to unzip the agent package; ensure unzip is installed on the test node | ||
self._log.info(f"Installing unzip on {self._vm_name}...") | ||
self._log.info(f"Installing unzip on {self._context.vm_name}...") | ||
self._node.os.install_packages("unzip") | ||
|
||
self._log.info(f"Installing {agent_package} on {self._vm_name}...") | ||
agent_package_remote_path = self._node_home_directory/agent_package.name | ||
self._log.info(f"Copying {agent_package} to {self._vm_name}:{agent_package_remote_path}") | ||
self._log.info(f"Installing {agent_package} on {self._context.vm_name}...") | ||
agent_package_remote_path = self._context.node_home_directory/agent_package.name | ||
self._log.info(f"Copying {agent_package} to {self._context.vm_name}:{agent_package_remote_path}") | ||
self._node.shell.copy(agent_package, agent_package_remote_path) | ||
self._execute_script_on_node( | ||
self._test_source_directory/"orchestrator"/"scripts"/"install-agent", | ||
self.execute_script_on_node( | ||
self._context.test_source_directory/"orchestrator"/"scripts"/"install-agent", | ||
parameters=f"--package {agent_package_remote_path} --version {AGENT_VERSION}", | ||
sudo=True) | ||
|
||
self._log.info("The agent was installed successfully.") | ||
install_done_path.touch() | ||
|
||
def _collect_node_logs(self) -> None: | ||
""" | ||
Collects the test logs from the remote machine and copied them to the local machine | ||
Collects the test logs from the remote machine and copies them to the local machine | ||
""" | ||
# Collect the logs on the test machine into a compressed tarball | ||
self._log.info("Collecting logs on test machine [%s]...", self._node.name) | ||
self._execute_script_on_node(self._test_source_directory/"orchestrator"/"scripts"/"collect-logs", sudo=True) | ||
|
||
# Copy the tarball to the local logs directory | ||
remote_path = self._node_home_directory/"logs.tgz" | ||
local_path = Path.home()/'logs'/'vm-logs-{0}.tgz'.format(self._node.name) | ||
self._log.info(f"Copying {self._node.name}:{remote_path} to {local_path}") | ||
self._node.shell.copy_back(remote_path, local_path) | ||
try: | ||
# Collect the logs on the test machine into a compressed tarball | ||
self._log.info("Collecting logs on test machine [%s]...", self._node.name) | ||
self.execute_script_on_node(self._context.test_source_directory/"orchestrator"/"scripts"/"collect-logs", sudo=True) | ||
|
||
# Copy the tarball to the local logs directory | ||
remote_path = self._context.node_home_directory/"logs.tgz" | ||
local_path = Path.home()/'logs'/'vm-logs-{0}.tgz'.format(self._node.name) | ||
self._log.info(f"Copying {self._node.name}:{remote_path} to {local_path}") | ||
self._node.shell.copy_back(remote_path, local_path) | ||
except Exception as e: | ||
self._log.warning(f"Failed to collect logs from the test machine: {e}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added an exception handler since errors collecting logs should not be reported as test errors. |
||
|
||
def execute(self, scenario: Callable[[Context], None]) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is used to enforce the setup/execution/cleanup of test scenarios and replaces the previous use of before/after_case. See agent_bvt.py for an example of its usage. |
||
""" | ||
Executes the given scenario | ||
""" | ||
try: | ||
self._setup() | ||
try: | ||
self._setup_node() | ||
scenario(self._context) | ||
finally: | ||
self._collect_node_logs() | ||
finally: | ||
self._clean_up() | ||
|
||
def _execute_script_on_node(self, script_path: Path, parameters: str = "", sudo: bool = False) -> int: | ||
def execute_script_on_node(self, script_path: Path, parameters: str = "", sudo: bool = False) -> int: | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made it public, added a docstring and did minor formatting improvements in its logging |
||
Executes the given script on the test node; if 'sudo' is True, the script is executed using the sudo command. | ||
""" | ||
custom_script_builder = CustomScriptBuilder(script_path.parent, [script_path.name]) | ||
custom_script = self._node.tools[custom_script_builder] | ||
|
||
command_line = f"{script_path} {parameters}" | ||
self._log.info(f"Executing {command_line}") | ||
if parameters == '': | ||
command_line = f"{script_path}" | ||
else: | ||
command_line = f"{script_path} {parameters}" | ||
|
||
self._log.info(f"Executing [{command_line}]") | ||
result = custom_script.run(parameters=parameters, sudo=sudo) | ||
|
||
# LISA appends stderr to stdout so no need to check for stderr | ||
if result.exit_code != 0: | ||
raise Exception(f"[{command_line}] failed.\n{result.stdout}") | ||
raise Exception(f"Command [{command_line}] failed.\n{result.stdout}") | ||
|
||
return result.exit_code | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,5 +15,11 @@ tar --exclude='journal/*' --exclude='omsbundle' --exclude='omsagent' --exclude=' | |
/var/lib/waagent/ \ | ||
/etc/waagent.conf | ||
|
||
# tar exits with 1 on warnings; ignore those | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are hitting a warning about the logs changing while we are creating the tarball. It's just a warning, but tar exits with code 1 (fatal errors are 2, see the man page for details) |
||
exit_code=$? | ||
if [ "$exit_code" != "1" ] && [ "$exit_code" != "0" ]; then | ||
exit $exit_code | ||
fi | ||
|
||
chmod +r "$logs_file_name" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,22 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Microsoft Azure Linux Agent | ||
# | ||
# Copyright 2018 Microsoft Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
set -euo pipefail | ||
|
||
usage() ( | ||
|
@@ -59,7 +74,7 @@ echo "Restarting service..." | |
service $service_name stop | ||
|
||
# Rename the previous log to ensure the new log starts with the agent we just installed | ||
mv /var/log/waagent.log /var/log/waagent.pre-install.log | ||
mv /var/log/waagent.log /var/log/waagent."$(date --iso-8601=seconds)".log | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a timestamp to the previous log. Useful if the scenario is run multiple times on the same VM (e.g. during test development) or if multiple scenarios run on the same VM (in the future). |
||
|
||
if command -v systemctl &> /dev/null; then | ||
systemctl daemon-reload | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Microsoft Azure Linux Agent | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a "sample" LISA test suite that runs on the local machine. Useful to do quick tests/experiments with LISA |
||
# | ||
# Copyright 2018 Microsoft Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
# E0401: Unable to import 'lisa' (import-error) | ||
from lisa import ( # pylint: disable=E0401 | ||
Logger, | ||
Node, | ||
TestCaseMetadata, | ||
TestSuite, | ||
TestSuiteMetadata, | ||
) | ||
|
||
|
||
@TestSuiteMetadata(area="sample", category="", description="") | ||
class HelloWorld(TestSuite): | ||
@TestCaseMetadata(description="") | ||
def main(self, node: Node, log: Logger) -> None: | ||
log.info(f"Hello world from {node.os.name}!") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Microsoft Azure Linux Agent | ||
# | ||
# Copyright 2018 Microsoft Corporation | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
extension: | ||
- "." | ||
environment: | ||
environments: | ||
- nodes: | ||
- type: local | ||
notifier: | ||
- type: console | ||
testcase: | ||
- criteria: | ||
area: sample |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The main change in this module is that I stopped using before_case and after_case for setup and cleanup.
The issue is that errors in before_case are not reported as failures, they simply make LISA skip test execution. Tests can be skipped for valid reasons (e.g. an unsupported OS), but skipping tests on setup failures without reporting an error is not correct, since those errors may go unnoticed on daily runs.
I replaced the use of before_case and after_case with custom code (mainly in the execute() method).