Skip to content

Commit

Permalink
Add e2e test scenario for hostname monitoring (#3003)
Browse files Browse the repository at this point in the history
* Validate hostname is published

* Run on distro without known issues

* Add comment about debugging network down

* Create e2e scenario for hostname monitoring

* Remove unused import

* Increase timeout for hostname change

* Add password to VM and check for agent status if ssh fails

* run scenario on all endorsed distros

* Use getdistro() to check distro

* Add comment to get_distro

* Add publish_hostname to runbook

* Make get_distro.py executable

* Address first round of PR comments

* Do not enable hostname monitoring on distros where it is disabled

* Skip test on ubuntu

* Update get-waagent-conf-value to remove unused variable
  • Loading branch information
maddieford authored Dec 29, 2023
1 parent 847bb08 commit 284fbd5
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 1 deletion.
2 changes: 1 addition & 1 deletion tests_e2e/orchestrator/runbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ variable:
# Test suites to execute
#
- name: test_suites
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall"
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname"

#
# Parameters used to create test VMs
Expand Down
8 changes: 8 additions & 0 deletions tests_e2e/test_suites/publish_hostname.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#
# Changes hostname and checks that the agent published the updated hostname to dns.
#
name: "PublishHostname"
tests:
- "publish_hostname/publish_hostname.py"
images:
- "endorsed"
12 changes: 12 additions & 0 deletions tests_e2e/tests/lib/virtual_machine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ def get_ip_address(self) -> str:
public_ip_address_name=nic.ip_configurations[0].public_ip_address.id.split('/')[-1]) # the name of the ip address is the last component of the id
return public_ip.ip_address

def get_private_ip_address(self) -> str:
"""
Retrieves the private IP address of the virtual machine
"""
vm_model = self.get_model()
nic: NetworkInterface = self._network_client.network_interfaces.get(
resource_group_name=self.resource_group,
network_interface_name=vm_model.network_profile.network_interfaces[0].id.split('/')[
-1]) # the name of the interface is the last component of the id
private_ip = nic.ip_configurations[0].private_ip_address
return private_ip

def get_model(self) -> VirtualMachine:
"""
Retrieves the model of the virtual machine.
Expand Down
207 changes: 207 additions & 0 deletions tests_e2e/tests/publish_hostname/publish_hostname.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# This test updates the hostname and checks that the agent published the hostname to DNS. It also checks that the
# primary network is up after publishing the hostname. This test was added in response to a bug in publishing the
# hostname on fedora distros, where there was a race condition between NetworkManager restart and Network Interface
# restart which caused the primary interface to go down.
#

import datetime
import re

from assertpy import fail
from time import sleep

from tests_e2e.tests.lib.shell import CommandError
from tests_e2e.tests.lib.agent_test import AgentVmTest, TestSkipped
from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
from tests_e2e.tests.lib.logging import log


class PublishHostname(AgentVmTest):
def __init__(self, context: AgentVmTestContext):
super().__init__(context)
self._context = context
self._ssh_client = context.create_ssh_client()
self._private_ip = context.vm.get_private_ip_address()
self._vm_password = ""

def add_vm_password(self):
# Add password to VM to help with debugging in case of failure
# REMOVE PWD FROM LOGS IF WE EVER MAKE THESE RUNS/LOGS PUBLIC
username = self._ssh_client.username
pwd = self._ssh_client.run_command("openssl rand -base64 32 | tr : .").rstrip()
self._vm_password = pwd
log.info("VM Username: {0}; VM Password: {1}".format(username, pwd))
self._ssh_client.run_command("echo '{0}:{1}' | sudo -S chpasswd".format(username, pwd))

def check_and_install_dns_tools(self):
lookup_cmd = "dig -x {0}".format(self._private_ip)
dns_regex = r"[\S\s]*;; ANSWER SECTION:\s.*PTR\s*(?P<hostname>.*).internal.cloudapp.net.[\S\s]*"

# Not all distros come with dig. Install dig if not on machine
try:
self._ssh_client.run_command("dig -v")
except CommandError as e:
if "dig: command not found" in e.stderr:
distro = self._ssh_client.run_command("get_distro.py").rstrip().lower()
if "debian_9" in distro:
# Debian 9 hostname look up needs to be done with "host" instead of dig
lookup_cmd = "host {0}".format(self._private_ip)
dns_regex = r".*pointer\s(?P<hostname>.*).internal.cloudapp.net."
elif "debian" in distro:
self._ssh_client.run_command("apt install -y dnsutils", use_sudo=True)
elif "alma" in distro or "rocky" in distro:
self._ssh_client.run_command("dnf install -y bind-utils", use_sudo=True)
else:
raise
else:
raise

return lookup_cmd, dns_regex

def check_agent_reports_status(self):
status_updated = False
last_agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
log.info("Agent reported status at {0}".format(last_agent_status_time))
retries = 3

while retries > 0 and not status_updated:
agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
if agent_status_time != last_agent_status_time:
status_updated = True
log.info("Agent reported status at {0}".format(last_agent_status_time))
else:
retries -= 1
sleep(60)

if not status_updated:
fail("Agent hasn't reported status since {0} and ssh connection failed. Use the serial console in portal "
"to check the contents of '/sys/class/net/eth0/operstate'. If the contents of this file are 'up', "
"no further action is needed. If contents are 'down', that indicates the network interface is down "
"and more debugging needs to be done to confirm this is not caused by the agent.\n VM: {1}\n RG: {2}"
"\nSubscriptionId: {3}\nUsername: {4}\nPassword: {5}".format(last_agent_status_time,
self._context.vm,
self._context.vm.resource_group,
self._context.vm.subscription,
self._context.username,
self._vm_password))

def retry_ssh_if_connection_reset(self, command: str, use_sudo=False):
# The agent may bring the network down and back up to publish the hostname, which can reset the ssh connection.
# Adding retry here for connection reset.
retries = 3
while retries > 0:
try:
return self._ssh_client.run_command(command, use_sudo=use_sudo)
except CommandError as e:
retries -= 1
retryable = e.exit_code == 255 and "Connection reset by peer" in e.stderr
if not retryable or retries == 0:
raise
log.warning("The SSH operation failed, retrying in 30 secs")
sleep(30)

def run(self):
# TODO: Investigate why hostname is not being published on Ubuntu as expected
if "ubuntu" in self._ssh_client.run_command("get_distro.py").lower():
raise TestSkipped("Known issue with hostname publishing on ubuntu. Will skip test until we continue "
"investigation.")

# Add password to VM and log. This allows us to debug with serial console if necessary
self.add_vm_password()

# This test looks up what hostname is published to dns. Check that the tools necessary to get hostname are
# installed, and if not install them.
lookup_cmd, dns_regex = self.check_and_install_dns_tools()

# Check if this distro monitors hostname changes. If it does, we should check that the agent detects the change
# and publishes the host name. If it doesn't, we should check that the hostname is automatically published.
monitors_hostname = self._ssh_client.run_command("get-waagent-conf-value Provisioning.MonitorHostName", use_sudo=True).rstrip().lower()

hostname_change_ctr = 0
# Update the hostname 3 times
while hostname_change_ctr < 3:
try:
hostname = "hostname-monitor-{0}".format(hostname_change_ctr)
log.info("Update hostname to {0}".format(hostname))
self.retry_ssh_if_connection_reset("hostnamectl set-hostname {0}".format(hostname), use_sudo=True)

# Wait for the agent to detect the hostname change for up to 2 minutes if hostname monitoring is enabled
if monitors_hostname == "y" or monitors_hostname == "yes":
log.info("Agent hostname monitoring is enabled")
timeout = datetime.datetime.now() + datetime.timedelta(minutes=2)
hostname_detected = ""
while datetime.datetime.now() <= timeout:
try:
hostname_detected = self.retry_ssh_if_connection_reset("grep -n 'Detected hostname change:.*-> {0}' /var/log/waagent.log".format(hostname), use_sudo=True)
if hostname_detected:
log.info("Agent detected hostname change: {0}".format(hostname_detected))
break
except CommandError as e:
# Exit code 1 indicates grep did not find a match. Sleep if exit code is 1, otherwise raise.
if e.exit_code != 1:
raise
sleep(15)

if not hostname_detected:
fail("Agent did not detect hostname change: {0}".format(hostname))
else:
log.info("Agent hostname monitoring is disabled")

# Check that the expected hostname is published with 4 minute timeout
timeout = datetime.datetime.now() + datetime.timedelta(minutes=4)
published_hostname = ""
while datetime.datetime.now() <= timeout:
try:
dns_info = self.retry_ssh_if_connection_reset(lookup_cmd)
actual_hostname = re.match(dns_regex, dns_info)
if actual_hostname:
# Compare published hostname to expected hostname
published_hostname = actual_hostname.group('hostname')
if hostname == published_hostname:
log.info("SUCCESS Hostname {0} was published successfully".format(hostname))
break
else:
log.info("Unable to parse the dns info: {0}".format(dns_info))
except CommandError as e:
if "NXDOMAIN" in e.stdout:
log.info("DNS Lookup could not find domain. Will try again.")
else:
raise
sleep(30)

if published_hostname == "" or published_hostname != hostname:
fail("Hostname {0} was not published successfully. Actual host name is: {1}".format(hostname, published_hostname))

hostname_change_ctr += 1

except CommandError as e:
# If failure is ssh issue, we should confirm that the VM did not lose network connectivity due to the
# agent's operations on the network. If agent reports status after this failure, then we know the
# network is up.
if e.exit_code == 255 and ("Connection timed out" in e.stderr or "Connection refused" in e.stderr):
self.check_agent_reports_status()
raise


if __name__ == "__main__":
PublishHostname.run_from_command_line()
41 changes: 41 additions & 0 deletions tests_e2e/tests/scripts/get-waagent-conf-value
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env bash

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# Echos the value in waagent.conf for the specified setting if it exists.
#

set -euo pipefail

if [[ $# -lt 1 ]]; then
echo "Usage: get-waagent-conf-value <setting>"
exit 1
fi

PYTHON=$(get-agent-python)
waagent_conf=$($PYTHON -c 'from azurelinuxagent.common.osutil import get_osutil; print(get_osutil().agent_conf_file_path)')

cat $waagent_conf | while read line
do
if [[ $line == $1* ]]; then
IFS='=' read -a values <<< "$line"
echo ${values[1]}
exit 0
fi
done
35 changes: 35 additions & 0 deletions tests_e2e/tests/scripts/get_distro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env pypy3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prints the distro and version of the machine
#

import sys

from azurelinuxagent.common.version import get_distro


def main():
# Prints '<distro>_<version>'
distro = get_distro()
print(distro[0] + "_" + distro[1].replace('.', ''))
sys.exit(0)


if __name__ == "__main__":
main()

0 comments on commit 284fbd5

Please sign in to comment.