Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add e2e test scenario for hostname monitoring #3003

Merged
merged 17 commits into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests_e2e/orchestrator/runbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ variable:
# Test suites to execute
#
- name: test_suites
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall"
value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_status, multi_config_ext, agent_cgroups, ext_cgroups, agent_firewall, ext_telemetry_pipeline, ext_sequencing, agent_persist_firewall, publish_hostname"

#
# Parameters used to create test VMs
Expand Down
8 changes: 8 additions & 0 deletions tests_e2e/test_suites/publish_hostname.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#
# Changes hostname and checks that the agent published the updated hostname to dns.
#
name: "PublishHostname"
tests:
- "publish_hostname/publish_hostname.py"
images:
- "endorsed"
1 change: 1 addition & 0 deletions tests_e2e/tests/lib/agent_test_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(self, working_directory: Path, vm: VirtualMachineClient, ip_address
super().__init__(working_directory, username, identity_file, ssh_port)
self.vm: VirtualMachineClient = vm
self.ip_address: str = ip_address
self.private_ip_address: str = self.vm.get_private_ip_address()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: since most tests do not need the private IP address and this involves a network call, consider getting the address on demand


def create_ssh_client(self) -> SshClient:
"""
Expand Down
12 changes: 12 additions & 0 deletions tests_e2e/tests/lib/virtual_machine_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,18 @@ def get_ip_address(self) -> str:
public_ip_address_name=nic.ip_configurations[0].public_ip_address.id.split('/')[-1]) # the name of the ip address is the last component of the id
return public_ip.ip_address

def get_private_ip_address(self) -> str:
"""
Retrieves the private IP address of the virtual machine
"""
vm_model = self.get_model()
nic: NetworkInterface = self._network_client.network_interfaces.get(
resource_group_name=self.resource_group,
network_interface_name=vm_model.network_profile.network_interfaces[0].id.split('/')[
-1]) # the name of the interface is the last component of the id
private_ip = nic.ip_configurations[0].private_ip_address
return private_ip

def get_model(self) -> VirtualMachine:
"""
Retrieves the model of the virtual machine.
Expand Down
200 changes: 200 additions & 0 deletions tests_e2e/tests/publish_hostname/publish_hostname.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#!/usr/bin/env python3

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

#
# This test updates the hostname and checks that the agent published the hostname to DNS. It also checks that the
# primary network is up after publishing the hostname. This test was added in response to a bug in publishing the
# hostname on fedora distros, where there was a race condition between NetworkManager restart and Network Interface
# restart which caused the primary interface to go down.
#

import datetime
import re

from assertpy import fail
from time import sleep

from tests_e2e.tests.lib.shell import CommandError
from tests_e2e.tests.lib.agent_test import AgentVmTest
from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
from tests_e2e.tests.lib.logging import log


class PublishHostname(AgentVmTest):
def __init__(self, context: AgentVmTestContext):
super().__init__(context)
self._context = context
self._ssh_client = context.create_ssh_client()
self._private_ip = context.private_ip_address
self._vm_password = ""

def add_vm_password(self):
# Add password to VM to help with debugging in case of failure
# REMOVE PWD FROM LOGS IF WE EVER MAKE THESE RUNS/LOGS PUBLIC
username = self._ssh_client.username
pwd = self._ssh_client.run_command("openssl rand -base64 32 | tr : .").rstrip()
self._vm_password = pwd
log.info("VM Username: {0}; VM Password: {1}".format(username, pwd))
self._ssh_client.run_command("echo '{0}:{1}' | sudo -S chpasswd".format(username, pwd))

def check_and_install_dns_tools(self):
lookup_cmd = "dig -x {0}".format(self._private_ip)
dns_regex = r"[\S\s]*;; ANSWER SECTION:\s.*PTR\s*(?P<hostname>.*).internal.cloudapp.net.[\S\s]*"

# Not all distros come with dig. Install dig if not on machine
try:
self._ssh_client.run_command("dig -v")
except CommandError as e:
if "dig: command not found" in e.stderr:
distro = self._ssh_client.run_command("get_distro.py").rstrip().lower()
if "debian_9" in distro:
# Debian 9 hostname look up needs to be done with "host" instead of dig
lookup_cmd = "host {0}".format(self._private_ip)
dns_regex = r".*pointer\s(?P<hostname>.*).internal.cloudapp.net."
elif "debian" in distro:
self._ssh_client.run_command("apt install -y dnsutils", use_sudo=True)
elif "alma" in distro or "rocky" in distro:
self._ssh_client.run_command("dnf install -y bind-utils", use_sudo=True)
else:
raise
else:
raise

return lookup_cmd, dns_regex

def check_agent_reports_status(self):
status_updated = False
last_agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
log.info("Agent reported status at {0}".format(last_agent_status_time))
retries = 3

while retries > 0 and not status_updated:
agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
if agent_status_time != last_agent_status_time:
status_updated = True
log.info("Agent reported status at {0}".format(last_agent_status_time))
else:
retries -= 1
sleep(60)

if not status_updated:
fail("Agent hasn't reported status since {0} and ssh connection failed. Use the serial console in portal "
"to check the contents of '/sys/class/net/eth0/operstate'. If the contents of this file are 'up', "
"no further action is needed. If contents are 'down', that indicates the network interface is down "
"and more debugging needs to be done to confirm this is not caused by the agent.\n VM: {1}\n RG: {2}"
"\nSubscriptionId: {3}\nUsername: {4}\nPassword: {5}".format(last_agent_status_time,
self._context.vm,
self._context.vm.resource_group,
self._context.vm.subscription,
self._context.username,
self._vm_password))

def retry_ssh_if_connection_reset(self, command: str, use_sudo=False):
# The agent may bring the network down and back up to publish the hostname, which can reset the ssh connection.
# Adding retry here for connection reset.
retries = 3
while retries > 0:
try:
return self._ssh_client.run_command(command, use_sudo=use_sudo)
except CommandError as e:
retries -= 1
retryable = e.exit_code == 255 and "Connection reset by peer" in e.stderr
if not retryable or retries == 0:
raise
log.warning("The SSH operation failed, retrying in 30 secs")
sleep(30)

def run(self):
# Add password to VM and log. This allows us to debug with serial console if necessary
self.add_vm_password()

# This test looks up what hostname is published to dns. Check that the tools necessary to get hostname are
# installed, and if not install them.
lookup_cmd, dns_regex = self.check_and_install_dns_tools()

# Enable agent hostname monitoring
log.info("Executing script update-waagent-conf to enable agent hostname monitoring")
result = self._ssh_client.run_command("update-waagent-conf Provisioning.MonitorHostName=y "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we shouldn't enable Hostname monitoring on distros where it is disabled (e.g. Ubuntu)

Copy link
Contributor

@nagworld9 nagworld9 Dec 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why shouldn't we test hostname path for those distros? It's possible that user can enable them at any time. That's what recent github issue reported that hostname changes not publishing to dns server in ubuntu

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should test them, but we should not enable this setting on distros where it is disabled

"Provisioning.MonitorHostNamePeriod=30", use_sudo=True)
log.info("Successfully enabled agent hostname monitoring config flag: {0}".format(result))

hostname_change_ctr = 0
# Update the hostname 3 times
while hostname_change_ctr < 3:
try:
hostname = "lisa-hostname-monitor-{0}".format(hostname_change_ctr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: I'd remove "lisa-" from the hostname

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

log.info("Update hostname to {0}".format(hostname))
self.retry_ssh_if_connection_reset("hostnamectl set-hostname {0}".format(hostname), use_sudo=True)
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

# Wait for the agent to detect the hostname change for up to 2 minutes
timeout = datetime.datetime.now() + datetime.timedelta(minutes=2)
hostname_detected = ""
while datetime.datetime.now() <= timeout:
try:
hostname_detected = self.retry_ssh_if_connection_reset("grep -n {0} /var/log/waagent.log".format(hostname), use_sudo=True)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add a little more context to grep's regex (on top of the hostname). The message is something like "Detected hostname change, etc"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated

if hostname_detected:
log.info("Agent detected hostname change: {0}".format(hostname_detected))
break
except CommandError as e:
# Exit code 1 indicates grep did not find a match. Sleep if exit code is 1, otherwise raise.
if e.exit_code != 1:
raise
sleep(15)

if not hostname_detected:
fail("Agent did not detect hostname change: {0}".format(hostname))

# Check that the expected hostname is published with 4 minute timeout
timeout = datetime.datetime.now() + datetime.timedelta(minutes=4)
published_hostname = ""
while datetime.datetime.now() <= timeout:
try:
dns_info = self.retry_ssh_if_connection_reset(lookup_cmd)
actual_hostname = re.match(dns_regex, dns_info)
if actual_hostname:
# Compare published hostname to expected hostname
published_hostname = actual_hostname.group('hostname')
if hostname == published_hostname:
log.info("SUCCESS Hostname {0} was published successfully".format(hostname))
break
else:
log.info("Unable to parse the dns info: {0}".format(dns_info))
except CommandError as e:
if "NXDOMAIN" in e.stdout:
log.info("DNS Lookup could not find domain. Will try again.")
else:
raise
sleep(30)

if published_hostname == "" or published_hostname != hostname:
fail("Hostname {0} was not published successfully. Actual host name is: {1}".format(hostname, published_hostname))

hostname_change_ctr += 1

except CommandError as e:
# If failure is ssh issue, we should confirm that the VM did not lose network connectivity due to the
# agent's operations on the network. If agent reports status after this failure, then we know the
# network is up.
if e.exit_code == 255 and ("Connection timed out" in e.stderr or "Connection refused" in e.stderr):
self.check_agent_reports_status()
raise


if __name__ == "__main__":
PublishHostname.run_from_command_line()
35 changes: 35 additions & 0 deletions tests_e2e/tests/scripts/get_distro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env pypy3
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

# Microsoft Azure Linux Agent
#
# Copyright 2018 Microsoft Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Prints the distro and version of the machine
#

import sys

from azurelinuxagent.common.version import get_distro


def main():
# Prints '<distro>_<version>'
distro = get_distro()
print(distro[0] + "_" + distro[1].replace('.', ''))
sys.exit(0)


if __name__ == "__main__":
main()
Loading