From 38cff2da8b65093288f85cb40b5b6eb391d48229 Mon Sep 17 00:00:00 2001 From: narrieta Date: Wed, 1 Nov 2023 13:20:12 -0700 Subject: [PATCH] Check SSH connectivity during end-to-end tests --- .../orchestrator/lib/agent_test_suite.py | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py index 9209b18f01..ca0243044b 100644 --- a/tests_e2e/orchestrator/lib/agent_test_suite.py +++ b/tests_e2e/orchestrator/lib/agent_test_suite.py @@ -17,6 +17,7 @@ import datetime import json import logging +import time import traceback import uuid @@ -58,7 +59,7 @@ from tests_e2e.tests.lib.logging import log, set_thread_name, set_current_thread_log from tests_e2e.tests.lib.agent_log import AgentLogRecord from tests_e2e.tests.lib.resource_group_client import ResourceGroupClient -from tests_e2e.tests.lib.shell import run_command +from tests_e2e.tests.lib.shell import run_command, CommandError from tests_e2e.tests.lib.ssh_client import SshClient @@ -398,6 +399,8 @@ def _setup_test_nodes(self) -> None: ssh_client = SshClient(ip_address=node.ip_address, username=self._user, identity_file=Path(self._identity_file)) + self._check_ssh_connectivity(ssh_client) + # # Cleanup the test node (useful for developer runs) # @@ -447,6 +450,26 @@ def _setup_test_nodes(self) -> None: log.info("Completed test node setup") + @staticmethod + def _check_ssh_connectivity(ssh_client: SshClient) -> None: + # We may be trying to connect to the test node while it is still booting. Execute a simple command to check that SSH is ready, + # and raise an exception if it is not after a few attempts. + max_attempts = 5 + for attempt in range(max_attempts): + try: + log.info("Checking SSH connectivity to the test node...") + ssh_client.run_command("echo 'SSH connectivity check'") + log.info("SSH is ready.") + break + except CommandError as error: + # Check for "System is booting up. Unprivileged users are not permitted to log in yet. Please come back later. For technical details, see pam_nologin(8)." + if "Unprivileged users are not permitted to log in yet" not in error.stderr: + raise + if attempt >= max_attempts - 1: + raise Exception(f"SSH connectivity check failed after {max_attempts} attempts, giving up [{error}]") + log.info("SSH is not ready [%s], will retry after a short delay.", error) + time.sleep(15) + def _collect_logs_from_test_nodes(self) -> None: """ Collects the test logs from the test nodes and copies them to the local machine