From 660454ddb8db4be3790cc0a6c082c7668628b113 Mon Sep 17 00:00:00 2001 From: nnandigam Date: Fri, 17 Mar 2023 16:05:43 -0700 Subject: [PATCH] ssh retry --- tests_e2e/orchestrator/runbook.yml | 2 +- tests_e2e/tests/lib/retry.py | 18 ++++++++++++++++++ tests_e2e/tests/lib/ssh_client.py | 13 ++++++++----- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index bb968bad86..8075725eb0 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -126,7 +126,7 @@ combinator: location: $(location) vm_size: $(vm_size) -concurrency: 32 +concurrency: 16 notifier: - type: agent.junit diff --git a/tests_e2e/tests/lib/retry.py b/tests_e2e/tests/lib/retry.py index a86227bc6c..ff049e557e 100644 --- a/tests_e2e/tests/lib/retry.py +++ b/tests_e2e/tests/lib/retry.py @@ -19,6 +19,7 @@ from typing import Callable, Any from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError def execute_with_retry(operation: Callable[[], Any]) -> Any: @@ -39,3 +40,20 @@ def execute_with_retry(operation: Callable[[], Any]) -> Any: time.sleep(30) +def retry_ssh_run(operation: Callable[[], Any]) -> Any: + """ + This method attempts to retry ssh run command a few times if operation failed with connection time out + """ + attempts = 3 + while attempts > 0: + attempts -= 1 + try: + return operation() + except Exception as e: + # We raise CommandError on !=0 exit codes in the downstream + if isinstance(e, CommandError): + # Instance of 'Exception' has no 'exit_code' member (no-member) - Disabled: e is actually an CommandError + if e.exit_code != 255 or attempts == 0: # pylint: disable=no-member + raise + log.warning("The operation failed with SSH Connection time out, retrying in 30 secs. Error: %s", e) + time.sleep(30) diff --git a/tests_e2e/tests/lib/ssh_client.py b/tests_e2e/tests/lib/ssh_client.py index e0c07420e6..e6e4e0c6dd 100644 --- a/tests_e2e/tests/lib/ssh_client.py +++ b/tests_e2e/tests/lib/ssh_client.py @@ -19,12 +19,13 @@ from pathlib import Path from tests_e2e.tests.lib import shell +from tests_e2e.tests.lib.retry import retry_ssh_run class SshClient(object): def __init__(self, ip_address: str, username: str, private_key_file: Path, port: int = 22): self._ip_address: str = ip_address - self._username:str = username + self._username: str = username self._private_key_file: Path = private_key_file self._port: int = port @@ -38,21 +39,23 @@ def run_command(self, command: str, use_sudo: bool = False) -> str: # Note that we add ~/bin to the remote PATH, since Python (Pypy) and other test tools are installed there. # Note, too, that when using sudo we need to carry over the value of PATH to the sudo session sudo = "sudo env PATH=$PATH" if use_sudo else '' - return shell.run_command([ + return retry_ssh_run(lambda: shell.run_command([ "ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, destination, - f"PATH=~/bin:$PATH;{sudo} {command}"]) + f"PATH=~/bin:$PATH;{sudo} {command}"])) @staticmethod def generate_ssh_key(private_key_file: Path): """ Generates an SSH key on the given Path """ - shell.run_command(["ssh-keygen", "-m", "PEM", "-t", "rsa", "-b", "4096", "-q", "-N", "", "-f", str(private_key_file)]) + shell.run_command( + ["ssh-keygen", "-m", "PEM", "-t", "rsa", "-b", "4096", "-q", "-N", "", "-f", str(private_key_file)]) def get_architecture(self): return self.run_command("uname -m").rstrip() - def copy(self, source: Path, target: Path, remote_source: bool = False, remote_target: bool = False, recursive: bool = False): + def copy(self, source: Path, target: Path, remote_source: bool = False, remote_target: bool = False, + recursive: bool = False): """ Copy file from local to remote machine """