diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index bb968bad86..8075725eb0 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -126,7 +126,7 @@ combinator: location: $(location) vm_size: $(vm_size) -concurrency: 32 +concurrency: 16 notifier: - type: agent.junit diff --git a/tests_e2e/tests/lib/retry.py b/tests_e2e/tests/lib/retry.py index a86227bc6c..bbd327cda3 100644 --- a/tests_e2e/tests/lib/retry.py +++ b/tests_e2e/tests/lib/retry.py @@ -19,6 +19,7 @@ from typing import Callable, Any from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.shell import CommandError def execute_with_retry(operation: Callable[[], Any]) -> Any: @@ -39,3 +40,20 @@ def execute_with_retry(operation: Callable[[], Any]) -> Any: time.sleep(30) +def retry_ssh_run(operation: Callable[[], Any]) -> Any: + """ + This method attempts to retry ssh run command a few times if operation failed with connection time out + """ + attempts = 3 + while attempts > 0: + attempts -= 1 + try: + return operation() + except Exception as e: + # We raise CommandError on !=0 exit codes in the called method + if isinstance(e, CommandError): + # Instance of 'Exception' has no 'exit_code' member (no-member) - Disabled: e is actually an CommandError + if e.exit_code != 255 or attempts == 0: # pylint: disable=no-member + raise + log.warning("The operation failed with %s, retrying in 30 secs.", e) + time.sleep(30) diff --git a/tests_e2e/tests/lib/ssh_client.py b/tests_e2e/tests/lib/ssh_client.py index 50ff9f8086..c10d763a47 100644 --- a/tests_e2e/tests/lib/ssh_client.py +++ b/tests_e2e/tests/lib/ssh_client.py @@ -21,12 +21,13 @@ from pathlib import Path from tests_e2e.tests.lib import shell +from tests_e2e.tests.lib.retry import retry_ssh_run class SshClient(object): def __init__(self, ip_address: str, username: str, private_key_file: Path, port: int = 22): self._ip_address: str = ip_address - self._username:str = username + self._username: str = username self._private_key_file: Path = private_key_file self._port: int = port @@ -43,16 +44,17 @@ def run_command(self, command: str, use_sudo: bool = False) -> str: # Note that we add ~/bin to the remote PATH, since Python (Pypy) and other test tools are installed there. # Note, too, that when using sudo we need to carry over the value of PATH to the sudo session sudo = "sudo env PATH=$PATH PYTHONPATH=$PYTHONPATH" if use_sudo else '' - return shell.run_command([ + return retry_ssh_run(lambda: shell.run_command([ "ssh", "-o", "StrictHostKeyChecking=no", "-i", self._private_key_file, destination, - f"source ~/bin/agent-env;{sudo} {command}"]) + f"source ~/bin/agent-env;{sudo} {command}"])) @staticmethod def generate_ssh_key(private_key_file: Path): """ Generates an SSH key on the given Path """ - shell.run_command(["ssh-keygen", "-m", "PEM", "-t", "rsa", "-b", "4096", "-q", "-N", "", "-f", str(private_key_file)]) + shell.run_command( + ["ssh-keygen", "-m", "PEM", "-t", "rsa", "-b", "4096", "-q", "-N", "", "-f", str(private_key_file)]) def get_architecture(self): return self.run_command("uname -m").rstrip()