From 78e5e87fd0d2081ea6bb12888cbfd23fd06bf412 Mon Sep 17 00:00:00 2001 From: Michael Barrett Date: Sat, 13 Oct 2018 14:39:57 -0700 Subject: [PATCH] Get rid of recursion for tail retries (#663) * Get rid of recursion for tail retries Also, extend the way we retry/timeout, that should work around #515. Not sure of a great way to test this unfortunately. * Add some tests --- stacker/providers/aws/default.py | 59 ++++++----- stacker/tests/providers/aws/test_default.py | 104 ++++++++++++++++++-- 2 files changed, 133 insertions(+), 30 deletions(-) diff --git a/stacker/providers/aws/default.py b/stacker/providers/aws/default.py index 596ac3a5b..f6824e14c 100644 --- a/stacker/providers/aws/default.py +++ b/stacker/providers/aws/default.py @@ -48,7 +48,14 @@ # https://github.com/boto/botocore/blob/1.6.1/botocore/data/_retry.json#L97-L121 MAX_ATTEMPTS = 10 -MAX_TAIL_RETRIES = 5 +# Updated this to 15 retries with a 1 second sleep between retries. This is +# only used when a call to `get_events` fails due to the stack not being +# found. This is often the case because Cloudformation is taking too long +# to create the stack. 15 seconds should, hopefully, be plenty of time for +# the stack to start showing up in the API. +MAX_TAIL_RETRIES = 15 +TAIL_RETRY_SLEEP = 1 +GET_EVENTS_SLEEP = 1 DEFAULT_CAPABILITIES = ["CAPABILITY_NAMED_IAM", ] @@ -581,8 +588,8 @@ def is_stack_rolling_back(self, stack, **kwargs): def is_stack_failed(self, stack, **kwargs): return self.get_stack_status(stack) in self.FAILED_STATUSES - def tail_stack(self, stack, cancel, retries=0, **kwargs): - def log_func(e): + def tail_stack(self, stack, cancel, log_func=None, **kwargs): + def _log_func(e): event_args = [e['ResourceStatus'], e['ResourceType'], e.get('ResourceStatusReason', None)] # filter out any values that are empty @@ -590,22 +597,26 @@ def log_func(e): template = " ".join(["[%s]"] + ["%s" for _ in event_args]) logger.info(template, *([stack.fqn] + event_args)) - if not retries: - logger.info("Tailing stack: %s", stack.fqn) + log_func = log_func or _log_func - try: - self.tail(stack.fqn, - cancel=cancel, - log_func=log_func, - include_initial=False) - except botocore.exceptions.ClientError as e: - if "does not exist" in str(e) and retries < MAX_TAIL_RETRIES: - # stack might be in the process of launching, wait for a second - # and try again - time.sleep(1) - self.tail_stack(stack, cancel, retries=retries + 1, **kwargs) - else: - raise + logger.info("Tailing stack: %s", stack.fqn) + + attempts = 0 + while True: + attempts += 1 + try: + self.tail(stack.fqn, cancel=cancel, log_func=log_func, + include_initial=False) + break + except botocore.exceptions.ClientError as e: + if "does not exist" in str(e) and attempts < MAX_TAIL_RETRIES: + # stack might be in the process of launching, wait for a + # second and try again + if cancel.wait(TAIL_RETRY_SLEEP): + return + continue + else: + raise @staticmethod def _tail_print(e): @@ -613,24 +624,24 @@ def _tail_print(e): e['ResourceType'], e['EventId'])) - def get_events(self, stackname): + def get_events(self, stack_name): """Get the events in batches and return in chronological order""" next_token = None event_list = [] - while 1: + while True: if next_token is not None: events = self.cloudformation.describe_stack_events( - StackName=stackname, NextToken=next_token + StackName=stack_name, NextToken=next_token ) else: events = self.cloudformation.describe_stack_events( - StackName=stackname + StackName=stack_name ) event_list.append(events['StackEvents']) next_token = events.get('NextToken', None) if next_token is None: break - time.sleep(1) + time.sleep(GET_EVENTS_SLEEP) return reversed(sum(event_list, [])) def tail(self, stack_name, cancel, log_func=_tail_print, sleep_time=5, @@ -646,7 +657,7 @@ def tail(self, stack_name, cancel, log_func=_tail_print, sleep_time=5, seen.add(e['EventId']) # Now keep looping through and dump the new events - while 1: + while True: events = self.get_events(stack_name) for e in events: if e['EventId'] not in seen: diff --git a/stacker/tests/providers/aws/test_default.py b/stacker/tests/providers/aws/test_default.py index fcc14048f..10dc5577c 100644 --- a/stacker/tests/providers/aws/test_default.py +++ b/stacker/tests/providers/aws/test_default.py @@ -6,19 +6,24 @@ from datetime import datetime import random import string +import threading import unittest -from mock import patch +from mock import patch, MagicMock from botocore.stub import Stubber +from botocore.exceptions import ClientError, UnStubbedResponseError import boto3 -from ....actions.diff import DictValue +from stacker.actions.diff import DictValue -from ....providers.base import Template -from ....session_cache import get_session +from stacker.providers.base import Template +from stacker.session_cache import get_session -from ....providers.aws.default import ( +from stacker.providers.aws import default + +from stacker.providers.aws.default import ( DEFAULT_CAPABILITIES, + MAX_TAIL_RETRIES, Provider, requires_replacement, ask_for_approval, @@ -28,7 +33,9 @@ generate_cloudformation_args, ) -from .... import exceptions +from stacker import exceptions + +from stacker.stack import Stack def random_string(length=12): @@ -558,6 +565,91 @@ def test_noninteractive_changeset_update_with_stack_policy(self): parameters=[], stack_policy=Template(body="{}"), tags=[], ) + def test_tail_stack_retry_on_missing_stack(self): + stack_name = "SlowToCreateStack" + stack = MagicMock(spec=Stack) + stack.fqn = "my-namespace-{}".format(stack_name) + + default.TAIL_RETRY_SLEEP = .01 + + # Ensure the stack never appears before we run out of retries + for i in range(MAX_TAIL_RETRIES + 5): + self.stubber.add_client_error( + "describe_stack_events", + service_error_code="ValidationError", + service_message="Stack [{}] does not exist".format(stack_name), + http_status_code=400, + response_meta={"attempt": i + 1}, + ) + + with self.stubber: + try: + self.provider.tail_stack(stack, threading.Event()) + except ClientError as exc: + self.assertEqual( + exc.response["ResponseMetadata"]["attempt"], + MAX_TAIL_RETRIES + ) + + def test_tail_stack_retry_on_missing_stack_eventual_success(self): + stack_name = "SlowToCreateStack" + stack = MagicMock(spec=Stack) + stack.fqn = "my-namespace-{}".format(stack_name) + + default.TAIL_RETRY_SLEEP = .01 + default.GET_EVENTS_SLEEP = .01 + + rcvd_events = [] + + def mock_log_func(e): + rcvd_events.append(e) + + def valid_event_response(stack, event_id): + return { + "StackEvents": [ + { + "StackId": stack.fqn + "12345", + "EventId": event_id, + "StackName": stack.fqn, + "Timestamp": datetime.now() + }, + ] + } + + # Ensure the stack never appears before we run out of retries + for i in range(3): + self.stubber.add_client_error( + "describe_stack_events", + service_error_code="ValidationError", + service_message="Stack [{}] does not exist".format(stack_name), + http_status_code=400, + response_meta={"attempt": i + 1}, + ) + + self.stubber.add_response( + "describe_stack_events", + valid_event_response(stack, "InitialEvents") + ) + + self.stubber.add_response( + "describe_stack_events", + valid_event_response(stack, "Event1") + ) + + with self.stubber: + try: + self.provider.tail_stack(stack, threading.Event(), + log_func=mock_log_func) + except UnStubbedResponseError: + # Eventually we run out of responses - could not happen in + # regular execution + # normally this would just be dealt with when the threads were + # shutdown, but doing so here is a little difficult because + # we can't control the `tail_stack` loop + pass + + self.assertEqual(rcvd_events[0]["EventId"], "Event1") + class TestProviderInteractiveMode(unittest.TestCase): def setUp(self):