Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing infinite download of extension manifest without a new GS #1874

Merged
merged 8 commits into from
May 22, 2020
1 change: 0 additions & 1 deletion azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ class WALAEventOperation:
ExtensionProcessing = "ExtensionProcessing"
FetchGoalState = "FetchGoalState"
Firewall = "Firewall"
GetArtifactExtended = "GetArtifactExtended"
HealthCheck = "HealthCheck"
HealthObservation = "HealthObservation"
HeartBeat = "HeartBeat"
Expand Down
49 changes: 12 additions & 37 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,32 +236,11 @@ def __init__(self, protocol):
self.log_process = False

self.report_status_error_state = ErrorState()
self.get_artifact_error_state = ErrorState(min_timedelta=ERROR_STATE_DELTA_INSTALL)

def run(self):
self.ext_handlers, etag = None, None
try:
self.ext_handlers, etag = self.protocol.get_ext_handlers()
self.get_artifact_error_state.reset()
except Exception as e:
msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
detailed_msg = '{0} {1}'.format(msg, traceback.extract_tb(get_traceback(e)))

self.get_artifact_error_state.incr()

if self.get_artifact_error_state.is_triggered():
add_event(AGENT_NAME,
version=CURRENT_VERSION,
op=WALAEventOperation.GetArtifactExtended,
larohra marked this conversation as resolved.
Show resolved Hide resolved
is_success=False,
message="Failed to get extension artifact for over "
"{0}: {1}".format(self.get_artifact_error_state.min_timedelta, msg))
self.get_artifact_error_state.reset()

add_event(AGENT_NAME, version=CURRENT_VERSION, op=WALAEventOperation.ExtensionProcessing, is_success=False, message=detailed_msg)
return

try:
self.ext_handlers, etag = self.protocol.get_ext_handlers()
msg = u"Handle extensions updates for incarnation {0}".format(etag)
logger.verbose(msg)
# Log status report success on new config
Expand Down Expand Up @@ -423,7 +402,14 @@ def handle_ext_handler(self, ext_handler, etag):
ext_handler_i = ExtHandlerInstance(ext_handler, self.protocol)

try:
if self.last_etag == etag:
if self.log_etag:
ext_handler_i.logger.verbose("Incarnation {0} did not change, not processing GoalState", etag)
self.log_etag = False
return

state = ext_handler.properties.state

if ext_handler_i.decide_version(target_state=state) is None:
version = ext_handler_i.ext_handler.properties.version
name = ext_handler_i.ext_handler.name
Expand All @@ -433,15 +419,6 @@ def handle_ext_handler(self, ext_handler, etag):
ext_handler_i.report_event(message=ustr(err_msg), is_success=False)
return

self.get_artifact_error_state.reset()
if self.last_etag == etag:
if self.log_etag:
ext_handler_i.logger.verbose("Version {0} is current for etag {1}",
ext_handler_i.pkg.version,
etag)
self.log_etag = False
return

self.log_etag = True

ext_handler_i.logger.info("Target handler state: {0} [incarnation {1}]", state, etag)
Expand Down Expand Up @@ -475,12 +452,8 @@ def handle_ext_handler_download_error(self, ext_handler_i, e, code=-1):
msg = ustr(e)
ext_handler_i.set_handler_status(message=msg, code=code)

self.get_artifact_error_state.incr()
larohra marked this conversation as resolved.
Show resolved Hide resolved
if self.get_artifact_error_state.is_triggered():
report_event(op=WALAEventOperation.Download, is_success=False, log_event=True,
message="Failed to get artifact for over "
"{0}: {1}".format(self.get_artifact_error_state.min_timedelta, msg))
self.get_artifact_error_state.reset()
report_event(op=WALAEventOperation.Download, is_success=False, log_event=True,
message="Failed to download artifacts: {0}".format(msg))

def handle_enable(self, ext_handler_i):
self.log_process = True
Expand Down Expand Up @@ -1389,6 +1362,8 @@ def set_handler_status(self, status="NotReady", message="", code=0):
try:
handler_status_json = json.dumps(get_properties(handler_status))
if handler_status_json is not None:
if not os.path.exists(state_dir):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently if there is any exceptions in extension processing and the config directory doesnt exist, the error is never reported back to CRP and the customer has to wait for 90 mins before the operation timesout. This is a bad Customer Experience. To mitigate it, I'm creating the config directory incase it doesnt exist while reporting status.

fileutil.mkdir(state_dir, mode=0o700)
fileutil.write_file(status_file, handler_status_json)
else:
self.logger.error("Failed to create JSON document of handler status for {0} version {1}".format(
Expand Down
78 changes: 54 additions & 24 deletions tests/ga/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import tempfile
import time
import unittest
import uuid
import zipfile

import datetime
Expand Down Expand Up @@ -520,6 +521,36 @@ def test_ext_handler(self, *args):
exthandlers_handler.run()
self._assert_no_handler_status(protocol.report_vm_status)

def test_it_should_only_download_extension_manifest_once_per_goal_state(self, *args):

def _assert_handler_status_and_manifest_download_count(protocol, test_data, manifest_count):
self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.0.0")
self._assert_ext_status(protocol.report_ext_status, "success", 0)
self.assertEqual(test_data.call_counts['manifest.xml'], manifest_count,
"We should have downloaded extension manifest {0} times".format(manifest_count))

test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)
exthandlers_handler.run()
_assert_handler_status_and_manifest_download_count(protocol, test_data, 1)

for _ in range(5):
exthandlers_handler.run()
# The extension manifest should only be downloaded once as incarnation did not change
_assert_handler_status_and_manifest_download_count(protocol, test_data, 1)

# Update Incarnation
test_data.set_incarnation(2)
protocol.update_goal_state()

exthandlers_handler.run()
_assert_handler_status_and_manifest_download_count(protocol, test_data, 2)

for _ in range(5):
exthandlers_handler.run()
# The extension manifest should be downloaded twice now as incarnation changed once
_assert_handler_status_and_manifest_download_count(protocol, test_data, 2)

def test_ext_zip_file_packages_removed_in_update_case(self, *args):
# Test enable scenario.
test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE)
Expand Down Expand Up @@ -952,39 +983,38 @@ def test_ext_handler_download_failure_permanent_ProtocolError(self, mock_add_eve
self.assertTrue("Failed to get ext handler pkgs" in kw['message'])
self.assertTrue("ProtocolError" in kw['message'])

@patch('azurelinuxagent.common.errorstate.ErrorState.is_triggered')
@patch('azurelinuxagent.common.event.add_event')
def test_ext_handler_download_failure_permanent_with_ExtensionDownloadError_and_triggered(self, mock_add_event,
mock_error_state, *args):
test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)
protocol.get_ext_handler_pkgs = Mock(side_effect=ExtensionDownloadError)

mock_error_state.return_value = True

exthandlers_handler.run()
def test_ext_handler_download_errors_should_be_reported_only_on_new_goal_state(self, mock_add_event, *args):

self.assertEquals(1, mock_add_event.call_count)
args, kw = mock_add_event.call_args_list[0]
self.assertEquals(False, kw['is_success'])
self.assertTrue("Failed to get artifact for over" in kw['message'])
self.assertTrue("ExtensionDownloadError" in kw['message'])
self.assertEquals("Download", kw['op'])
def _assert_mock_add_event_call(expected_download_failed_event_count, err_msg_guid):
event_occurrences = [kw for _, kw in mock_add_event.call_args_list if
"Failed to download artifacts: [ExtensionDownloadError] {0}".format(err_msg_guid) in kw['message']]
self.assertEquals(expected_download_failed_event_count, len(event_occurrences), "Call count do not match")
self.assertFalse(any([kw['is_success'] for kw in event_occurrences]), "The events should have failed")
self.assertEqual(expected_download_failed_event_count, len([kw['op'] for kw in event_occurrences]),
"Incorrect Operation, all events should be a download errors")

@patch('azurelinuxagent.common.errorstate.ErrorState.is_triggered')
@patch('azurelinuxagent.common.event.add_event')
def test_ext_handler_download_failure_permanent_with_ExtensionDownloadError_and_not_triggered(self, mock_add_event,
mock_error_state,
*args):
test_data = mockwiredata.WireProtocolData(mockwiredata.DATA_FILE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)
protocol.get_ext_handler_pkgs = Mock(side_effect=ExtensionDownloadError)
unique_error_message_guid = str(uuid.uuid4())
protocol.get_ext_handler_pkgs = Mock(side_effect=ExtensionDownloadError(unique_error_message_guid))

mock_error_state.return_value = False
exthandlers_handler.run()
_assert_mock_add_event_call(expected_download_failed_event_count=1, err_msg_guid=unique_error_message_guid)
self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0")

# Re-run exthandler.run without updating the GS and ensure we dont report error
exthandlers_handler.run()
_assert_mock_add_event_call(expected_download_failed_event_count=1, err_msg_guid=unique_error_message_guid)
self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line will fail this test currently because the cleanup logic cleans up the state and we dont report it again. Once the PR #1889 is merged, this test should pass.


self.assertEquals(0, mock_add_event.call_count)
# Change incarnation and then re-check we report error
test_data.set_incarnation(2)
protocol.update_goal_state()

exthandlers_handler.run()
_assert_mock_add_event_call(expected_download_failed_event_count=2, err_msg_guid=unique_error_message_guid)
self._assert_handler_status(protocol.report_vm_status, "NotReady", 0, "1.0.0")

@patch('azurelinuxagent.ga.exthandlers.fileutil')
def test_ext_handler_io_error(self, mock_fileutil, *args):
Expand Down
23 changes: 0 additions & 23 deletions tests/ga/test_exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,29 +249,6 @@ def test_extension_sequence_number(self):
disk_sequence_number=3,
expected_sequence_number=-1)

@patch("azurelinuxagent.ga.exthandlers.add_event")
@patch("azurelinuxagent.common.errorstate.ErrorState.is_triggered")
def test_it_should_report_an_error_if_the_wireserver_cannot_be_reached(self, patch_is_triggered, patch_add_event):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a redundant test, this scenario is being tested by the tests for try_update_goal_state functionality

test_message = "TEST MESSAGE"

patch_is_triggered.return_value = True # protocol errors are reported only after a delay; force the error to be reported now

protocol = WireProtocol("foo.bar")
protocol.get_ext_handlers = MagicMock(side_effect=ProtocolError(test_message))

get_exthandlers_handler(protocol).run()

self.assertEquals(patch_add_event.call_count, 2)

_, first_call_args = patch_add_event.call_args_list[0]
self.assertEquals(first_call_args['op'], WALAEventOperation.GetArtifactExtended)
self.assertEquals(first_call_args['is_success'], False)

_, second_call_args = patch_add_event.call_args_list[1]
self.assertEquals(second_call_args['op'], WALAEventOperation.ExtensionProcessing)
self.assertEquals(second_call_args['is_success'], False)
self.assertIn(test_message, second_call_args['message'])


class LaunchCommandTestCase(AgentTestCase):
"""
Expand Down