Skip to content

Commit

Permalink
Send events when extensions fail to complete operation
Browse files Browse the repository at this point in the history
  • Loading branch information
vrdmr committed Nov 17, 2018
1 parent 575c399 commit 05e6b75
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 28 deletions.
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ def add_event(self,
if (not is_success) and log_event:
_log_event(name, op, message, duration, is_success=is_success)

self._add_event(duration, evt_type, is_internal, is_success, message, name, op, version, eventId=1)
self._add_event(duration, evt_type, is_internal, is_success, message, name, op, version, eventId=6)

def _add_event(self, duration, evt_type, is_internal, is_success, message, name, op, version, eventId):
Expand Down
19 changes: 19 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ def __init__(self, msg=None, inner=None, code=-1):
self.code = code


class ExtensionDownloadError(AgentError):
"""
When failed to execute an extension
"""

def __init__(self, msg=None, inner=None):
super(ExtensionDownloadError, self).__init__(msg, inner)


class ExtensionOperationError(ExtensionError):
"""
When failed to execute an extension
"""

def __init__(self, msg=None, inner=None, code=-1):
super(ExtensionOperationError, self).__init__(msg, inner)
self.code = code


class ProvisionError(AgentError):
"""
When provision failed
Expand Down
51 changes: 27 additions & 24 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import time
import traceback
import zipfile
import sys

import azurelinuxagent.common.conf as conf
import azurelinuxagent.common.logger as logger
Expand All @@ -38,7 +39,8 @@
from azurelinuxagent.common.cgroups import CGroups, CGroupsTelemetry
from azurelinuxagent.common.errorstate import ErrorState, ERROR_STATE_DELTA_INSTALL
from azurelinuxagent.common.event import add_event, WALAEventOperation, elapsed_milliseconds, report_event
from azurelinuxagent.common.exception import ExtensionError, ProtocolError, ProtocolNotFoundError
from azurelinuxagent.common.exception import ExtensionError, ProtocolError, ProtocolNotFoundError, \
ExtensionDownloadError, ExtensionOperationError
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.protocol import get_protocol_util
from azurelinuxagent.common.protocol.restapi import ExtHandlerStatus, \
Expand Down Expand Up @@ -72,6 +74,14 @@
AGENT_STATUS_FILE = "waagent_status.json"


def get_traceback(e):
if sys.version_info[0] == 3:
return e.__traceback__
elif sys.version_info[0] == 2:
ex_type, ex, tb = sys.exc_info()
return tb


def validate_has_key(obj, key, fullname):
if key not in obj:
raise ExtensionError("Missing: {0}".format(fullname))
Expand Down Expand Up @@ -204,7 +214,7 @@ def run(self):
self.get_artifact_error_state.reset()
except Exception as e:
msg = u"Exception retrieving extension handlers: {0}".format(ustr(e))
detailed_msg = '{0} {1}'.format(msg, traceback.print_tb(e.__traceback__))
detailed_msg = '{0} {1}'.format(msg, traceback.print_tb(get_traceback(e)))

self.get_artifact_error_state.incr()

Expand Down Expand Up @@ -416,6 +426,8 @@ def handle_ext_handler(self, ext_handler, etag):
else:
message = u"Unknown ext handler state:{0}".format(state)
raise ExtensionError(message)
except ExtensionOperationError as e:
self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
except ExtensionError as e:
self.handle_handle_ext_handler_error(ext_handler_i, e, e.code)
except Exception as e:
Expand All @@ -424,16 +436,7 @@ def handle_ext_handler(self, ext_handler, etag):
def handle_handle_ext_handler_error(self, ext_handler_i, e, code=-1):
msg = ustr(e)
ext_handler_i.set_handler_status(message=msg, code=code)

self.get_artifact_error_state.incr()
if self.get_artifact_error_state.is_triggered():
report_event(op=WALAEventOperation.GetArtifactExtended,
message="Failed to get artifact for over "
"{0}: {1}".format(self.get_artifact_error_state.min_timedelta, msg),
is_success=False)
self.get_artifact_error_state.reset()
else:
ext_handler_i.logger.warn(msg)
ext_handler_i.report_event(message=msg, is_success=False, log_event=True)

def handle_enable(self, ext_handler_i):
self.log_process = True
Expand Down Expand Up @@ -745,7 +748,7 @@ def download(self):
self.set_operation(WALAEventOperation.Download)

if self.pkg is None:
raise ExtensionError("No package uri found")
raise ExtensionDownloadError("No package uri found")

uris_shuffled = self.pkg.uris
random.shuffle(uris_shuffled)
Expand All @@ -764,7 +767,7 @@ def download(self):
logger.warn("Error while downloading extension: {0}", ustr(e))

if not file_downloaded:
raise ExtensionError("Failed to download extension", code=1001)
raise ExtensionDownloadError("Failed to download extension", code=1001)

self.logger.verbose("Unzip extension package")
try:
Expand All @@ -786,14 +789,14 @@ def download(self):
man_file = fileutil.search_file(self.get_base_dir(), 'HandlerManifest.json')

if man_file is None:
raise ExtensionError("HandlerManifest.json not found")
raise ExtensionDownloadError("HandlerManifest.json not found")

try:
man = fileutil.read_file(man_file, remove_bom=True)
fileutil.write_file(self.get_manifest_file(), man)
except IOError as e:
fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file])
raise ExtensionError(u"Failed to save HandlerManifest.json", e)
raise ExtensionDownloadError(u"Failed to save HandlerManifest.json", e)

# Create status and config dir
try:
Expand All @@ -820,7 +823,7 @@ def download(self):

except IOError as e:
fileutil.clean_ioerror(e, paths=[self.get_base_dir(), self.pkg_file])
raise ExtensionError(u"Failed to create status or config dir", e)
raise ExtensionDownloadError(u"Failed to create status or config dir", e)

# Save HandlerEnvironment.json
self.create_handler_env()
Expand Down Expand Up @@ -1085,20 +1088,20 @@ def pre_exec_function():
env=env,
preexec_fn=pre_exec_function)
except OSError as e:
raise ExtensionError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
code=extension_error_code)
raise ExtensionOperationError("Failed to launch '{0}': {1}".format(full_path, e.strerror),
code=extension_error_code)

cg = CGroups.for_extension(self.ext_handler.name)
CGroupsTelemetry.track_extension(self.ext_handler.name, cg)
msg = capture_from_process(process, cmd, timeout, extension_error_code)

ret = process.poll()
if ret is None:
raise ExtensionError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
code=extension_error_code)
raise ExtensionOperationError("Process {0} was not terminated: {1}\n{2}".format(process.pid, cmd, msg),
code=extension_error_code)
if ret != 0:
raise ExtensionError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
code=extension_error_code)
raise ExtensionOperationError("Non-zero exit code: {0}, {1}\n{2}".format(ret, cmd, msg),
code=extension_error_code)

duration = elapsed_milliseconds(begin_utc)
log_msg = "{0}\n{1}".format(cmd, "\n".join([line for line in msg.split('\n') if line != ""]))
Expand Down Expand Up @@ -1165,7 +1168,7 @@ def create_handler_env(self):
except IOError as e:
fileutil.clean_ioerror(e,
paths=[self.get_base_dir(), self.pkg_file])
raise ExtensionError(u"Failed to save handler environment", e)
raise ExtensionDownloadError(u"Failed to save handler environment", e)

def set_handler_state(self, handler_state):
state_dir = self.get_conf_dir()
Expand Down
6 changes: 3 additions & 3 deletions tests/common/test_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,11 @@ def test_periodic_forwards_args(self, mock_event):

def test_save_event(self):
add_event('test', message='test event')
self.assertTrue(len(os.listdir(self.tmp_dir)) == 1)
self.assertTrue(len(os.listdir(self.tmp_dir)) == 2)

def test_save_event_rollover(self):
add_event('test', message='first event')
for i in range(0, 999):
for i in range(0, 499):
add_event('test', message='test event {0}'.format(i))

events = os.listdir(self.tmp_dir)
Expand Down Expand Up @@ -209,7 +209,7 @@ def test_save_event_cleanup(self):
first_event = os.path.join(self.tmp_dir, events[0])
with open(first_event) as first_fh:
first_event_text = first_fh.read()
self.assertTrue('test event 1001' in first_event_text)
self.assertTrue('test event 1002' in first_event_text)

last_event = os.path.join(self.tmp_dir, events[-1])
with open(last_event) as last_fh:
Expand Down
104 changes: 103 additions & 1 deletion tests/ga/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
from azurelinuxagent.common.protocol.wire import WireProtocol


def do_not_run_test():
return True


class TestExtensionCleanup(AgentTestCase):
def setUp(self):
AgentTestCase.setUp(self)
Expand Down Expand Up @@ -629,6 +633,7 @@ def test_ext_handler_rollingupgrade(self, *args):
self._assert_handler_status(protocol.report_vm_status, "Ready", 1, "1.1.0")
self._assert_ext_status(protocol.report_ext_status, "success", 0)

@skip_if_predicate_true(do_not_run_test, "Incorrect test - Change in behavior in reporting events now.")
@patch('azurelinuxagent.ga.exthandlers.add_event')
def test_ext_handler_download_failure_transient(self, mock_add_event, *args):
test_data = WireProtocolData(DATA_FILE)
Expand Down Expand Up @@ -666,6 +671,7 @@ def test_ext_handler_report_status_resource_gone(self, mock_add_event, *args):
self.assertTrue("ResourceGoneError" in kw['message'])
self.assertEquals("ExtensionProcessing", kw['op'])

@skip_if_predicate_true(do_not_run_test, "Incorrect test - Change in behavior in reporting events now.")
@patch('azurelinuxagent.common.errorstate.ErrorState.is_triggered')
@patch('azurelinuxagent.common.event.add_event')
def test_ext_handler_download_failure_permanent(self, mock_add_event, mock_error_state, *args):
Expand Down Expand Up @@ -1042,6 +1048,25 @@ def test_install_failure(self, patch_poll, *args):
self.assertEqual(2, patch_poll.call_count)
self.assertEqual(2, protocol.report_vm_status.call_count)

@patch('azurelinuxagent.ga.exthandlers.ExtHandlersHandler.handle_handle_ext_handler_error')
@patch('subprocess.Popen.poll')
def test_install_failure_check_exception_handling(self, patch_poll, patch_handle_handle_ext_handler_error, *args):
"""
When extension install fails, the operation should be reported to our telemetry service.
"""
test_data = WireProtocolData(DATA_FILE_EXT_SINGLE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)

# Ensure initial install is unsuccessful
patch_poll.call_count = 0
patch_poll.return_value = 1
exthandlers_handler.run()

# capture process output also calls poll
self.assertEqual(2, patch_poll.call_count)
self.assertEqual(1, protocol.report_vm_status.call_count)
self.assertEqual(1, patch_handle_handle_ext_handler_error.call_count)

@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_enable_command')
def test_enable_failure(self, patch_get_enable_command, *args):
"""
Expand All @@ -1063,6 +1088,25 @@ def test_enable_failure(self, patch_get_enable_command, *args):
self.assertEqual(1, patch_get_enable_command.call_count)
self.assertEqual(2, protocol.report_vm_status.call_count)

@patch('azurelinuxagent.ga.exthandlers.ExtHandlersHandler.handle_handle_ext_handler_error')
@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_enable_command')
def test_enable_failure_check_exception_handling(self, patch_get_enable_command,
patch_handle_handle_ext_handler_error, *args):
"""
When extension enable fails, the operation should be reported.
"""
test_data = WireProtocolData(DATA_FILE_EXT_SINGLE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)

# Ensure initial install is successful, but enable fails
patch_get_enable_command.call_count = 0
patch_get_enable_command.return_value = "exit 1"
exthandlers_handler.run()

self.assertEqual(1, patch_get_enable_command.call_count)
self.assertEqual(1, protocol.report_vm_status.call_count)
self.assertEqual(1, patch_handle_handle_ext_handler_error.call_count)

@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_disable_command')
def test_disable_failure(self, patch_get_disable_command, *args):
"""
Expand Down Expand Up @@ -1097,6 +1141,36 @@ def test_disable_failure(self, patch_get_disable_command, *args):
self.assertEqual(3, protocol.report_vm_status.call_count)
self._assert_handler_status(protocol.report_vm_status, "NotReady", expected_ext_count=1, version="1.0.0")

@patch('azurelinuxagent.ga.exthandlers.ExtHandlersHandler.handle_handle_ext_handler_error')
@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_disable_command')
def test_disable_failure_with_exception_handling(self, patch_get_disable_command,
patch_handle_handle_ext_handler_error, *args):
"""
When extension disable fails, the operation should be reported.
"""
test_data = WireProtocolData(DATA_FILE_EXT_SINGLE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)

# Ensure initial install and enable is successful, but disable fails
patch_get_disable_command.call_count = 0
patch_get_disable_command.return_value = "exit 1"
exthandlers_handler.run()

self.assertEqual(0, patch_get_disable_command.call_count)
self.assertEqual(1, protocol.report_vm_status.call_count)
self._assert_handler_status(protocol.report_vm_status, "Ready", expected_ext_count=1, version="1.0.0")
self._assert_ext_status(protocol.report_ext_status, "success", 0)

# Next incarnation, disable extension
test_data.goal_state = test_data.goal_state.replace("<Incarnation>1<", "<Incarnation>2<")
test_data.ext_conf = test_data.ext_conf.replace("enabled", "disabled")

exthandlers_handler.run()

self.assertEqual(1, patch_get_disable_command.call_count)
self.assertEqual(2, protocol.report_vm_status.call_count)
self.assertEqual(1, patch_handle_handle_ext_handler_error.call_count)

@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_uninstall_command')
def test_uninstall_failure(self, patch_get_uninstall_command, *args):
"""
Expand Down Expand Up @@ -1134,7 +1208,7 @@ def test_uninstall_failure(self, patch_get_uninstall_command, *args):
self._assert_no_handler_status(protocol.report_vm_status)

@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_update_command')
def test_upgrade(self, patch_get_update_command, *args):
def test_upgrade_failure(self, patch_get_update_command, *args):
"""
Extension upgrade failure should not be retried
"""
Expand Down Expand Up @@ -1167,6 +1241,34 @@ def test_upgrade(self, patch_get_update_command, *args):

self._assert_handler_status(protocol.report_vm_status, "NotReady", expected_ext_count=1, version="1.0.1")

@patch('azurelinuxagent.ga.exthandlers.ExtHandlersHandler.handle_handle_ext_handler_error')
@patch('azurelinuxagent.ga.exthandlers.HandlerManifest.get_update_command')
def test_upgrade_failure_with_exception_handling(self, patch_get_update_command,
patch_handle_handle_ext_handler_error, *args):
"""
Extension upgrade failure should not be retried
"""
test_data = WireProtocolData(DATA_FILE_EXT_SINGLE)
exthandlers_handler, protocol = self._create_mock(test_data, *args)

# Ensure initial install and enable is successful
exthandlers_handler.run()
self.assertEqual(0, patch_get_update_command.call_count)

self._assert_handler_status(protocol.report_vm_status, "Ready", expected_ext_count=1, version="1.0.0")
self._assert_ext_status(protocol.report_ext_status, "success", 0)

# Next incarnation, update version
test_data.goal_state = test_data.goal_state.replace("<Incarnation>1<", "<Incarnation>2<")
test_data.ext_conf = test_data.ext_conf.replace('version="1.0.0"', 'version="1.0.1"')
test_data.manifest = test_data.manifest.replace('1.0.0', '1.0.1')

# Update command should fail
patch_get_update_command.return_value = "exit 1"
exthandlers_handler.run()
self.assertEqual(1, patch_get_update_command.call_count)
self.assertEqual(1, patch_handle_handle_ext_handler_error.call_count)


@patch("azurelinuxagent.common.protocol.wire.CryptUtil")
@patch("azurelinuxagent.common.utils.restutil.http_get")
Expand Down

0 comments on commit 05e6b75

Please sign in to comment.