Make error notifications more consistent.

The error notifications sent by wrap_exception are a good source of
information when tracking down errors in the system. It is easy to spot
patterns in the errors by looking at similarities in the exceptions
raised.

This change looks at making the in-line error notifications
follow the same format, so they become equally useful.

Closes-Bug: #1251602
Change-Id: I1cd64a90ecefc64d12af05487eb6e45aa0362d69
This commit is contained in:
John Garbutt
2013-11-09 20:08:37 +08:00
committed by John Garbutt
parent cb31014353
commit f35c63df24
4 changed files with 94 additions and 70 deletions

View File

@@ -958,13 +958,13 @@ class ComputeManager(manager.Manager):
extra_usage_info = {} extra_usage_info = {}
def notify(status, msg="", **kwargs): def notify(status, msg="", fault=None, **kwargs):
"""Send a create.{start,error,end} notification.""" """Send a create.{start,error,end} notification."""
type_ = "create.%(status)s" % dict(status=status) type_ = "create.%(status)s" % dict(status=status)
info = extra_usage_info.copy() info = extra_usage_info.copy()
info['message'] = unicode(msg) info['message'] = unicode(msg)
self._notify_about_instance_usage(context, instance, type_, self._notify_about_instance_usage(context, instance, type_,
extra_usage_info=info, **kwargs) extra_usage_info=info, fault=fault, **kwargs)
try: try:
self._prebuild_instance(context, instance) self._prebuild_instance(context, instance)
@@ -985,7 +985,7 @@ class ComputeManager(manager.Manager):
except exception.RescheduledException as e: except exception.RescheduledException as e:
# Instance build encountered an error, and has been rescheduled. # Instance build encountered an error, and has been rescheduled.
notify("error", msg=unicode(e)) # notify that build failed notify("error", fault=e)
except exception.BuildAbortException as e: except exception.BuildAbortException as e:
# Instance build aborted due to a non-failure # Instance build aborted due to a non-failure
@@ -996,7 +996,7 @@ class ComputeManager(manager.Manager):
# Instance build encountered a non-recoverable error: # Instance build encountered a non-recoverable error:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
self._set_instance_error_state(context, instance['uuid']) self._set_instance_error_state(context, instance['uuid'])
notify("error", msg=unicode(e)) # notify that build failed notify("error", fault=e) # notify that build failed
def _prebuild_instance(self, context, instance): def _prebuild_instance(self, context, instance):
self._check_instance_exists(context, instance) self._check_instance_exists(context, instance)
@@ -1128,6 +1128,8 @@ class ComputeManager(manager.Manager):
compute_utils.add_instance_fault_from_exc(context, self.conductor_api, compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
instance, exc_info[1], exc_info=exc_info) instance, exc_info[1], exc_info=exc_info)
self._notify_about_instance_usage(context, instance,
'instance.create.error', fault=exc_info[1])
try: try:
LOG.debug(_("Clean up resource before rescheduling."), LOG.debug(_("Clean up resource before rescheduling."),
@@ -1515,12 +1517,12 @@ class ComputeManager(manager.Manager):
def _notify_about_instance_usage(self, context, instance, event_suffix, def _notify_about_instance_usage(self, context, instance, event_suffix,
network_info=None, system_metadata=None, network_info=None, system_metadata=None,
extra_usage_info=None): extra_usage_info=None, fault=None):
compute_utils.notify_about_instance_usage( compute_utils.notify_about_instance_usage(
self.notifier, context, instance, event_suffix, self.notifier, context, instance, event_suffix,
network_info=network_info, network_info=network_info,
system_metadata=system_metadata, system_metadata=system_metadata,
extra_usage_info=extra_usage_info) extra_usage_info=extra_usage_info, fault=fault)
def _deallocate_network(self, context, instance, def _deallocate_network(self, context, instance,
requested_networks=None): requested_networks=None):
@@ -1697,29 +1699,22 @@ class ComputeManager(manager.Manager):
'create.end', 'create.end',
extra_usage_info={'message': _('Success')}, extra_usage_info={'message': _('Success')},
network_info=network_info) network_info=network_info)
except exception.InstanceNotFound as e: except (exception.InstanceNotFound,
exception.UnexpectedDeletingTaskStateError) as e:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
self._notify_about_instance_usage(context, instance, self._notify_about_instance_usage(context, instance,
'create.end', 'create.end', fault=e)
extra_usage_info={'message': e.format_message()})
except exception.UnexpectedDeletingTaskStateError as e:
with excutils.save_and_reraise_exception():
msg = e.format_message()
self._notify_about_instance_usage(context, instance,
'create.end', extra_usage_info={'message': msg})
except exception.ComputeResourcesUnavailable as e: except exception.ComputeResourcesUnavailable as e:
LOG.debug(e.format_message(), instance=instance) LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance, self._notify_about_instance_usage(context, instance,
'create.error', 'create.error', fault=e)
extra_usage_info={'message': e.format_message()})
raise exception.RescheduledException( raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=e.format_message()) instance_uuid=instance.uuid, reason=e.format_message())
except exception.BuildAbortException as e: except exception.BuildAbortException as e:
with excutils.save_and_reraise_exception(): with excutils.save_and_reraise_exception():
LOG.debug(e.format_message, instance=instance) LOG.debug(e.format_message(), instance=instance)
self._notify_about_instance_usage(context, instance, self._notify_about_instance_usage(context, instance,
'create.error', extra_usage_info={'message': 'create.error', fault=e)
e.format_message()})
except (exception.VirtualInterfaceCreateException, except (exception.VirtualInterfaceCreateException,
exception.VirtualInterfaceMacAddressException, exception.VirtualInterfaceMacAddressException,
exception.FixedIpLimitExceeded, exception.FixedIpLimitExceeded,
@@ -1727,15 +1722,13 @@ class ComputeManager(manager.Manager):
LOG.exception(_('Failed to allocate network(s)'), LOG.exception(_('Failed to allocate network(s)'),
instance=instance) instance=instance)
self._notify_about_instance_usage(context, instance, self._notify_about_instance_usage(context, instance,
'create.error', extra_usage_info={'message': 'create.error', fault=e)
e.format_message()})
msg = _('Failed to allocate the network(s), not rescheduling.') msg = _('Failed to allocate the network(s), not rescheduling.')
raise exception.BuildAbortException(instance_uuid=instance.uuid, raise exception.BuildAbortException(instance_uuid=instance.uuid,
reason=msg) reason=msg)
except Exception as e: except Exception as e:
self._notify_about_instance_usage(context, instance, self._notify_about_instance_usage(context, instance,
'create.error', 'create.error', fault=e)
extra_usage_info={'message': str(e)})
raise exception.RescheduledException( raise exception.RescheduledException(
instance_uuid=instance.uuid, reason=str(e)) instance_uuid=instance.uuid, reason=str(e))
@@ -2400,25 +2393,23 @@ class ComputeManager(manager.Manager):
bad_volumes_callback=bad_volumes_callback) bad_volumes_callback=bad_volumes_callback)
except Exception as error: except Exception as error:
# Can't use save_and_reraise as we don't know yet if we with excutils.save_and_reraise_exception() as ctxt:
# will re-raise or not exc_info = sys.exc_info()
type_, value, tb = sys.exc_info() # if the reboot failed but the VM is running don't
# put it into an error state
compute_utils.add_instance_fault_from_exc(context, new_power_state = self._get_power_state(context, instance)
self.conductor_api, instance, error, if new_power_state == power_state.RUNNING:
sys.exc_info()) LOG.warning(_('Reboot failed but instance is running'),
context=context, instance=instance)
# if the reboot failed but the VM is running don't compute_utils.add_instance_fault_from_exc(context,
# put it into an error state self.conductor_api, instance, error, exc_info)
new_power_state = self._get_power_state(context, instance) self._notify_about_instance_usage(context, instance,
if new_power_state == power_state.RUNNING: 'reboot.error', fault=error)
LOG.warning(_('Reboot failed but instance is running'), ctxt.reraise = False
context=context, instance=instance) else:
else: LOG.error(_('Cannot reboot instance: %s'), error,
LOG.error(_('Cannot reboot instance: %s'), error, context=context, instance=instance)
context=context, instance=instance) self._set_instance_obj_error_state(context, instance)
self._set_instance_obj_error_state(context, instance)
raise type_, value, tb
if not new_power_state: if not new_power_state:
new_power_state = self._get_power_state(context, instance) new_power_state = self._get_power_state(context, instance)
@@ -3127,9 +3118,6 @@ class ComputeManager(manager.Manager):
rescheduled = False rescheduled = False
instance_uuid = instance['uuid'] instance_uuid = instance['uuid']
compute_utils.add_instance_fault_from_exc(context, self.conductor_api,
instance, exc_info[0], exc_info=exc_info)
try: try:
# NOTE(comstud): remove the scheduler RPCAPI method when # NOTE(comstud): remove the scheduler RPCAPI method when
# this is adjusted to send to conductor... and then # this is adjusted to send to conductor... and then
@@ -3143,14 +3131,23 @@ class ComputeManager(manager.Manager):
rescheduled = self._reschedule(context, request_spec, rescheduled = self._reschedule(context, request_spec,
filter_properties, instance_uuid, scheduler_method, filter_properties, instance_uuid, scheduler_method,
method_args, task_state, exc_info) method_args, task_state, exc_info)
except Exception: except Exception as error:
rescheduled = False rescheduled = False
LOG.exception(_("Error trying to reschedule"), LOG.exception(_("Error trying to reschedule"),
instance_uuid=instance_uuid) instance_uuid=instance_uuid)
compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, error,
exc_info=sys.exc_info())
self._notify_about_instance_usage(context, instance,
'resize.error', fault=error)
if rescheduled: if rescheduled:
# log the original build error
self._log_original_error(exc_info, instance_uuid) self._log_original_error(exc_info, instance_uuid)
compute_utils.add_instance_fault_from_exc(context,
self.conductor_api, instance, exc_info[1],
exc_info=exc_info)
self._notify_about_instance_usage(context, instance,
'resize.error', fault=exc_info[1])
else: else:
# not re-scheduling # not re-scheduling
raise exc_info[0], exc_info[1], exc_info[2] raise exc_info[0], exc_info[1], exc_info[2]

View File

@@ -39,12 +39,11 @@ CONF.import_opt('host', 'nova.netconf')
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
def add_instance_fault_from_exc(context, conductor, def exception_to_dict(fault):
instance, fault, exc_info=None): """Converts exceptions to a dict for use in notifications."""
"""Adds the specified fault to the database.""" #TODO(johngarbutt) move to nova/exception.py to share with wrap_exception
code = 500 code = 500
if hasattr(fault, "kwargs"): if hasattr(fault, "kwargs"):
code = fault.kwargs.get('code', 500) code = fault.kwargs.get('code', 500)
@@ -65,17 +64,35 @@ def add_instance_fault_from_exc(context, conductor,
# MySQL silently truncates overly long messages, but PostgreSQL throws an # MySQL silently truncates overly long messages, but PostgreSQL throws an
# error if we don't truncate it. # error if we don't truncate it.
u_message = unicode(message)[:255] u_message = unicode(message)[:255]
details = ''
if exc_info and code == 500: fault_dict = dict(exception=fault)
fault_dict["message"] = u_message
fault_dict["code"] = code
return fault_dict
def _get_fault_details(exc_info, error_code):
details = ''
if exc_info and error_code == 500:
tb = exc_info[2] tb = exc_info[2]
details += ''.join(traceback.format_tb(tb)) if tb:
details = ''.join(traceback.format_tb(tb))
return unicode(details)
def add_instance_fault_from_exc(context, conductor,
instance, fault, exc_info=None):
"""Adds the specified fault to the database."""
fault_dict = exception_to_dict(fault)
code = fault_dict["code"]
details = _get_fault_details(exc_info, code)
values = { values = {
'instance_uuid': instance['uuid'], 'instance_uuid': instance['uuid'],
'code': code, 'code': code,
'message': u_message, 'message': fault_dict["message"],
'details': unicode(details), 'details': details,
'host': CONF.host 'host': CONF.host
} }
conductor.instance_fault_create(context, values) conductor.instance_fault_create(context, values)
@@ -294,7 +311,7 @@ def notify_usage_exists(notifier, context, instance_ref, current_period=False,
def notify_about_instance_usage(notifier, context, instance, event_suffix, def notify_about_instance_usage(notifier, context, instance, event_suffix,
network_info=None, system_metadata=None, network_info=None, system_metadata=None,
extra_usage_info=None): extra_usage_info=None, fault=None):
"""Send a notification about an instance. """Send a notification about an instance.
:param notifier: a messaging.Notifier :param notifier: a messaging.Notifier
@@ -311,6 +328,13 @@ def notify_about_instance_usage(notifier, context, instance, event_suffix,
usage_info = notifications.info_from_instance(context, instance, usage_info = notifications.info_from_instance(context, instance,
network_info, system_metadata, **extra_usage_info) network_info, system_metadata, **extra_usage_info)
if fault:
# NOTE(johngarbutt) mirrors the format in wrap_exception
fault_payload = exception_to_dict(fault)
LOG.debug(fault_payload["message"], instance=instance,
exc_info=True)
usage_info.update(fault_payload)
if event_suffix.endswith("error"): if event_suffix.endswith("error"):
method = notifier.error method = notifier.error
else: else:

View File

@@ -2337,6 +2337,7 @@ class ComputeTestCase(BaseTestCase):
'args': (econtext, instance, expected_nw_info, 'args': (econtext, instance, expected_nw_info,
reboot_type), reboot_type),
'kwargs': {'block_device_info': fake_block_dev_info}} 'kwargs': {'block_device_info': fake_block_dev_info}}
fault = exception.InstanceNotFound(instance_id='instance-0000')
def fake_reboot(*args, **kwargs): def fake_reboot(*args, **kwargs):
reboot_call_info['args'] = args reboot_call_info['args'] = args
@@ -2347,7 +2348,7 @@ class ComputeTestCase(BaseTestCase):
# can't stub it out, thus we skip that comparison. # can't stub it out, thus we skip that comparison.
kwargs.pop('bad_volumes_callback') kwargs.pop('bad_volumes_callback')
if fail_reboot: if fail_reboot:
raise exception.InstanceNotFound(instance_id='instance-0000') raise fault
self.stubs.Set(self.compute.driver, 'reboot', fake_reboot) self.stubs.Set(self.compute.driver, 'reboot', fake_reboot)
@@ -2362,6 +2363,8 @@ class ComputeTestCase(BaseTestCase):
instance).AndReturn(fake_power_state3) instance).AndReturn(fake_power_state3)
if test_delete: if test_delete:
fault = exception.InstanceNotFound(
instance_id=instance['uuid'])
db.instance_update_and_get_original( db.instance_update_and_get_original(
econtext, updated_dbinstance1['uuid'], econtext, updated_dbinstance1['uuid'],
{'power_state': new_power_state, {'power_state': new_power_state,
@@ -2369,8 +2372,7 @@ class ComputeTestCase(BaseTestCase):
'vm_state': vm_states.ACTIVE}, 'vm_state': vm_states.ACTIVE},
update_cells=False, update_cells=False,
columns_to_join=['system_metadata'], columns_to_join=['system_metadata'],
).AndRaise(exception.InstanceNotFound( ).AndRaise(fault)
instance_id=instance['uuid']))
self.compute._notify_about_instance_usage( self.compute._notify_about_instance_usage(
econtext, econtext,
instance, instance,
@@ -2381,8 +2383,7 @@ class ComputeTestCase(BaseTestCase):
{'vm_state': vm_states.ERROR}, {'vm_state': vm_states.ERROR},
update_cells=False, update_cells=False,
columns_to_join=['system_metadata'], columns_to_join=['system_metadata'],
).AndRaise(exception.InstanceNotFound( ).AndRaise(fault)
instance_id=instance['uuid']))
else: else:
db.instance_update_and_get_original( db.instance_update_and_get_original(
econtext, updated_dbinstance1['uuid'], econtext, updated_dbinstance1['uuid'],
@@ -2392,6 +2393,9 @@ class ComputeTestCase(BaseTestCase):
update_cells=False, update_cells=False,
columns_to_join=['system_metadata'], columns_to_join=['system_metadata'],
).AndReturn((None, updated_dbinstance2)) ).AndReturn((None, updated_dbinstance2))
if fail_running:
self.compute._notify_about_instance_usage(econtext, instance,
'reboot.error', fault=fault)
self.compute._notify_about_instance_usage( self.compute._notify_about_instance_usage(
econtext, econtext,
instance, instance,

View File

@@ -1138,7 +1138,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
network_info=self.network_info, network_info=self.network_info,
block_device_info=self.block_device_info).AndRaise(exc) block_device_info=self.block_device_info).AndRaise(exc)
self._notify_about_instance_usage('create.end', self._notify_about_instance_usage('create.end',
extra_usage_info={'message': exc.format_message()}, stub=False) fault=exc, stub=False)
conductor_rpcapi.ConductorAPI.instance_update( conductor_rpcapi.ConductorAPI.instance_update(
self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor') self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
self.mox.ReplayAll() self.mox.ReplayAll()
@@ -1161,16 +1161,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self._notify_about_instance_usage('create.start', self._notify_about_instance_usage('create.start',
extra_usage_info={'image_name': self.image.get('name')}) extra_usage_info={'image_name': self.image.get('name')})
self._build_and_run_instance_update() self._build_and_run_instance_update()
exc = test.TestingException()
self.compute.driver.spawn(self.context, self.instance, self.image, self.compute.driver.spawn(self.context, self.instance, self.image,
self.injected_files, self.admin_pass, self.injected_files, self.admin_pass,
network_info=self.network_info, network_info=self.network_info,
block_device_info=self.block_device_info).AndRaise( block_device_info=self.block_device_info).AndRaise(exc)
test.TestingException())
conductor_rpcapi.ConductorAPI.instance_update( conductor_rpcapi.ConductorAPI.instance_update(
self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor') self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor')
self._notify_about_instance_usage('create.error', self._notify_about_instance_usage('create.error',
extra_usage_info={'message': str(test.TestingException())}, fault=exc, stub=False)
stub=False)
self.mox.ReplayAll() self.mox.ReplayAll()
self.assertRaises(exception.RescheduledException, self.assertRaises(exception.RescheduledException,
@@ -1214,7 +1213,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
mock.call(self.context, self.instance, 'create.start', mock.call(self.context, self.instance, 'create.start',
extra_usage_info={'image_name': self.image.get('name')}), extra_usage_info={'image_name': self.image.get('name')}),
mock.call(self.context, self.instance, 'create.error', mock.call(self.context, self.instance, 'create.error',
extra_usage_info={'message': exc.format_message()})]) fault=exc)])
save.assert_has_calls([ save.assert_has_calls([
mock.call(), mock.call(),
@@ -1249,7 +1248,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self._notify_about_instance_usage('create.start', self._notify_about_instance_usage('create.start',
extra_usage_info={'image_name': self.image.get('name')}) extra_usage_info={'image_name': self.image.get('name')})
self._notify_about_instance_usage('create.error', self._notify_about_instance_usage('create.error',
extra_usage_info={'message': exc.format_message()}, stub=False) fault=exc, stub=False)
self.compute.compute_task_api.build_instances(self.context, self.compute.compute_task_api.build_instances(self.context,
[self.instance], self.image, [], self.admin_pass, [self.instance], self.image, [], self.admin_pass,
self.injected_files, self.requested_networks, self.injected_files, self.requested_networks,
@@ -1283,7 +1282,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
self.requested_networks, self.security_groups, self.image, self.requested_networks, self.security_groups, self.image,
self.block_device_mapping).AndRaise(exc) self.block_device_mapping).AndRaise(exc)
self._notify_about_instance_usage('create.error', self._notify_about_instance_usage('create.error',
extra_usage_info={'message': exc.format_message()}, stub=False) fault=exc, stub=False)
self.mox.ReplayAll() self.mox.ReplayAll()
self.assertRaises(exception.BuildAbortException, self.assertRaises(exception.BuildAbortException,
self.compute._build_and_run_instance, self.context, self.compute._build_and_run_instance, self.context,