diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 83523d7860..e3b07384fd 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -958,13 +958,13 @@ class ComputeManager(manager.Manager): extra_usage_info = {} - def notify(status, msg="", **kwargs): + def notify(status, msg="", fault=None, **kwargs): """Send a create.{start,error,end} notification.""" type_ = "create.%(status)s" % dict(status=status) info = extra_usage_info.copy() info['message'] = unicode(msg) self._notify_about_instance_usage(context, instance, type_, - extra_usage_info=info, **kwargs) + extra_usage_info=info, fault=fault, **kwargs) try: self._prebuild_instance(context, instance) @@ -985,7 +985,7 @@ class ComputeManager(manager.Manager): except exception.RescheduledException as e: # Instance build encountered an error, and has been rescheduled. - notify("error", msg=unicode(e)) # notify that build failed + notify("error", fault=e) except exception.BuildAbortException as e: # Instance build aborted due to a non-failure @@ -996,7 +996,7 @@ class ComputeManager(manager.Manager): # Instance build encountered a non-recoverable error: with excutils.save_and_reraise_exception(): self._set_instance_error_state(context, instance['uuid']) - notify("error", msg=unicode(e)) # notify that build failed + notify("error", fault=e) # notify that build failed def _prebuild_instance(self, context, instance): self._check_instance_exists(context, instance) @@ -1128,6 +1128,8 @@ class ComputeManager(manager.Manager): compute_utils.add_instance_fault_from_exc(context, self.conductor_api, instance, exc_info[1], exc_info=exc_info) + self._notify_about_instance_usage(context, instance, + 'instance.create.error', fault=exc_info[1]) try: LOG.debug(_("Clean up resource before rescheduling."), @@ -1515,12 +1517,12 @@ class ComputeManager(manager.Manager): def _notify_about_instance_usage(self, context, instance, event_suffix, network_info=None, system_metadata=None, - extra_usage_info=None): + extra_usage_info=None, fault=None): compute_utils.notify_about_instance_usage( self.notifier, context, instance, event_suffix, network_info=network_info, system_metadata=system_metadata, - extra_usage_info=extra_usage_info) + extra_usage_info=extra_usage_info, fault=fault) def _deallocate_network(self, context, instance, requested_networks=None): @@ -1697,29 +1699,22 @@ class ComputeManager(manager.Manager): 'create.end', extra_usage_info={'message': _('Success')}, network_info=network_info) - except exception.InstanceNotFound as e: + except (exception.InstanceNotFound, + exception.UnexpectedDeletingTaskStateError) as e: with excutils.save_and_reraise_exception(): self._notify_about_instance_usage(context, instance, - 'create.end', - extra_usage_info={'message': e.format_message()}) - except exception.UnexpectedDeletingTaskStateError as e: - with excutils.save_and_reraise_exception(): - msg = e.format_message() - self._notify_about_instance_usage(context, instance, - 'create.end', extra_usage_info={'message': msg}) + 'create.end', fault=e) except exception.ComputeResourcesUnavailable as e: LOG.debug(e.format_message(), instance=instance) self._notify_about_instance_usage(context, instance, - 'create.error', - extra_usage_info={'message': e.format_message()}) + 'create.error', fault=e) raise exception.RescheduledException( instance_uuid=instance.uuid, reason=e.format_message()) except exception.BuildAbortException as e: with excutils.save_and_reraise_exception(): - LOG.debug(e.format_message, instance=instance) + LOG.debug(e.format_message(), instance=instance) self._notify_about_instance_usage(context, instance, - 'create.error', extra_usage_info={'message': - e.format_message()}) + 'create.error', fault=e) except (exception.VirtualInterfaceCreateException, exception.VirtualInterfaceMacAddressException, exception.FixedIpLimitExceeded, @@ -1727,15 +1722,13 @@ class ComputeManager(manager.Manager): LOG.exception(_('Failed to allocate network(s)'), instance=instance) self._notify_about_instance_usage(context, instance, - 'create.error', extra_usage_info={'message': - e.format_message()}) + 'create.error', fault=e) msg = _('Failed to allocate the network(s), not rescheduling.') raise exception.BuildAbortException(instance_uuid=instance.uuid, reason=msg) except Exception as e: self._notify_about_instance_usage(context, instance, - 'create.error', - extra_usage_info={'message': str(e)}) + 'create.error', fault=e) raise exception.RescheduledException( instance_uuid=instance.uuid, reason=str(e)) @@ -2400,25 +2393,23 @@ class ComputeManager(manager.Manager): bad_volumes_callback=bad_volumes_callback) except Exception as error: - # Can't use save_and_reraise as we don't know yet if we - # will re-raise or not - type_, value, tb = sys.exc_info() - - compute_utils.add_instance_fault_from_exc(context, - self.conductor_api, instance, error, - sys.exc_info()) - - # if the reboot failed but the VM is running don't - # put it into an error state - new_power_state = self._get_power_state(context, instance) - if new_power_state == power_state.RUNNING: - LOG.warning(_('Reboot failed but instance is running'), - context=context, instance=instance) - else: - LOG.error(_('Cannot reboot instance: %s'), error, - context=context, instance=instance) - self._set_instance_obj_error_state(context, instance) - raise type_, value, tb + with excutils.save_and_reraise_exception() as ctxt: + exc_info = sys.exc_info() + # if the reboot failed but the VM is running don't + # put it into an error state + new_power_state = self._get_power_state(context, instance) + if new_power_state == power_state.RUNNING: + LOG.warning(_('Reboot failed but instance is running'), + context=context, instance=instance) + compute_utils.add_instance_fault_from_exc(context, + self.conductor_api, instance, error, exc_info) + self._notify_about_instance_usage(context, instance, + 'reboot.error', fault=error) + ctxt.reraise = False + else: + LOG.error(_('Cannot reboot instance: %s'), error, + context=context, instance=instance) + self._set_instance_obj_error_state(context, instance) if not new_power_state: new_power_state = self._get_power_state(context, instance) @@ -3127,9 +3118,6 @@ class ComputeManager(manager.Manager): rescheduled = False instance_uuid = instance['uuid'] - compute_utils.add_instance_fault_from_exc(context, self.conductor_api, - instance, exc_info[0], exc_info=exc_info) - try: # NOTE(comstud): remove the scheduler RPCAPI method when # this is adjusted to send to conductor... and then @@ -3143,14 +3131,23 @@ class ComputeManager(manager.Manager): rescheduled = self._reschedule(context, request_spec, filter_properties, instance_uuid, scheduler_method, method_args, task_state, exc_info) - except Exception: + except Exception as error: rescheduled = False LOG.exception(_("Error trying to reschedule"), instance_uuid=instance_uuid) + compute_utils.add_instance_fault_from_exc(context, + self.conductor_api, instance, error, + exc_info=sys.exc_info()) + self._notify_about_instance_usage(context, instance, + 'resize.error', fault=error) if rescheduled: - # log the original build error self._log_original_error(exc_info, instance_uuid) + compute_utils.add_instance_fault_from_exc(context, + self.conductor_api, instance, exc_info[1], + exc_info=exc_info) + self._notify_about_instance_usage(context, instance, + 'resize.error', fault=exc_info[1]) else: # not re-scheduling raise exc_info[0], exc_info[1], exc_info[2] diff --git a/nova/compute/utils.py b/nova/compute/utils.py index 43022e3b2c..796206dee5 100644 --- a/nova/compute/utils.py +++ b/nova/compute/utils.py @@ -39,12 +39,11 @@ CONF.import_opt('host', 'nova.netconf') LOG = log.getLogger(__name__) -def add_instance_fault_from_exc(context, conductor, - instance, fault, exc_info=None): - """Adds the specified fault to the database.""" +def exception_to_dict(fault): + """Converts exceptions to a dict for use in notifications.""" + #TODO(johngarbutt) move to nova/exception.py to share with wrap_exception code = 500 - if hasattr(fault, "kwargs"): code = fault.kwargs.get('code', 500) @@ -65,17 +64,35 @@ def add_instance_fault_from_exc(context, conductor, # MySQL silently truncates overly long messages, but PostgreSQL throws an # error if we don't truncate it. u_message = unicode(message)[:255] - details = '' - if exc_info and code == 500: + fault_dict = dict(exception=fault) + fault_dict["message"] = u_message + fault_dict["code"] = code + return fault_dict + + +def _get_fault_details(exc_info, error_code): + details = '' + if exc_info and error_code == 500: tb = exc_info[2] - details += ''.join(traceback.format_tb(tb)) + if tb: + details = ''.join(traceback.format_tb(tb)) + return unicode(details) + + +def add_instance_fault_from_exc(context, conductor, + instance, fault, exc_info=None): + """Adds the specified fault to the database.""" + + fault_dict = exception_to_dict(fault) + code = fault_dict["code"] + details = _get_fault_details(exc_info, code) values = { 'instance_uuid': instance['uuid'], 'code': code, - 'message': u_message, - 'details': unicode(details), + 'message': fault_dict["message"], + 'details': details, 'host': CONF.host } conductor.instance_fault_create(context, values) @@ -294,7 +311,7 @@ def notify_usage_exists(notifier, context, instance_ref, current_period=False, def notify_about_instance_usage(notifier, context, instance, event_suffix, network_info=None, system_metadata=None, - extra_usage_info=None): + extra_usage_info=None, fault=None): """Send a notification about an instance. :param notifier: a messaging.Notifier @@ -311,6 +328,13 @@ def notify_about_instance_usage(notifier, context, instance, event_suffix, usage_info = notifications.info_from_instance(context, instance, network_info, system_metadata, **extra_usage_info) + if fault: + # NOTE(johngarbutt) mirrors the format in wrap_exception + fault_payload = exception_to_dict(fault) + LOG.debug(fault_payload["message"], instance=instance, + exc_info=True) + usage_info.update(fault_payload) + if event_suffix.endswith("error"): method = notifier.error else: diff --git a/nova/tests/compute/test_compute.py b/nova/tests/compute/test_compute.py index 2ed80e0565..61bb924bdf 100644 --- a/nova/tests/compute/test_compute.py +++ b/nova/tests/compute/test_compute.py @@ -2337,6 +2337,7 @@ class ComputeTestCase(BaseTestCase): 'args': (econtext, instance, expected_nw_info, reboot_type), 'kwargs': {'block_device_info': fake_block_dev_info}} + fault = exception.InstanceNotFound(instance_id='instance-0000') def fake_reboot(*args, **kwargs): reboot_call_info['args'] = args @@ -2347,7 +2348,7 @@ class ComputeTestCase(BaseTestCase): # can't stub it out, thus we skip that comparison. kwargs.pop('bad_volumes_callback') if fail_reboot: - raise exception.InstanceNotFound(instance_id='instance-0000') + raise fault self.stubs.Set(self.compute.driver, 'reboot', fake_reboot) @@ -2362,6 +2363,8 @@ class ComputeTestCase(BaseTestCase): instance).AndReturn(fake_power_state3) if test_delete: + fault = exception.InstanceNotFound( + instance_id=instance['uuid']) db.instance_update_and_get_original( econtext, updated_dbinstance1['uuid'], {'power_state': new_power_state, @@ -2369,8 +2372,7 @@ class ComputeTestCase(BaseTestCase): 'vm_state': vm_states.ACTIVE}, update_cells=False, columns_to_join=['system_metadata'], - ).AndRaise(exception.InstanceNotFound( - instance_id=instance['uuid'])) + ).AndRaise(fault) self.compute._notify_about_instance_usage( econtext, instance, @@ -2381,8 +2383,7 @@ class ComputeTestCase(BaseTestCase): {'vm_state': vm_states.ERROR}, update_cells=False, columns_to_join=['system_metadata'], - ).AndRaise(exception.InstanceNotFound( - instance_id=instance['uuid'])) + ).AndRaise(fault) else: db.instance_update_and_get_original( econtext, updated_dbinstance1['uuid'], @@ -2392,6 +2393,9 @@ class ComputeTestCase(BaseTestCase): update_cells=False, columns_to_join=['system_metadata'], ).AndReturn((None, updated_dbinstance2)) + if fail_running: + self.compute._notify_about_instance_usage(econtext, instance, + 'reboot.error', fault=fault) self.compute._notify_about_instance_usage( econtext, instance, diff --git a/nova/tests/compute/test_compute_mgr.py b/nova/tests/compute/test_compute_mgr.py index 07489eda8c..7e0ffdf309 100644 --- a/nova/tests/compute/test_compute_mgr.py +++ b/nova/tests/compute/test_compute_mgr.py @@ -1138,7 +1138,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): network_info=self.network_info, block_device_info=self.block_device_info).AndRaise(exc) self._notify_about_instance_usage('create.end', - extra_usage_info={'message': exc.format_message()}, stub=False) + fault=exc, stub=False) conductor_rpcapi.ConductorAPI.instance_update( self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor') self.mox.ReplayAll() @@ -1161,16 +1161,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self._notify_about_instance_usage('create.start', extra_usage_info={'image_name': self.image.get('name')}) self._build_and_run_instance_update() + exc = test.TestingException() self.compute.driver.spawn(self.context, self.instance, self.image, self.injected_files, self.admin_pass, network_info=self.network_info, - block_device_info=self.block_device_info).AndRaise( - test.TestingException()) + block_device_info=self.block_device_info).AndRaise(exc) conductor_rpcapi.ConductorAPI.instance_update( self.context, self.instance['uuid'], mox.IgnoreArg(), 'conductor') self._notify_about_instance_usage('create.error', - extra_usage_info={'message': str(test.TestingException())}, - stub=False) + fault=exc, stub=False) self.mox.ReplayAll() self.assertRaises(exception.RescheduledException, @@ -1214,7 +1213,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): mock.call(self.context, self.instance, 'create.start', extra_usage_info={'image_name': self.image.get('name')}), mock.call(self.context, self.instance, 'create.error', - extra_usage_info={'message': exc.format_message()})]) + fault=exc)]) save.assert_has_calls([ mock.call(), @@ -1249,7 +1248,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self._notify_about_instance_usage('create.start', extra_usage_info={'image_name': self.image.get('name')}) self._notify_about_instance_usage('create.error', - extra_usage_info={'message': exc.format_message()}, stub=False) + fault=exc, stub=False) self.compute.compute_task_api.build_instances(self.context, [self.instance], self.image, [], self.admin_pass, self.injected_files, self.requested_networks, @@ -1283,7 +1282,7 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase): self.requested_networks, self.security_groups, self.image, self.block_device_mapping).AndRaise(exc) self._notify_about_instance_usage('create.error', - extra_usage_info={'message': exc.format_message()}, stub=False) + fault=exc, stub=False) self.mox.ReplayAll() self.assertRaises(exception.BuildAbortException, self.compute._build_and_run_instance, self.context,