Browse Source

Stop failed live-migrates getting stuck migrating

When there are failures in driver.cleanup, we are seeing live-migrations
that get stuck in the live-migrating state. While there has been a patch
to stop the cause listed in the bug this closes, there are other
failures (such as a token timeout when talking to cinder or neutron)
that could trigger this same failure mode.

When we hit an error this late in live-migration, it should be a very
rare event, so its best to just put the instance and migration into an
error state, and help alert both the operator and API user to the
failure that has occurred.

For backport into Newton, 'migrate_instance_start' had to be patched
in the unit test (nova/tests/unit/compute/test_compute.py).

Closes-Bug: #1662626

Change-Id: Idfdce9e7dd8106af01db0358ada15737cb846395
(cherry picked from commit b56f8fc2d1)
(cherry picked from commit 012fa9353f)
changes/87/470387/4
John Garbutt 4 years ago
committed by Shane Peters
parent
commit
017e853b95
2 changed files with 52 additions and 3 deletions
  1. +7
    -3
      nova/compute/manager.py
  2. +45
    -0
      nova/tests/unit/compute/test_compute.py

+ 7
- 3
nova/compute/manager.py View File

@ -5287,12 +5287,16 @@ class ComputeManager(manager.Manager):
self._rollback_live_migration,
block_migration, migrate_data)
except Exception:
# Executing live migration
# live_migration might raises exceptions, but
# nothing must be recovered in this version.
LOG.exception(_LE('Live migration failed.'), instance=instance)
with excutils.save_and_reraise_exception():
# Put instance and migration into error state,
# as its almost certainly too late to rollback
self._set_migration_status(migration, 'error')
# first refresh instance as it may have got updated by
# post_live_migration_at_destination
instance.refresh()
self._set_instance_obj_error_state(context, instance,
clean_task_state=True)
@wrap_exception()
@wrap_instance_event(prefix='compute')


+ 45
- 0
nova/tests/unit/compute/test_compute.py View File

@ -5801,6 +5801,51 @@ class ComputeTestCase(BaseTestCase):
mock_post.assert_called_once_with(c, instance, False, dest)
mock_clear.assert_called_once_with(mock.ANY)
@mock.patch.object(compute_rpcapi.ComputeAPI, 'pre_live_migration')
@mock.patch.object(network_api.API, 'migrate_instance_start')
@mock.patch.object(compute_rpcapi.ComputeAPI,
'post_live_migration_at_destination')
@mock.patch.object(compute_manager.InstanceEvents,
'clear_events_for_instance')
@mock.patch.object(compute_utils, 'EventReporter')
@mock.patch('nova.objects.Migration.save')
def test_live_migration_handles_errors_correctly(self,
mock_save, mock_event, mock_clear,
mock_post, mock_migrate, mock_pre):
# Confirm live_migration() works as expected correctly.
# creating instance testdata
c = context.get_admin_context()
instance = self._create_fake_instance_obj(context=c)
instance.host = self.compute.host
dest = 'desthost'
migrate_data = migrate_data_obj.LibvirtLiveMigrateData(
is_shared_instance_path=False,
is_shared_block_storage=False)
mock_pre.return_value = migrate_data
# start test
migration = objects.Migration()
with mock.patch.object(self.compute.driver,
'cleanup') as mock_cleanup:
mock_cleanup.side_effect = test.TestingException
self.assertRaises(test.TestingException,
self.compute.live_migration,
c, dest, instance, False, migration, migrate_data)
# ensure we have updated the instance and migration objects
self.assertEqual(vm_states.ERROR, instance.vm_state)
self.assertIsNone(instance.task_state)
self.assertEqual("error", migration.status)
mock_pre.assert_called_once_with(c, instance, False, None,
dest, migrate_data)
self.assertEqual(0, mock_clear.call_count)
# cleanup
instance.destroy()
@mock.patch.object(fake.FakeDriver, 'unfilter_instance')
@mock.patch.object(network_api.API, 'migrate_instance_start')
@mock.patch.object(compute_rpcapi.ComputeAPI,


Loading…
Cancel
Save