[compute] always set instance.host in post_livemigration

This change add a new _post_live_migration_update_host
function that wraps _post_live_migration and just ensures
that if we exit due to an exception instance.host is set
to the destination host.

when we are in _post_live_migration the guest has already
started running on the destination host and we cannot revert.
Sometimes admins or users will hard reboot the instance expecting
that to fix everything when the vm enters the error state after
the failed migrations. Previously this would end up recreating the
instance on the source node leading to possible data corruption if
the instance used shared storage.

Change-Id: Ibc4bc7edf1c8d1e841c72c9188a0a62836e9f153
Partial-Bug: #1628606
This commit is contained in:
Sean Mooney 2021-05-13 12:48:21 +01:00 committed by Amit Uniyal
parent a20baeca1f
commit 8449b7caef
3 changed files with 64 additions and 7 deletions

View File

@ -8761,8 +8761,9 @@ class ComputeManager(manager.Manager):
# host attachment. We fetch BDMs before that to retain connection_info # host attachment. We fetch BDMs before that to retain connection_info
# and attachment_id relating to the source host for post migration # and attachment_id relating to the source host for post migration
# cleanup. # cleanup.
post_live_migration = functools.partial(self._post_live_migration, post_live_migration = functools.partial(
source_bdms=source_bdms) self._post_live_migration_update_host, source_bdms=source_bdms
)
rollback_live_migration = functools.partial( rollback_live_migration = functools.partial(
self._rollback_live_migration, source_bdms=source_bdms) self._rollback_live_migration, source_bdms=source_bdms)
@ -9037,6 +9038,42 @@ class ComputeManager(manager.Manager):
bdm.attachment_id, self.host, bdm.attachment_id, self.host,
str(e), instance=instance) str(e), instance=instance)
# TODO(sean-k-mooney): add typing
def _post_live_migration_update_host(
self, ctxt, instance, dest, block_migration=False,
migrate_data=None, source_bdms=None
):
try:
self._post_live_migration(
ctxt, instance, dest, block_migration, migrate_data,
source_bdms)
except Exception:
# Restore the instance object
node_name = None
try:
# get node name of compute, where instance will be
# running after migration, that is destination host
compute_node = self._get_compute_info(ctxt, dest)
node_name = compute_node.hypervisor_hostname
except exception.ComputeHostNotFound:
LOG.exception('Failed to get compute_info for %s', dest)
# we can never rollback from post live migration and we can only
# get here if the instance is running on the dest so we ensure
# the instance.host is set correctly and reraise the original
# exception unmodified.
if instance.host != dest:
# apply saves the new fields while drop actually removes the
# migration context from the instance, so migration persists.
instance.apply_migration_context()
instance.drop_migration_context()
instance.host = dest
instance.task_state = None
instance.node = node_name
instance.progress = 0
instance.save()
raise
@wrap_exception() @wrap_exception()
@wrap_instance_fault @wrap_instance_fault
def _post_live_migration(self, ctxt, instance, dest, def _post_live_migration(self, ctxt, instance, dest,
@ -9048,7 +9085,7 @@ class ComputeManager(manager.Manager):
and mainly updating database record. and mainly updating database record.
:param ctxt: security context :param ctxt: security context
:param instance: instance dict :param instance: instance object
:param dest: destination host :param dest: destination host
:param block_migration: if true, prepare for block migration :param block_migration: if true, prepare for block migration
:param migrate_data: if not None, it is a dict which has data :param migrate_data: if not None, it is a dict which has data

View File

@ -53,9 +53,8 @@ class PostLiveMigrationFail(
"Failed to remove source vol connection post live migration") "Failed to remove source vol connection post live migration")
mock_migration.side_effect = error mock_migration.side_effect = error
self._live_migrate( server = self._live_migrate(
server, migration_expected_state='error', server, migration_expected_state='error',
server_expected_state='ERROR') server_expected_state='ERROR')
# FIXME(amit): this should point to the dest as after migration
# but does not because of bug 1628606 self.assertEqual(self.dest.host, server['OS-EXT-SRV-ATTR:host'])
self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host'])

View File

@ -10209,6 +10209,27 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase,
self.instance, self.instance,
migration) migration)
def test_post_live_migration_update_host(self):
@mock.patch.object(self.compute, '_get_compute_info')
def _test_post_live_migration(_get_compute_info):
dest_host = 'dest'
cn = objects.ComputeNode(hypervisor_hostname=dest_host)
_get_compute_info.return_value = cn
instance = fake_instance.fake_instance_obj(self.context,
node='src',
uuid=uuids.instance)
with mock.patch.object(self.compute, "_post_live_migration"
) as plm, mock.patch.object(instance, "save") as save:
error = ValueError("some failure")
plm.side_effect = error
self.assertRaises(
ValueError, self.compute._post_live_migration_update_host,
self.context, instance, dest_host)
save.assert_called_once()
self.assertEqual(instance.host, dest_host)
_test_post_live_migration()
def test_post_live_migration_cinder_pre_344_api(self): def test_post_live_migration_cinder_pre_344_api(self):
# Because live migration has # Because live migration has
# succeeded,_post_live_migration_remove_source_vol_connections() # succeeded,_post_live_migration_remove_source_vol_connections()