compute: Ensure pre-migrating instances are destroyed during init_host

Previously _destroy_evacuated_instances would not remove instances
associated with evacuation migration records in a pre-migrating state.
This could lead to a race between the original source host and the new
destination if the source returned early, calling init_instance during
the evacuation process.

This change now includes pre-migrating migration records when looking
for active evacuations. Additionally the dict of evacuating instances is
then returned to init_host and used to skip running init_instance
against such instances ensuring no race occurs between the two computes.

Closes-bug: #1764883
Change-Id: I379678dfdb2609f12a572d4f99c8e9da4deab803
This commit is contained in:
Lee Yarwood 2018-04-18 14:35:07 +01:00 committed by Matt Riedemann
parent b9213398c7
commit c4988cdabf
4 changed files with 68 additions and 19 deletions

View File

@ -624,10 +624,11 @@ class ComputeManager(manager.Manager):
While nova-compute was down, the instances running on it could be
evacuated to another host. This method looks for evacuation migration
records where this is the source host and which were either started
(accepted) or complete (done). From those migration records, local
instances reported by the hypervisor are compared to the instances
for the migration records and those local guests are destroyed, along
with instance allocation records in Placement for this node.
(accepted), in-progress (pre-migrating) or migrated (done). From those
migration records, local instances reported by the hypervisor are
compared to the instances for the migration records and those local
guests are destroyed, along with instance allocation records in
Placement for this node.
"""
filters = {
'source_compute': self.host,
@ -635,7 +636,13 @@ class ComputeManager(manager.Manager):
# included in case the source node comes back up while instances
# are being evacuated to another host. We don't want the same
# instance being reported from multiple hosts.
'status': ['accepted', 'done'],
# NOTE(lyarwood): pre-migrating is also included here as the
# source compute can come back online shortly after the RT
# claims on the destination that in-turn moves the migration to
# pre-migrating. If the evacuate fails on the destination host,
# the user can rebuild the instance (in ERROR state) on the source
# host.
'status': ['accepted', 'pre-migrating', 'done'],
'migration_type': 'evacuation',
}
with utils.temporary_mutation(context, read_deleted='yes'):
@ -698,6 +705,7 @@ class ComputeManager(manager.Manager):
migration.status = 'completed'
migration.save()
return evacuations
def _is_instance_storage_shared(self, context, instance, host=None):
shared_storage = True
@ -1133,9 +1141,14 @@ class ComputeManager(manager.Manager):
try:
# checking that instance was not already evacuated to other host
self._destroy_evacuated_instances(context)
evacuated_instances = self._destroy_evacuated_instances(context)
# Initialise instances on the host that are not evacuating
for instance in instances:
self._init_instance(context, instance)
if (not evacuated_instances or
instance.uuid not in evacuated_instances):
self._init_instance(context, instance)
finally:
if CONF.defer_iptables_apply:
self.driver.filter_defer_apply_off()

View File

@ -25,13 +25,9 @@ class TestEvacuationWithSourceReturningDuringRebuild(
test.TestCase, integrated_helpers.InstanceHelperMixin):
"""Assert the behaviour of evacuating instances when the src returns early.
This test asserts that evacuating instances end up in an ERROR state on the
source when that host comes back online during an evacuation while the
migration record is in a pre-migrating state.
This currently leads to a race between the source initialising the instance
and destination claim during the rebuild, both of which attempt to update
the underlying task_state of the instance.
This test asserts that evacuating instances end up in an ACTIVE state on
the destination even when the source host comes back online during an
evacuation while the migration record is in a pre-migrating state.
"""
def setUp(self):
@ -113,8 +109,15 @@ class TestEvacuationWithSourceReturningDuringRebuild(
# Start evacuating the instance from the source_host
self.api.post_server_action(server['id'], {'evacuate': {}})
# FIXME(lyarwood): Assert that the evacuation fails at present with the
# instance remaining on the source in an ERROR state.
self._wait_for_state_change(self.api, server, 'ERROR')
# Wait for the instance to go into an ACTIVE state
self._wait_for_state_change(self.api, server, 'ACTIVE')
server = self.api.get_server(server['id'])
self.assertEqual(self.source_compute, server['OS-EXT-SRV-ATTR:host'])
host = server['OS-EXT-SRV-ATTR:host']
migrations = self.api.get_migrations()
# Assert that we have a single `done` migration record after the evac
self.assertEqual(1, len(migrations))
self.assertEqual('done', migrations[0]['status'])
# Assert that the instance is now on the dest
self.assertNotEqual(self.source_compute, host)

View File

@ -7500,7 +7500,10 @@ class ComputeTestCase(BaseTestCase,
mock_get_filter.assert_called_once_with(fake_context,
{'source_compute': self.compute.host,
'status': ['accepted', 'done'],
'status': [
'accepted',
'pre-migrating',
'done'],
'migration_type': 'evacuation'})
mock_get_inst.assert_called_once_with(fake_context)
mock_get_nw.assert_called_once_with(fake_context, evacuated_instance)

View File

@ -766,6 +766,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
mock.ANY, mock.ANY, mock.ANY)
mock_save.assert_called_once_with()
@mock.patch.object(context, 'get_admin_context')
@mock.patch.object(objects.InstanceList, 'get_by_host')
@mock.patch.object(fake_driver.FakeDriver, 'init_host')
@mock.patch('nova.compute.manager.ComputeManager._init_instance')
@mock.patch('nova.compute.manager.ComputeManager.'
'_destroy_evacuated_instances')
def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
mock_init_instance, mock_init_host, mock_host_get,
mock_admin_ctxt):
"""Assert that init_instance is not called for instances that are
evacuating from the host during init_host.
"""
active_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.active_instance)
evacuating_instance = fake_instance.fake_instance_obj(
self.context, host=self.compute.host, uuid=uuids.evac_instance)
instance_list = objects.InstanceList(self.context,
objects=[active_instance, evacuating_instance])
mock_host_get.return_value = instance_list
mock_admin_ctxt.return_value = self.context
mock_destroy_evac.return_value = {
uuids.evac_instance: evacuating_instance
}
self.compute.init_host()
mock_init_instance.assert_called_once_with(
self.context, active_instance)
def test_init_instance_with_binding_failed_vif_type(self):
# this instance will plug a 'binding_failed' vif
instance = fake_instance.fake_instance_obj(