Merge "Sanity check instance mapping during scheduling" into stable/stein

This commit is contained in:
Zuul 2020-11-16 17:38:32 +00:00 committed by Gerrit Code Review
commit 051a51f521
2 changed files with 120 additions and 17 deletions

View File

@ -1301,8 +1301,37 @@ class ComputeTaskManager(base.Base):
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
for instance in instances_by_uuid.values():
inst_mapping = None
try:
# We don't need the cell0-targeted context here because the
# instance mapping is in the API DB.
inst_mapping = \
objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
except exception.InstanceMappingNotFound:
# The API created the instance mapping record so it should
# definitely be here. Log an error but continue to create the
# instance in the cell0 database.
LOG.error('While burying instance in cell0, no instance '
'mapping was found.', instance=instance)
# Perform a final sanity check that the instance is not mapped
# to some other cell already because of maybe some crazy
# clustered message queue weirdness.
if inst_mapping and inst_mapping.cell_mapping is not None:
LOG.error('When attempting to bury instance in cell0, the '
'instance is already mapped to cell %s. Ignoring '
'bury in cell0 attempt.',
inst_mapping.cell_mapping.identity,
instance=instance)
continue
with obj_target_cell(instance, cell0) as cctxt:
instance.create()
if inst_mapping:
inst_mapping.cell_mapping = cell0
inst_mapping.save()
# Record an instance action with a failed event.
self._create_instance_action_for_cell0(
@ -1322,16 +1351,6 @@ class ComputeTaskManager(base.Base):
self._set_vm_state_and_notify(
cctxt, instance.uuid, 'build_instances', updates,
exc, request_spec)
try:
# We don't need the cell0-targeted context here because the
# instance mapping is in the API DB.
inst_mapping = \
objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
inst_mapping.cell_mapping = cell0
inst_mapping.save()
except exception.InstanceMappingNotFound:
pass
for build_request in build_requests:
try:
@ -1542,13 +1561,8 @@ class ComputeTaskManager(base.Base):
instance.tags = instance_tags if instance_tags \
else objects.TagList()
# Update mapping for instance. Normally this check is guarded by
# a try/except but if we're here we know that a newer nova-api
# handled the build process and would have created the mapping
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
inst_mapping.cell_mapping = cell
inst_mapping.save()
# Update mapping for instance.
self._map_instance_to_cell(context, instance, cell)
if not self._delete_build_request(
context, build_request, instance, cell, instance_bdms,
@ -1576,6 +1590,37 @@ class ComputeTaskManager(base.Base):
host=host.service_host, node=host.nodename,
limits=host.limits, host_list=host_list)
@staticmethod
def _map_instance_to_cell(context, instance, cell):
"""Update the instance mapping to point at the given cell.
During initial scheduling once a host and cell is selected in which
to build the instance this method is used to update the instance
mapping to point at that cell.
:param context: nova auth RequestContext
:param instance: Instance object being built
:param cell: CellMapping representing the cell in which the instance
was created and is being built.
:returns: InstanceMapping object that was updated.
"""
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
# Perform a final sanity check that the instance is not mapped
# to some other cell already because of maybe some crazy
# clustered message queue weirdness.
if inst_mapping.cell_mapping is not None:
LOG.error('During scheduling instance is already mapped to '
'another cell: %s. This should not happen and is an '
'indication of bigger problems. If you see this you '
'should report it to the nova team. Overwriting '
'the mapping to point at cell %s.',
inst_mapping.cell_mapping.identity, cell.identity,
instance=instance)
inst_mapping.cell_mapping = cell
inst_mapping.save()
return inst_mapping
def _cleanup_build_artifacts(self, context, exc, instances, build_requests,
request_specs, block_device_mappings, tags,
cell_mapping_cache):

View File

@ -2270,6 +2270,27 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['request_specs'][0].requested_resources = []
self._do_schedule_and_build_instances_test(self.params)
def test_map_instance_to_cell_already_mapped(self):
"""Tests a scenario where an instance is already mapped to a cell
during scheduling.
"""
build_request = self.params['build_requests'][0]
instance = build_request.get_new_instance(self.ctxt)
# Simulate MQ split brain craziness by updating the instance mapping
# to point at cell0.
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.ctxt, instance.uuid)
inst_mapping.cell_mapping = self.cell_mappings['cell0']
inst_mapping.save()
cell1 = self.cell_mappings['cell1']
inst_mapping = self.conductor._map_instance_to_cell(
self.ctxt, instance, cell1)
# Assert that the instance mapping was updated to point at cell1 but
# also that an error was logged.
self.assertEqual(cell1.uuid, inst_mapping.cell_mapping.uuid)
self.assertIn('During scheduling instance is already mapped to '
'another cell', self.stdlog.logger.output)
@mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid')
def test_cleanup_build_artifacts(self, inst_map_get):
"""Simple test to ensure the order of operations in the cleanup method
@ -2367,6 +2388,17 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
build_requests = objects.BuildRequestList.get_all(self.ctxt)
instances = objects.InstanceList.get_all(self.ctxt)
# Verify instance mappings.
inst_mappings = objects.InstanceMappingList.get_by_cell_id(
self.ctxt, self.cell_mappings['cell0'].id)
# bare_br is the only instance that has a mapping from setUp.
self.assertEqual(1, len(inst_mappings))
# Since we did not setup instance mappings for the other fake build
# requests used in this test, we should see a message logged about
# there being no instance mappings.
self.assertIn('While burying instance in cell0, no instance mapping '
'was found', self.stdlog.logger.output)
self.assertEqual(0, len(build_requests))
self.assertEqual(4, len(instances))
inst_states = {inst.uuid: (inst.deleted, inst.vm_state)
@ -2464,6 +2496,32 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
test.MatchType(context.RequestContext), inst.uuid,
self.params['tags'])
@mock.patch('nova.objects.Instance.create')
def test_bury_in_cell0_already_mapped(self, mock_inst_create):
"""Tests a scenario where the instance mapping is already mapped to a
cell when we attempt to bury the instance in cell0.
"""
build_request = self.params['build_requests'][0]
# Simulate MQ split brain craziness by updating the instance mapping
# to point at cell1.
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.ctxt, build_request.instance_uuid)
inst_mapping.cell_mapping = self.cell_mappings['cell1']
inst_mapping.save()
# Now attempt to bury the instance in cell0.
with mock.patch.object(inst_mapping, 'save') as mock_inst_map_save:
self.conductor._bury_in_cell0(
self.ctxt, self.params['request_specs'][0],
exc.NoValidHost(reason='idk'), build_requests=[build_request])
# We should have exited without creating the instance in cell0 nor
# should the instance mapping have been updated to point at cell0.
mock_inst_create.assert_not_called()
mock_inst_map_save.assert_not_called()
# And we should have logged an error.
self.assertIn('When attempting to bury instance in cell0, the '
'instance is already mapped to cell',
self.stdlog.logger.output)
def test_reset(self):
with mock.patch('nova.compute.rpcapi.ComputeAPI') as mock_rpc:
old_rpcapi = self.conductor_manager.compute_rpcapi