Sanity check instance mapping during scheduling

mnaser reported a weird case where an instance was found
in both cell0 (deleted there) and in cell1 (not deleted
there but in error state from a failed build). It's unclear
how this could happen besides some weird clustered rabbitmq
issue where maybe the schedule and build request to conductor
happens twice for the same instance and one picks a host and
tries to build and the other fails during scheduling and is
buried in cell0.

To avoid a split brain situation like this, we add a sanity
check in _bury_in_cell0 to make sure the instance mapping is
not pointing at a cell when we go to update it to cell0.
Similarly a check is added in the schedule_and_build_instances
flow (the code is moved to a private method to make it easier
to test).

Worst case is this is unnecessary but doesn't hurt anything,
best case is this helps avoid split brain clustered rabbit
issues.

Closes-Bug: #1775934

Change-Id: I335113f0ec59516cb337d34b6fc9078ea202130f
(cherry picked from commit 5b552518e1)
This commit is contained in:
Matt Riedemann 2019-09-20 17:07:35 -04:00 committed by melanie witt
parent 1ee93b9cd7
commit efc35b1c52
2 changed files with 120 additions and 17 deletions

View File

@ -1307,8 +1307,37 @@ class ComputeTaskManager(base.Base):
updates = {'vm_state': vm_states.ERROR, 'task_state': None}
for instance in instances_by_uuid.values():
inst_mapping = None
try:
# We don't need the cell0-targeted context here because the
# instance mapping is in the API DB.
inst_mapping = \
objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
except exception.InstanceMappingNotFound:
# The API created the instance mapping record so it should
# definitely be here. Log an error but continue to create the
# instance in the cell0 database.
LOG.error('While burying instance in cell0, no instance '
'mapping was found.', instance=instance)
# Perform a final sanity check that the instance is not mapped
# to some other cell already because of maybe some crazy
# clustered message queue weirdness.
if inst_mapping and inst_mapping.cell_mapping is not None:
LOG.error('When attempting to bury instance in cell0, the '
'instance is already mapped to cell %s. Ignoring '
'bury in cell0 attempt.',
inst_mapping.cell_mapping.identity,
instance=instance)
continue
with obj_target_cell(instance, cell0) as cctxt:
instance.create()
if inst_mapping:
inst_mapping.cell_mapping = cell0
inst_mapping.save()
# Record an instance action with a failed event.
self._create_instance_action_for_cell0(
@ -1328,16 +1357,6 @@ class ComputeTaskManager(base.Base):
self._set_vm_state_and_notify(
cctxt, instance.uuid, 'build_instances', updates,
exc, request_spec)
try:
# We don't need the cell0-targeted context here because the
# instance mapping is in the API DB.
inst_mapping = \
objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
inst_mapping.cell_mapping = cell0
inst_mapping.save()
except exception.InstanceMappingNotFound:
pass
for build_request in build_requests:
try:
@ -1506,13 +1525,8 @@ class ComputeTaskManager(base.Base):
instance.tags = instance_tags if instance_tags \
else objects.TagList()
# Update mapping for instance. Normally this check is guarded by
# a try/except but if we're here we know that a newer nova-api
# handled the build process and would have created the mapping
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
inst_mapping.cell_mapping = cell
inst_mapping.save()
# Update mapping for instance.
self._map_instance_to_cell(context, instance, cell)
if not self._delete_build_request(
context, build_request, instance, cell, instance_bdms,
@ -1540,6 +1554,37 @@ class ComputeTaskManager(base.Base):
host=host.service_host, node=host.nodename,
limits=host.limits, host_list=host_list)
@staticmethod
def _map_instance_to_cell(context, instance, cell):
"""Update the instance mapping to point at the given cell.
During initial scheduling once a host and cell is selected in which
to build the instance this method is used to update the instance
mapping to point at that cell.
:param context: nova auth RequestContext
:param instance: Instance object being built
:param cell: CellMapping representing the cell in which the instance
was created and is being built.
:returns: InstanceMapping object that was updated.
"""
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
context, instance.uuid)
# Perform a final sanity check that the instance is not mapped
# to some other cell already because of maybe some crazy
# clustered message queue weirdness.
if inst_mapping.cell_mapping is not None:
LOG.error('During scheduling instance is already mapped to '
'another cell: %s. This should not happen and is an '
'indication of bigger problems. If you see this you '
'should report it to the nova team. Overwriting '
'the mapping to point at cell %s.',
inst_mapping.cell_mapping.identity, cell.identity,
instance=instance)
inst_mapping.cell_mapping = cell
inst_mapping.save()
return inst_mapping
def _cleanup_build_artifacts(self, context, exc, instances, build_requests,
request_specs, block_device_mappings, tags,
cell_mapping_cache):

View File

@ -2281,6 +2281,27 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
self.params['request_specs'][0].requested_resources = []
self._do_schedule_and_build_instances_test(self.params)
def test_map_instance_to_cell_already_mapped(self):
"""Tests a scenario where an instance is already mapped to a cell
during scheduling.
"""
build_request = self.params['build_requests'][0]
instance = build_request.get_new_instance(self.ctxt)
# Simulate MQ split brain craziness by updating the instance mapping
# to point at cell0.
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.ctxt, instance.uuid)
inst_mapping.cell_mapping = self.cell_mappings['cell0']
inst_mapping.save()
cell1 = self.cell_mappings['cell1']
inst_mapping = self.conductor._map_instance_to_cell(
self.ctxt, instance, cell1)
# Assert that the instance mapping was updated to point at cell1 but
# also that an error was logged.
self.assertEqual(cell1.uuid, inst_mapping.cell_mapping.uuid)
self.assertIn('During scheduling instance is already mapped to '
'another cell', self.stdlog.logger.output)
@mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid')
def test_cleanup_build_artifacts(self, inst_map_get):
"""Simple test to ensure the order of operations in the cleanup method
@ -2378,6 +2399,17 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
build_requests = objects.BuildRequestList.get_all(self.ctxt)
instances = objects.InstanceList.get_all(self.ctxt)
# Verify instance mappings.
inst_mappings = objects.InstanceMappingList.get_by_cell_id(
self.ctxt, self.cell_mappings['cell0'].id)
# bare_br is the only instance that has a mapping from setUp.
self.assertEqual(1, len(inst_mappings))
# Since we did not setup instance mappings for the other fake build
# requests used in this test, we should see a message logged about
# there being no instance mappings.
self.assertIn('While burying instance in cell0, no instance mapping '
'was found', self.stdlog.logger.output)
self.assertEqual(0, len(build_requests))
self.assertEqual(4, len(instances))
inst_states = {inst.uuid: (inst.deleted, inst.vm_state)
@ -2475,6 +2507,32 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase):
test.MatchType(context.RequestContext), inst.uuid,
self.params['tags'])
@mock.patch('nova.objects.Instance.create')
def test_bury_in_cell0_already_mapped(self, mock_inst_create):
"""Tests a scenario where the instance mapping is already mapped to a
cell when we attempt to bury the instance in cell0.
"""
build_request = self.params['build_requests'][0]
# Simulate MQ split brain craziness by updating the instance mapping
# to point at cell1.
inst_mapping = objects.InstanceMapping.get_by_instance_uuid(
self.ctxt, build_request.instance_uuid)
inst_mapping.cell_mapping = self.cell_mappings['cell1']
inst_mapping.save()
# Now attempt to bury the instance in cell0.
with mock.patch.object(inst_mapping, 'save') as mock_inst_map_save:
self.conductor._bury_in_cell0(
self.ctxt, self.params['request_specs'][0],
exc.NoValidHost(reason='idk'), build_requests=[build_request])
# We should have exited without creating the instance in cell0 nor
# should the instance mapping have been updated to point at cell0.
mock_inst_create.assert_not_called()
mock_inst_map_save.assert_not_called()
# And we should have logged an error.
self.assertIn('When attempting to bury instance in cell0, the '
'instance is already mapped to cell',
self.stdlog.logger.output)
def test_reset(self):
with mock.patch('nova.compute.rpcapi.ComputeAPI') as mock_rpc:
old_rpcapi = self.conductor_manager.compute_rpcapi