From c895d3e6bca562225d70e8f81255f38970f7fcda Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Fri, 20 Sep 2019 17:07:35 -0400 Subject: [PATCH] Sanity check instance mapping during scheduling mnaser reported a weird case where an instance was found in both cell0 (deleted there) and in cell1 (not deleted there but in error state from a failed build). It's unclear how this could happen besides some weird clustered rabbitmq issue where maybe the schedule and build request to conductor happens twice for the same instance and one picks a host and tries to build and the other fails during scheduling and is buried in cell0. To avoid a split brain situation like this, we add a sanity check in _bury_in_cell0 to make sure the instance mapping is not pointing at a cell when we go to update it to cell0. Similarly a check is added in the schedule_and_build_instances flow (the code is moved to a private method to make it easier to test). Worst case is this is unnecessary but doesn't hurt anything, best case is this helps avoid split brain clustered rabbit issues. Closes-Bug: #1775934 Change-Id: I335113f0ec59516cb337d34b6fc9078ea202130f (cherry picked from commit 5b552518e1abdc63fb33c633661e30e4b2fe775e) (cherry picked from commit efc35b1c5293c7c6c85f8cf9fd9d8cd8de71d1d5) --- nova/conductor/manager.py | 79 ++++++++++++++++----- nova/tests/unit/conductor/test_conductor.py | 58 +++++++++++++++ 2 files changed, 120 insertions(+), 17 deletions(-) diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py index 379057376c72..1153ca727ae0 100644 --- a/nova/conductor/manager.py +++ b/nova/conductor/manager.py @@ -1301,8 +1301,37 @@ class ComputeTaskManager(base.Base): updates = {'vm_state': vm_states.ERROR, 'task_state': None} for instance in instances_by_uuid.values(): + + inst_mapping = None + try: + # We don't need the cell0-targeted context here because the + # instance mapping is in the API DB. + inst_mapping = \ + objects.InstanceMapping.get_by_instance_uuid( + context, instance.uuid) + except exception.InstanceMappingNotFound: + # The API created the instance mapping record so it should + # definitely be here. Log an error but continue to create the + # instance in the cell0 database. + LOG.error('While burying instance in cell0, no instance ' + 'mapping was found.', instance=instance) + + # Perform a final sanity check that the instance is not mapped + # to some other cell already because of maybe some crazy + # clustered message queue weirdness. + if inst_mapping and inst_mapping.cell_mapping is not None: + LOG.error('When attempting to bury instance in cell0, the ' + 'instance is already mapped to cell %s. Ignoring ' + 'bury in cell0 attempt.', + inst_mapping.cell_mapping.identity, + instance=instance) + continue + with obj_target_cell(instance, cell0) as cctxt: instance.create() + if inst_mapping: + inst_mapping.cell_mapping = cell0 + inst_mapping.save() # Record an instance action with a failed event. self._create_instance_action_for_cell0( @@ -1322,16 +1351,6 @@ class ComputeTaskManager(base.Base): self._set_vm_state_and_notify( cctxt, instance.uuid, 'build_instances', updates, exc, request_spec) - try: - # We don't need the cell0-targeted context here because the - # instance mapping is in the API DB. - inst_mapping = \ - objects.InstanceMapping.get_by_instance_uuid( - context, instance.uuid) - inst_mapping.cell_mapping = cell0 - inst_mapping.save() - except exception.InstanceMappingNotFound: - pass for build_request in build_requests: try: @@ -1542,13 +1561,8 @@ class ComputeTaskManager(base.Base): instance.tags = instance_tags if instance_tags \ else objects.TagList() - # Update mapping for instance. Normally this check is guarded by - # a try/except but if we're here we know that a newer nova-api - # handled the build process and would have created the mapping - inst_mapping = objects.InstanceMapping.get_by_instance_uuid( - context, instance.uuid) - inst_mapping.cell_mapping = cell - inst_mapping.save() + # Update mapping for instance. + self._map_instance_to_cell(context, instance, cell) if not self._delete_build_request( context, build_request, instance, cell, instance_bdms, @@ -1576,6 +1590,37 @@ class ComputeTaskManager(base.Base): host=host.service_host, node=host.nodename, limits=host.limits, host_list=host_list) + @staticmethod + def _map_instance_to_cell(context, instance, cell): + """Update the instance mapping to point at the given cell. + + During initial scheduling once a host and cell is selected in which + to build the instance this method is used to update the instance + mapping to point at that cell. + + :param context: nova auth RequestContext + :param instance: Instance object being built + :param cell: CellMapping representing the cell in which the instance + was created and is being built. + :returns: InstanceMapping object that was updated. + """ + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + context, instance.uuid) + # Perform a final sanity check that the instance is not mapped + # to some other cell already because of maybe some crazy + # clustered message queue weirdness. + if inst_mapping.cell_mapping is not None: + LOG.error('During scheduling instance is already mapped to ' + 'another cell: %s. This should not happen and is an ' + 'indication of bigger problems. If you see this you ' + 'should report it to the nova team. Overwriting ' + 'the mapping to point at cell %s.', + inst_mapping.cell_mapping.identity, cell.identity, + instance=instance) + inst_mapping.cell_mapping = cell + inst_mapping.save() + return inst_mapping + def _cleanup_build_artifacts(self, context, exc, instances, build_requests, request_specs, block_device_mappings, tags, cell_mapping_cache): diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py index 418955f3a024..76d9662fe7c7 100644 --- a/nova/tests/unit/conductor/test_conductor.py +++ b/nova/tests/unit/conductor/test_conductor.py @@ -2270,6 +2270,27 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): self.params['request_specs'][0].requested_resources = [] self._do_schedule_and_build_instances_test(self.params) + def test_map_instance_to_cell_already_mapped(self): + """Tests a scenario where an instance is already mapped to a cell + during scheduling. + """ + build_request = self.params['build_requests'][0] + instance = build_request.get_new_instance(self.ctxt) + # Simulate MQ split brain craziness by updating the instance mapping + # to point at cell0. + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + self.ctxt, instance.uuid) + inst_mapping.cell_mapping = self.cell_mappings['cell0'] + inst_mapping.save() + cell1 = self.cell_mappings['cell1'] + inst_mapping = self.conductor._map_instance_to_cell( + self.ctxt, instance, cell1) + # Assert that the instance mapping was updated to point at cell1 but + # also that an error was logged. + self.assertEqual(cell1.uuid, inst_mapping.cell_mapping.uuid) + self.assertIn('During scheduling instance is already mapped to ' + 'another cell', self.stdlog.logger.output) + @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid') def test_cleanup_build_artifacts(self, inst_map_get): """Simple test to ensure the order of operations in the cleanup method @@ -2367,6 +2388,17 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): build_requests = objects.BuildRequestList.get_all(self.ctxt) instances = objects.InstanceList.get_all(self.ctxt) + # Verify instance mappings. + inst_mappings = objects.InstanceMappingList.get_by_cell_id( + self.ctxt, self.cell_mappings['cell0'].id) + # bare_br is the only instance that has a mapping from setUp. + self.assertEqual(1, len(inst_mappings)) + # Since we did not setup instance mappings for the other fake build + # requests used in this test, we should see a message logged about + # there being no instance mappings. + self.assertIn('While burying instance in cell0, no instance mapping ' + 'was found', self.stdlog.logger.output) + self.assertEqual(0, len(build_requests)) self.assertEqual(4, len(instances)) inst_states = {inst.uuid: (inst.deleted, inst.vm_state) @@ -2464,6 +2496,32 @@ class ConductorTaskTestCase(_BaseTaskTestCase, test_compute.BaseTestCase): test.MatchType(context.RequestContext), inst.uuid, self.params['tags']) + @mock.patch('nova.objects.Instance.create') + def test_bury_in_cell0_already_mapped(self, mock_inst_create): + """Tests a scenario where the instance mapping is already mapped to a + cell when we attempt to bury the instance in cell0. + """ + build_request = self.params['build_requests'][0] + # Simulate MQ split brain craziness by updating the instance mapping + # to point at cell1. + inst_mapping = objects.InstanceMapping.get_by_instance_uuid( + self.ctxt, build_request.instance_uuid) + inst_mapping.cell_mapping = self.cell_mappings['cell1'] + inst_mapping.save() + # Now attempt to bury the instance in cell0. + with mock.patch.object(inst_mapping, 'save') as mock_inst_map_save: + self.conductor._bury_in_cell0( + self.ctxt, self.params['request_specs'][0], + exc.NoValidHost(reason='idk'), build_requests=[build_request]) + # We should have exited without creating the instance in cell0 nor + # should the instance mapping have been updated to point at cell0. + mock_inst_create.assert_not_called() + mock_inst_map_save.assert_not_called() + # And we should have logged an error. + self.assertIn('When attempting to bury instance in cell0, the ' + 'instance is already mapped to cell', + self.stdlog.logger.output) + def test_reset(self): with mock.patch('nova.compute.rpcapi.ComputeAPI') as mock_rpc: old_rpcapi = self.conductor_manager.compute_rpcapi