diff --git a/nova/compute/manager.py b/nova/compute/manager.py index ee7de74a739a..86523e006935 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -645,6 +645,10 @@ class ComputeManager(manager.Manager): Then allocations are removed from Placement for every instance that is evacuated from this host regardless if the instance is reported by the hypervisor or not. + + :param context: The request context + :return: A dict keyed by instance uuid mapped to Migration objects + for instances that were migrated away from this host """ filters = { 'source_compute': self.host, @@ -665,7 +669,7 @@ class ComputeManager(manager.Manager): evacuations = objects.MigrationList.get_by_filters(context, filters) if not evacuations: - return + return {} evacuations = {mig.instance_uuid: mig for mig in evacuations} # The instances might be deleted in which case we need to avoid @@ -849,9 +853,8 @@ class ComputeManager(manager.Manager): # instance has already been scheduled to this particular host. LOG.debug("Instance failed to spawn correctly, " "setting to ERROR state", instance=instance) - instance.task_state = None - instance.vm_state = vm_states.ERROR - instance.save() + self._set_instance_obj_error_state( + context, instance, clean_task_state=True) return if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and @@ -862,9 +865,8 @@ class ComputeManager(manager.Manager): # spawned so set to ERROR state. This is consistent to BUILD LOG.debug("Instance failed to rebuild correctly, " "setting to ERROR state", instance=instance) - instance.task_state = None - instance.vm_state = vm_states.ERROR - instance.save() + self._set_instance_obj_error_state( + context, instance, clean_task_state=True) return if (instance.vm_state != vm_states.ERROR and @@ -1241,10 +1243,24 @@ class ComputeManager(manager.Manager): # Initialise instances on the host that are not evacuating for instance in instances: - if (not evacuated_instances or - instance.uuid not in evacuated_instances): + if instance.uuid not in evacuated_instances: self._init_instance(context, instance) + # NOTE(gibi): collect all the instance uuids that is in some way + # was already handled above. Either by init_instance or by + # _destroy_evacuated_instances. This way we can limit the scope of + # the _error_out_instances_whose_build_was_interrupted call to look + # only for instances that have allocations on this node and not + # handled by the above calls. + already_handled = {instance.uuid for instance in instances}.union( + evacuated_instances) + # NOTE(gibi): If ironic and vcenter virt driver slow start time + # becomes problematic here then we should consider adding a config + # option or a driver flag to tell us if we should thread this out + # in the background on startup + self._error_out_instances_whose_build_was_interrupted( + context, already_handled) + finally: if CONF.defer_iptables_apply: self.driver.filter_defer_apply_off() @@ -1257,6 +1273,85 @@ class ComputeManager(manager.Manager): # _sync_scheduler_instance_info periodic task will. self._update_scheduler_instance_info(context, instances) + def _error_out_instances_whose_build_was_interrupted( + self, context, already_handled_instances): + """If there are instances in BUILDING state that are not + assigned to this host but have allocations in placement towards + this compute that means the nova-compute service was + restarted while those instances waited for the resource claim + to finish and the _set_instance_host_and_node() to update the + instance.host field. We need to push them to ERROR state here to + prevent keeping them in BUILDING state forever. + + :param context: The request context + :param already_handled_instances: The set of instance UUIDs that the + host initialization process already handled in some way. + """ + + # Strategy: + # 1) Get the allocations from placement for our compute node(s) + # 2) Remove the already handled instances from the consumer list; + # they are either already initialized or need to be skipped. + # 3) Check which remaining consumer is an instance in BUILDING state + # and push it to ERROR state. + + LOG.info( + "Looking for unclaimed instances stuck in BUILDING status for " + "nodes managed by this host") + + try: + node_names = self.driver.get_available_nodes() + except exception.VirtDriverNotReady: + LOG.warning( + "Virt driver is not ready. Therefore unable to error out any " + "instances stuck in BUILDING state on this node. If this is " + "the first time this service is starting on this host, then " + "you can ignore this warning.") + return + + for node_name in node_names: + try: + cn_uuid = objects.ComputeNode.get_by_host_and_nodename( + context, self.host, node_name).uuid + except exception.ComputeHostNotFound: + LOG.warning( + "Compute node %s not found in the database and therefore " + "unable to error out any instances stuck in BUILDING " + "state on this node. If this is the first time this " + "service is starting on this host, then you can ignore " + "this warning.", node_name) + continue + + f = self.reportclient.get_allocations_for_resource_provider + allocations = f(context, cn_uuid) + if not allocations: + LOG.error( + "Could not retrieve compute node resource provider %s and " + "therefore unable to error out any instances stuck in " + "BUILDING state.", cn_uuid) + continue + + not_handled_consumers = (set(allocations) - + already_handled_instances) + + if not not_handled_consumers: + continue + + filters = { + 'vm_state': vm_states.BUILDING, + 'uuid': not_handled_consumers + } + + instances = objects.InstanceList.get_by_filters( + context, filters, expected_attrs=[]) + + for instance in instances: + LOG.debug( + "Instance spawn was interrupted before instance_claim, " + "setting instance to ERROR state", instance=instance) + self._set_instance_obj_error_state( + context, instance, clean_task_state=True) + def cleanup_host(self): self.driver.register_event_listener(None) self.instance_events.cancel_all_events() diff --git a/nova/tests/functional/compute/test_init_host.py b/nova/tests/functional/compute/test_init_host.py index 2ce71df8a5d6..14b0a1f9a052 100644 --- a/nova/tests/functional/compute/test_init_host.py +++ b/nova/tests/functional/compute/test_init_host.py @@ -175,21 +175,14 @@ class TestComputeRestartInstanceStuckInBuild( # instance_claim call we can wait for in the test fake_notifier.wait_for_versioned_notifications( 'instance.create.start') - self.restart_compute_service(self.compute1) - # This is bug 1833581 as the server remains in BUILD state after the - # compute restart. - self._wait_for_state_change(self.admin_api, server, 'BUILD') - - # Not even the periodic task push this server to ERROR because the - # server host is still None since the instance_claim didn't set it. - self.flags(instance_build_timeout=1) - self.compute1.manager._check_instance_build_time( - nova_context.get_admin_context()) - server = self.admin_api.get_server(server['id']) - self.assertEqual('BUILD', server['status']) - self.assertIsNone(server['OS-EXT-SRV-ATTR:host']) + with mock.patch('nova.compute.manager.LOG.debug') as mock_log: + self.restart_compute_service(self.compute1) # We expect that the instance is pushed to ERROR state during the # compute restart. - # self._wait_for_state_change(self.admin_api, server, 'ERROR') + self._wait_for_state_change(self.admin_api, server, 'ERROR') + mock_log.assert_called_with( + 'Instance spawn was interrupted before instance_claim, setting ' + 'instance to ERROR state', + instance=mock.ANY) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 679a66a1703a..85e89a530038 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -686,6 +686,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): return instance_obj._make_instance_list( self.context, objects.InstanceList(), db_list, None) + @mock.patch.object(manager.ComputeManager, + '_error_out_instances_whose_build_was_interrupted') @mock.patch.object(fake_driver.FakeDriver, 'init_host') @mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_on') @mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_off') @@ -698,7 +700,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): def _do_mock_calls(mock_update_scheduler, mock_inst_init, mock_destroy, mock_admin_ctxt, mock_host_get, mock_filter_off, mock_filter_on, mock_init_host, - defer_iptables_apply): + mock_error_interrupted, defer_iptables_apply): mock_admin_ctxt.return_value = self.context inst_list = _make_instance_list(startup_instances) mock_host_get.return_value = inst_list @@ -722,6 +724,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): mock_update_scheduler.assert_called_once_with( self.context, inst_list) + mock_error_interrupted.assert_called_once_with( + self.context, {inst.uuid for inst in inst_list}) + # Test with defer_iptables_apply self.flags(defer_iptables_apply=True) _do_mock_calls(defer_iptables_apply=True) @@ -730,6 +735,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.flags(defer_iptables_apply=False) _do_mock_calls(defer_iptables_apply=False) + @mock.patch('nova.compute.manager.ComputeManager.' + '_error_out_instances_whose_build_was_interrupted') @mock.patch('nova.objects.InstanceList.get_by_host', return_value=objects.InstanceList()) @mock.patch('nova.compute.manager.ComputeManager.' @@ -739,13 +746,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): @mock.patch('nova.compute.manager.ComputeManager.' '_update_scheduler_instance_info', mock.NonCallableMock()) def test_init_host_no_instances(self, mock_destroy_evac_instances, - mock_get_by_host): + mock_get_by_host, mock_error_interrupted): """Tests the case that init_host runs and there are no instances on this host yet (it's brand new). Uses NonCallableMock for the methods we assert should not be called. """ self.compute.init_host() + mock_error_interrupted.assert_called_once_with(mock.ANY, set()) + @mock.patch('nova.objects.InstanceList') @mock.patch('nova.objects.MigrationList.get_by_filters') def test_cleanup_host(self, mock_miglist_get, mock_instance_list): @@ -802,6 +811,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self.assertFalse(mock_register.called) @mock.patch('nova.context.RequestContext.elevated') + @mock.patch.object(manager.ComputeManager, + '_error_out_instances_whose_build_was_interrupted') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.scheduler.utils.resources_from_flavor') @mock.patch.object(manager.ComputeManager, '_get_instances_on_driver') @@ -816,7 +827,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get, mock_temp_mut, mock_init_host, mock_destroy, mock_host_get, mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources, - mock_get_node, mock_elevated): + mock_get_node, mock_error_interrupted, mock_elevated): our_host = self.compute.host not_our_host = 'not-' + our_host @@ -866,6 +877,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): mock.ANY, mock.ANY, mock.ANY) mock_save.assert_called_once_with() + mock_error_interrupted.assert_called_once_with( + self.context, {deleted_instance.uuid}) + + @mock.patch.object(manager.ComputeManager, + '_error_out_instances_whose_build_was_interrupted') @mock.patch.object(context, 'get_admin_context') @mock.patch.object(objects.InstanceList, 'get_by_host') @mock.patch.object(fake_driver.FakeDriver, 'init_host') @@ -874,7 +890,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): '_destroy_evacuated_instances') def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac, mock_init_instance, mock_init_host, mock_host_get, - mock_admin_ctxt): + mock_admin_ctxt, mock_error_interrupted): """Assert that init_instance is not called for instances that are evacuating from the host during init_host. """ @@ -895,6 +911,145 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): mock_init_instance.assert_called_once_with( self.context, active_instance) + mock_error_interrupted.assert_called_once_with( + self.context, {active_instance.uuid, evacuating_instance.uuid}) + + @mock.patch.object(objects.Instance, 'save') + @mock.patch.object(objects.InstanceList, 'get_by_filters') + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + def test_init_host_with_interrupted_instance_build( + self, mock_get_nodes, mock_get_by_host_and_node, + mock_get_allocations, mock_get_instances, mock_instance_save): + + mock_get_nodes.return_value = ['fake-node'] + mock_get_by_host_and_node.return_value = objects.ComputeNode( + host=self.compute.host, uuid=uuids.cn_uuid) + + active_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.active_instance) + evacuating_instance = fake_instance.fake_instance_obj( + self.context, host=self.compute.host, uuid=uuids.evac_instance) + interrupted_instance = fake_instance.fake_instance_obj( + self.context, host=None, uuid=uuids.interrupted_instance, + vm_state=vm_states.BUILDING) + + # we have 3 different instances. We need consumers for each instance + # in placement and an extra consumer that is not an instance + allocations = { + uuids.active_instance: "fake-resources-active", + uuids.evac_instance: "fake-resources-evacuating", + uuids.interrupted_instance: "fake-resources-interrupted", + uuids.not_an_instance: "fake-resources-not-an-instance", + } + mock_get_allocations.return_value = allocations + + # get is called with a uuid filter containing interrupted_instance, + # error_instance, and not_an_instance but it will only return the + # interrupted_instance as the error_instance is not in building state + # and not_an_instance does not match with any instance in the db. + mock_get_instances.return_value = objects.InstanceList( + self.context, objects=[interrupted_instance]) + + # interrupted_instance and error_instance is not in the list passed in + # because it is not assigned to the compute and therefore not processed + # by init_host and init_instance + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, + {inst.uuid for inst in [active_instance, evacuating_instance]}) + + mock_get_by_host_and_node.assert_called_once_with( + self.context, self.compute.host, 'fake-node') + mock_get_allocations.assert_called_once_with( + self.context, uuids.cn_uuid) + + mock_get_instances.assert_called_once_with( + self.context, + {'vm_state': 'building', + 'uuid': {uuids.interrupted_instance, uuids.not_an_instance} + }, + expected_attrs=[]) + + # this is expected to be called only once for interrupted_instance + mock_instance_save.assert_called_once_with() + self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state) + + @mock.patch.object(manager.LOG, 'warning') + @mock.patch.object( + fake_driver.FakeDriver, 'get_available_nodes', + side_effect=exception.VirtDriverNotReady) + def test_init_host_with_interrupted_instance_build_driver_not_ready( + self, mock_get_nodes, mock_log_warning): + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, set()) + + mock_log_warning.assert_called_once_with( + "Virt driver is not ready. Therefore unable to error out any " + "instances stuck in BUILDING state on this node. If this is the " + "first time this service is starting on this host, then you can " + "ignore this warning.") + + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + def test_init_host_with_interrupted_instance_build_compute_node_not_found( + self, mock_get_nodes, mock_get_by_host_and_node, + mock_get_allocations): + + mock_get_nodes.return_value = ['fake-node1', 'fake-node2'] + mock_get_by_host_and_node.side_effect = [ + exception.ComputeHostNotFound(host='fake-node1'), + objects.ComputeNode(host=self.compute.host, uuid=uuids.cn_uuid)] + + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, set()) + + # check that nova skip the node that is not found in the db and + # continue with the next + mock_get_by_host_and_node.assert_has_calls( + [ + mock.call(self.context, self.compute.host, 'fake-node1'), + mock.call(self.context, self.compute.host, 'fake-node2'), + ] + ) + + # placement only queried for the existing compute + mock_get_allocations.assert_called_once_with( + self.context, uuids.cn_uuid) + + @mock.patch('nova.scheduler.client.report.SchedulerReportClient.' + 'get_allocations_for_resource_provider') + @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename') + @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes') + def test_init_host_with_interrupted_instance_build_compute_rp_not_found( + self, mock_get_nodes, mock_get_by_host_and_node, + mock_get_allocations): + + mock_get_nodes.return_value = ['fake-node1', 'fake-node2'] + mock_get_by_host_and_node.side_effect = [ + objects.ComputeNode(host=self.compute.host, uuid=uuids.cn1_uuid), + objects.ComputeNode(host=self.compute.host, uuid=uuids.cn2_uuid), + ] + + mock_get_allocations.side_effect = [ + {}, + {uuids.active_instance: "fake-resources"} + ] + + self.compute._error_out_instances_whose_build_was_interrupted( + self.context, {uuids.active_instance}) + + # check that nova skip the node that is not found in placement and + # continue with the next + mock_get_allocations.assert_has_calls( + [ + mock.call(self.context, uuids.cn1_uuid), + mock.call(self.context, uuids.cn2_uuid), + ] + ) def test_init_instance_with_binding_failed_vif_type(self): # this instance will plug a 'binding_failed' vif @@ -3781,6 +3936,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase): self._do_test_set_admin_password_driver_error( exc, vm_states.ACTIVE, None, expected_exception) + def test_destroy_evacuated_instances_no_migrations(self): + with mock.patch( + 'nova.objects.MigrationList.get_by_filters') as migration_list: + migration_list.return_value = [] + + result = self.compute._destroy_evacuated_instances(self.context) + self.assertEqual({}, result) + def test_destroy_evacuated_instances(self): our_host = self.compute.host flavor = objects.Flavor()