Merge "Error out interrupted builds" into stable/rocky

2019-10-26 04:53:45 +00:00 · 2019-10-26 04:53:45 +00:00 · a5269012a3
parent 9bd300a4d0 13bb7ed701
commit a5269012a3
3 changed files with 278 additions and 27 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -645,6 +645,10 @@ class ComputeManager(manager.Manager):
        Then allocations are removed from Placement for every instance that is
        evacuated from this host regardless if the instance is reported by the
        hypervisor or not.
+
+        :param context: The request context
+        :return: A dict keyed by instance uuid mapped to Migration objects
+            for instances that were migrated away from this host
        """
        filters = {
            'source_compute': self.host,
@ -665,7 +669,7 @@ class ComputeManager(manager.Manager):
            evacuations = objects.MigrationList.get_by_filters(context,
                                                               filters)
        if not evacuations:
-            return
+            return {}
        evacuations = {mig.instance_uuid: mig for mig in evacuations}

        # The instances might be deleted in which case we need to avoid
@ -849,9 +853,8 @@ class ComputeManager(manager.Manager):
            # instance has already been scheduled to this particular host.
            LOG.debug("Instance failed to spawn correctly, "
                      "setting to ERROR state", instance=instance)
-            instance.task_state = None
-            instance.vm_state = vm_states.ERROR
-            instance.save()
+            self._set_instance_obj_error_state(
+                context, instance, clean_task_state=True)
            return

        if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
@ -862,9 +865,8 @@ class ComputeManager(manager.Manager):
            # spawned so set to ERROR state. This is consistent to BUILD
            LOG.debug("Instance failed to rebuild correctly, "
                      "setting to ERROR state", instance=instance)
-            instance.task_state = None
-            instance.vm_state = vm_states.ERROR
-            instance.save()
+            self._set_instance_obj_error_state(
+                context, instance, clean_task_state=True)
            return

        if (instance.vm_state != vm_states.ERROR and
@ -1241,10 +1243,24 @@ class ComputeManager(manager.Manager):

            # Initialise instances on the host that are not evacuating
            for instance in instances:
-                if (not evacuated_instances or
-                        instance.uuid not in evacuated_instances):
+                if instance.uuid not in evacuated_instances:
                    self._init_instance(context, instance)

+            # NOTE(gibi): collect all the instance uuids that is in some way
+            # was already handled above. Either by init_instance or by
+            # _destroy_evacuated_instances. This way we can limit the scope of
+            # the _error_out_instances_whose_build_was_interrupted call to look
+            # only for instances that have allocations on this node and not
+            # handled by the above calls.
+            already_handled = {instance.uuid for instance in instances}.union(
+                evacuated_instances)
+            # NOTE(gibi): If ironic and vcenter virt driver slow start time
+            # becomes problematic here then we should consider adding a config
+            # option or a driver flag to tell us if we should thread this out
+            # in the background on startup
+            self._error_out_instances_whose_build_was_interrupted(
+                context, already_handled)
+
        finally:
            if CONF.defer_iptables_apply:
                self.driver.filter_defer_apply_off()
@ -1257,6 +1273,85 @@ class ComputeManager(manager.Manager):
                # _sync_scheduler_instance_info periodic task will.
                self._update_scheduler_instance_info(context, instances)

+    def _error_out_instances_whose_build_was_interrupted(
+            self, context, already_handled_instances):
+        """If there are instances in BUILDING state that are not
+        assigned to this host but have allocations in placement towards
+        this compute that means the nova-compute service was
+        restarted while those instances waited for the resource claim
+        to finish and the _set_instance_host_and_node() to update the
+        instance.host field. We need to push them to ERROR state here to
+        prevent keeping them in BUILDING state forever.
+
+        :param context: The request context
+        :param already_handled_instances: The set of instance UUIDs that the
+            host initialization process already handled in some way.
+        """
+
+        # Strategy:
+        # 1) Get the allocations from placement for our compute node(s)
+        # 2) Remove the already handled instances from the consumer list;
+        #    they are either already initialized or need to be skipped.
+        # 3) Check which remaining consumer is an instance in BUILDING state
+        #    and push it to ERROR state.
+
+        LOG.info(
+            "Looking for unclaimed instances stuck in BUILDING status for "
+            "nodes managed by this host")
+
+        try:
+            node_names = self.driver.get_available_nodes()
+        except exception.VirtDriverNotReady:
+            LOG.warning(
+                "Virt driver is not ready. Therefore unable to error out any "
+                "instances stuck in BUILDING state on this node. If this is "
+                "the first time this service is starting on this host, then "
+                "you can ignore this warning.")
+            return
+
+        for node_name in node_names:
+            try:
+                cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
+                    context, self.host, node_name).uuid
+            except exception.ComputeHostNotFound:
+                LOG.warning(
+                    "Compute node %s not found in the database and therefore "
+                    "unable to error out any instances stuck in BUILDING "
+                    "state on this node. If this is the first time this "
+                    "service is starting on this host, then you can ignore "
+                    "this warning.", node_name)
+                continue
+
+            f = self.reportclient.get_allocations_for_resource_provider
+            allocations = f(context, cn_uuid)
+            if not allocations:
+                LOG.error(
+                    "Could not retrieve compute node resource provider %s and "
+                    "therefore unable to error out any instances stuck in "
+                    "BUILDING state.", cn_uuid)
+                continue
+
+            not_handled_consumers = (set(allocations) -
+                                     already_handled_instances)
+
+            if not not_handled_consumers:
+                continue
+
+            filters = {
+                'vm_state': vm_states.BUILDING,
+                'uuid': not_handled_consumers
+            }
+
+            instances = objects.InstanceList.get_by_filters(
+                context, filters, expected_attrs=[])
+
+            for instance in instances:
+                LOG.debug(
+                    "Instance spawn was interrupted before instance_claim, "
+                    "setting instance to ERROR state", instance=instance)
+                self._set_instance_obj_error_state(
+                    context, instance, clean_task_state=True)
+
    def cleanup_host(self):
        self.driver.register_event_listener(None)
        self.instance_events.cancel_all_events()
--- a/nova/tests/functional/compute/test_init_host.py
+++ b/nova/tests/functional/compute/test_init_host.py
@ -175,21 +175,14 @@ class TestComputeRestartInstanceStuckInBuild(
            # instance_claim call we can wait for in the test
            fake_notifier.wait_for_versioned_notifications(
                'instance.create.start')
-            self.restart_compute_service(self.compute1)

-        # This is bug 1833581 as the server remains in BUILD state after the
-        # compute restart.
-        self._wait_for_state_change(self.admin_api, server, 'BUILD')
-
-        # Not even the periodic task push this server to ERROR because the
-        # server host is still None since the instance_claim didn't set it.
-        self.flags(instance_build_timeout=1)
-        self.compute1.manager._check_instance_build_time(
-            nova_context.get_admin_context())
-        server = self.admin_api.get_server(server['id'])
-        self.assertEqual('BUILD', server['status'])
-        self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
+            with mock.patch('nova.compute.manager.LOG.debug') as mock_log:
+                self.restart_compute_service(self.compute1)

        # We expect that the instance is pushed to ERROR state during the
        # compute restart.
-        # self._wait_for_state_change(self.admin_api, server, 'ERROR')
+        self._wait_for_state_change(self.admin_api, server, 'ERROR')
+        mock_log.assert_called_with(
+            'Instance spawn was interrupted before instance_claim, setting '
+            'instance to ERROR state',
+            instance=mock.ANY)
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@ -686,6 +686,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            return instance_obj._make_instance_list(
                    self.context, objects.InstanceList(), db_list, None)

+        @mock.patch.object(manager.ComputeManager,
+                           '_error_out_instances_whose_build_was_interrupted')
        @mock.patch.object(fake_driver.FakeDriver, 'init_host')
        @mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_on')
        @mock.patch.object(fake_driver.FakeDriver, 'filter_defer_apply_off')
@ -698,7 +700,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        def _do_mock_calls(mock_update_scheduler, mock_inst_init,
                           mock_destroy, mock_admin_ctxt, mock_host_get,
                           mock_filter_off, mock_filter_on, mock_init_host,
-                           defer_iptables_apply):
+                           mock_error_interrupted, defer_iptables_apply):
            mock_admin_ctxt.return_value = self.context
            inst_list = _make_instance_list(startup_instances)
            mock_host_get.return_value = inst_list
@ -722,6 +724,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            mock_update_scheduler.assert_called_once_with(
                self.context, inst_list)

+            mock_error_interrupted.assert_called_once_with(
+                self.context, {inst.uuid for inst in inst_list})
+
        # Test with defer_iptables_apply
        self.flags(defer_iptables_apply=True)
        _do_mock_calls(defer_iptables_apply=True)
@ -730,6 +735,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self.flags(defer_iptables_apply=False)
        _do_mock_calls(defer_iptables_apply=False)

+    @mock.patch('nova.compute.manager.ComputeManager.'
+                '_error_out_instances_whose_build_was_interrupted')
    @mock.patch('nova.objects.InstanceList.get_by_host',
                return_value=objects.InstanceList())
    @mock.patch('nova.compute.manager.ComputeManager.'
@ -739,13 +746,15 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
    @mock.patch('nova.compute.manager.ComputeManager.'
                '_update_scheduler_instance_info', mock.NonCallableMock())
    def test_init_host_no_instances(self, mock_destroy_evac_instances,
-                                    mock_get_by_host):
+                                    mock_get_by_host, mock_error_interrupted):
        """Tests the case that init_host runs and there are no instances
        on this host yet (it's brand new). Uses NonCallableMock for the
        methods we assert should not be called.
        """
        self.compute.init_host()

+        mock_error_interrupted.assert_called_once_with(mock.ANY, set())
+
    @mock.patch('nova.objects.InstanceList')
    @mock.patch('nova.objects.MigrationList.get_by_filters')
    def test_cleanup_host(self, mock_miglist_get, mock_instance_list):
@ -802,6 +811,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self.assertFalse(mock_register.called)

    @mock.patch('nova.context.RequestContext.elevated')
+    @mock.patch.object(manager.ComputeManager,
+                       '_error_out_instances_whose_build_was_interrupted')
    @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
    @mock.patch('nova.scheduler.utils.resources_from_flavor')
    @mock.patch.object(manager.ComputeManager, '_get_instances_on_driver')
@ -816,7 +827,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
    def test_init_host_with_evacuated_instance(self, mock_save, mock_mig_get,
            mock_temp_mut, mock_init_host, mock_destroy, mock_host_get,
            mock_admin_ctxt, mock_init_virt, mock_get_inst, mock_resources,
-            mock_get_node, mock_elevated):
+            mock_get_node, mock_error_interrupted, mock_elevated):
        our_host = self.compute.host
        not_our_host = 'not-' + our_host

@ -866,6 +877,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                                             mock.ANY, mock.ANY, mock.ANY)
        mock_save.assert_called_once_with()

+        mock_error_interrupted.assert_called_once_with(
+            self.context, {deleted_instance.uuid})
+
+    @mock.patch.object(manager.ComputeManager,
+                       '_error_out_instances_whose_build_was_interrupted')
    @mock.patch.object(context, 'get_admin_context')
    @mock.patch.object(objects.InstanceList, 'get_by_host')
    @mock.patch.object(fake_driver.FakeDriver, 'init_host')
@ -874,7 +890,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
                  '_destroy_evacuated_instances')
    def test_init_host_with_in_progress_evacuations(self, mock_destroy_evac,
            mock_init_instance, mock_init_host, mock_host_get,
-            mock_admin_ctxt):
+            mock_admin_ctxt, mock_error_interrupted):
        """Assert that init_instance is not called for instances that are
           evacuating from the host during init_host.
        """
@ -895,6 +911,145 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

        mock_init_instance.assert_called_once_with(
            self.context, active_instance)
+        mock_error_interrupted.assert_called_once_with(
+            self.context, {active_instance.uuid, evacuating_instance.uuid})
+
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(objects.InstanceList, 'get_by_filters')
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    def test_init_host_with_interrupted_instance_build(
+            self, mock_get_nodes, mock_get_by_host_and_node,
+            mock_get_allocations, mock_get_instances, mock_instance_save):
+
+        mock_get_nodes.return_value = ['fake-node']
+        mock_get_by_host_and_node.return_value = objects.ComputeNode(
+            host=self.compute.host, uuid=uuids.cn_uuid)
+
+        active_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.active_instance)
+        evacuating_instance = fake_instance.fake_instance_obj(
+            self.context, host=self.compute.host, uuid=uuids.evac_instance)
+        interrupted_instance = fake_instance.fake_instance_obj(
+            self.context, host=None, uuid=uuids.interrupted_instance,
+            vm_state=vm_states.BUILDING)
+
+        # we have 3 different instances. We need consumers for each instance
+        # in placement and an extra consumer that is not an instance
+        allocations = {
+            uuids.active_instance: "fake-resources-active",
+            uuids.evac_instance: "fake-resources-evacuating",
+            uuids.interrupted_instance: "fake-resources-interrupted",
+            uuids.not_an_instance: "fake-resources-not-an-instance",
+        }
+        mock_get_allocations.return_value = allocations
+
+        # get is called with a uuid filter containing interrupted_instance,
+        # error_instance, and not_an_instance but it will only return the
+        # interrupted_instance as the error_instance is not in building state
+        # and not_an_instance does not match with any instance in the db.
+        mock_get_instances.return_value = objects.InstanceList(
+            self.context, objects=[interrupted_instance])
+
+        # interrupted_instance and error_instance is not in the list passed in
+        # because it is not assigned to the compute and therefore not processed
+        # by init_host and init_instance
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context,
+            {inst.uuid for inst in [active_instance, evacuating_instance]})
+
+        mock_get_by_host_and_node.assert_called_once_with(
+            self.context, self.compute.host, 'fake-node')
+        mock_get_allocations.assert_called_once_with(
+            self.context, uuids.cn_uuid)
+
+        mock_get_instances.assert_called_once_with(
+            self.context,
+            {'vm_state': 'building',
+             'uuid': {uuids.interrupted_instance, uuids.not_an_instance}
+             },
+            expected_attrs=[])
+
+        # this is expected to be called only once for interrupted_instance
+        mock_instance_save.assert_called_once_with()
+        self.assertEqual(vm_states.ERROR, interrupted_instance.vm_state)
+
+    @mock.patch.object(manager.LOG, 'warning')
+    @mock.patch.object(
+        fake_driver.FakeDriver, 'get_available_nodes',
+        side_effect=exception.VirtDriverNotReady)
+    def test_init_host_with_interrupted_instance_build_driver_not_ready(
+            self, mock_get_nodes, mock_log_warning):
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context, set())
+
+        mock_log_warning.assert_called_once_with(
+            "Virt driver is not ready. Therefore unable to error out any "
+            "instances stuck in BUILDING state on this node. If this is the "
+            "first time this service is starting on this host, then you can "
+            "ignore this warning.")
+
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    def test_init_host_with_interrupted_instance_build_compute_node_not_found(
+            self, mock_get_nodes, mock_get_by_host_and_node,
+            mock_get_allocations):
+
+        mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
+        mock_get_by_host_and_node.side_effect = [
+            exception.ComputeHostNotFound(host='fake-node1'),
+            objects.ComputeNode(host=self.compute.host, uuid=uuids.cn_uuid)]
+
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context, set())
+
+        # check that nova skip the node that is not found in the db and
+        # continue with the next
+        mock_get_by_host_and_node.assert_has_calls(
+            [
+                mock.call(self.context, self.compute.host, 'fake-node1'),
+                mock.call(self.context, self.compute.host, 'fake-node2'),
+            ]
+        )
+
+        # placement only queried for the existing compute
+        mock_get_allocations.assert_called_once_with(
+            self.context, uuids.cn_uuid)
+
+    @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
+                'get_allocations_for_resource_provider')
+    @mock.patch.object(objects.ComputeNode, 'get_by_host_and_nodename')
+    @mock.patch.object(fake_driver.FakeDriver, 'get_available_nodes')
+    def test_init_host_with_interrupted_instance_build_compute_rp_not_found(
+            self, mock_get_nodes, mock_get_by_host_and_node,
+            mock_get_allocations):
+
+        mock_get_nodes.return_value = ['fake-node1', 'fake-node2']
+        mock_get_by_host_and_node.side_effect = [
+            objects.ComputeNode(host=self.compute.host, uuid=uuids.cn1_uuid),
+            objects.ComputeNode(host=self.compute.host, uuid=uuids.cn2_uuid),
+        ]
+
+        mock_get_allocations.side_effect = [
+            {},
+            {uuids.active_instance: "fake-resources"}
+        ]
+
+        self.compute._error_out_instances_whose_build_was_interrupted(
+            self.context, {uuids.active_instance})
+
+        # check that nova skip the node that is not found in placement and
+        # continue with the next
+        mock_get_allocations.assert_has_calls(
+            [
+                mock.call(self.context, uuids.cn1_uuid),
+                mock.call(self.context, uuids.cn2_uuid),
+            ]
+        )

    def test_init_instance_with_binding_failed_vif_type(self):
        # this instance will plug a 'binding_failed' vif
@ -3781,6 +3936,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self._do_test_set_admin_password_driver_error(
            exc, vm_states.ACTIVE, None, expected_exception)

+    def test_destroy_evacuated_instances_no_migrations(self):
+        with mock.patch(
+                'nova.objects.MigrationList.get_by_filters') as migration_list:
+            migration_list.return_value = []
+
+            result = self.compute._destroy_evacuated_instances(self.context)
+            self.assertEqual({}, result)
+
    def test_destroy_evacuated_instances(self):
        our_host = self.compute.host
        flavor = objects.Flavor()