Merge "Cleanup allocations in failed prep_resize"

2017-08-25 00:27:39 +00:00 · 2017-08-25 00:27:39 +00:00 · a99e936ff1
commit a99e936ff1
parent a7b49f6690 b53133ba50
4 changed files with 122 additions and 39 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -3789,6 +3789,7 @@ class ComputeManager(manager.Manager):
                                              current_period=True)
            self._notify_about_instance_usage(
                    context, instance, "resize.prep.start")
            failed = False
            try:
                self._prep_resize(context, image, instance,
                                  instance_type, filter_properties,
@ -3797,14 +3798,31 @@ class ComputeManager(manager.Manager):
            #               instance to be migrated is backed by LVM.
            #               Remove when LVM migration is implemented.
            except exception.MigrationPreCheckError:
                # TODO(mriedem): How is it even possible to get here?
                # _prep_resize does not call the driver. The resize_instance
                # method does, but we RPC cast to the source node to do that
                # so we shouldn't even get this exception...
                failed = True
                raise
            except Exception:
                failed = True
                # try to re-schedule the resize elsewhere:
                exc_info = sys.exc_info()
                self._reschedule_resize_or_reraise(context, image, instance,
                        exc_info, instance_type, request_spec,
                        filter_properties)
            finally:
                if failed:
                    # Since we hit a failure, we're either rescheduling or dead
                    # and either way we need to cleanup any allocations created
                    # by the scheduler for the destination node. Note that for
                    # a resize to the same host, the scheduler will merge the
                    # flavors, so here we'd be subtracting the new flavor from
                    # the allocated resources on this node.
                    rt = self._get_resource_tracker()
                    rt.delete_allocation_for_failed_resize(
                        instance, node, instance_type)
                extra_usage_info = dict(
                        new_instance_type=instance_type.name,
                        new_instance_type_id=instance_type.id)
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -1237,6 +1237,30 @@ class ResourceTracker(object):
                      'on the source node %s', cn.uuid,
                      instance=instance)
    def delete_allocation_for_failed_resize(self, instance, node, flavor):
        """Delete instance allocations for the node during a failed resize
        :param instance: The instance being resized/migrated.
        :param node: The node provider on which the instance should have
            allocations to remove. If this is a resize to the same host, then
            the new_flavor resources are subtracted from the single allocation.
        :param flavor: This is the new_flavor during a resize.
        """
        resources = scheduler_utils.resources_from_flavor(instance, flavor)
        cn = self.compute_nodes[node]
        res = self.reportclient.remove_provider_from_instance_allocation(
            instance.uuid, cn.uuid, instance.user_id, instance.project_id,
            resources)
        if not res:
            if instance.instance_type_id == flavor.id:
                operation = 'migration'
            else:
                operation = 'resize'
            LOG.error('Failed to clean allocation after a failed '
                      '%(operation)s on node %(node)s',
                      {'operation': operation, 'node': cn.uuid},
                      instance=instance)
    def _find_orphaned_instances(self):
        """Given the set of instances and migrations already account for
        by resource tracker, sanity check the hypervisor to determine
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -1029,11 +1029,15 @@ class SchedulerReportClient(object):
        then PUTs the resulting allocation back to the placement API for the
        consumer.
-        We call this method on two occasions: on the source host during a
+        We call this method on three occasions:
-        confirm_resize() operation and on the destination host during a
+
-        revert_resize() operation. This is important to reconcile the
+        1. On the source host during a confirm_resize() operation.
-        "doubled-up" allocation that the scheduler constructs when claiming
+        2. On the destination host during a revert_resize() operation.
-        resources against the destination host during a move operation.
+        3. On the destination host when prep_resize fails.
        This is important to reconcile the "doubled-up" allocation that the
        scheduler constructs when claiming resources against the destination
        host during a move operation.
        If the move was between hosts, the entire allocation for rp_uuid will
        be dropped. If the move is a resize on the same host, then we will
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -1949,39 +1949,17 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        self.assertFlavorMatchesAllocation(
           {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
-    def test_rescheduling_when_migrating_instance(self):
+    def _wait_for_prep_resize_fail_completion(self, server, expected_action):
-        """Tests that allocations are removed from the destination node by
+        """Polls instance action events for the given instance and action
-        the compute service when a cold migrate / resize fails and a reschedule
+        until it finds the compute_prep_resize action event with an error
-        request is sent back to conductor.
+        result.
        """
        source_hostname = self.compute1.manager.host
        server = self._boot_and_check_allocations(
            self.flavor1, source_hostname)
        def fake_prep_resize(*args, **kwargs):
            raise test.TestingException('Simulated _prep_resize failure.')
        # Yes this isn't great in a functional test, but it's simple.
        self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
                      fake_prep_resize)
        # Now migrate the server which is going to fail on the destination.
        self.api.post_server_action(server['id'], {'migrate': None})
        # When the destination compute calls conductor and conductor calls
        # the scheduler to get another host, it's going to fail since there
        # are only two hosts and we started on the first and failed on the
        # second, so the scheduler will raise NoValidHost and a fault will
        # be recorded on the instance. However, the instance is not put into
        # ERROR state since it's still OK on the source host, but faults are
        # only shown in the API for ERROR/DELETED instances, so we can't poll
        # for the fault. So we poll for instance action events instead.
        completion_event = None
        for attempt in range(10):
            actions = self.api.get_instance_actions(server['id'])
            # Look for the migrate action.
            for action in actions:
-                if action['action'] == 'migrate':
+                if action['action'] == expected_action:
                    events = (
                        self.api.api_get(
                            '/servers/%s/os-instance-actions/%s' %
@ -2005,19 +1983,78 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
            self.fail('Timed out waiting for compute_prep_resize failure '
                      'event. Current instance actions: %s' % actions)
    def test_rescheduling_when_migrating_instance(self):
        """Tests that allocations are removed from the destination node by
        the compute service when a cold migrate / resize fails and a reschedule
        request is sent back to conductor.
        """
        source_hostname = self.compute1.manager.host
        server = self._boot_and_check_allocations(
            self.flavor1, source_hostname)
        def fake_prep_resize(*args, **kwargs):
            raise test.TestingException('Simulated _prep_resize failure.')
        # Yes this isn't great in a functional test, but it's simple.
        self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
                      fake_prep_resize)
        # Now migrate the server which is going to fail on the destination.
        self.api.post_server_action(server['id'], {'migrate': None})
        self._wait_for_prep_resize_fail_completion(server, 'migrate')
        dest_hostname = self._other_hostname(source_hostname)
        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
        failed_usages = self._get_provider_usages(dest_rp_uuid)
-        # FIXME(mriedem): Due to bug 1712850 the allocations are not removed
+        # Expects no allocation records on the failed host.
-        # from the destination node before the reschedule. Remove this once
+        self.assertFlavorMatchesAllocation(
-        # the bug is fixed as it should be:
+           {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
        # # Expects no allocation records on the failed host.
        # self.assertFlavorMatchesAllocation(
        #    {'vcpus': 0, 'ram': 0, 'disk': 0}, failed_usages)
        self.assertFlavorMatchesAllocation(self.flavor1, failed_usages)
        # Ensure the allocation records still exist on the source host.
        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
        source_usages = self._get_provider_usages(source_rp_uuid)
        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)
    def test_resize_to_same_host_prep_resize_fails(self):
        """Tests that when we resize to the same host and resize fails in
        the prep_resize method, we cleanup the allocations before rescheduling.
        """
        # make sure that the test only uses a single host
        compute2_service_id = self.admin_api.get_services(
            host=self.compute2.host, binary='nova-compute')[0]['id']
        self.admin_api.put_service(compute2_service_id, {'status': 'disabled'})
        hostname = self.compute1.manager.host
        rp_uuid = self._get_provider_uuid_by_host(hostname)
        server = self._boot_and_check_allocations(self.flavor1, hostname)
        def fake_prep_resize(*args, **kwargs):
            # Ensure the allocations are doubled now before we fail.
            usages = self._get_provider_usages(rp_uuid)
            self.assertFlavorsMatchAllocation(
                self.flavor1, self.flavor2, usages)
            raise test.TestingException('Simulated _prep_resize failure.')
        # Yes this isn't great in a functional test, but it's simple.
        self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
                      fake_prep_resize)
        self.flags(allow_resize_to_same_host=True)
        resize_req = {
            'resize': {
                'flavorRef': self.flavor2['id']
            }
        }
        self.api.post_server_action(server['id'], resize_req)
        self._wait_for_prep_resize_fail_completion(server, 'resize')
        # Ensure the allocation records still exist on the host.
        source_rp_uuid = self._get_provider_uuid_by_host(hostname)
        source_usages = self._get_provider_usages(source_rp_uuid)
        # The new_flavor should have been subtracted from the doubled
        # allocation which just leaves us with the original flavor.
        self.assertFlavorMatchesAllocation(self.flavor1, source_usages)