Merge "Resource tracker compatibility with Ocata and Pike"

2017-08-11 07:06:48 +00:00
parent c58d4422d8 ddef9dcfc0
commit 4cd6a3a1b4
10 changed files with 136 additions and 43 deletions
--- a/doc/notification_samples/service-update.json
+++ b/doc/notification_samples/service-update.json
@@ -13,7 +13,7 @@
            "disabled_reason": null,
            "report_count": 1,
            "forced_down": false,
-            "version": 21,
+            "version": 22,
            "availability_zone": null,
            "uuid": "fa69c544-906b-4a6a-a9c6-c1f7a8078c73"
        }
--- a/nova/cmd/compute.py
+++ b/nova/cmd/compute.py
@@ -52,7 +52,7 @@ def main():

    cmd_common.block_db_access('nova-compute')
    objects_base.NovaObject.indirection_api = conductor_rpcapi.ConductorAPI()
-
+    objects.Service.enable_min_version_cache()
    server = service.Service.create(binary='nova-compute',
                                    topic=compute_rpcapi.RPC_TOPIC)
    service.serve(server)
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1010,7 +1010,7 @@ class ResourceTracker(object):
                continue

    def _update_usage_from_instance(self, context, instance, nodename,
-                                    is_removed=False):
+            is_removed=False, has_ocata_computes=False):
        """Update usage for a single instance."""

        uuid = instance['uuid']
@@ -1038,7 +1038,40 @@ class ResourceTracker(object):
                self.pci_tracker.update_pci_for_instance(context,
                                                         instance,
                                                         sign=sign)
-            self.reportclient.update_instance_allocation(cn, instance, sign)
+            if has_ocata_computes:
+                LOG.debug("We're on a Pike compute host in a deployment "
+                          "with Ocata compute hosts. Auto-correcting "
+                          "allocations to handle Ocata-style assumptions.")
+                self.reportclient.update_instance_allocation(cn, instance,
+                                                             sign)
+            else:
+                # NOTE(jaypipes): We're on a Pike compute host or later in
+                # a deployment with all compute hosts upgraded to Pike or
+                # later
+                #
+                # If that is the case, then we know that the scheduler will
+                # have properly created an allocation and that the compute
+                # hosts have not attempted to overwrite allocations
+                # **during the periodic update_available_resource() call**.
+                # However, Pike compute hosts may still rework an
+                # allocation for an instance in a move operation during
+                # confirm_resize() on the source host which will remove the
+                # source resource provider from any allocation for an
+                # instance.
+                #
+                # In Queens and beyond, the scheduler will understand when
+                # a move operation has been requested and instead of
+                # creating a doubled-up allocation that contains both the
+                # source and destination host, the scheduler will take the
+                # original allocation (against the source host) and change
+                # the consumer ID of that allocation to be the migration
+                # UUID and not the instance UUID. The scheduler will
+                # allocate the resources for the destination host to the
+                # instance UUID.
+                LOG.debug("We're on a Pike compute host in a deployment "
+                          "with all Pike compute hosts. Skipping "
+                          "auto-correction of allocations.")
+
            # new instance, update compute node resource usage:
            self._update_usage(self._get_usage_dict(instance), nodename,
                               sign=sign)
@@ -1068,9 +1101,44 @@ class ResourceTracker(object):
        cn.current_workload = 0
        cn.running_vms = 0

+        # NOTE(jaypipes): In Pike, we need to be tolerant of Ocata compute
+        # nodes that overwrite placement allocations to look like what the
+        # resource tracker *thinks* is correct. When an instance is
+        # migrated from an Ocata compute node to a Pike compute node, the
+        # Pike scheduler will have created a "doubled-up" allocation that
+        # contains allocated resources against both the source and
+        # destination hosts. The Ocata source compute host, during its
+        # update_available_resource() periodic call will find the instance
+        # in its list of known instances and will call
+        # update_instance_allocation() in the report client. That call will
+        # pull the allocations for the instance UUID which will contain
+        # both the source and destination host providers in the allocation
+        # set. Seeing that this is different from what the Ocata source
+        # host thinks it should be and will overwrite the allocation to
+        # only be an allocation against itself.
+        #
+        # And therefore, here we need to have Pike compute hosts
+        # "correct" the improper healing that the Ocata source host did
+        # during its periodic interval. When the instance is fully migrated
+        # to the Pike compute host, the Ocata compute host will find an
+        # allocation that refers to itself for an instance it no longer
+        # controls and will *delete* all allocations that refer to that
+        # instance UUID, assuming that the instance has been deleted. We
+        # need the destination Pike compute host to recreate that
+        # allocation to refer to its own resource provider UUID.
+        #
+        # For Pike compute nodes that migrate to either a Pike compute host
+        # or a Queens compute host, we do NOT want the Pike compute host to
+        # be "healing" allocation information. Instead, we rely on the Pike
+        # scheduler to properly create allocations during scheduling.
+        compute_version = objects.Service.get_minimum_version(
+            context, 'nova-compute')
+        has_ocata_computes = compute_version < 22
+
        for instance in instances:
            if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
-                self._update_usage_from_instance(context, instance, nodename)
+                self._update_usage_from_instance(context, instance, nodename,
+                    has_ocata_computes=has_ocata_computes)

        self._remove_deleted_instances_allocations(context, cn)

@@ -1141,7 +1209,6 @@ class ResourceTracker(object):
                          "There are allocations remaining against the source "
                          "host that might need to be removed: %s.",
                          instance_uuid, instance.host, instance.node, alloc)
-                continue

    def _find_orphaned_instances(self):
        """Given the set of instances and migrations already account for
--- a/nova/objects/service.py
+++ b/nova/objects/service.py
@@ -32,7 +32,7 @@ LOG = logging.getLogger(__name__)


 # NOTE(danms): This is the global service version counter
-SERVICE_VERSION = 21
+SERVICE_VERSION = 22


 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@@ -110,6 +110,9 @@ SERVICE_VERSION_HISTORY = (
    {'compute_rpc': '4.16'},
    # Version 21: Compute RPC version 4.17
    {'compute_rpc': '4.17'},
+    # Version 22: A marker for the behaviour change of auto-healing code on the
+    # compute host regarding allocations against an instance
+    {'compute_rpc': '4.17'},
 )


--- a/nova/test.py
+++ b/nova/test.py
@@ -260,6 +260,7 @@ class TestCase(testtools.TestCase):
        self._base_test_obj_backup = copy.copy(
            objects_base.NovaObjectRegistry._registry._obj_classes)
        self.addCleanup(self._restore_obj_registry)
+        objects.Service.clear_min_version_cache()

        # NOTE(danms): Reset the cached list of cells
        from nova.compute import api
--- a/nova/tests/functional/compute/test_resource_tracker.py
+++ b/nova/tests/functional/compute/test_resource_tracker.py
@@ -290,12 +290,17 @@ class IronicResourceTrackerTest(test.TestCase):
                self.assertEqual(3, len(inv))

            # Now "spawn" an instance to the first compute node by calling the
-            # RT's instance_claim(), which should, in the case of an Ironic
-            # instance, grab the full compute node for the instance and write
-            # allocation records for VCPU, MEMORY_MB, and DISK_GB
+            # RT's instance_claim().
            cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1]
            cn1_nodename = cn1_obj.hypervisor_hostname
            inst = self.INSTANCE_FIXTURES[uuids.instance1]
+            # Since we're pike, the scheduler would have created our
+            # allocation for us. So, we can use our old update routine
+            # here to mimic that before we go do the compute RT claim,
+            # and then the checks below.
+            self.rt.reportclient.update_instance_allocation(cn1_obj,
+                                                            inst,
+                                                            1)
            with self.rt.instance_claim(self.ctx, inst, cn1_nodename):
                pass

--- a/nova/tests/functional/regressions/test_bug_1679750.py
+++ b/nova/tests/functional/regressions/test_bug_1679750.py
@@ -41,7 +41,8 @@ class TestLocalDeleteAllocations(test.TestCase,
        self.start_service('conductor')
        self.start_service('consoleauth')

-        self.flags(group='scheduler', driver='chance_scheduler')
+        self.flags(enabled_filters=['RetryFilter', 'ComputeFilter'],
+                   group='filter_scheduler')
        self.start_service('scheduler')

        self.compute = self.start_service('compute')
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1081,11 +1081,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):

        self.useFixture(policy_fixture.RealPolicyFixture())
        self.useFixture(nova_fixtures.NeutronFixture(self))
-
-        # NOTE(gibi): After fix 1707071 we need to set the service version to
-        # pike to test pike only interactions. We need separate tests for
-        # ocata - pike interactions
-        # self.useFixture(nova_fixtures.AllServicesCurrent())
+        self.useFixture(nova_fixtures.AllServicesCurrent())

        placement = self.useFixture(nova_fixtures.PlacementFixture())
        self.placement_api = placement.api
@@ -1308,7 +1304,6 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        source_allocation = allocations[source_rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(old_flavor, source_allocation)

-        self.assertEqual(2, len(allocations))
        dest_allocation = allocations[dest_rp_uuid]['resources']
        self.assertFlavorMatchesAllocation(new_flavor, dest_allocation)

@@ -1394,6 +1389,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
        # destination host
        allocations = self._get_allocations_by_server_uuid(server['id'])

+        # and the server allocates only from the target host
        self.assertEqual(1, len(allocations))

        source_usages = self._get_provider_usages(source_rp_uuid)
@@ -1407,6 +1403,12 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):
                         'The source host %s still has usages after the '
                         'resize has been confirmed' % source_hostname)

+        # and the target host allocation should be according to the new flavor
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_usages)
+
+        dest_allocation = allocations[dest_rp_uuid]['resources']
+        self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation)
+
        self._run_periodics()

        # Check we're still accurate after running the periodics
@@ -1462,29 +1464,13 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin):

        usages = self._get_provider_usages(rp_uuid)

-        # NOTE(gibi): This is bug 1707071 where the compute "healing" periodic
-        # tramples on the doubled allocations created in the scheduler.
-        self.assertFlavorMatchesAllocation(new_flavor, usages)
-
-        # NOTE(gibi): After fixing bug 1707252 the following is expected
-        # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
-        #                  usages['VCPU'])
-        # self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
-        #                  usages['MEMORY_MB'])
-        # self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
-        #                  usages['DISK_GB'])
+        self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages)

        allocations = self._get_allocations_by_server_uuid(server['id'])
        self.assertEqual(1, len(allocations))
        allocation = allocations[rp_uuid]['resources']

-        # NOTE(gibi): After fixing bug 1707252 the following is expected
-        # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'],
-        #                  allocation['VCPU'])
-        # self.assertEqual(old_flavor['ram'] + new_flavor['ram'],
-        #                  allocation['MEMORY_MB'])
-        # self.assertEqual(old_flavor['disk'] + new_flavor['disk'],
-        #                  allocation['DISK_GB'])
+        self.assertFlavorsMatchAllocation(old_flavor, new_flavor, allocation)

    def test_resize_revert_same_host(self):
        # make sure that the test only uses a single host
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -5643,6 +5643,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
                self.instance, self.instance.node)
            self.assertIsNotNone(self.instance.migration_context)

+        @mock.patch('nova.objects.Service.get_minimum_version',
+                    return_value=22)
        @mock.patch.object(self.compute, "_notify_about_instance_usage")
        @mock.patch.object(self.compute, "_set_instance_info")
        @mock.patch.object(self.instance, 'save')
@@ -5665,7 +5667,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase):
                                    mock_mig_save,
                                    mock_inst_save,
                                    mock_set,
-                                    mock_notify):
+                                    mock_notify,
+                                    mock_version):
            self.compute.finish_revert_resize(context=self.context,
                                              instance=self.instance,
                                              reservations=None,
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -31,6 +31,7 @@ from nova.objects import base as obj_base
 from nova.objects import fields as obj_fields
 from nova.objects import pci_device
 from nova.pci import manager as pci_manager
+from nova.scheduler import utils as sched_utils
 from nova import test
 from nova.tests.unit.objects import test_pci_device as fake_pci_device
 from nova.tests import uuidsentinel as uuids
@@ -467,7 +468,9 @@ class BaseTestCase(test.NoDBTestCase):

 class TestUpdateAvailableResources(BaseTestCase):

-    def _update_available_resources(self):
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
+    def _update_available_resources(self, version_mock):
        # We test RT._update separately, since the complexity
        # of the update_available_resource() function is high enough as
        # it is, we just want to focus here on testing the resources
@@ -1688,6 +1691,8 @@ class TestInstanceClaim(BaseTestCase):
 class TestResize(BaseTestCase):
    @mock.patch('nova.compute.utils.is_volume_backed_instance',
                return_value=False)
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
    @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                return_value=objects.InstancePCIRequests(requests=[]))
    @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -1698,7 +1703,7 @@ class TestResize(BaseTestCase):
    @mock.patch('nova.objects.ComputeNode.save')
    def test_resize_claim_same_host(self, save_mock, get_mock, migr_mock,
                                    get_cn_mock, pci_mock, instance_pci_mock,
-                                    is_bfv_mock):
+                                    version_mock, is_bfv_mock):
        # Resize an existing instance from its current flavor (instance type
        # 1) to a new flavor (instance type 2) and verify that the compute
        # node's resources are appropriately updated to account for the new
@@ -1788,6 +1793,8 @@ class TestResize(BaseTestCase):

    @mock.patch('nova.compute.utils.is_volume_backed_instance',
                return_value=False)
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
    @mock.patch('nova.objects.InstancePCIRequests.get_by_instance_uuid',
                return_value=objects.InstancePCIRequests(requests=[]))
    @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
@@ -1809,8 +1816,10 @@ class TestResize(BaseTestCase):
                                    pci_get_by_compute_node_mock,
                                    pci_get_by_instance_mock,
                                    pci_get_by_instance_uuid_mock,
+                                    version_mock,
                                    is_bfv_mock,
                                    revert=False):
+
        self.flags(reserved_host_disk_mb=0,
                   reserved_host_memory_mb=0)
        virt_resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES)
@@ -1893,12 +1902,23 @@ class TestResize(BaseTestCase):
            ignore=['stats']
        ))

-    def test_instance_build_resize_confirm(self):
-        self._test_instance_build_resize()
+        cn = self.rt.compute_nodes[_NODENAME]
+        cn_uuid = cn.uuid
+        rc = self.sched_client_mock.reportclient
+        remove_method = rc.remove_provider_from_instance_allocation
+        expected_resources = sched_utils.resources_from_flavor(instance,
+            flavor)
+        remove_method.assert_called_once_with(instance.uuid, cn_uuid,
+            instance.user_id, instance.project_id, expected_resources)

    def test_instance_build_resize_revert(self):
        self._test_instance_build_resize(revert=True)

+    def test_instance_build_resize_confirm(self):
+        self._test_instance_build_resize()
+
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
    @mock.patch('nova.pci.stats.PciDeviceStats.support_requests',
                return_value=True)
    @mock.patch('nova.objects.PciDevice.save')
@@ -1911,7 +1931,7 @@ class TestResize(BaseTestCase):
    @mock.patch('nova.objects.ComputeNode.save')
    def test_resize_claim_dest_host_with_pci(self, save_mock, get_mock,
            migr_mock, get_cn_mock, pci_mock, pci_req_mock, pci_claim_mock,
-            pci_dev_save_mock, pci_supports_mock):
+            pci_dev_save_mock, pci_supports_mock, version_mock):
        # Starting from an empty destination compute node, perform a resize
        # operation for an instance containing SR-IOV PCI devices on the
        # original host.
@@ -2038,7 +2058,12 @@ class TestResize(BaseTestCase):
                self.rt.drop_move_claim(ctx, instance, _NODENAME)
                mock_pci_free_device.assert_called_once_with(
                    pci_dev, mock.ANY)
+                # Check that we grabbed resourced for the right flavor...
+                mock_resources.assert_called_once_with(instance,
+                    instance.flavor)

+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
    @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                return_value=objects.InstancePCIRequests(requests=[]))
    @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -2048,7 +2073,7 @@ class TestResize(BaseTestCase):
    @mock.patch('nova.objects.InstanceList.get_by_host_and_node')
    @mock.patch('nova.objects.ComputeNode.save')
    def test_resize_claim_two_instances(self, save_mock, get_mock, migr_mock,
-            get_cn_mock, pci_mock, instance_pci_mock):
+            get_cn_mock, pci_mock, instance_pci_mock, version_mock):
        # Issue two resize claims against a destination host with no prior
        # instances on it and validate that the accounting for resources is
        # correct.
@@ -2163,6 +2188,8 @@ class TestResize(BaseTestCase):


 class TestRebuild(BaseTestCase):
+    @mock.patch('nova.objects.Service.get_minimum_version',
+                return_value=22)
    @mock.patch('nova.objects.InstancePCIRequests.get_by_instance',
                return_value=objects.InstancePCIRequests(requests=[]))
    @mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
@@ -2172,7 +2199,7 @@ class TestRebuild(BaseTestCase):
    @mock.patch('nova.objects.InstanceList.get_by_host_and_node')
    @mock.patch('nova.objects.ComputeNode.save')
    def test_rebuild_claim(self, save_mock, get_mock, migr_mock, get_cn_mock,
-            pci_mock, instance_pci_mock):
+            pci_mock, instance_pci_mock, version_mock):
        # Rebuild an instance, emulating an evacuate command issued against the
        # original instance. The rebuild operation uses the resource tracker's
        # _move_claim() method, but unlike with resize_claim(), rebuild_claim()