diff --git a/doc/notification_samples/service-update.json b/doc/notification_samples/service-update.json index e080ab9c65b2..9e7220e73a60 100644 --- a/doc/notification_samples/service-update.json +++ b/doc/notification_samples/service-update.json @@ -13,7 +13,7 @@ "disabled_reason": null, "report_count": 1, "forced_down": false, - "version": 21, + "version": 22, "availability_zone": null, "uuid": "fa69c544-906b-4a6a-a9c6-c1f7a8078c73" } diff --git a/nova/cmd/compute.py b/nova/cmd/compute.py index 53342e2f07eb..eb48853d45fb 100644 --- a/nova/cmd/compute.py +++ b/nova/cmd/compute.py @@ -52,7 +52,7 @@ def main(): cmd_common.block_db_access('nova-compute') objects_base.NovaObject.indirection_api = conductor_rpcapi.ConductorAPI() - + objects.Service.enable_min_version_cache() server = service.Service.create(binary='nova-compute', topic=compute_rpcapi.RPC_TOPIC) service.serve(server) diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 2ab30161caf4..994f1ca82f36 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1010,7 +1010,7 @@ class ResourceTracker(object): continue def _update_usage_from_instance(self, context, instance, nodename, - is_removed=False): + is_removed=False, has_ocata_computes=False): """Update usage for a single instance.""" uuid = instance['uuid'] @@ -1038,7 +1038,40 @@ class ResourceTracker(object): self.pci_tracker.update_pci_for_instance(context, instance, sign=sign) - self.reportclient.update_instance_allocation(cn, instance, sign) + if has_ocata_computes: + LOG.debug("We're on a Pike compute host in a deployment " + "with Ocata compute hosts. Auto-correcting " + "allocations to handle Ocata-style assumptions.") + self.reportclient.update_instance_allocation(cn, instance, + sign) + else: + # NOTE(jaypipes): We're on a Pike compute host or later in + # a deployment with all compute hosts upgraded to Pike or + # later + # + # If that is the case, then we know that the scheduler will + # have properly created an allocation and that the compute + # hosts have not attempted to overwrite allocations + # **during the periodic update_available_resource() call**. + # However, Pike compute hosts may still rework an + # allocation for an instance in a move operation during + # confirm_resize() on the source host which will remove the + # source resource provider from any allocation for an + # instance. + # + # In Queens and beyond, the scheduler will understand when + # a move operation has been requested and instead of + # creating a doubled-up allocation that contains both the + # source and destination host, the scheduler will take the + # original allocation (against the source host) and change + # the consumer ID of that allocation to be the migration + # UUID and not the instance UUID. The scheduler will + # allocate the resources for the destination host to the + # instance UUID. + LOG.debug("We're on a Pike compute host in a deployment " + "with all Pike compute hosts. Skipping " + "auto-correction of allocations.") + # new instance, update compute node resource usage: self._update_usage(self._get_usage_dict(instance), nodename, sign=sign) @@ -1068,9 +1101,44 @@ class ResourceTracker(object): cn.current_workload = 0 cn.running_vms = 0 + # NOTE(jaypipes): In Pike, we need to be tolerant of Ocata compute + # nodes that overwrite placement allocations to look like what the + # resource tracker *thinks* is correct. When an instance is + # migrated from an Ocata compute node to a Pike compute node, the + # Pike scheduler will have created a "doubled-up" allocation that + # contains allocated resources against both the source and + # destination hosts. The Ocata source compute host, during its + # update_available_resource() periodic call will find the instance + # in its list of known instances and will call + # update_instance_allocation() in the report client. That call will + # pull the allocations for the instance UUID which will contain + # both the source and destination host providers in the allocation + # set. Seeing that this is different from what the Ocata source + # host thinks it should be and will overwrite the allocation to + # only be an allocation against itself. + # + # And therefore, here we need to have Pike compute hosts + # "correct" the improper healing that the Ocata source host did + # during its periodic interval. When the instance is fully migrated + # to the Pike compute host, the Ocata compute host will find an + # allocation that refers to itself for an instance it no longer + # controls and will *delete* all allocations that refer to that + # instance UUID, assuming that the instance has been deleted. We + # need the destination Pike compute host to recreate that + # allocation to refer to its own resource provider UUID. + # + # For Pike compute nodes that migrate to either a Pike compute host + # or a Queens compute host, we do NOT want the Pike compute host to + # be "healing" allocation information. Instead, we rely on the Pike + # scheduler to properly create allocations during scheduling. + compute_version = objects.Service.get_minimum_version( + context, 'nova-compute') + has_ocata_computes = compute_version < 22 + for instance in instances: if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL: - self._update_usage_from_instance(context, instance, nodename) + self._update_usage_from_instance(context, instance, nodename, + has_ocata_computes=has_ocata_computes) self._remove_deleted_instances_allocations(context, cn) @@ -1141,7 +1209,6 @@ class ResourceTracker(object): "There are allocations remaining against the source " "host that might need to be removed: %s.", instance_uuid, instance.host, instance.node, alloc) - continue def _find_orphaned_instances(self): """Given the set of instances and migrations already account for diff --git a/nova/objects/service.py b/nova/objects/service.py index 6a7c8ea8a521..ff9058910b80 100644 --- a/nova/objects/service.py +++ b/nova/objects/service.py @@ -32,7 +32,7 @@ LOG = logging.getLogger(__name__) # NOTE(danms): This is the global service version counter -SERVICE_VERSION = 21 +SERVICE_VERSION = 22 # NOTE(danms): This is our SERVICE_VERSION history. The idea is that any @@ -110,6 +110,9 @@ SERVICE_VERSION_HISTORY = ( {'compute_rpc': '4.16'}, # Version 21: Compute RPC version 4.17 {'compute_rpc': '4.17'}, + # Version 22: A marker for the behaviour change of auto-healing code on the + # compute host regarding allocations against an instance + {'compute_rpc': '4.17'}, ) diff --git a/nova/test.py b/nova/test.py index c4fc62a10f5e..24d65105db23 100644 --- a/nova/test.py +++ b/nova/test.py @@ -260,6 +260,7 @@ class TestCase(testtools.TestCase): self._base_test_obj_backup = copy.copy( objects_base.NovaObjectRegistry._registry._obj_classes) self.addCleanup(self._restore_obj_registry) + objects.Service.clear_min_version_cache() # NOTE(danms): Reset the cached list of cells from nova.compute import api diff --git a/nova/tests/functional/compute/test_resource_tracker.py b/nova/tests/functional/compute/test_resource_tracker.py index ee7d98e472cb..9559ceaee20f 100644 --- a/nova/tests/functional/compute/test_resource_tracker.py +++ b/nova/tests/functional/compute/test_resource_tracker.py @@ -290,12 +290,17 @@ class IronicResourceTrackerTest(test.TestCase): self.assertEqual(3, len(inv)) # Now "spawn" an instance to the first compute node by calling the - # RT's instance_claim(), which should, in the case of an Ironic - # instance, grab the full compute node for the instance and write - # allocation records for VCPU, MEMORY_MB, and DISK_GB + # RT's instance_claim(). cn1_obj = self.COMPUTE_NODE_FIXTURES[uuids.cn1] cn1_nodename = cn1_obj.hypervisor_hostname inst = self.INSTANCE_FIXTURES[uuids.instance1] + # Since we're pike, the scheduler would have created our + # allocation for us. So, we can use our old update routine + # here to mimic that before we go do the compute RT claim, + # and then the checks below. + self.rt.reportclient.update_instance_allocation(cn1_obj, + inst, + 1) with self.rt.instance_claim(self.ctx, inst, cn1_nodename): pass diff --git a/nova/tests/functional/regressions/test_bug_1679750.py b/nova/tests/functional/regressions/test_bug_1679750.py index 6cbaf9934f94..b93ce5107c5c 100644 --- a/nova/tests/functional/regressions/test_bug_1679750.py +++ b/nova/tests/functional/regressions/test_bug_1679750.py @@ -41,7 +41,8 @@ class TestLocalDeleteAllocations(test.TestCase, self.start_service('conductor') self.start_service('consoleauth') - self.flags(group='scheduler', driver='chance_scheduler') + self.flags(enabled_filters=['RetryFilter', 'ComputeFilter'], + group='filter_scheduler') self.start_service('scheduler') self.compute = self.start_service('compute') diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index ab7439a6f58e..bc32a28f5d8a 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -1081,11 +1081,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): self.useFixture(policy_fixture.RealPolicyFixture()) self.useFixture(nova_fixtures.NeutronFixture(self)) - - # NOTE(gibi): After fix 1707071 we need to set the service version to - # pike to test pike only interactions. We need separate tests for - # ocata - pike interactions - # self.useFixture(nova_fixtures.AllServicesCurrent()) + self.useFixture(nova_fixtures.AllServicesCurrent()) placement = self.useFixture(nova_fixtures.PlacementFixture()) self.placement_api = placement.api @@ -1308,7 +1304,6 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): source_allocation = allocations[source_rp_uuid]['resources'] self.assertFlavorMatchesAllocation(old_flavor, source_allocation) - self.assertEqual(2, len(allocations)) dest_allocation = allocations[dest_rp_uuid]['resources'] self.assertFlavorMatchesAllocation(new_flavor, dest_allocation) @@ -1394,6 +1389,7 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): # destination host allocations = self._get_allocations_by_server_uuid(server['id']) + # and the server allocates only from the target host self.assertEqual(1, len(allocations)) source_usages = self._get_provider_usages(source_rp_uuid) @@ -1407,6 +1403,12 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): 'The source host %s still has usages after the ' 'resize has been confirmed' % source_hostname) + # and the target host allocation should be according to the new flavor + self.assertFlavorMatchesAllocation(self.flavor2, dest_usages) + + dest_allocation = allocations[dest_rp_uuid]['resources'] + self.assertFlavorMatchesAllocation(self.flavor2, dest_allocation) + self._run_periodics() # Check we're still accurate after running the periodics @@ -1462,29 +1464,13 @@ class ServerMovingTests(test.TestCase, integrated_helpers.InstanceHelperMixin): usages = self._get_provider_usages(rp_uuid) - # NOTE(gibi): This is bug 1707071 where the compute "healing" periodic - # tramples on the doubled allocations created in the scheduler. - self.assertFlavorMatchesAllocation(new_flavor, usages) - - # NOTE(gibi): After fixing bug 1707252 the following is expected - # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'], - # usages['VCPU']) - # self.assertEqual(old_flavor['ram'] + new_flavor['ram'], - # usages['MEMORY_MB']) - # self.assertEqual(old_flavor['disk'] + new_flavor['disk'], - # usages['DISK_GB']) + self.assertFlavorsMatchAllocation(old_flavor, new_flavor, usages) allocations = self._get_allocations_by_server_uuid(server['id']) self.assertEqual(1, len(allocations)) allocation = allocations[rp_uuid]['resources'] - # NOTE(gibi): After fixing bug 1707252 the following is expected - # self.assertEqual(old_flavor['vcpus'] + new_flavor['vcpus'], - # allocation['VCPU']) - # self.assertEqual(old_flavor['ram'] + new_flavor['ram'], - # allocation['MEMORY_MB']) - # self.assertEqual(old_flavor['disk'] + new_flavor['disk'], - # allocation['DISK_GB']) + self.assertFlavorsMatchAllocation(old_flavor, new_flavor, allocation) def test_resize_revert_same_host(self): # make sure that the test only uses a single host diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index a66182d0670a..beb955d2ab6e 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -5643,6 +5643,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): self.instance, self.instance.node) self.assertIsNotNone(self.instance.migration_context) + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch.object(self.compute, "_notify_about_instance_usage") @mock.patch.object(self.compute, "_set_instance_info") @mock.patch.object(self.instance, 'save') @@ -5665,7 +5667,8 @@ class ComputeManagerMigrationTestCase(test.NoDBTestCase): mock_mig_save, mock_inst_save, mock_set, - mock_notify): + mock_notify, + mock_version): self.compute.finish_revert_resize(context=self.context, instance=self.instance, reservations=None, diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 6c456b7e4fe0..e2c726c2bc91 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -31,6 +31,7 @@ from nova.objects import base as obj_base from nova.objects import fields as obj_fields from nova.objects import pci_device from nova.pci import manager as pci_manager +from nova.scheduler import utils as sched_utils from nova import test from nova.tests.unit.objects import test_pci_device as fake_pci_device from nova.tests import uuidsentinel as uuids @@ -467,7 +468,9 @@ class BaseTestCase(test.NoDBTestCase): class TestUpdateAvailableResources(BaseTestCase): - def _update_available_resources(self): + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) + def _update_available_resources(self, version_mock): # We test RT._update separately, since the complexity # of the update_available_resource() function is high enough as # it is, we just want to focus here on testing the resources @@ -1688,6 +1691,8 @@ class TestInstanceClaim(BaseTestCase): class TestResize(BaseTestCase): @mock.patch('nova.compute.utils.is_volume_backed_instance', return_value=False) + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch('nova.objects.InstancePCIRequests.get_by_instance', return_value=objects.InstancePCIRequests(requests=[])) @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', @@ -1698,7 +1703,7 @@ class TestResize(BaseTestCase): @mock.patch('nova.objects.ComputeNode.save') def test_resize_claim_same_host(self, save_mock, get_mock, migr_mock, get_cn_mock, pci_mock, instance_pci_mock, - is_bfv_mock): + version_mock, is_bfv_mock): # Resize an existing instance from its current flavor (instance type # 1) to a new flavor (instance type 2) and verify that the compute # node's resources are appropriately updated to account for the new @@ -1788,6 +1793,8 @@ class TestResize(BaseTestCase): @mock.patch('nova.compute.utils.is_volume_backed_instance', return_value=False) + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch('nova.objects.InstancePCIRequests.get_by_instance_uuid', return_value=objects.InstancePCIRequests(requests=[])) @mock.patch('nova.objects.InstancePCIRequests.get_by_instance', @@ -1809,8 +1816,10 @@ class TestResize(BaseTestCase): pci_get_by_compute_node_mock, pci_get_by_instance_mock, pci_get_by_instance_uuid_mock, + version_mock, is_bfv_mock, revert=False): + self.flags(reserved_host_disk_mb=0, reserved_host_memory_mb=0) virt_resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES) @@ -1893,12 +1902,23 @@ class TestResize(BaseTestCase): ignore=['stats'] )) - def test_instance_build_resize_confirm(self): - self._test_instance_build_resize() + cn = self.rt.compute_nodes[_NODENAME] + cn_uuid = cn.uuid + rc = self.sched_client_mock.reportclient + remove_method = rc.remove_provider_from_instance_allocation + expected_resources = sched_utils.resources_from_flavor(instance, + flavor) + remove_method.assert_called_once_with(instance.uuid, cn_uuid, + instance.user_id, instance.project_id, expected_resources) def test_instance_build_resize_revert(self): self._test_instance_build_resize(revert=True) + def test_instance_build_resize_confirm(self): + self._test_instance_build_resize() + + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch('nova.pci.stats.PciDeviceStats.support_requests', return_value=True) @mock.patch('nova.objects.PciDevice.save') @@ -1911,7 +1931,7 @@ class TestResize(BaseTestCase): @mock.patch('nova.objects.ComputeNode.save') def test_resize_claim_dest_host_with_pci(self, save_mock, get_mock, migr_mock, get_cn_mock, pci_mock, pci_req_mock, pci_claim_mock, - pci_dev_save_mock, pci_supports_mock): + pci_dev_save_mock, pci_supports_mock, version_mock): # Starting from an empty destination compute node, perform a resize # operation for an instance containing SR-IOV PCI devices on the # original host. @@ -2038,7 +2058,12 @@ class TestResize(BaseTestCase): self.rt.drop_move_claim(ctx, instance, _NODENAME) mock_pci_free_device.assert_called_once_with( pci_dev, mock.ANY) + # Check that we grabbed resourced for the right flavor... + mock_resources.assert_called_once_with(instance, + instance.flavor) + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch('nova.objects.InstancePCIRequests.get_by_instance', return_value=objects.InstancePCIRequests(requests=[])) @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', @@ -2048,7 +2073,7 @@ class TestResize(BaseTestCase): @mock.patch('nova.objects.InstanceList.get_by_host_and_node') @mock.patch('nova.objects.ComputeNode.save') def test_resize_claim_two_instances(self, save_mock, get_mock, migr_mock, - get_cn_mock, pci_mock, instance_pci_mock): + get_cn_mock, pci_mock, instance_pci_mock, version_mock): # Issue two resize claims against a destination host with no prior # instances on it and validate that the accounting for resources is # correct. @@ -2163,6 +2188,8 @@ class TestResize(BaseTestCase): class TestRebuild(BaseTestCase): + @mock.patch('nova.objects.Service.get_minimum_version', + return_value=22) @mock.patch('nova.objects.InstancePCIRequests.get_by_instance', return_value=objects.InstancePCIRequests(requests=[])) @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', @@ -2172,7 +2199,7 @@ class TestRebuild(BaseTestCase): @mock.patch('nova.objects.InstanceList.get_by_host_and_node') @mock.patch('nova.objects.ComputeNode.save') def test_rebuild_claim(self, save_mock, get_mock, migr_mock, get_cn_mock, - pci_mock, instance_pci_mock): + pci_mock, instance_pci_mock, version_mock): # Rebuild an instance, emulating an evacuate command issued against the # original instance. The rebuild operation uses the resource tracker's # _move_claim() method, but unlike with resize_claim(), rebuild_claim()