Reduce gen conflict in COMPUTE_STATUS_DISABLED handling

The COMPUTE_STATUS_DISABLED trait is supposed to be added to the compute RP when the compute service is disabled, and the trait is supposed to be removed when the service is enabled again. However adding and removing traits is prone to generation conflict in placement. The original implementation of blueprint pre-filter-disabled-computes noticed this and prints a detailed warning message while the API operation succeeds. We can ignore the conflict this way because the periodic update_available_resource() call will re-sync the traits later. Still this gives human noticeable time window where the trait and the service state are not in sync. Setting the compute service disable is the smaller problem as the scheduler still uses the ComputeFilter that filters the computes based on the service api. So during the enable->disable race window we only lose scheduling performance as the placement filter is inefficient. In case of setting the compute service to enabled the race is more visible as the placement pre_filter will filter out the compute that is enable by the admin until the re-sync happens. If the conflict would only happen due to high load on the given compute the such delay could be explained by the load itself. However conflict can happen simply due to a new instance boot on the compute. Fortunately the solution is easy and cheap. The current service state handler code path has already queried placement about the existing traits on the compute RP and therefore it receives the current RP generation as well. But it does not really use this information but instead rely on the potentially stale provide_tree cache. This patch uses the much fresher RP generation known by the service state handling code instead of the potentially stale provider_tree cache. The change in the notification test is also due to the fixed behavior. The test disables the compute. Until now this caused that the FilterScheduler detected that there is no valid host. Now it is already detect by the scheduler manager based on the empty placement response. This causes now that that the FilterScheduler is not called and therefore the select_destination.start notification is not sent. Closes-Bug: #1886418 Change-Id: Ib3c455bf21f33923bb82e3f5c53035f6722480d3
2020-01-29 19:48:12 +01:00 · 2020-01-29 19:48:12 +01:00 · 1b661c2669
parent f167891218
commit 1b661c2669
6 changed files with 67 additions and 18 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@ -507,7 +507,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):

        if new_traits is not None:
            self.reportclient.set_traits_for_provider(
-                context, rp_uuid, new_traits)
+                context, rp_uuid, new_traits, generation=trait_info.generation)


 class ComputeManager(manager.Manager):
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@ -19,6 +19,7 @@ import copy
 import functools
 import random
 import time
+import typing as ty

 from keystoneauth1 import exceptions as ks_exc
 import os_resource_classes as orc
@ -32,6 +33,7 @@ import six

 from nova.compute import provider_tree
 import nova.conf
+from nova import context as nova_context
 from nova import exception
 from nova.i18n import _
 from nova import objects
@ -998,7 +1000,13 @@ class SchedulerReportClient(object):
        raise exception.TraitRetrievalFailed(error=resp.text)

    @safe_connect
-    def set_traits_for_provider(self, context, rp_uuid, traits):
+    def set_traits_for_provider(
+        self,
+        context: nova_context.RequestContext,
+        rp_uuid: str,
+        traits: ty.Iterable[str],
+        generation: int = None
+    ):
        """Replace a provider's traits with those specified.

        The provider must exist - this method does not attempt to create it.
@ -1006,6 +1014,9 @@ class SchedulerReportClient(object):
        :param context: The security context
        :param rp_uuid: The UUID of the provider whose traits are to be updated
        :param traits: Iterable of traits to set on the provider
+        :param generation: The resource provider generation if known. If not
+                           provided then the value from the provider tree cache
+                           will be used.
        :raises: ResourceProviderUpdateConflict if the provider's generation
                 doesn't match the generation in the cache.  Callers may choose
                 to retrieve the provider and its associations afresh and
@ -1028,7 +1039,8 @@ class SchedulerReportClient(object):
        # that method doesn't return content, and we need to update the cached
        # provider tree with the new generation.
        traits = list(traits) if traits else []
-        generation = self._provider_tree.data(rp_uuid).generation
+        if generation is None:
+            generation = self._provider_tree.data(rp_uuid).generation
        payload = {
            'resource_provider_generation': generation,
            'traits': traits,
--- a/nova/tests/functional/notification_sample_tests/test_compute_task.py
+++ b/nova/tests/functional/notification_sample_tests/test_compute_task.py
@ -109,9 +109,7 @@ class TestComputeTaskNotificationSample(
        # NoValidHost asynchronously.
        self.admin_api.post_server_action(server['id'], {'migrate': None})
        self._wait_for_notification('compute_task.migrate_server.error')
-        # 0. scheduler.select_destinations.start
-        # 1. compute_task.migrate_server.error
-        self.assertEqual(2, len(fake_notifier.VERSIONED_NOTIFICATIONS),
+        self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS),
                         fake_notifier.VERSIONED_NOTIFICATIONS)
        self._verify_notification(
            'compute_task-migrate_server-error',
@ -121,8 +119,9 @@ class TestComputeTaskNotificationSample(
                'request_spec.security_groups': [],
                'request_spec.numa_topology.instance_uuid': server['id'],
                'request_spec.pci_requests.instance_uuid': server['id'],
+                'reason.exception_message': 'No valid host was found. ',
                'reason.function_name': self.ANY,
                'reason.module_name': self.ANY,
                'reason.traceback': self.ANY
            },
-            actual=fake_notifier.VERSIONED_NOTIFICATIONS[1])
+            actual=fake_notifier.VERSIONED_NOTIFICATIONS[0])
--- a/nova/tests/functional/regressions/test_bug_1886418.py
+++ b/nova/tests/functional/regressions/test_bug_1886418.py
@ -58,16 +58,9 @@ class TestServices(integrated_helpers._IntegratedTestBase):
        self._create_server(networks=[])

        self._disable_compute()
-        # FIXME(gibi): Check that COMPUTE_STATUS_DISABLED is now on the
-        # compute. Unfortunately it is not true as the compute manager failed
-        # to update the traits in placement due to a stale provide tree cache.
-        # It is stale because a server is booted on the compute since the last
-        # update_available_resource periodic was run.
-        self.assertIn(
-            'An error occurred while updating COMPUTE_STATUS_DISABLED trait '
-            'on compute node resource provider',
-            self.stdlog.logger.output)
-        self.assertFalse(self._has_disabled_trait())
+
+        # Check that COMPUTE_STATUS_DISABLED is now on the compute.
+        self.assertTrue(self._has_disabled_trait())

        # This would be the expected behavior
        #
@ -82,3 +75,8 @@ class TestServices(integrated_helpers._IntegratedTestBase):
        self._enable_compute()
        # Check that COMPUTE_STATUS_DISABLED is removed from the compute
        self.assertFalse(self._has_disabled_trait())
+        self.assertNotIn(
+            'An error occurred while updating COMPUTE_STATUS_DISABLED trait '
+            'on compute node resource provider',
+            self.stdlog.logger.output,
+            "This is probably bug 1886418.")
--- a/nova/tests/unit/compute/test_virtapi.py
+++ b/nova/tests/unit/compute/test_virtapi.py
@ -98,7 +98,8 @@ class FakeCompute(object):
    def _get_provider_traits(self, context, rp_uuid):
        return mock.Mock(traits=self.provider_traits[rp_uuid])

-    def _set_traits_for_provider(self, context, rp_uuid, traits):
+    def _set_traits_for_provider(
+            self, context, rp_uuid, traits, generation=None):
        self.provider_traits[rp_uuid] = traits

    def _event_waiter(self):
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@ -2938,6 +2938,45 @@ class TestTraits(SchedulerReportClientTestCase):
        self.assertEqual(
            1, self.client._provider_tree.data(uuids.rp).generation)

+    def test_set_traits_for_provider_with_generation(self):
+        traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
+
+        # Make _ensure_traits succeed without PUTting
+        get_mock = mock.Mock(status_code=200)
+        get_mock.json.return_value = {'traits': traits}
+        self.ks_adap_mock.get.return_value = get_mock
+
+        # Prime the provider tree cache
+        self.client._provider_tree.new_root('rp', uuids.rp, generation=0)
+
+        # Mock the /rp/{u}/traits PUT to succeed
+        put_mock = mock.Mock(status_code=200)
+        put_mock.json.return_value = {'traits': traits,
+                                      'resource_provider_generation': 2}
+        self.ks_adap_mock.put.return_value = put_mock
+
+        # Invoke
+        self.client.set_traits_for_provider(
+            self.context, uuids.rp, traits, generation=1)
+
+        # Verify API calls
+        self.ks_adap_mock.get.assert_called_once_with(
+            '/traits?name=in:' + ','.join(traits),
+            global_request_id=self.context.global_id,
+            **self.trait_api_kwargs)
+        self.ks_adap_mock.put.assert_called_once_with(
+            '/resource_providers/%s/traits' % uuids.rp,
+            json={'traits': traits, 'resource_provider_generation': 1},
+            global_request_id=self.context.global_id,
+            **self.trait_api_kwargs)
+
+        # And ensure the provider tree cache was updated appropriately
+        self.assertFalse(
+            self.client._provider_tree.have_traits_changed(uuids.rp, traits))
+        # Validate the generation
+        self.assertEqual(
+            2, self.client._provider_tree.data(uuids.rp).generation)
+
    def test_set_traits_for_provider_fail(self):
        traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
        get_mock = mock.Mock()