Merge "Reduce gen conflict in COMPUTE_STATUS_DISABLED handling"

This commit is contained in:
Zuul 2020-07-22 02:22:33 +00:00 committed by Gerrit Code Review
commit b1e34c594b
6 changed files with 67 additions and 18 deletions

View File

@ -507,7 +507,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
if new_traits is not None:
self.reportclient.set_traits_for_provider(
context, rp_uuid, new_traits)
context, rp_uuid, new_traits, generation=trait_info.generation)
class ComputeManager(manager.Manager):

View File

@ -19,6 +19,7 @@ import copy
import functools
import random
import time
import typing as ty
from keystoneauth1 import exceptions as ks_exc
import os_resource_classes as orc
@ -32,6 +33,7 @@ import six
from nova.compute import provider_tree
import nova.conf
from nova import context as nova_context
from nova import exception
from nova.i18n import _
from nova import objects
@ -998,7 +1000,13 @@ class SchedulerReportClient(object):
raise exception.TraitRetrievalFailed(error=resp.text)
@safe_connect
def set_traits_for_provider(self, context, rp_uuid, traits):
def set_traits_for_provider(
self,
context: nova_context.RequestContext,
rp_uuid: str,
traits: ty.Iterable[str],
generation: int = None
):
"""Replace a provider's traits with those specified.
The provider must exist - this method does not attempt to create it.
@ -1006,6 +1014,9 @@ class SchedulerReportClient(object):
:param context: The security context
:param rp_uuid: The UUID of the provider whose traits are to be updated
:param traits: Iterable of traits to set on the provider
:param generation: The resource provider generation if known. If not
provided then the value from the provider tree cache
will be used.
:raises: ResourceProviderUpdateConflict if the provider's generation
doesn't match the generation in the cache. Callers may choose
to retrieve the provider and its associations afresh and
@ -1028,7 +1039,8 @@ class SchedulerReportClient(object):
# that method doesn't return content, and we need to update the cached
# provider tree with the new generation.
traits = list(traits) if traits else []
generation = self._provider_tree.data(rp_uuid).generation
if generation is None:
generation = self._provider_tree.data(rp_uuid).generation
payload = {
'resource_provider_generation': generation,
'traits': traits,

View File

@ -109,9 +109,7 @@ class TestComputeTaskNotificationSample(
# NoValidHost asynchronously.
self.admin_api.post_server_action(server['id'], {'migrate': None})
self._wait_for_notification('compute_task.migrate_server.error')
# 0. scheduler.select_destinations.start
# 1. compute_task.migrate_server.error
self.assertEqual(2, len(fake_notifier.VERSIONED_NOTIFICATIONS),
self.assertEqual(1, len(fake_notifier.VERSIONED_NOTIFICATIONS),
fake_notifier.VERSIONED_NOTIFICATIONS)
self._verify_notification(
'compute_task-migrate_server-error',
@ -121,8 +119,9 @@ class TestComputeTaskNotificationSample(
'request_spec.security_groups': [],
'request_spec.numa_topology.instance_uuid': server['id'],
'request_spec.pci_requests.instance_uuid': server['id'],
'reason.exception_message': 'No valid host was found. ',
'reason.function_name': self.ANY,
'reason.module_name': self.ANY,
'reason.traceback': self.ANY
},
actual=fake_notifier.VERSIONED_NOTIFICATIONS[1])
actual=fake_notifier.VERSIONED_NOTIFICATIONS[0])

View File

@ -58,16 +58,9 @@ class TestServices(integrated_helpers._IntegratedTestBase):
self._create_server(networks=[])
self._disable_compute()
# FIXME(gibi): Check that COMPUTE_STATUS_DISABLED is now on the
# compute. Unfortunately it is not true as the compute manager failed
# to update the traits in placement due to a stale provide tree cache.
# It is stale because a server is booted on the compute since the last
# update_available_resource periodic was run.
self.assertIn(
'An error occurred while updating COMPUTE_STATUS_DISABLED trait '
'on compute node resource provider',
self.stdlog.logger.output)
self.assertFalse(self._has_disabled_trait())
# Check that COMPUTE_STATUS_DISABLED is now on the compute.
self.assertTrue(self._has_disabled_trait())
# This would be the expected behavior
#
@ -82,3 +75,8 @@ class TestServices(integrated_helpers._IntegratedTestBase):
self._enable_compute()
# Check that COMPUTE_STATUS_DISABLED is removed from the compute
self.assertFalse(self._has_disabled_trait())
self.assertNotIn(
'An error occurred while updating COMPUTE_STATUS_DISABLED trait '
'on compute node resource provider',
self.stdlog.logger.output,
"This is probably bug 1886418.")

View File

@ -98,7 +98,8 @@ class FakeCompute(object):
def _get_provider_traits(self, context, rp_uuid):
return mock.Mock(traits=self.provider_traits[rp_uuid])
def _set_traits_for_provider(self, context, rp_uuid, traits):
def _set_traits_for_provider(
self, context, rp_uuid, traits, generation=None):
self.provider_traits[rp_uuid] = traits
def _event_waiter(self):

View File

@ -2938,6 +2938,45 @@ class TestTraits(SchedulerReportClientTestCase):
self.assertEqual(
1, self.client._provider_tree.data(uuids.rp).generation)
def test_set_traits_for_provider_with_generation(self):
traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
# Make _ensure_traits succeed without PUTting
get_mock = mock.Mock(status_code=200)
get_mock.json.return_value = {'traits': traits}
self.ks_adap_mock.get.return_value = get_mock
# Prime the provider tree cache
self.client._provider_tree.new_root('rp', uuids.rp, generation=0)
# Mock the /rp/{u}/traits PUT to succeed
put_mock = mock.Mock(status_code=200)
put_mock.json.return_value = {'traits': traits,
'resource_provider_generation': 2}
self.ks_adap_mock.put.return_value = put_mock
# Invoke
self.client.set_traits_for_provider(
self.context, uuids.rp, traits, generation=1)
# Verify API calls
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:' + ','.join(traits),
global_request_id=self.context.global_id,
**self.trait_api_kwargs)
self.ks_adap_mock.put.assert_called_once_with(
'/resource_providers/%s/traits' % uuids.rp,
json={'traits': traits, 'resource_provider_generation': 1},
global_request_id=self.context.global_id,
**self.trait_api_kwargs)
# And ensure the provider tree cache was updated appropriately
self.assertFalse(
self.client._provider_tree.have_traits_changed(uuids.rp, traits))
# Validate the generation
self.assertEqual(
2, self.client._provider_tree.data(uuids.rp).generation)
def test_set_traits_for_provider_fail(self):
traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
get_mock = mock.Mock()