SchedulerReportClient.set_traits_for_provider

Introduce SchedulerReportClient.set_traits_for_provider, which accepts a
provider UUID and list of traits to push back to the placement API.  On
success, the local ProviderTree cache is updated appropriately.  On
failure, an exception is raised.  If the failure was a conflict (409),
the exception is a subclass of PlacementAPIConflict (see below) so that
the caller may decide to refresh and retry.

This change also introduces the _ensure_traits helper, which makes sure
a given set of traits exists in the placement service.

Per design [1], we want consumers of SchedulerReportClient to be able to
detect placement API conflict errors (HTTP 409), as distinct from other
errors.  Such consumers (the resource tracker) may respond to conflicts
by redriving the operation in progress, whereas other exceptions will be
allowed to bubble up.  PlacementAPIConflict will be the (base) class of
exceptions raised in response to placement 409s for which we want to
allow this kind of behavior.

[1] http://eavesdrop.openstack.org/meetings/nova_scheduler/2018/nova_scheduler.2018-01-08-14.00.log.html#l-251

Change-Id: I9d9b4a5cb5f6c5d255c00e22923728892c37383f
blueprint: nested-resource-providers
blueprint: ironic-driver-traits
This commit is contained in:
Eric Fried 2018-01-16 11:11:47 -06:00
parent 281b38e0d6
commit 60b2525817
3 changed files with 274 additions and 2 deletions

View File

@ -2115,6 +2115,27 @@ class ResourceProviderCreationFailed(NovaException):
msg_fmt = _("Failed to create resource provider %(name)s")
class ResourceProviderUpdateFailed(NovaException):
msg_fmt = _("Failed to update resource provider via URL %(url)s: "
"%(error)s")
class PlacementAPIConflict(NovaException):
"""Any 409 error from placement APIs should use (a subclass of) this
exception.
"""
msg_fmt = _("A conflict was encountered attempting to invoke the "
"placement API at URL %(url)s: %(error)s")
class ResourceProviderUpdateConflict(PlacementAPIConflict):
"""A 409 caused by generation mismatch from attempting to update an
existing provider record or its associated data (aggregates, traits, etc.).
"""
msg_fmt = _("A conflict was encountered attempting to update resource "
"provider %(uuid)s (generation %(generation)d): %(error)s")
class InventoryWithResourceClassNotFound(NotFound):
msg_fmt = _("No inventory of class %(resource_class)s found.")
@ -2231,6 +2252,14 @@ class TraitInUse(Invalid):
msg_fmt = _("The trait %(name)s is in use by a resource provider.")
class TraitRetrievalFailed(NovaException):
msg_fmt = _("Failed to retrieve traits from the placement API: %(error)s")
class TraitCreationFailed(NovaException):
msg_fmt = _("Failed to create trait %(name)s: %(error)s")
class CannotMigrateWithTargetHost(NovaException):
msg_fmt = _("Cannot migrate with target host. Retry without a host "
"specified.")

View File

@ -967,6 +967,117 @@ class SchedulerReportClient(object):
else:
self._delete_inventory(rp_uuid)
@safe_connect
def _ensure_traits(self, traits):
"""Make sure all specified traits exist in the placement service.
:param traits: Iterable of trait strings to ensure exist.
:raises: TraitCreationFailed if traits contains a trait that did not
exist in placement, and couldn't be created. When this
exception is raised, it is possible that *some* of the
requested traits were created.
:raises: TraitRetrievalFailed if the initial query of existing traits
was unsuccessful. In this scenario, it is guaranteed that
no traits were created.
"""
if not traits:
return
# Query for all the requested traits. Whichever ones we *don't* get
# back, we need to create.
# NOTE(efried): We don't attempt to filter based on our local idea of
# standard traits, which may not be in sync with what the placement
# service knows. If the caller tries to ensure a nonexistent
# "standard" trait, they deserve the TraitCreationFailed exception
# they'll get.
resp = self.get('/traits?name=in:' + ','.join(traits), version='1.6')
if resp.status_code == 200:
traits_to_create = set(traits) - set(resp.json()['traits'])
# Might be neat to have a batch create. But creating multiple
# traits will generally happen once, at initial startup, if at all.
for trait in traits_to_create:
resp = self.put('/traits/' + trait, None, version='1.6')
if not resp:
raise exception.TraitCreationFailed(name=trait,
error=resp.text)
return
# The initial GET failed
msg = ("[%(placement_req_id)s] Failed to retrieve the list of traits. "
"Got %(status_code)d: %(err_text)s")
args = {
'placement_req_id': get_placement_request_id(resp),
'status_code': resp.status_code,
'err_text': resp.text,
}
LOG.error(msg, args)
raise exception.TraitRetrievalFailed(error=resp.text)
@safe_connect
def set_traits_for_provider(self, rp_uuid, traits):
"""Replace a provider's traits with those specified.
The provider must exist - this method does not attempt to create it.
:param rp_uuid: The UUID of the provider whose traits are to be updated
:param traits: Iterable of traits to set on the provider
:raises: ResourceProviderUpdateConflict if the provider's generation
doesn't match the generation in the cache. Callers may choose
to retrieve the provider and its associations afresh and
redrive this operation.
:raises: ResourceProviderUpdateFailed on any other placement API
failure.
:raises: TraitCreationFailed if traits contains a trait that did not
exist in placement, and couldn't be created.
:raises: TraitRetrievalFailed if the initial query of existing traits
was unsuccessful.
"""
# If not different from what we've got, short out
if not self._provider_tree.have_traits_changed(rp_uuid, traits):
return
self._ensure_traits(traits)
url = '/resource_providers/%s/traits' % rp_uuid
# NOTE(efried): Don't use the DELETE API when traits is empty, because
# that guy doesn't return content, and we need to update the cached
# provider tree with the new generation.
traits = traits or []
generation = self._provider_tree.data(rp_uuid).generation
payload = {
'resource_provider_generation': generation,
'traits': traits,
}
resp = self.put(url, payload, version='1.6')
if resp.status_code == 200:
json = resp.json()
self._provider_tree.update_traits(
rp_uuid, json['traits'],
generation=json['resource_provider_generation'])
return
# Some error occurred; log it
msg = ("[%(placement_req_id)s] Failed to update traits to "
"[%(traits)s] for resource provider with UUID %(uuid)s. Got "
"%(status_code)d: %(err_text)s")
args = {
'placement_req_id': get_placement_request_id(resp),
'uuid': rp_uuid,
'traits': ','.join(traits),
'status_code': resp.status_code,
'err_text': resp.text,
}
LOG.error(msg, args)
# If a conflict, raise special conflict exception
if resp.status_code == 409:
raise exception.ResourceProviderUpdateConflict(
uuid=rp_uuid, generation=generation, error=resp.text)
# Otherwise, raise generic exception
raise exception.ResourceProviderUpdateFailed(url=url, error=resp.text)
@safe_connect
def _ensure_resource_class(self, name):
"""Make sure a custom resource class exists.

View File

@ -15,6 +15,7 @@ import time
from keystoneauth1 import exceptions as ks_exc
import mock
import requests
import six
from six.moves.urllib import parse
@ -1826,6 +1827,8 @@ class TestAggregates(SchedulerReportClientTestCase):
class TestTraits(SchedulerReportClientTestCase):
trait_api_kwargs = {'raise_exc': False, 'microversion': '1.6'}
def test_get_provider_traits_found(self):
uuid = uuids.compute_node
resp_mock = mock.Mock(status_code=200)
@ -1840,7 +1843,7 @@ class TestTraits(SchedulerReportClientTestCase):
expected_url = '/resource_providers/' + uuid + '/traits'
self.ks_adap_mock.get.assert_called_once_with(
expected_url, raise_exc=False, microversion='1.6')
expected_url, **self.trait_api_kwargs)
self.assertEqual(set(traits), result)
@mock.patch.object(report.LOG, 'warning')
@ -1877,12 +1880,141 @@ class TestTraits(SchedulerReportClientTestCase):
expected_url = '/resource_providers/' + uuid + '/traits'
self.ks_adap_mock.get.assert_called_once_with(
expected_url, raise_exc=False, microversion='1.6')
expected_url, **self.trait_api_kwargs)
self.assertTrue(log_mock.called)
self.assertEqual(uuids.request_id,
log_mock.call_args[0][1]['placement_req_id'])
self.assertIsNone(result)
def test_ensure_traits(self):
"""Successful paths, various permutations of traits existing or needing
to be created.
"""
standard_traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
custom_traits = ['CUSTOM_GOLD', 'CUSTOM_SILVER']
all_traits = standard_traits + custom_traits
get_mock = mock.Mock(status_code=200)
self.ks_adap_mock.get.return_value = get_mock
# Request all traits; custom traits need to be created
get_mock.json.return_value = {'traits': standard_traits}
self.client._ensure_traits(all_traits)
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:' + ','.join(all_traits), **self.trait_api_kwargs)
self.ks_adap_mock.put.assert_has_calls(
[mock.call('/traits/' + trait, **self.trait_api_kwargs)
for trait in custom_traits], any_order=True)
self.ks_adap_mock.reset_mock()
# Request standard traits; no traits need to be created
get_mock.json.return_value = {'traits': standard_traits}
self.client._ensure_traits(standard_traits)
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:' + ','.join(standard_traits),
**self.trait_api_kwargs)
self.ks_adap_mock.put.assert_not_called()
self.ks_adap_mock.reset_mock()
# Request no traits - short circuit
self.client._ensure_traits(None)
self.client._ensure_traits([])
self.ks_adap_mock.get.assert_not_called()
self.ks_adap_mock.put.assert_not_called()
def test_ensure_traits_fail_retrieval(self):
self.ks_adap_mock.get.return_value = mock.Mock(status_code=400)
self.assertRaises(exception.TraitRetrievalFailed,
self.client._ensure_traits, ['FOO'])
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:FOO', **self.trait_api_kwargs)
self.ks_adap_mock.put.assert_not_called()
def test_ensure_traits_fail_creation(self):
get_mock = mock.Mock(status_code=200)
get_mock.json.return_value = {'traits': []}
self.ks_adap_mock.get.return_value = get_mock
put_mock = requests.Response()
put_mock.status_code = 400
self.ks_adap_mock.put.return_value = put_mock
self.assertRaises(exception.TraitCreationFailed,
self.client._ensure_traits, ['FOO'])
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:FOO', **self.trait_api_kwargs)
self.ks_adap_mock.put.assert_called_once_with(
'/traits/FOO', **self.trait_api_kwargs)
def test_set_traits_for_provider(self):
traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
# Make _ensure_traits succeed without PUTting
get_mock = mock.Mock(status_code=200)
get_mock.json.return_value = {'traits': traits}
self.ks_adap_mock.get.return_value = get_mock
# Prime the provider tree cache
self.client._provider_tree.new_root('rp', uuids.rp, 0)
# Mock the /rp/{u}/traits PUT to succeed
put_mock = mock.Mock(status_code=200)
put_mock.json.return_value = {'traits': traits,
'resource_provider_generation': 1}
self.ks_adap_mock.put.return_value = put_mock
# Invoke
self.client.set_traits_for_provider(uuids.rp, traits)
# Verify API calls
self.ks_adap_mock.get.assert_called_once_with(
'/traits?name=in:' + ','.join(traits), **self.trait_api_kwargs)
self.ks_adap_mock.put.assert_called_once_with(
'/resource_providers/%s/traits' % uuids.rp,
json={'traits': traits, 'resource_provider_generation': 0},
**self.trait_api_kwargs)
# And ensure the provider tree cache was updated appropriately
self.assertFalse(
self.client._provider_tree.have_traits_changed(uuids.rp, traits))
# Validate the generation
self.assertEqual(
1, self.client._provider_tree.data(uuids.rp).generation)
def test_set_traits_for_provider_fail(self):
traits = ['HW_NIC_OFFLOAD_UCS', 'HW_NIC_OFFLOAD_RDMA']
get_mock = mock.Mock()
self.ks_adap_mock.get.return_value = get_mock
# Prime the provider tree cache
self.client._provider_tree.new_root('rp', uuids.rp, 0)
# _ensure_traits exception bubbles up
get_mock.status_code = 400
self.assertRaises(
exception.TraitRetrievalFailed,
self.client.set_traits_for_provider, uuids.rp, traits)
self.ks_adap_mock.put.assert_not_called()
get_mock.status_code = 200
get_mock.json.return_value = {'traits': traits}
# Conflict
self.ks_adap_mock.put.return_value = mock.Mock(status_code=409)
self.assertRaises(
exception.ResourceProviderUpdateConflict,
self.client.set_traits_for_provider, uuids.rp, traits)
# Other error
self.ks_adap_mock.put.return_value = mock.Mock(status_code=503)
self.assertRaises(
exception.ResourceProviderUpdateFailed,
self.client.set_traits_for_provider, uuids.rp, traits)
class TestAssociations(SchedulerReportClientTestCase):
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'