Consumer gen support for put allocations

The placement API version 1.28 introduced consumer generation as a way
to make updating allocation safe even if it is done from multiple
places.

This patch changes the scheduler report client put_allocations
function to raise AllocationUpdateFailed in case of generation conflict.
The only direct user of this call is the nova-manage heal_allocations
CLI which will simply fail to heal the allocation for this instance.

Blueprint: use-nested-allocation-candidates
Change-Id: Iba230201803ef3d33bccaaf83eb10453eea43f20
This commit is contained in:
Balazs Gibizer 2018-08-14 14:10:17 +02:00
parent 6f1a1f5e8e
commit dfa2e6f221
6 changed files with 114 additions and 34 deletions

View File

@ -1815,7 +1815,7 @@ class PlacementCommands(object):
ctxt, instance.uuid) ctxt, instance.uuid)
except ks_exc.ClientException as e: except ks_exc.ClientException as e:
raise exception.AllocationUpdateFailed( raise exception.AllocationUpdateFailed(
instance=instance.uuid, consumer_uuid=instance.uuid,
error=_("Allocation retrieval failed: %s") % e) error=_("Allocation retrieval failed: %s") % e)
except exception.ConsumerAllocationRetrievalFailed as e: except exception.ConsumerAllocationRetrievalFailed as e:
output(_("Allocation retrieval failed: %s") % e) output(_("Allocation retrieval failed: %s") % e)
@ -1858,7 +1858,7 @@ class PlacementCommands(object):
return True return True
else: else:
raise exception.AllocationUpdateFailed( raise exception.AllocationUpdateFailed(
instance=instance.uuid, error=resp.text) consumer_uuid=instance.uuid, error=resp.text)
# This instance doesn't have allocations so we need to find # This instance doesn't have allocations so we need to find
# its compute node resource provider. # its compute node resource provider.
@ -1871,7 +1871,8 @@ class PlacementCommands(object):
instance, instance.flavor) instance, instance.flavor)
if placement.put_allocations( if placement.put_allocations(
ctxt, node_uuid, instance.uuid, resources, ctxt, node_uuid, instance.uuid, resources,
instance.project_id, instance.user_id): instance.project_id, instance.user_id,
consumer_generation=None):
output(_('Successfully created allocations for ' output(_('Successfully created allocations for '
'instance %(instance)s against resource ' 'instance %(instance)s against resource '
'provider %(provider)s.') % 'provider %(provider)s.') %

View File

@ -2318,7 +2318,7 @@ class AllocationCreateFailed(NovaException):
class AllocationUpdateFailed(NovaException): class AllocationUpdateFailed(NovaException):
msg_fmt = _('Failed to update allocations for instance %(instance)s. ' msg_fmt = _('Failed to update allocations for consumer %(consumer_uuid)s. '
'Error: %(error)s') 'Error: %(error)s')

View File

@ -1994,7 +1994,7 @@ class SchedulerReportClient(object):
@safe_connect @safe_connect
@retries @retries
def put_allocations(self, context, rp_uuid, consumer_uuid, alloc_data, def put_allocations(self, context, rp_uuid, consumer_uuid, alloc_data,
project_id, user_id): project_id, user_id, consumer_generation):
"""Creates allocation records for the supplied instance UUID against """Creates allocation records for the supplied instance UUID against
the supplied resource provider. the supplied resource provider.
@ -2009,29 +2009,38 @@ class SchedulerReportClient(object):
consume. consume.
:param project_id: The project_id associated with the allocations. :param project_id: The project_id associated with the allocations.
:param user_id: The user_id associated with the allocations. :param user_id: The user_id associated with the allocations.
:param consumer_generation: The current generation of the consumer
:returns: True if the allocations were created, False otherwise. :returns: True if the allocations were created, False otherwise.
:raises: Retry if the operation should be retried due to a concurrent :raises: Retry if the operation should be retried due to a concurrent
update. resource provider update.
:raises: AllocationUpdateFailed if placement returns a consumer
generation conflict
""" """
payload = { payload = {
'allocations': [ 'allocations': {
{ rp_uuid: {'resources': alloc_data},
'resource_provider': { },
'uuid': rp_uuid,
},
'resources': alloc_data,
},
],
'project_id': project_id, 'project_id': project_id,
'user_id': user_id, 'user_id': user_id,
'consumer_generation': consumer_generation
} }
url = '/allocations/%s' % consumer_uuid url = '/allocations/%s' % consumer_uuid
r = self.put(url, payload, version='1.8', r = self.put(url, payload, version=CONSUMER_GENERATION_VERSION,
global_request_id=context.global_id) global_request_id=context.global_id)
if r.status_code != 204: if r.status_code != 204:
err = r.json()['errors'][0]
# NOTE(jaypipes): Yes, it sucks doing string comparison like this # NOTE(jaypipes): Yes, it sucks doing string comparison like this
# but we have no error codes, only error messages. # but we have no error codes, only error messages.
if 'concurrently updated' in r.text: # TODO(gibi): Use more granular error codes when available
if err['code'] == 'placement.concurrent_update':
if 'consumer generation conflict' in err['detail']:
raise exception.AllocationUpdateFailed(
consumer_uuid=consumer_uuid, error=err['detail'])
# this is not a consumer generation conflict so it can only be
# a resource provider generation conflict. The caller does not
# provide resource provider generation so this is just a
# placement internal race. We can blindly retry locally.
reason = ('another process changed the resource providers ' reason = ('another process changed the resource providers '
'involved in our attempt to put allocations for ' 'involved in our attempt to put allocations for '
'consumer %s' % consumer_uuid) 'consumer %s' % consumer_uuid)

View File

@ -225,7 +225,8 @@ class SchedulerReportClientTests(SchedulerReportClientTestBase):
self.instance.flavor) self.instance.flavor)
self.client.put_allocations( self.client.put_allocations(
self.context, self.compute_uuid, self.instance_uuid, self.context, self.compute_uuid, self.instance_uuid,
alloc_dict, self.instance.project_id, self.instance.user_id) alloc_dict, self.instance.project_id, self.instance.user_id,
None)
# Check that allocations were made # Check that allocations were made
resp = self.client.get('/allocations/%s' % self.instance_uuid) resp = self.client.get('/allocations/%s' % self.instance_uuid)
@ -724,7 +725,7 @@ class SchedulerReportClientTests(SchedulerReportClientTestBase):
self.client.put_allocations( self.client.put_allocations(
self.context, uuids.cn, uuids.consumer, self.context, uuids.cn, uuids.consumer,
{fields.ResourceClass.SRIOV_NET_VF: 1}, {fields.ResourceClass.SRIOV_NET_VF: 1},
uuids.proj, uuids.user)) uuids.proj, uuids.user, None))
# ...and trying to delete the provider's VF inventory # ...and trying to delete the provider's VF inventory
bad_inv = { bad_inv = {
'CUSTOM_BANDWIDTH': { 'CUSTOM_BANDWIDTH': {

View File

@ -270,10 +270,11 @@ class TestPutAllocations(SchedulerReportClientTestCase):
resp = self.client.put_allocations(self.context, rp_uuid, resp = self.client.put_allocations(self.context, rp_uuid,
consumer_uuid, data, consumer_uuid, data,
mock.sentinel.project_id, mock.sentinel.project_id,
mock.sentinel.user_id) mock.sentinel.user_id,
mock.sentinel.consumer_generation)
self.assertTrue(resp) self.assertTrue(resp)
mock_put.assert_called_once_with( mock_put.assert_called_once_with(
expected_url, mock.ANY, version='1.8', expected_url, mock.ANY, version='1.28',
global_request_id=self.context.global_id) global_request_id=self.context.global_id)
@mock.patch.object(report.LOG, 'warning') @mock.patch.object(report.LOG, 'warning')
@ -288,20 +289,48 @@ class TestPutAllocations(SchedulerReportClientTestCase):
resp = self.client.put_allocations(self.context, rp_uuid, resp = self.client.put_allocations(self.context, rp_uuid,
consumer_uuid, data, consumer_uuid, data,
mock.sentinel.project_id, mock.sentinel.project_id,
mock.sentinel.user_id) mock.sentinel.user_id,
mock.sentinel.consumer_generation)
self.assertFalse(resp) self.assertFalse(resp)
mock_put.assert_called_once_with( mock_put.assert_called_once_with(
expected_url, mock.ANY, version='1.8', expected_url, mock.ANY, version='1.28',
global_request_id=self.context.global_id) global_request_id=self.context.global_id)
log_msg = mock_warn.call_args[0][0] log_msg = mock_warn.call_args[0][0]
self.assertIn("Unable to submit allocation for instance", log_msg) self.assertIn("Unable to submit allocation for instance", log_msg)
@mock.patch.object(report.LOG, 'warning')
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.put')
def test_put_allocations_fail_due_to_consumer_generation_conflict(
self, mock_put, mock_warn):
mock_put.return_value = fake_requests.FakeResponse(
status_code=409,
content=jsonutils.dumps(
{'errors': [{'code': 'placement.concurrent_update',
'detail': 'consumer generation conflict'}]}))
rp_uuid = mock.sentinel.rp
consumer_uuid = mock.sentinel.consumer
data = {"MEMORY_MB": 1024}
expected_url = "/allocations/%s" % consumer_uuid
self.assertRaises(exception.AllocationUpdateFailed,
self.client.put_allocations,
self.context, rp_uuid,
consumer_uuid, data,
mock.sentinel.project_id,
mock.sentinel.user_id,
mock.sentinel.consumer_generation)
mock_put.assert_called_once_with(
expected_url, mock.ANY, version='1.28',
global_request_id=self.context.global_id)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.put') @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put')
def test_put_allocations_retries_conflict(self, mock_put): def test_put_allocations_retries_conflict(self, mock_put):
failed = fake_requests.FakeResponse(
failed = mock.MagicMock() status_code=409,
failed.status_code = 409 content=jsonutils.dumps(
failed.text = "concurrently updated" {'errors': [{'code': 'placement.concurrent_update',
'detail': ''}]}))
succeeded = mock.MagicMock() succeeded = mock.MagicMock()
succeeded.status_code = 204 succeeded.status_code = 204
@ -315,18 +344,21 @@ class TestPutAllocations(SchedulerReportClientTestCase):
resp = self.client.put_allocations(self.context, rp_uuid, resp = self.client.put_allocations(self.context, rp_uuid,
consumer_uuid, data, consumer_uuid, data,
mock.sentinel.project_id, mock.sentinel.project_id,
mock.sentinel.user_id) mock.sentinel.user_id,
mock.sentinel.consumer_generation)
self.assertTrue(resp) self.assertTrue(resp)
mock_put.assert_has_calls([ mock_put.assert_has_calls([
mock.call(expected_url, mock.ANY, version='1.8', mock.call(expected_url, mock.ANY, version='1.28',
global_request_id=self.context.global_id)] * 2) global_request_id=self.context.global_id)] * 2)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.put') @mock.patch('nova.scheduler.client.report.SchedulerReportClient.put')
def test_put_allocations_retry_gives_up(self, mock_put): def test_put_allocations_retry_gives_up(self, mock_put):
failed = mock.MagicMock() failed = fake_requests.FakeResponse(
failed.status_code = 409 status_code=409,
failed.text = "concurrently updated" content=jsonutils.dumps(
{'errors': [{'code': 'placement.concurrent_update',
'detail': ''}]}))
mock_put.return_value = failed mock_put.return_value = failed
@ -337,10 +369,11 @@ class TestPutAllocations(SchedulerReportClientTestCase):
resp = self.client.put_allocations(self.context, rp_uuid, resp = self.client.put_allocations(self.context, rp_uuid,
consumer_uuid, data, consumer_uuid, data,
mock.sentinel.project_id, mock.sentinel.project_id,
mock.sentinel.user_id) mock.sentinel.user_id,
mock.sentinel.consumer_generation)
self.assertFalse(resp) self.assertFalse(resp)
mock_put.assert_has_calls([ mock_put.assert_has_calls([
mock.call(expected_url, mock.ANY, version='1.8', mock.call(expected_url, mock.ANY, version='1.28',
global_request_id=self.context.global_id)] * 3) global_request_id=self.context.global_id)] * 3)
def test_claim_resources_success_with_old_version(self): def test_claim_resources_success_with_old_version(self):

View File

@ -2535,7 +2535,43 @@ class TestNovaManagePlacement(test.NoDBTestCase):
mock_put_allocations.assert_called_once_with( mock_put_allocations.assert_called_once_with(
test.MatchType(context.RequestContext), uuidsentinel.node, test.MatchType(context.RequestContext), uuidsentinel.node,
uuidsentinel.instance, mock.sentinel.resources, 'fake-project', uuidsentinel.instance, mock.sentinel.resources, 'fake-project',
'fake-user') 'fake-user', consumer_generation=None)
@mock.patch('nova.objects.CellMappingList.get_all',
return_value=objects.CellMappingList(objects=[
objects.CellMapping(name='cell1',
uuid=uuidsentinel.cell1)]))
@mock.patch('nova.objects.InstanceList.get_by_filters',
return_value=objects.InstanceList(objects=[
objects.Instance(
uuid=uuidsentinel.instance, host='fake', node='fake',
task_state=None, flavor=objects.Flavor(),
project_id='fake-project', user_id='fake-user')]))
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'get_allocs_for_consumer', return_value={})
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
return_value=objects.ComputeNode(uuid=uuidsentinel.node))
@mock.patch('nova.scheduler.utils.resources_from_flavor',
return_value=mock.sentinel.resources)
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
'put_allocations',
side_effect=exception.AllocationUpdateFailed(
consumer_uuid=uuidsentinel.instance,
error="consumer generation conflict"))
def test_heal_allocations_put_allocations_fails_with_consumer_conflict(
self, mock_put_allocations, mock_res_from_flavor,
mock_get_compute_node, mock_get_allocs, mock_get_instances,
mock_get_all_cells):
self.assertEqual(3, self.cli.heal_allocations())
self.assertIn('Failed to update allocations for consumer',
self.output.getvalue())
instance = mock_get_instances.return_value[0]
mock_res_from_flavor.assert_called_once_with(
instance, instance.flavor)
mock_put_allocations.assert_called_once_with(
test.MatchType(context.RequestContext), uuidsentinel.node,
uuidsentinel.instance, mock.sentinel.resources, 'fake-project',
'fake-user', consumer_generation=None)
@mock.patch('nova.objects.CellMappingList.get_all', @mock.patch('nova.objects.CellMappingList.get_all',
new=mock.Mock(return_value=objects.CellMappingList(objects=[ new=mock.Mock(return_value=objects.CellMappingList(objects=[