Make get_allocations_for_resource_provider raise

In preparation for reshaper work, and because it's The Right Thing To
Do, this patch makes the report client's
get_allocations_for_resource_provider method stop acting like something
didn't go wrong when its placement API request fails. We remove the
@safe_connect decorator so it raises (a subclass of) ksa's
ClientException when something goes wrong with the API communication.
And the method now raises a new
ResourceProviderAllocationRetrievalFailed exception on non-200.

In the spirit of the aggregate and trait retrieval methods, it now
returns a namedtuple containing the allocation information.

Unit tests, which were entirely absent, are added for the method.

The resource tracker's _remove_deleted_instances_allocations, which is
get_allocations_for_resource_provider's only consumer (for now, until
reshaper work starts using it) is refactored to behave the same way it
used to, which is to no-op if the placement API request fails. However,
a) we take an earlier short-circuit out of the method, which saves a
little work copying context stuff; and b) we now emit a warning log
message if the no-op is due to the newly-raised exceptions.

Change-Id: I020e7dc47efc79f8907b7bfb753ec779a8da69a1
This commit is contained in:
Eric Fried
2018-07-21 12:07:28 -05:00
parent d13e5a5011
commit f534495a42
5 changed files with 115 additions and 25 deletions

View File

@@ -22,6 +22,7 @@ import collections
import copy
import retrying
from keystoneauth1 import exceptions as ks_exc
from oslo_log import log as logging
from oslo_serialization import jsonutils
@@ -1271,8 +1272,20 @@ class ResourceTracker(object):
# happen according to the normal flow of events where the scheduler
# always creates allocations for an instance
known_instances = set(self.tracked_instances.keys())
allocations = self.reportclient.get_allocations_for_resource_provider(
context, cn.uuid) or {}
try:
# pai: report.ProviderAllocInfo namedtuple
pai = self.reportclient.get_allocations_for_resource_provider(
context, cn.uuid)
except (exception.ResourceProviderAllocationRetrievalFailed,
ks_exc.ClientException) as e:
LOG.error("Skipping removal of allocations for deleted instances: "
"%s", e)
return
allocations = pai.allocations
if not allocations:
# The main loop below would short-circuit anyway, but this saves us
# the (potentially expensive) context.elevated construction below.
return
read_deleted_context = context.elevated(read_deleted='yes')
for consumer_uuid, alloc in allocations.items():
if consumer_uuid in known_instances:

View File

@@ -2368,3 +2368,8 @@ class ZVMConnectorError(ZVMDriverException):
class NoResourceClass(NovaException):
msg_fmt = _("Resource class not found for Ironic node %(node)s.")
class ResourceProviderAllocationRetrievalFailed(NovaException):
msg_fmt = _("Failed to retrieve allocations for resource provider "
"%(rp_uuid)s: %(error)s")

View File

@@ -57,6 +57,8 @@ POST_ALLOCATIONS_API_VERSION = '1.13'
AggInfo = collections.namedtuple('AggInfo', ['aggregates', 'generation'])
TraitInfo = collections.namedtuple('TraitInfo', ['traits', 'generation'])
ProviderAllocInfo = collections.namedtuple(
'ProviderAllocInfo', ['allocations'])
def warn_limit(self, msg):
@@ -1933,14 +1935,25 @@ class SchedulerReportClient(object):
else:
self.delete_allocation_for_instance(context, instance.uuid)
@safe_connect
def get_allocations_for_resource_provider(self, context, rp_uuid):
"""Retrieves the allocations for a specific provider.
:param context: The nova.context.RequestContext auth context
:param rp_uuid: The UUID of the provider.
:return: ProviderAllocInfo namedtuple.
:raises: keystoneauth1.exceptions.base.ClientException on failure to
communicate with the placement API
:raises: ResourceProviderAllocationRetrievalFailed if the placement API
call fails.
"""
url = '/resource_providers/%s/allocations' % rp_uuid
resp = self.get(url, global_request_id=context.global_id)
if not resp:
return {}
else:
return resp.json()['allocations']
raise exception.ResourceProviderAllocationRetrievalFailed(
rp_uuid=rp_uuid, error=resp.text)
data = resp.json()
return ProviderAllocInfo(allocations=data['allocations'])
def delete_resource_provider(self, context, compute_node, cascade=False):
"""Deletes the ResourceProvider record for the compute_node.

View File

@@ -13,6 +13,7 @@
import copy
import datetime
from keystoneauth1 import exceptions as ks_exc
import mock
from oslo_config import cfg
from oslo_utils import timeutils
@@ -33,6 +34,7 @@ from nova.objects import fields as obj_fields
from nova.objects import pci_device
from nova.pci import manager as pci_manager
from nova import rc_fields
from nova.scheduler.client import report
from nova import test
from nova.tests.unit import fake_notifier
from nova.tests.unit.objects import test_pci_device as fake_pci_device
@@ -2799,7 +2801,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.deleted: "fake_deleted_instance"}
allocs = report.ProviderAllocInfo(
allocations={uuids.deleted: "fake_deleted_instance"})
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2824,7 +2827,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.deleted: "fake_deleted_instance"}
allocs = report.ProviderAllocInfo(
allocations={uuids.deleted: "fake_deleted_instance"})
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2843,8 +2847,9 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.deleted: "fake_deleted_instance",
uuids.migration: "fake_migration"}
allocs = report.ProviderAllocInfo(
allocations={uuids.deleted: "fake_deleted_instance",
uuids.migration: "fake_migration"})
mig = objects.Migration(uuid=uuids.migration)
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
@@ -2867,7 +2872,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
mock_inst_get):
rc = self.rt.reportclient
self.rt.tracked_instances = {}
allocs = {uuids.scheduled: "fake_scheduled_instance"}
allocs = report.ProviderAllocInfo(
allocations={uuids.scheduled: "fake_scheduled_instance"})
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2916,15 +2922,17 @@ class TestUpdateUsageFromInstance(BaseTestCase):
rc = self.rt.reportclient
self.rt.tracked_instances = {
uuids.known: objects.Instance(uuid=uuids.known)}
allocs = {
uuids.known: {
'resources': {
'VCPU': 1,
'MEMORY_MB': 2048,
'DISK_GB': 20
allocs = report.ProviderAllocInfo(
allocations={
uuids.known: {
'resources': {
'VCPU': 1,
'MEMORY_MB': 2048,
'DISK_GB': 20
}
}
}
}
)
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2951,15 +2959,17 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# No tracked instances on this node.
self.rt.tracked_instances = {}
# But there is an allocation for an instance on this node.
allocs = {
instance.uuid: {
'resources': {
'VCPU': 1,
'MEMORY_MB': 2048,
'DISK_GB': 20
allocs = report.ProviderAllocInfo(
allocations={
instance.uuid: {
'resources': {
'VCPU': 1,
'MEMORY_MB': 2048,
'DISK_GB': 20
}
}
}
}
)
rc.get_allocations_for_resource_provider = mock.MagicMock(
return_value=allocs)
rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2974,6 +2984,33 @@ class TestUpdateUsageFromInstance(BaseTestCase):
# and figure things out, this should actually be an error.
rc.delete_allocation_for_instance.assert_not_called()
def test_remove_deleted_instances_allocations_retrieval_fail(self):
"""When the report client errs or otherwise retrieves no allocations,
_remove_deleted_instances_allocations gracefully no-ops.
"""
cn = self.rt.compute_nodes[_NODENAME]
rc = self.rt.reportclient
# We'll test three different ways get_allocations_for_resource_provider
# can cause us to no-op.
side_effects = (
# Actual placement error
exc.ResourceProviderAllocationRetrievalFailed(
rp_uuid='rp_uuid', error='error'),
# API communication failure
ks_exc.ClientException,
# Legitimately no allocations
report.ProviderAllocInfo(allocations={}),
)
rc.get_allocations_for_resource_provider = mock.Mock(
side_effect=side_effects)
for _ in side_effects:
# If we didn't no op, this would blow up at 'ctx'.elevated()
self.rt._remove_deleted_instances_allocations(
'ctx', cn, [], self.rt.tracked_instances)
rc.get_allocations_for_resource_provider.assert_called_once_with(
'ctx', cn.uuid)
rc.get_allocations_for_resource_provider.reset_mock()
def test_delete_allocation_for_shelve_offloaded_instance(self):
instance = _INSTANCE_FIXTURES[0].obj_clone()
instance.uuid = uuids.inst0

View File

@@ -3406,6 +3406,28 @@ class TestAllocations(SchedulerReportClientTestCase):
self.assertEqual(0, mock_log.info.call_count)
self.assertEqual(1, mock_log.error.call_count)
@mock.patch("nova.scheduler.client.report.SchedulerReportClient.get")
def test_get_allocations_for_resource_provider(self, mock_get):
mock_get.return_value = fake_requests.FakeResponse(
200, content=jsonutils.dumps(
{'allocations': 'fake', 'resource_provider_generation': 42}))
ret = self.client.get_allocations_for_resource_provider(
self.context, 'rpuuid')
self.assertEqual('fake', ret.allocations)
mock_get.assert_called_once_with(
'/resource_providers/rpuuid/allocations',
global_request_id=self.context.global_id)
@mock.patch("nova.scheduler.client.report.SchedulerReportClient.get")
def test_get_allocations_for_resource_provider_fail(self, mock_get):
mock_get.return_value = fake_requests.FakeResponse(400, content="ouch")
self.assertRaises(exception.ResourceProviderAllocationRetrievalFailed,
self.client.get_allocations_for_resource_provider,
self.context, 'rpuuid')
mock_get.assert_called_once_with(
'/resource_providers/rpuuid/allocations',
global_request_id=self.context.global_id)
class TestResourceClass(SchedulerReportClientTestCase):
def setUp(self):