Make get_allocations_for_resource_provider raise

In preparation for reshaper work, and because it's The Right Thing To Do, this patch makes the report client's get_allocations_for_resource_provider method stop acting like something didn't go wrong when its placement API request fails. We remove the @safe_connect decorator so it raises (a subclass of) ksa's ClientException when something goes wrong with the API communication. And the method now raises a new ResourceProviderAllocationRetrievalFailed exception on non-200. In the spirit of the aggregate and trait retrieval methods, it now returns a namedtuple containing the allocation information. Unit tests, which were entirely absent, are added for the method. The resource tracker's _remove_deleted_instances_allocations, which is get_allocations_for_resource_provider's only consumer (for now, until reshaper work starts using it) is refactored to behave the same way it used to, which is to no-op if the placement API request fails. However, a) we take an earlier short-circuit out of the method, which saves a little work copying context stuff; and b) we now emit a warning log message if the no-op is due to the newly-raised exceptions. Change-Id: I020e7dc47efc79f8907b7bfb753ec779a8da69a1
2018-07-21 12:07:28 -05:00
parent d13e5a5011
commit f534495a42
5 changed files with 115 additions and 25 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -22,6 +22,7 @@ import collections
 import copy
 import retrying

+from keystoneauth1 import exceptions as ks_exc
 from oslo_log import log as logging
 from oslo_serialization import jsonutils

@@ -1271,8 +1272,20 @@ class ResourceTracker(object):
        # happen according to the normal flow of events where the scheduler
        # always creates allocations for an instance
        known_instances = set(self.tracked_instances.keys())
-        allocations = self.reportclient.get_allocations_for_resource_provider(
-                context, cn.uuid) or {}
+        try:
+            # pai: report.ProviderAllocInfo namedtuple
+            pai = self.reportclient.get_allocations_for_resource_provider(
+                context, cn.uuid)
+        except (exception.ResourceProviderAllocationRetrievalFailed,
+                ks_exc.ClientException) as e:
+            LOG.error("Skipping removal of allocations for deleted instances: "
+                      "%s", e)
+            return
+        allocations = pai.allocations
+        if not allocations:
+            # The main loop below would short-circuit anyway, but this saves us
+            # the (potentially expensive) context.elevated construction below.
+            return
        read_deleted_context = context.elevated(read_deleted='yes')
        for consumer_uuid, alloc in allocations.items():
            if consumer_uuid in known_instances:
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -2368,3 +2368,8 @@ class ZVMConnectorError(ZVMDriverException):

 class NoResourceClass(NovaException):
    msg_fmt = _("Resource class not found for Ironic node %(node)s.")
+
+
+class ResourceProviderAllocationRetrievalFailed(NovaException):
+    msg_fmt = _("Failed to retrieve allocations for resource provider "
+                "%(rp_uuid)s: %(error)s")
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -57,6 +57,8 @@ POST_ALLOCATIONS_API_VERSION = '1.13'

 AggInfo = collections.namedtuple('AggInfo', ['aggregates', 'generation'])
 TraitInfo = collections.namedtuple('TraitInfo', ['traits', 'generation'])
+ProviderAllocInfo = collections.namedtuple(
+    'ProviderAllocInfo', ['allocations'])


 def warn_limit(self, msg):
@@ -1933,14 +1935,25 @@ class SchedulerReportClient(object):
        else:
            self.delete_allocation_for_instance(context, instance.uuid)

-    @safe_connect
    def get_allocations_for_resource_provider(self, context, rp_uuid):
+        """Retrieves the allocations for a specific provider.
+
+        :param context: The nova.context.RequestContext auth context
+        :param rp_uuid: The UUID of the provider.
+        :return: ProviderAllocInfo namedtuple.
+        :raises: keystoneauth1.exceptions.base.ClientException on failure to
+                 communicate with the placement API
+        :raises: ResourceProviderAllocationRetrievalFailed if the placement API
+                 call fails.
+        """
        url = '/resource_providers/%s/allocations' % rp_uuid
        resp = self.get(url, global_request_id=context.global_id)
        if not resp:
-            return {}
-        else:
-            return resp.json()['allocations']
+            raise exception.ResourceProviderAllocationRetrievalFailed(
+                rp_uuid=rp_uuid, error=resp.text)
+
+        data = resp.json()
+        return ProviderAllocInfo(allocations=data['allocations'])

    def delete_resource_provider(self, context, compute_node, cascade=False):
        """Deletes the ResourceProvider record for the compute_node.
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -13,6 +13,7 @@
 import copy
 import datetime

+from keystoneauth1 import exceptions as ks_exc
 import mock
 from oslo_config import cfg
 from oslo_utils import timeutils
@@ -33,6 +34,7 @@ from nova.objects import fields as obj_fields
 from nova.objects import pci_device
 from nova.pci import manager as pci_manager
 from nova import rc_fields
+from nova.scheduler.client import report
 from nova import test
 from nova.tests.unit import fake_notifier
 from nova.tests.unit.objects import test_pci_device as fake_pci_device
@@ -2799,7 +2801,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
            mock_inst_get):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {}
-        allocs = {uuids.deleted: "fake_deleted_instance"}
+        allocs = report.ProviderAllocInfo(
+            allocations={uuids.deleted: "fake_deleted_instance"})
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2824,7 +2827,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
            mock_inst_get):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {}
-        allocs = {uuids.deleted: "fake_deleted_instance"}
+        allocs = report.ProviderAllocInfo(
+            allocations={uuids.deleted: "fake_deleted_instance"})
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2843,8 +2847,9 @@ class TestUpdateUsageFromInstance(BaseTestCase):
            mock_inst_get):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {}
-        allocs = {uuids.deleted: "fake_deleted_instance",
-                  uuids.migration: "fake_migration"}
+        allocs = report.ProviderAllocInfo(
+            allocations={uuids.deleted: "fake_deleted_instance",
+                         uuids.migration: "fake_migration"})
        mig = objects.Migration(uuid=uuids.migration)
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
@@ -2867,7 +2872,8 @@ class TestUpdateUsageFromInstance(BaseTestCase):
            mock_inst_get):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {}
-        allocs = {uuids.scheduled: "fake_scheduled_instance"}
+        allocs = report.ProviderAllocInfo(
+            allocations={uuids.scheduled: "fake_scheduled_instance"})
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2916,15 +2922,17 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        rc = self.rt.reportclient
        self.rt.tracked_instances = {
            uuids.known: objects.Instance(uuid=uuids.known)}
-        allocs = {
-            uuids.known: {
-                'resources': {
-                    'VCPU': 1,
-                    'MEMORY_MB': 2048,
-                    'DISK_GB': 20
+        allocs = report.ProviderAllocInfo(
+            allocations={
+                uuids.known: {
+                    'resources': {
+                        'VCPU': 1,
+                        'MEMORY_MB': 2048,
+                        'DISK_GB': 20
+                    }
                }
            }
-        }
+        )
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2951,15 +2959,17 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        # No tracked instances on this node.
        self.rt.tracked_instances = {}
        # But there is an allocation for an instance on this node.
-        allocs = {
-            instance.uuid: {
-                'resources': {
-                    'VCPU': 1,
-                    'MEMORY_MB': 2048,
-                    'DISK_GB': 20
+        allocs = report.ProviderAllocInfo(
+            allocations={
+                instance.uuid: {
+                    'resources': {
+                        'VCPU': 1,
+                        'MEMORY_MB': 2048,
+                        'DISK_GB': 20
+                    }
                }
            }
-        }
+        )
        rc.get_allocations_for_resource_provider = mock.MagicMock(
            return_value=allocs)
        rc.delete_allocation_for_instance = mock.MagicMock()
@@ -2974,6 +2984,33 @@ class TestUpdateUsageFromInstance(BaseTestCase):
        # and figure things out, this should actually be an error.
        rc.delete_allocation_for_instance.assert_not_called()

+    def test_remove_deleted_instances_allocations_retrieval_fail(self):
+        """When the report client errs or otherwise retrieves no allocations,
+        _remove_deleted_instances_allocations gracefully no-ops.
+        """
+        cn = self.rt.compute_nodes[_NODENAME]
+        rc = self.rt.reportclient
+        # We'll test three different ways get_allocations_for_resource_provider
+        # can cause us to no-op.
+        side_effects = (
+            # Actual placement error
+            exc.ResourceProviderAllocationRetrievalFailed(
+                rp_uuid='rp_uuid', error='error'),
+            # API communication failure
+            ks_exc.ClientException,
+            # Legitimately no allocations
+            report.ProviderAllocInfo(allocations={}),
+        )
+        rc.get_allocations_for_resource_provider = mock.Mock(
+            side_effect=side_effects)
+        for _ in side_effects:
+            # If we didn't no op, this would blow up at 'ctx'.elevated()
+            self.rt._remove_deleted_instances_allocations(
+                'ctx', cn, [], self.rt.tracked_instances)
+            rc.get_allocations_for_resource_provider.assert_called_once_with(
+                'ctx', cn.uuid)
+            rc.get_allocations_for_resource_provider.reset_mock()
+
    def test_delete_allocation_for_shelve_offloaded_instance(self):
        instance = _INSTANCE_FIXTURES[0].obj_clone()
        instance.uuid = uuids.inst0
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -3406,6 +3406,28 @@ class TestAllocations(SchedulerReportClientTestCase):
        self.assertEqual(0, mock_log.info.call_count)
        self.assertEqual(1, mock_log.error.call_count)

+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient.get")
+    def test_get_allocations_for_resource_provider(self, mock_get):
+        mock_get.return_value = fake_requests.FakeResponse(
+            200, content=jsonutils.dumps(
+                {'allocations': 'fake', 'resource_provider_generation': 42}))
+        ret = self.client.get_allocations_for_resource_provider(
+            self.context, 'rpuuid')
+        self.assertEqual('fake', ret.allocations)
+        mock_get.assert_called_once_with(
+            '/resource_providers/rpuuid/allocations',
+            global_request_id=self.context.global_id)
+
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient.get")
+    def test_get_allocations_for_resource_provider_fail(self, mock_get):
+        mock_get.return_value = fake_requests.FakeResponse(400, content="ouch")
+        self.assertRaises(exception.ResourceProviderAllocationRetrievalFailed,
+                          self.client.get_allocations_for_resource_provider,
+                          self.context, 'rpuuid')
+        mock_get.assert_called_once_with(
+            '/resource_providers/rpuuid/allocations',
+            global_request_id=self.context.global_id)
+

 class TestResourceClass(SchedulerReportClientTestCase):
    def setUp(self):