From c178d9360665c219cbcc71c9f37b9e6e3055a5e5 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Thu, 11 Aug 2022 09:50:30 -0700
Subject: [PATCH] Unify placement client singleton implementations

We have many places where we implement singleton behavior for the
placement client. This unifies them into a single place and
implementation. Not only does this DRY things up, but may cause us
to initialize it fewer times and also allows for emitting a common
set of error messages about expected failures for better
troubleshooting.

Change-Id: Iab8a791f64323f996e1d6e6d5a7e7a7c34eb4fb3
Related-Bug: #1846820
---
 nova/api/openstack/compute/services.py        |  7 +--
 nova/cmd/manage.py                            |  4 +-
 nova/compute/api.py                           | 10 +---
 nova/compute/manager.py                       |  5 ++
 nova/compute/resource_tracker.py              |  2 +-
 nova/conductor/manager.py                     |  2 +-
 nova/conductor/tasks/migrate.py               |  4 +-
 nova/limit/placement.py                       |  6 +--
 nova/quota.py                                 |  7 +--
 nova/scheduler/client/report.py               | 46 +++++++++++++++++++
 nova/scheduler/manager.py                     |  2 +-
 nova/scheduler/request_filter.py              |  2 +-
 nova/test.py                                  |  4 ++
 nova/tests/unit/compute/test_api.py           | 11 ++---
 nova/tests/unit/compute/test_compute.py       | 11 ++---
 .../unit/scheduler/client/test_report.py      | 36 +++++++++++++++
 16 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py
index 6deb84a7f1ab..e9d51d4d0c8e 100644
--- a/nova/api/openstack/compute/services.py
+++ b/nova/api/openstack/compute/services.py
@@ -48,13 +48,10 @@ class ServiceController(wsgi.Controller):
         self.actions = {"enable": self._enable,
                         "disable": self._disable,
                         "disable-log-reason": self._disable_log_reason}
-        self._placementclient = None  # Lazy-load on first access.
 
     @property
     def placementclient(self):
-        if self._placementclient is None:
-            self._placementclient = report.SchedulerReportClient()
-        return self._placementclient
+        return report.report_client_singleton()
 
     def _get_services(self, req):
         # The API services are filtered out since they are not RPC services
@@ -328,7 +325,7 @@ class ServiceController(wsgi.Controller):
                             "Failed to delete compute node resource provider "
                             "for compute node %s: %s",
                             compute_node.uuid, str(e))
-                # remove the host_mapping of this host.
+                # Remove the host_mapping of this host.
                 try:
                     hm = objects.HostMapping.get_by_host(context, service.host)
                     hm.destroy()
diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index 32b0dd57f808..08b8ebb31047 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -2209,7 +2209,7 @@ class PlacementCommands(object):
                 output(_('No cells to process.'))
                 return 4
 
-        placement = report.SchedulerReportClient()
+        placement = report.report_client_singleton()
 
         neutron = None
         if heal_port_allocations:
@@ -2710,7 +2710,7 @@ class PlacementCommands(object):
         if verbose:
             output = lambda msg: print(msg)
 
-        placement = report.SchedulerReportClient()
+        placement = report.report_client_singleton()
         # Resets two in-memory dicts for knowing instances per compute node
         self.cn_uuid_mapping = collections.defaultdict(tuple)
         self.instances_mapping = collections.defaultdict(list)
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 66543f57dc0a..cc400cfa7ac0 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -386,7 +386,6 @@ class API:
         self.image_api = image_api or glance.API()
         self.network_api = network_api or neutron.API()
         self.volume_api = volume_api or cinder.API()
-        self._placementclient = None  # Lazy-load on first access.
         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
         self.compute_task_api = conductor.ComputeTaskAPI()
         self.servicegroup_api = servicegroup.API()
@@ -2575,9 +2574,7 @@ class API:
 
     @property
     def placementclient(self):
-        if self._placementclient is None:
-            self._placementclient = report.SchedulerReportClient()
-        return self._placementclient
+        return report.report_client_singleton()
 
     def _local_delete(self, context, instance, bdms, delete_type, cb):
         if instance.vm_state == vm_states.SHELVED_OFFLOADED:
@@ -6405,13 +6402,10 @@ class AggregateAPI:
     def __init__(self):
         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
         self.query_client = query.SchedulerQueryClient()
-        self._placement_client = None  # Lazy-load on first access.
 
     @property
     def placement_client(self):
-        if self._placement_client is None:
-            self._placement_client = report.SchedulerReportClient()
-        return self._placement_client
+        return report.report_client_singleton()
 
     @wrap_exception()
     def create_aggregate(self, context, aggregate_name, availability_zone):
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 5e39b1b76645..6a42e1769600 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -622,6 +622,11 @@ class ComputeManager(manager.Manager):
         # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
         # using the same instance of SchedulerReportClient which has the
         # ProviderTree cache for this compute service.
+        # NOTE(danms): We do not use the global placement client
+        # singleton here, because the above-mentioned stack of objects
+        # maintain local state in the client. Thus, keeping our own
+        # private object for that stack avoids any potential conflict
+        # with other users in our process outside of the above.
         self.reportclient = report.SchedulerReportClient()
         self.virtapi = ComputeVirtAPI(self)
         self.network_api = neutron.API()
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 0b801f7ddf99..058777d1ed08 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -103,7 +103,7 @@ class ResourceTracker(object):
         monitor_handler = monitors.MonitorHandler(self)
         self.monitors = monitor_handler.monitors
         self.old_resources = collections.defaultdict(objects.ComputeNode)
-        self.reportclient = reportclient or report.SchedulerReportClient()
+        self.reportclient = reportclient or report.report_client_singleton()
         self.ram_allocation_ratio = CONF.ram_allocation_ratio
         self.cpu_allocation_ratio = CONF.cpu_allocation_ratio
         self.disk_allocation_ratio = CONF.disk_allocation_ratio
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index f6b0815d1b71..5b1b9f9c6055 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -243,7 +243,7 @@ class ComputeTaskManager:
         self.network_api = neutron.API()
         self.servicegroup_api = servicegroup.API()
         self.query_client = query.SchedulerQueryClient()
-        self.report_client = report.SchedulerReportClient()
+        self.report_client = report.report_client_singleton()
         self.notifier = rpc.get_notifier('compute')
         # Help us to record host in EventReporter
         self.host = CONF.host
diff --git a/nova/conductor/tasks/migrate.py b/nova/conductor/tasks/migrate.py
index 6ff6206f6599..8838d0240a6c 100644
--- a/nova/conductor/tasks/migrate.py
+++ b/nova/conductor/tasks/migrate.py
@@ -54,7 +54,7 @@ def replace_allocation_with_migration(context, instance, migration):
         # and do any rollback required
         raise
 
-    reportclient = report.SchedulerReportClient()
+    reportclient = report.report_client_singleton()
 
     orig_alloc = reportclient.get_allocs_for_consumer(
         context, instance.uuid)['allocations']
@@ -94,7 +94,7 @@ def replace_allocation_with_migration(context, instance, migration):
 def revert_allocation_for_migration(context, source_cn, instance, migration):
     """Revert an allocation made for a migration back to the instance."""
 
-    reportclient = report.SchedulerReportClient()
+    reportclient = report.report_client_singleton()
 
     # FIXME(gibi): This method is flawed in that it does not handle allocations
     # against sharing providers in any special way. This leads to duplicate
diff --git a/nova/limit/placement.py b/nova/limit/placement.py
index 497986c4ab87..eedf7d69e19f 100644
--- a/nova/limit/placement.py
+++ b/nova/limit/placement.py
@@ -43,10 +43,8 @@ LEGACY_LIMITS = {
 def _get_placement_usages(
     context: 'nova.context.RequestContext', project_id: str
 ) -> ty.Dict[str, int]:
-    global PLACEMENT_CLIENT
-    if not PLACEMENT_CLIENT:
-        PLACEMENT_CLIENT = report.SchedulerReportClient()
-    return PLACEMENT_CLIENT.get_usages_counts_for_limits(context, project_id)
+    return report.report_client_singleton().get_usages_counts_for_limits(
+        context, project_id)
 
 
 def _get_usage(
diff --git a/nova/quota.py b/nova/quota.py
index b9dd76301276..eafad4cd23d5 100644
--- a/nova/quota.py
+++ b/nova/quota.py
@@ -1348,11 +1348,8 @@ def _instances_cores_ram_count_legacy(context, project_id, user_id=None):
 
 
 def _cores_ram_count_placement(context, project_id, user_id=None):
-    global PLACEMENT_CLIENT
-    if not PLACEMENT_CLIENT:
-        PLACEMENT_CLIENT = report.SchedulerReportClient()
-    return PLACEMENT_CLIENT.get_usages_counts_for_quota(context, project_id,
-                                                        user_id=user_id)
+    return report.report_client_singleton().get_usages_counts_for_quota(
+        context, project_id, user_id=user_id)
 
 
 def _instances_cores_ram_count_api_db_placement(context, project_id,
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index e4d0c8e3db62..ff86527cf55c 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -52,6 +52,7 @@ AGGREGATE_GENERATION_VERSION = '1.19'
 NESTED_PROVIDER_API_VERSION = '1.14'
 POST_ALLOCATIONS_API_VERSION = '1.13'
 GET_USAGES_VERSION = '1.9'
+PLACEMENTCLIENT = None
 
 AggInfo = collections.namedtuple('AggInfo', ['aggregates', 'generation'])
 TraitInfo = collections.namedtuple('TraitInfo', ['traits', 'generation'])
@@ -67,6 +68,51 @@ def warn_limit(self, msg):
         LOG.warning(msg)
 
 
+def report_client_singleton():
+    """Return a reference to the global placement client singleton.
+
+    This initializes the placement client once and returns a reference
+    to that singleton on subsequent calls. Errors are raised
+    (particularly ks_exc.*) but context-specific error messages are
+    logged for consistency.
+    """
+    # NOTE(danms): The report client maintains internal state in the
+    # form of the provider tree, which will be shared across all users
+    # of this global client. That is not a problem now, but in the
+    # future it may be beneficial to fix that. One idea would be to
+    # change the behavior of the client such that the static-config
+    # pieces of the actual keystone client are separate from the
+    # internal state, so that we can return a new object here with a
+    # context-specific local state object, but with the client bits
+    # shared.
+    global PLACEMENTCLIENT
+    if PLACEMENTCLIENT is None:
+        try:
+            PLACEMENTCLIENT = SchedulerReportClient()
+        except ks_exc.EndpointNotFound:
+            LOG.error('The placement API endpoint was not found.')
+            raise
+        except ks_exc.MissingAuthPlugin:
+            LOG.error('No authentication information found for placement API.')
+            raise
+        except ks_exc.Unauthorized:
+            LOG.error('Placement service credentials do not work.')
+            raise
+        except ks_exc.DiscoveryFailure:
+            LOG.error('Discovering suitable URL for placement API failed.')
+            raise
+        except (ks_exc.ConnectFailure,
+                ks_exc.RequestTimeout,
+                ks_exc.GatewayTimeout):
+            LOG.error('Placement API service is not responding.')
+            raise
+        except Exception:
+            LOG.error('Failed to initialize placement client '
+                      '(is keystone available?)')
+            raise
+    return PLACEMENTCLIENT
+
+
 def safe_connect(f):
     @functools.wraps(f)
     def wrapper(self, *a, **k):
diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py
index 03df615f6a6c..10b330653deb 100644
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@@ -66,7 +66,7 @@ class SchedulerManager(manager.Manager):
         self.host_manager = host_manager.HostManager()
         self.servicegroup_api = servicegroup.API()
         self.notifier = rpc.get_notifier('scheduler')
-        self.placement_client = report.SchedulerReportClient()
+        self.placement_client = report.report_client_singleton()
 
         super().__init__(service_name='scheduler', *args, **kwargs)
 
diff --git a/nova/scheduler/request_filter.py b/nova/scheduler/request_filter.py
index bd237b06cac0..3f96b7a88063 100644
--- a/nova/scheduler/request_filter.py
+++ b/nova/scheduler/request_filter.py
@@ -311,7 +311,7 @@ def routed_networks_filter(
 
     # Get the clients we need
     network_api = neutron.API()
-    report_api = report.SchedulerReportClient()
+    report_api = report.report_client_singleton()
 
     for requested_network in requested_networks:
         network_id = None
diff --git a/nova/test.py b/nova/test.py
index d645d06cfa5c..689d5ba29164 100644
--- a/nova/test.py
+++ b/nova/test.py
@@ -61,6 +61,7 @@ from nova import exception
 from nova import objects
 from nova.objects import base as objects_base
 from nova import quota
+from nova.scheduler.client import report
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.unit import matchers
 from nova import utils
@@ -291,6 +292,9 @@ class TestCase(base.BaseTestCase):
         # instead of only once initialized for test worker
         wsgi_app.init_global_data.reset()
 
+        # Reset the placement client singleton
+        report.PLACEMENTCLIENT = None
+
     def _setup_cells(self):
         """Setup a normal cellsv2 environment.
 
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index f62f8c7477ca..b0442afb90f8 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -7764,16 +7764,13 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
         self.assertTrue(hasattr(self.compute_api, 'host'))
         self.assertEqual(CONF.host, self.compute_api.host)
 
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient')
+    @mock.patch('nova.scheduler.client.report.report_client_singleton')
     def test_placement_client_init(self, mock_report_client):
         """Tests to make sure that the construction of the placement client
-        only happens once per API class instance.
+        uses the singleton helper, and happens only when needed.
         """
-        self.assertIsNone(self.compute_api._placementclient)
-        # Access the property twice to make sure SchedulerReportClient is
-        # only loaded once.
-        for x in range(2):
-            self.compute_api.placementclient
+        self.assertFalse(mock_report_client.called)
+        self.compute_api.placementclient
         mock_report_client.assert_called_once_with()
 
     def test_validate_host_for_cold_migrate_same_host_fails(self):
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index 672e5c894ef2..b5754226a30b 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -13091,16 +13091,13 @@ class ComputeAPIAggrTestCase(BaseTestCase):
         hosts = aggregate.hosts if 'hosts' in aggregate else None
         self.assertIn(values[0][1][0], hosts)
 
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient')
+    @mock.patch('nova.scheduler.client.report.report_client_singleton')
     def test_placement_client_init(self, mock_report_client):
         """Tests to make sure that the construction of the placement client
-        only happens once per AggregateAPI class instance.
+        uses the singleton helper, and happens only when needed.
         """
-        self.assertIsNone(self.api._placement_client)
-        # Access the property twice to make sure SchedulerReportClient is
-        # only loaded once.
-        for x in range(2):
-            self.api.placement_client
+        self.assertFalse(mock_report_client.called)
+        self.api.placement_client
         mock_report_client.assert_called_once_with()
 
 
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index 9a1766927c3e..2829c160340d 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -11,6 +11,7 @@
 #    under the License.
 
 import copy
+import ddt
 import time
 from unittest import mock
 from urllib import parse
@@ -157,6 +158,41 @@ class SafeConnectedTestCase(test.NoDBTestCase):
         self.assertTrue(req.called)
 
 
+@ddt.ddt
+class TestSingleton(test.NoDBTestCase):
+    def test_singleton(self):
+        # Make sure we start with a clean slate
+        self.assertIsNone(report.PLACEMENTCLIENT)
+
+        # Make sure the first call creates the singleton, sets it
+        # globally, and returns it
+        client = report.report_client_singleton()
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+        # Make sure that a subsequent call returns the same thing
+        # again and that the global is unchanged
+        self.assertEqual(client, report.report_client_singleton())
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+    @ddt.data(ks_exc.EndpointNotFound,
+              ks_exc.MissingAuthPlugin,
+              ks_exc.Unauthorized,
+              ks_exc.DiscoveryFailure,
+              ks_exc.ConnectFailure,
+              ks_exc.RequestTimeout,
+              ks_exc.GatewayTimeout,
+              test.TestingException)
+    def test_errors(self, exc):
+        self._test_error(exc)
+
+    @mock.patch.object(report, 'LOG')
+    def _test_error(self, exc, mock_log):
+        with mock.patch.object(report.SchedulerReportClient, '_create_client',
+                               side_effect=exc):
+            self.assertRaises(exc, report.report_client_singleton)
+        mock_log.error.assert_called_once()
+
+
 class TestConstructor(test.NoDBTestCase):
     def setUp(self):
         super(TestConstructor, self).setUp()