Browse Source

Cache database and message queue connection objects

Recently in the gate we have seen a trace on some work-in-progress
patches:

  OperationalError: (pymysql.err.OperationalError)
                    (1040, u'Too many connections')

and at least one operator has reported that the number of database
connections increased significantly going from Mitaka to Newton.

It was suspected that the increase was caused by creating new oslo.db
transaction context managers on-the-fly when switching database
connections for cells. Comparing the dstat --tcp output of runs of the
gate-tempest-dsvm-neutron-full-ubuntu-xenial job with and without
caching of the database connections showed a difference of 445 active
TCP connections and 1495 active TCP connections, respectively [1].

This adds caching of the oslo.db transaction context managers and the
oslo.messaging transports to avoid creating a large number of objects
that are not being garbage-collected as expected.

Closes-Bug: #1691545

[1] https://docs.google.com/spreadsheets/d/1DIfFfX3kaA_SRoCM-aO7BN4IBEShChXLztOBFeKryt4/edit?usp=sharing

Change-Id: I17e0eb836dd87aac5859f506e7d771d42753d31a
tags/16.0.0.0b2
melanie witt 2 years ago
parent
commit
47fa88d947

+ 25
- 5
nova/context.py View File

@@ -34,6 +34,10 @@ from nova import policy
34 34
 from nova import utils
35 35
 
36 36
 LOG = logging.getLogger(__name__)
37
+# TODO(melwitt): This cache should be cleared whenever WSGIService receives a
38
+# SIGHUP and periodically based on an expiration time. Currently, none of the
39
+# cell caches are purged, so neither is this one, for now.
40
+CELL_CACHE = {}
37 41
 
38 42
 
39 43
 class _ContextAuthPlugin(plugin.BaseAuthPlugin):
@@ -370,15 +374,31 @@ def set_target_cell(context, cell_mapping):
370 374
     :param context: The RequestContext to add connection information
371 375
     :param cell_mapping: An objects.CellMapping object or None
372 376
     """
377
+    global CELL_CACHE
373 378
     if cell_mapping is not None:
374 379
         # avoid circular import
375 380
         from nova import db
376 381
         from nova import rpc
377
-        db_connection_string = cell_mapping.database_connection
378
-        context.db_connection = db.create_context_manager(db_connection_string)
379
-        if not cell_mapping.transport_url.startswith('none'):
380
-            context.mq_connection = rpc.create_transport(
381
-                cell_mapping.transport_url)
382
+
383
+        # Synchronize access to the cache by multiple API workers.
384
+        @utils.synchronized(cell_mapping.uuid)
385
+        def get_or_set_cached_cell_and_set_connections():
386
+            try:
387
+                cell_tuple = CELL_CACHE[cell_mapping.uuid]
388
+            except KeyError:
389
+                db_connection_string = cell_mapping.database_connection
390
+                context.db_connection = db.create_context_manager(
391
+                    db_connection_string)
392
+                if not cell_mapping.transport_url.startswith('none'):
393
+                    context.mq_connection = rpc.create_transport(
394
+                        cell_mapping.transport_url)
395
+                CELL_CACHE[cell_mapping.uuid] = (context.db_connection,
396
+                                                 context.mq_connection)
397
+            else:
398
+                context.db_connection = cell_tuple[0]
399
+                context.mq_connection = cell_tuple[1]
400
+
401
+        get_or_set_cached_cell_and_set_connections()
382 402
     else:
383 403
         context.db_connection = None
384 404
         context.mq_connection = None

+ 1
- 0
nova/test.py View File

@@ -242,6 +242,7 @@ class TestCase(testtools.TestCase):
242 242
         # NOTE(danms): Reset the cached list of cells
243 243
         from nova.compute import api
244 244
         api.CELLS = []
245
+        context.CELL_CACHE = {}
245 246
 
246 247
         self.cell_mappings = {}
247 248
         self.host_mappings = {}

+ 27
- 1
nova/tests/unit/test_context.py View File

@@ -20,6 +20,7 @@ from nova import context
20 20
 from nova import exception
21 21
 from nova import objects
22 22
 from nova import test
23
+from nova.tests import uuidsentinel as uuids
23 24
 
24 25
 
25 26
 class ContextTestCase(test.NoDBTestCase):
@@ -302,7 +303,8 @@ class ContextTestCase(test.NoDBTestCase):
302 303
         ctxt.db_connection = mock.sentinel.db_conn
303 304
         ctxt.mq_connection = mock.sentinel.mq_conn
304 305
         mapping = objects.CellMapping(database_connection='fake://',
305
-                                      transport_url='fake://')
306
+                                      transport_url='fake://',
307
+                                      uuid=uuids.cell)
306 308
         with context.target_cell(ctxt, mapping):
307 309
             self.assertEqual(ctxt.db_connection, mock.sentinel.cdb)
308 310
             self.assertEqual(ctxt.mq_connection, mock.sentinel.cmq)
@@ -333,3 +335,27 @@ class ContextTestCase(test.NoDBTestCase):
333 335
         self.assertIsNone(ctxt.user_id)
334 336
         self.assertIsNone(ctxt.project_id)
335 337
         self.assertFalse(ctxt.is_admin)
338
+
339
+    @mock.patch('nova.rpc.create_transport')
340
+    @mock.patch('nova.db.create_context_manager')
341
+    def test_target_cell_caching(self, mock_create_cm, mock_create_tport):
342
+        mock_create_cm.return_value = mock.sentinel.db_conn_obj
343
+        mock_create_tport.return_value = mock.sentinel.mq_conn_obj
344
+        ctxt = context.get_context()
345
+        mapping = objects.CellMapping(database_connection='fake://db',
346
+                                      transport_url='fake://mq',
347
+                                      uuid=uuids.cell)
348
+        # First call should create new connection objects.
349
+        with context.target_cell(ctxt, mapping):
350
+            self.assertEqual(mock.sentinel.db_conn_obj, ctxt.db_connection)
351
+            self.assertEqual(mock.sentinel.mq_conn_obj, ctxt.mq_connection)
352
+        mock_create_cm.assert_called_once_with('fake://db')
353
+        mock_create_tport.assert_called_once_with('fake://mq')
354
+        # Second call should use cached objects.
355
+        mock_create_cm.reset_mock()
356
+        mock_create_tport.reset_mock()
357
+        with context.target_cell(ctxt, mapping):
358
+            self.assertEqual(mock.sentinel.db_conn_obj, ctxt.db_connection)
359
+            self.assertEqual(mock.sentinel.mq_conn_obj, ctxt.mq_connection)
360
+        mock_create_cm.assert_not_called()
361
+        mock_create_tport.assert_not_called()

+ 10
- 0
releasenotes/notes/bug-1691545-1acd6512effbdffb.yaml View File

@@ -0,0 +1,10 @@
1
+---
2
+fixes:
3
+  - |
4
+    Fixes `bug 1691545`_ in which there was a significant increase in database
5
+    connections because of the way connections to cell databases were being
6
+    established. With this fix, objects related to database connections are
7
+    cached in the API service and reused to prevent new connections being
8
+    established for every communication with cell databases.
9
+
10
+    .. _bug 1691545: https://bugs.launchpad.net/nova/+bug/1691545

Loading…
Cancel
Save