From a86ebc75eb886bd293dca42439762ecdd69ca0d7 Mon Sep 17 00:00:00 2001 From: melanie witt Date: Fri, 3 Apr 2020 21:22:27 +0000 Subject: [PATCH] Reset the cell cache for database access in Service We have had a gate bug for a long time where occasionally the scheduler service gets into a state where many requests fail in it with CellTimeout errors. Example: Timed out waiting for response from cell Through the use of much DNM patch debug logging in oslo.db, it was revealed that service child processes (workers) were sometimes starting off with already locked internal oslo.db locks. This is a known issue in python [1] where if a parent process forks a child process while a lock is held, the child will inherit the held lock which can never be acquired. The python issue is not considered a bug and the recommended way to handle it is by making use of the os.register_at_fork() in the oslo.db to reinitialize its lock. The method is new in python 3.7, so as long as we still support python 3.6, we must handle the situation outside of oslo.db. We can do this by clearing the cell cache that holds oslo.db database transaction context manager objects during service start(). This way, we get fresh oslo.db locks that are in an unlocked state when a child process begins. We can also take this opportunity to resolve part of a TODO to clear the same cell cache during service reset() (SIGHUP) since it is another case where we intended to clear it. The rest of the TODO related to periodic clearing of the cache is removed after discussion on the review, as such clearing would be unsynchronized among multiple services and for periods of time each service might have a different view of cached cells than another. Closes-Bug: #1844929 [1] https://bugs.python.org/issue6721 NOTE(melwitt): The difference from the Stein change in nova/tests/functional/test_service.py is because change Idaed39629095f86d24a54334c699a26c218c6593 is not in Rocky. Change-Id: Id233f673a57461cc312e304873a41442d732c051 (cherry picked from commit 941559042f609ee43ff3160c0f0d0c45187be17f) (cherry picked from commit 88205a4e911268dae7120a6a43ff9042d1534251) (cherry picked from commit 4de766006d9432a7ccbcf6a4d4232db472b2f0e1) --- nova/context.py | 3 - nova/service.py | 37 +++++++++- nova/tests/functional/test_service.py | 101 ++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 nova/tests/functional/test_service.py diff --git a/nova/context.py b/nova/context.py index 58ea26a9939f..8e2d7f3a153e 100644 --- a/nova/context.py +++ b/nova/context.py @@ -38,9 +38,6 @@ from nova import policy from nova import utils LOG = logging.getLogger(__name__) -# TODO(melwitt): This cache should be cleared whenever WSGIService receives a -# SIGHUP and periodically based on an expiration time. Currently, none of the -# cell caches are purged, so neither is this one, for now. CELL_CACHE = {} # NOTE(melwitt): Used for the scatter-gather utility to indicate we timed out # waiting for a result from a cell. diff --git a/nova/service.py b/nova/service.py index b0655c051342..19d97e79c45e 100644 --- a/nova/service.py +++ b/nova/service.py @@ -153,6 +153,20 @@ class Service(service.Service): This includes starting an RPC service, initializing periodic tasks, etc. """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + assert_eventlet_uses_monotonic_clock() verstr = version.version_string_with_package() @@ -308,6 +322,8 @@ class Service(service.Service): def reset(self): """reset the service.""" self.manager.reset() + # Reset the cell cache that holds database transaction context managers + context.CELL_CACHE = {} class WSGIService(service.Service): @@ -362,13 +378,18 @@ class WSGIService(service.Service): setup_profiler(name, self.host) def reset(self): - """Reset server greenpool size to default and service version cache. + """Reset the following: + + * server greenpool size to default + * service version cache + * cell cache holding database transaction context managers :returns: None """ self.server.reset() service_obj.Service.clear_min_version_cache() + context.CELL_CACHE = {} def _get_manager(self): """Initialize a Manager object appropriate for this service. @@ -396,6 +417,20 @@ class WSGIService(service.Service): :returns: None """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + ctxt = context.get_admin_context() service_ref = objects.Service.get_by_host_and_binary(ctxt, self.host, self.binary) diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py new file mode 100644 index 000000000000..78cac9f625a2 --- /dev/null +++ b/nova/tests/functional/test_service.py @@ -0,0 +1,101 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova import context as nova_context +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit.image import fake as fake_image +from nova.tests.unit import policy_fixture + + +class ServiceTestCase(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """Contains scenarios for testing services. + """ + + def setUp(self): + super(ServiceTestCase, self).setUp() + # Use the standard fixtures. + self.useFixture(policy_fixture.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.PlacementFixture()) + fake_image.stub_out_image_service(self) + self.addCleanup(fake_image.FakeImageService_reset) + # Start nova controller services. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).api + self.start_service('conductor') + self.scheduler = self.start_service('scheduler') + # Our OSAPIFixture does not use a WSGIService, so just use the metadata + # server fixture (which uses WSGIService) for testing. + self.metadata = self.useFixture( + nova_fixtures.OSMetadataServer()).metadata + # Start one compute service. + self.start_service('compute') + + def test_service_reset_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared after a service reset (example scenario: SIGHUP). + """ + server_req = self._build_minimal_create_server_request( + self.api, 'test-service-reset') + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + def test_service_start_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared upon a service start (example scenario: service start after + a SIGTERM and the parent process forks child process workers). + """ + server_req = self._build_minimal_create_server_request( + self.api, 'test-service-start') + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.stop() + # NOTE(melwitt): Simulate a service starting after being stopped. The + # scenario we want to handle is one where during start, the parent + # process forks child process workers while one or more of its cached + # database transaction context managers is inside a locked code + # section. If the child processes are forked while the lock is locked, + # the child processes begin with an already locked lock that can never + # be acquired again. The result is that requests gets stuck and fail + # with a CellTimeout error. + self.scheduler.start() + # Cell cache should be empty after the service start. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.stop() + self.metadata.start() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE)