From 5aa78acbce9fa1fa18ac963385e82a6367ff445e Mon Sep 17 00:00:00 2001 From: melanie witt Date: Fri, 3 Apr 2020 21:22:27 +0000 Subject: [PATCH] Reset the cell cache for database access in Service We have had a gate bug for a long time where occasionally the scheduler service gets into a state where many requests fail in it with CellTimeout errors. Example: Timed out waiting for response from cell Through the use of much DNM patch debug logging in oslo.db, it was revealed that service child processes (workers) were sometimes starting off with already locked internal oslo.db locks. This is a known issue in python [1] where if a parent process forks a child process while a lock is held, the child will inherit the held lock which can never be acquired. The python issue is not considered a bug and the recommended way to handle it is by making use of the os.register_at_fork() in the oslo.db to reinitialize its lock. The method is new in python 3.7, so as long as we still support python 3.6, we must handle the situation outside of oslo.db. We can do this by clearing the cell cache that holds oslo.db database transaction context manager objects during service start(). This way, we get fresh oslo.db locks that are in an unlocked state when a child process begins. We can also take this opportunity to resolve part of a TODO to clear the same cell cache during service reset() (SIGHUP) since it is another case where we intended to clear it. The rest of the TODO related to periodic clearing of the cache is removed after discussion on the review, as such clearing would be unsynchronized among multiple services and for periods of time each service might have a different view of cached cells than another. Closes-Bug: #1844929 [1] https://bugs.python.org/issue6721 NOTE(melwitt): This backport differs slightly in that the test setup calls set_stub_network_methods because change I1dbccc2be6ba79bf267edac9208c80e187e6256a is not in Queens. Change-Id: Id233f673a57461cc312e304873a41442d732c051 (cherry picked from commit 941559042f609ee43ff3160c0f0d0c45187be17f) (cherry picked from commit 88205a4e911268dae7120a6a43ff9042d1534251) (cherry picked from commit 4de766006d9432a7ccbcf6a4d4232db472b2f0e1) (cherry picked from commit a86ebc75eb886bd293dca42439762ecdd69ca0d7) --- nova/context.py | 3 - nova/service.py | 37 ++++++++- nova/tests/functional/test_service.py | 103 ++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 4 deletions(-) create mode 100644 nova/tests/functional/test_service.py diff --git a/nova/context.py b/nova/context.py index c7153d98e515..94bbabb6f33f 100644 --- a/nova/context.py +++ b/nova/context.py @@ -37,9 +37,6 @@ from nova import policy from nova import utils LOG = logging.getLogger(__name__) -# TODO(melwitt): This cache should be cleared whenever WSGIService receives a -# SIGHUP and periodically based on an expiration time. Currently, none of the -# cell caches are purged, so neither is this one, for now. CELL_CACHE = {} # NOTE(melwitt): Used for the scatter-gather utility to indicate we timed out # waiting for a result from a cell. diff --git a/nova/service.py b/nova/service.py index ee574ba12438..3b8f2a69a684 100644 --- a/nova/service.py +++ b/nova/service.py @@ -152,6 +152,20 @@ class Service(service.Service): This includes starting an RPC service, initializing periodic tasks, etc. """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + assert_eventlet_uses_monotonic_clock() verstr = version.version_string_with_package() @@ -307,6 +321,8 @@ class Service(service.Service): def reset(self): """reset the service.""" self.manager.reset() + # Reset the cell cache that holds database transaction context managers + context.CELL_CACHE = {} class WSGIService(service.Service): @@ -357,13 +373,18 @@ class WSGIService(service.Service): setup_profiler(name, self.host) def reset(self): - """Reset server greenpool size to default and service version cache. + """Reset the following: + + * server greenpool size to default + * service version cache + * cell cache holding database transaction context managers :returns: None """ self.server.reset() service_obj.Service.clear_min_version_cache() + context.CELL_CACHE = {} def _get_manager(self): """Initialize a Manager object appropriate for this service. @@ -391,6 +412,20 @@ class WSGIService(service.Service): :returns: None """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + ctxt = context.get_admin_context() service_ref = objects.Service.get_by_host_and_binary(ctxt, self.host, self.binary) diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py new file mode 100644 index 000000000000..25e181545c49 --- /dev/null +++ b/nova/tests/functional/test_service.py @@ -0,0 +1,103 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova import context as nova_context +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit import fake_network +from nova.tests.unit.image import fake as fake_image +from nova.tests.unit import policy_fixture + + +class ServiceTestCase(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """Contains scenarios for testing services. + """ + + def setUp(self): + super(ServiceTestCase, self).setUp() + # Use the standard fixtures. + self.useFixture(policy_fixture.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(nova_fixtures.PlacementFixture()) + fake_image.stub_out_image_service(self) + self.addCleanup(fake_image.FakeImageService_reset) + fake_network.set_stub_network_methods(self) + # Start nova controller services. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).api + self.start_service('conductor') + self.scheduler = self.start_service('scheduler') + # Our OSAPIFixture does not use a WSGIService, so just use the metadata + # server fixture (which uses WSGIService) for testing. + self.metadata = self.useFixture( + nova_fixtures.OSMetadataServer()).metadata + # Start one compute service. + self.start_service('compute') + + def test_service_reset_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared after a service reset (example scenario: SIGHUP). + """ + server_req = self._build_minimal_create_server_request( + self.api, 'test-service-reset') + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + def test_service_start_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared upon a service start (example scenario: service start after + a SIGTERM and the parent process forks child process workers). + """ + server_req = self._build_minimal_create_server_request( + self.api, 'test-service-start') + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.stop() + # NOTE(melwitt): Simulate a service starting after being stopped. The + # scenario we want to handle is one where during start, the parent + # process forks child process workers while one or more of its cached + # database transaction context managers is inside a locked code + # section. If the child processes are forked while the lock is locked, + # the child processes begin with an already locked lock that can never + # be acquired again. The result is that requests gets stuck and fail + # with a CellTimeout error. + self.scheduler.start() + # Cell cache should be empty after the service start. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.stop() + self.metadata.start() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE)