diff --git a/nova/context.py b/nova/context.py index 90aaa14a398a..e72fd05a9c7d 100644 --- a/nova/context.py +++ b/nova/context.py @@ -37,9 +37,6 @@ from nova import policy from nova import utils LOG = logging.getLogger(__name__) -# TODO(melwitt): This cache should be cleared whenever WSGIService receives a -# SIGHUP and periodically based on an expiration time. Currently, none of the -# cell caches are purged, so neither is this one, for now. CELL_CACHE = {} # NOTE(melwitt): Used for the scatter-gather utility to indicate we timed out # waiting for a result from a cell. diff --git a/nova/service.py b/nova/service.py index 2b903064cf19..16703a945006 100644 --- a/nova/service.py +++ b/nova/service.py @@ -149,6 +149,20 @@ class Service(service.Service): This includes starting an RPC service, initializing periodic tasks, etc. """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + assert_eventlet_uses_monotonic_clock() verstr = version.version_string_with_package() @@ -304,6 +318,8 @@ class Service(service.Service): def reset(self): """reset the service.""" self.manager.reset() + # Reset the cell cache that holds database transaction context managers + context.CELL_CACHE = {} class WSGIService(service.Service): @@ -358,13 +374,18 @@ class WSGIService(service.Service): setup_profiler(name, self.host) def reset(self): - """Reset server greenpool size to default and service version cache. + """Reset the following: + + * server greenpool size to default + * service version cache + * cell cache holding database transaction context managers :returns: None """ self.server.reset() service_obj.Service.clear_min_version_cache() + context.CELL_CACHE = {} def _get_manager(self): """Initialize a Manager object appropriate for this service. @@ -392,6 +413,20 @@ class WSGIService(service.Service): :returns: None """ + # NOTE(melwitt): Clear the cell cache holding database transaction + # context manager objects. We do this to ensure we create new internal + # oslo.db locks to avoid a situation where a child process receives an + # already locked oslo.db lock when it is forked. When a child process + # inherits a locked oslo.db lock, database accesses through that + # transaction context manager will never be able to acquire the lock + # and requests will fail with CellTimeout errors. + # See https://bugs.python.org/issue6721 for more information. + # With python 3.7, it would be possible for oslo.db to make use of the + # os.register_at_fork() method to reinitialize its lock. Until we + # require python 3.7 as a mininum version, we must handle the situation + # outside of oslo.db. + context.CELL_CACHE = {} + ctxt = context.get_admin_context() service_ref = objects.Service.get_by_host_and_binary(ctxt, self.host, self.binary) diff --git a/nova/tests/functional/test_service.py b/nova/tests/functional/test_service.py new file mode 100644 index 000000000000..c046772cd2b3 --- /dev/null +++ b/nova/tests/functional/test_service.py @@ -0,0 +1,100 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from nova import context as nova_context +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers +from nova.tests.unit.image import fake as fake_image +from nova.tests.unit import policy_fixture + + +class ServiceTestCase(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """Contains scenarios for testing services. + """ + + def setUp(self): + super(ServiceTestCase, self).setUp() + # Use the standard fixtures. + self.useFixture(policy_fixture.RealPolicyFixture()) + self.useFixture(nova_fixtures.NeutronFixture(self)) + self.useFixture(func_fixtures.PlacementFixture()) + fake_image.stub_out_image_service(self) + self.addCleanup(fake_image.FakeImageService_reset) + # Start nova controller services. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).api + self.start_service('conductor') + self.scheduler = self.start_service('scheduler') + # Our OSAPIFixture does not use a WSGIService, so just use the metadata + # server fixture (which uses WSGIService) for testing. + self.metadata = self.useFixture( + nova_fixtures.OSMetadataServer()).metadata + # Start one compute service. + self.start_service('compute') + + def test_service_reset_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared after a service reset (example scenario: SIGHUP). + """ + server_req = self._build_server() + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.reset() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE) + + def test_service_start_resets_cell_cache(self): + """Tests that the cell cache for database transaction context managers + is cleared upon a service start (example scenario: service start after + a SIGTERM and the parent process forks child process workers). + """ + server_req = self._build_server() + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.scheduler.stop() + # NOTE(melwitt): Simulate a service starting after being stopped. The + # scenario we want to handle is one where during start, the parent + # process forks child process workers while one or more of its cached + # database transaction context managers is inside a locked code + # section. If the child processes are forked while the lock is locked, + # the child processes begin with an already locked lock that can never + # be acquired again. The result is that requests gets stuck and fail + # with a CellTimeout error. + self.scheduler.start() + # Cell cache should be empty after the service start. + self.assertEqual({}, nova_context.CELL_CACHE) + + # Now test the WSGI service. + server = self.api.post_server({'server': server_req}) + self._wait_for_state_change(server, 'ACTIVE') + # Cell cache should be populated after creating a server. + self.assertTrue(nova_context.CELL_CACHE) + self.metadata.stop() + self.metadata.start() + # Cell cache should be empty after the service reset. + self.assertEqual({}, nova_context.CELL_CACHE)