From 2794748d9c58623045023f34c7793c58ce41447c Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Wed, 1 May 2019 23:38:40 +0200 Subject: [PATCH] Enhance service restart in functional env Bugfix Icaf1bae8cb040b939f916a19ce026031ddb84af7 showed that restarting a compute service in the functional env is unrealistic causing faults to slip through. During that bug fix only the minimal change was done in the functional env regarding compute service restart to reproduce the reported fault. However the restart of the compute service could be made even more realistic. This patch simulates a compute service restart in the functional env by stopping the original compute service and starting a totally new compute service for the same host and node. This way we can make sure that we get a brand new ComputeManager in the new service and no state can leak between the old and the new service. This change revealed another shortcoming of the functional env. In the real world the nova-compute service could be restarted without loosing any running servers on the compute host. But with the naive implementation of this change the compute service is re-created. This means that a new ComputeManager is instantiated that loads a new FakeDriver instance as well. That new FakeDriver instance then reports an empty hypervisor. This behavior is not totally unrealistic as it simulates such a compute host restart that cleans the hypervisor state as well (e.g. compute host redeployment). However this type of restart shows another bug in the code path that destroys and deallocates evacuated instance from the source host. Therefore this patch implements the compute service restart in a way that simulates only a service restart and not a full compute restart. A subsequent patch will add a test that uses the clean hypervisor case to reproduces the revealed bug. Related-Bug: #1724172 Change-Id: I9d6cd6259659a35383c0c9c21db72a9434ba86b1 --- .../libvirt-connect-error.json | 2 +- nova/test.py | 59 ++++++++++++++----- nova/tests/functional/libvirt/test_reshape.py | 2 +- .../regressions/test_bug_1794996.py | 2 +- nova/tests/functional/test_servers.py | 12 ++-- 5 files changed, 54 insertions(+), 23 deletions(-) diff --git a/doc/notification_samples/libvirt-connect-error.json b/doc/notification_samples/libvirt-connect-error.json index a936d339ab21..1d29ac5fa5aa 100644 --- a/doc/notification_samples/libvirt-connect-error.json +++ b/doc/notification_samples/libvirt-connect-error.json @@ -21,5 +21,5 @@ "nova_object.version": "1.0" }, "priority": "ERROR", - "publisher_id": "nova-compute:fake-mini" + "publisher_id": "nova-compute:compute" } diff --git a/nova/test.py b/nova/test.py index a833a4b2fcf3..04062d1f58a3 100644 --- a/nova/test.py +++ b/nova/test.py @@ -51,7 +51,6 @@ import six from six.moves import builtins import testtools -from nova.compute import resource_tracker from nova.compute import rpcapi as compute_rpcapi from nova import context from nova.db import api as db @@ -432,20 +431,29 @@ class TestCase(testtools.TestCase): ctxt = context.get_context() cell_name = kwargs.pop('cell', CELL1_NAME) or CELL1_NAME cell = self.cell_mappings[cell_name] - hm = objects.HostMapping(context=ctxt, - host=host or name, - cell_mapping=cell) - hm.create() - self.host_mappings[hm.host] = hm + if (host or name) not in self.host_mappings: + # NOTE(gibi): If the HostMapping does not exists then this is + # the first start of the service so we create the mapping. + hm = objects.HostMapping(context=ctxt, + host=host or name, + cell_mapping=cell) + hm.create() + self.host_mappings[hm.host] = hm svc = self.useFixture( nova_fixtures.ServiceFixture(name, host, cell=cell, **kwargs)) return svc.service - def restart_compute_service(self, compute): - """Restart a compute service in a realistic way. + def restart_compute_service(self, compute, keep_hypervisor_state=True): + """Stops the service and starts a new one to have realistic restart :param:compute: the nova-compute service to be restarted + :param:keep_hypervisor_state: If true then already defined instances + will survive the compute service restart. + If false then the new service will see + an empty hypervisor + :returns: a new compute service instance serving the same host and + and node """ # NOTE(gibi): The service interface cannot be used to simulate a real @@ -455,13 +463,36 @@ class TestCase(testtools.TestCase): # a stop start. The service.kill() call cannot help as it deletes # the service from the DB which is unrealistic and causes that some # operation that refers to the killed host (e.g. evacuate) fails. - # So this helper method tries to simulate a better compute service - # restart by cleaning up some of the internal state of the compute - # manager. - host, driver = compute.manager.host, compute.manager.driver + # So this helper method will stop the original service and then starts + # a brand new compute service for the same host and node. This way + # a new ComputeManager instance will be created and initialized during + # the service startup. compute.stop() - compute.manager.rt = resource_tracker.ResourceTracker(host, driver) - compute.start() + + # this service was running previously so we have to make sure that + # we restart it in the same cell + cell_name = self.host_mappings[compute.host].cell_mapping.name + + if keep_hypervisor_state: + # NOTE(gibi): FakeDriver does not provide a meaningful way to + # define some servers that exists already on the hypervisor when + # the driver is (re)created during the service startup. This means + # that we cannot simulate that the definition of a server + # survives a nova-compute service restart on the hypervisor. + # Instead here we save the FakeDriver instance that knows about + # the defined servers and inject that driver into the new Manager + # class during the startup of the compute service. + old_driver = compute.manager.driver + with mock.patch( + 'nova.virt.driver.load_compute_driver') as load_driver: + load_driver.return_value = old_driver + new_compute = self.start_service( + 'compute', host=compute.host, cell=cell_name) + else: + new_compute = self.start_service( + 'compute', host=compute.host, cell=cell_name) + + return new_compute def assertJsonEqual(self, expected, observed, message=''): """Asserts that 2 complex data structures are json equivalent. diff --git a/nova/tests/functional/libvirt/test_reshape.py b/nova/tests/functional/libvirt/test_reshape.py index ec827e9d9f08..5d91e9ccaf9c 100644 --- a/nova/tests/functional/libvirt/test_reshape.py +++ b/nova/tests/functional/libvirt/test_reshape.py @@ -145,7 +145,7 @@ class VGPUReshapeTests(base.ServersTestBase): enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE, group='devices') # restart compute which will trigger a reshape - self.restart_compute_service(self.compute) + self.compute = self.restart_compute_service(self.compute) # verify that the inventory, usages and allocation are correct after # the reshape diff --git a/nova/tests/functional/regressions/test_bug_1794996.py b/nova/tests/functional/regressions/test_bug_1794996.py index f854c4e358cf..3b5802789444 100644 --- a/nova/tests/functional/regressions/test_bug_1794996.py +++ b/nova/tests/functional/regressions/test_bug_1794996.py @@ -78,4 +78,4 @@ class TestEvacuateDeleteServerRestartOriginalCompute( self._delete_and_check_allocations(server) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index e26db090db91..ca56d0aee9b2 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -2284,7 +2284,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): self.flavor1, server['id'], source_rp_uuid, dest_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'}) @@ -2355,7 +2355,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): self.flavor1, server['id'], source_rp_uuid, dest_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'}) @@ -2453,7 +2453,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): self.flavor1, server['id'], source_rp_uuid, dest_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'}) @@ -2533,7 +2533,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): source_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'}) @@ -2600,7 +2600,7 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase): source_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'}) @@ -5196,7 +5196,7 @@ class ServerMovingTestsWithNestedResourceRequests( source_rp_uuid) # restart the source compute - self.restart_compute_service(self.compute1) + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( source_compute_id, {'forced_down': 'false'})