From bf89a23d91f01a29ba9b19bd0accf8de8c05f2f7 Mon Sep 17 00:00:00 2001 From: Lee Yarwood Date: Wed, 31 Mar 2021 11:00:57 +0100 Subject: [PATCH] api: Reject requests to force up computes when `done` evacuation records exist When evacuating an instance the evacuation migration record moves from a state of `accepted` to `pre-migrating` and eventually `done` once the instance has been rebuilt on the new compute host. At present the migration record remains in this state until the original compute host is restarted and the service cleans up any leftovers of the instance before it is moved to a state of `completed`. Bug #1922053 details a use case where an operator might unintentionally forget to ensure the compute service is restarted before forcing the service up leaving evacuation migration records stuck as `done`. This could become an issue in the future if the instance is moved back to this compute before the service is restarted. Any future restart invoking the cleanup logic and potentially damaging the running instance. This change aims to address this by blocking requests to force up computes associated to `done` evacuation records. Forcing operators to restart the service allowing them to move to a `completed` state before the service can be forced up again. To allow this to be backportable these requests are rejected with a 400 BadRequest return code from the API. A TODO is left to move this to a 409 Conflict during the Xena release under a new microversion. Finally, some additional functional tests have been updated to ensure they restart the source compute service of an evacuation before attempting to force up the service, ensuring any migration records are marked as completed. Closes-Bug: #1922053 Change-Id: I95882ea28564a31a6b4f8b665de462774d84edfc --- nova/api/openstack/compute/services.py | 24 +++++++++++++++++++ .../test_instance.py | 3 +++ .../regressions/test_bug_1922053.py | 21 ++++++++-------- nova/tests/functional/test_servers.py | 4 ++-- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index 5b4cfd0d77fc..1cc54e3973cc 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -189,6 +189,9 @@ class ServiceController(wsgi.Controller): host = body['host'] binary = body['binary'] + if binary == 'nova-compute' and forced_down is False: + self._check_for_evacuations(context, host) + ret_value = {'service': {'host': host, 'binary': binary, 'forced_down': forced_down}} @@ -225,6 +228,25 @@ class ServiceController(wsgi.Controller): return action(body, context) + def _check_for_evacuations(self, context, hostname): + # NOTE(lyarwood): When forcing a compute service back up ensure that + # there are no evacuation migration records against this host as the + # source that are marked as done, suggesting that the compute service + # hasn't restarted and moved such records to a completed state. + filters = { + 'source_compute': hostname, + 'status': 'done', + 'migration_type': objects.fields.MigrationType.EVACUATION, + } + if any(objects.MigrationList.get_by_filters(context, filters)): + msg = _("Unable to force up host %(host)s as `done` evacuation " + "migration records remain associated with the host. " + "Ensure the compute service has been restarted, " + "allowing these records to move to `completed` before " + "retrying this request.") % {'host': hostname} + # TODO(lyarwood): Move to 409 HTTPConflict under a new microversion + raise webob.exc.HTTPBadRequest(explanation=msg) + @wsgi.response(204) @wsgi.expected_errors((400, 404, 409)) def delete(self, req, id): @@ -446,6 +468,8 @@ class ServiceController(wsgi.Controller): if 'forced_down' in body: service.forced_down = strutils.bool_from_string( body['forced_down'], strict=True) + if service.forced_down is False: + self._check_for_evacuations(context, service.host) # Check to see if anything was actually updated since the schema does # not define any required fields. diff --git a/nova/tests/functional/notification_sample_tests/test_instance.py b/nova/tests/functional/notification_sample_tests/test_instance.py index 21c922845427..a263e5129348 100644 --- a/nova/tests/functional/notification_sample_tests/test_instance.py +++ b/nova/tests/functional/notification_sample_tests/test_instance.py @@ -252,6 +252,7 @@ class TestInstanceNotificationSampleWithMultipleCompute( services = self.admin_api.get_services(host='host2', binary='nova-compute') service_id = services[0]['id'] + self.compute2.stop() self.admin_api.put_service(service_id, {'forced_down': True}) evacuate = { 'evacuate': { @@ -272,6 +273,8 @@ class TestInstanceNotificationSampleWithMultipleCompute( 'reservation_id': server['reservation_id'], 'uuid': server['id']}, actual=notifications[0]) + self.compute2.start() + self._wait_for_migration_status(server, ['completed']) self.admin_api.put_service(service_id, {'forced_down': False}) def _test_live_migration_force_complete(self, server): diff --git a/nova/tests/functional/regressions/test_bug_1922053.py b/nova/tests/functional/regressions/test_bug_1922053.py index 89cc16caccb0..612be27b2b3c 100644 --- a/nova/tests/functional/regressions/test_bug_1922053.py +++ b/nova/tests/functional/regressions/test_bug_1922053.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +from nova.tests.functional.api import client from nova.tests.functional import integrated_helpers @@ -61,16 +62,13 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase): expected_migration_status='done' ) - # FIXME(lyarwood): This is bug #1922053, this shouldn't be allowed with - # `done` evacuation migration records still associated with the host. - # Replace this with the following assertion once fixed: - # ex = self.assertRaises( - # client.OpenStackApiException, - # self._force_up_compute, - # 'compute', - # ) - # self.assertEqual(400, ex.response.status_code) - self._force_up_compute('compute') + # Assert that the request to force up the host is rejected + ex = self.assertRaises( + client.OpenStackApiException, + self._force_up_compute, + 'compute', + ) + self.assertEqual(400, ex.response.status_code) # Assert that the evacuation migration record remains `done` self._wait_for_migration_status(server, ["done"]) @@ -82,6 +80,9 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase): # Assert that the evacuation migration record is now `completed` self._wait_for_migration_status(server, ["completed"]) + # Assert that we can now force up the host + self._force_up_compute('compute') + class ForceUpWithDoneEvacuationsv252(ForceUpWithDoneEvacuations): diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py index b3b155ff3400..e189d86a7107 100644 --- a/nova/tests/functional/test_servers.py +++ b/nova/tests/functional/test_servers.py @@ -7181,9 +7181,9 @@ class ServerMoveWithPortResourceRequestTest( self._assert_pci_request_pf_device_name(server, 'host2-ens2') # recover source compute + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( self.compute1_service_id, {'forced_down': 'false'}) - self.compute1 = self.restart_compute_service(self.compute1) # check that source allocation is cleaned up and the dest allocation # and the port bindings are not touched. @@ -7226,9 +7226,9 @@ class ServerMoveWithPortResourceRequestTest( qos_normal_port, qos_sriov_port, self.flavor_with_group_policy) # recover source compute + self.compute1 = self.restart_compute_service(self.compute1) self.admin_api.put_service( self.compute1_service_id, {'forced_down': 'false'}) - self.compute1 = self.restart_compute_service(self.compute1) # check again that even after source host recovery the source # allocation is intact