api: Reject requests to force up computes when `done` evacuation records exist

When evacuating an instance the evacuation migration record moves from a state of `accepted` to `pre-migrating` and eventually `done` once the instance has been rebuilt on the new compute host. At present the migration record remains in this state until the original compute host is restarted and the service cleans up any leftovers of the instance before it is moved to a state of `completed`. Bug #1922053 details a use case where an operator might unintentionally forget to ensure the compute service is restarted before forcing the service up leaving evacuation migration records stuck as `done`. This could become an issue in the future if the instance is moved back to this compute before the service is restarted. Any future restart invoking the cleanup logic and potentially damaging the running instance. This change aims to address this by blocking requests to force up computes associated to `done` evacuation records. Forcing operators to restart the service allowing them to move to a `completed` state before the service can be forced up again. To allow this to be backportable these requests are rejected with a 400 BadRequest return code from the API. A TODO is left to move this to a 409 Conflict during the Xena release under a new microversion. Finally, some additional functional tests have been updated to ensure they restart the source compute service of an evacuation before attempting to force up the service, ensuring any migration records are marked as completed. Closes-Bug: #1922053 Change-Id: I95882ea28564a31a6b4f8b665de462774d84edfc
2021-03-31 11:00:57 +01:00 · 2021-03-31 11:00:57 +01:00 · bf89a23d91
parent db333070b6
commit bf89a23d91
4 changed files with 40 additions and 12 deletions
--- a/nova/api/openstack/compute/services.py
+++ b/nova/api/openstack/compute/services.py
@ -189,6 +189,9 @@ class ServiceController(wsgi.Controller):
        host = body['host']
        binary = body['binary']

+        if binary == 'nova-compute' and forced_down is False:
+            self._check_for_evacuations(context, host)
+
        ret_value = {'service': {'host': host,
                                 'binary': binary,
                                 'forced_down': forced_down}}
@ -225,6 +228,25 @@ class ServiceController(wsgi.Controller):

        return action(body, context)

+    def _check_for_evacuations(self, context, hostname):
+        # NOTE(lyarwood): When forcing a compute service back up ensure that
+        # there are no evacuation migration records against this host as the
+        # source that are marked as done, suggesting that the compute service
+        # hasn't restarted and moved such records to a completed state.
+        filters = {
+            'source_compute': hostname,
+            'status': 'done',
+            'migration_type': objects.fields.MigrationType.EVACUATION,
+        }
+        if any(objects.MigrationList.get_by_filters(context, filters)):
+            msg = _("Unable to force up host %(host)s as `done` evacuation "
+                    "migration records remain associated with the host. "
+                    "Ensure the compute service has been restarted, "
+                    "allowing these records to move to `completed` before "
+                    "retrying this request.") % {'host': hostname}
+            # TODO(lyarwood): Move to 409 HTTPConflict under a new microversion
+            raise webob.exc.HTTPBadRequest(explanation=msg)
+
    @wsgi.response(204)
    @wsgi.expected_errors((400, 404, 409))
    def delete(self, req, id):
@ -446,6 +468,8 @@ class ServiceController(wsgi.Controller):
        if 'forced_down' in body:
            service.forced_down = strutils.bool_from_string(
                body['forced_down'], strict=True)
+            if service.forced_down is False:
+                self._check_for_evacuations(context, service.host)

        # Check to see if anything was actually updated since the schema does
        # not define any required fields.
--- a/nova/tests/functional/notification_sample_tests/test_instance.py
+++ b/nova/tests/functional/notification_sample_tests/test_instance.py
@ -252,6 +252,7 @@ class TestInstanceNotificationSampleWithMultipleCompute(
        services = self.admin_api.get_services(host='host2',
                                               binary='nova-compute')
        service_id = services[0]['id']
+        self.compute2.stop()
        self.admin_api.put_service(service_id, {'forced_down': True})
        evacuate = {
            'evacuate': {
@ -272,6 +273,8 @@ class TestInstanceNotificationSampleWithMultipleCompute(
                'reservation_id': server['reservation_id'],
                'uuid': server['id']},
            actual=notifications[0])
+        self.compute2.start()
+        self._wait_for_migration_status(server, ['completed'])
        self.admin_api.put_service(service_id, {'forced_down': False})

    def _test_live_migration_force_complete(self, server):
--- a/nova/tests/functional/regressions/test_bug_1922053.py
+++ b/nova/tests/functional/regressions/test_bug_1922053.py
@ -10,6 +10,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.

+from nova.tests.functional.api import client
 from nova.tests.functional import integrated_helpers


@ -61,16 +62,13 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase):
            expected_migration_status='done'
        )

-        # FIXME(lyarwood): This is bug #1922053, this shouldn't be allowed with
-        # `done` evacuation migration records still associated with the host.
-        # Replace this with the following assertion once fixed:
-        # ex = self.assertRaises(
-        #   client.OpenStackApiException,
-        #   self._force_up_compute,
-        #   'compute',
-        # )
-        # self.assertEqual(400, ex.response.status_code)
-        self._force_up_compute('compute')
+        # Assert that the request to force up the host is rejected
+        ex = self.assertRaises(
+            client.OpenStackApiException,
+            self._force_up_compute,
+            'compute',
+        )
+        self.assertEqual(400, ex.response.status_code)

        # Assert that the evacuation migration record remains `done`
        self._wait_for_migration_status(server, ["done"])
@ -82,6 +80,9 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase):
        # Assert that the evacuation migration record is now `completed`
        self._wait_for_migration_status(server, ["completed"])

+        # Assert that we can now force up the host
+        self._force_up_compute('compute')
+

 class ForceUpWithDoneEvacuationsv252(ForceUpWithDoneEvacuations):

--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@ -7181,9 +7181,9 @@ class ServerMoveWithPortResourceRequestTest(
        self._assert_pci_request_pf_device_name(server, 'host2-ens2')

        # recover source compute
+        self.compute1 = self.restart_compute_service(self.compute1)
        self.admin_api.put_service(
            self.compute1_service_id, {'forced_down': 'false'})
-        self.compute1 = self.restart_compute_service(self.compute1)

        # check that source allocation is cleaned up and the dest allocation
        # and the port bindings are not touched.
@ -7226,9 +7226,9 @@ class ServerMoveWithPortResourceRequestTest(
            qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)

        # recover source compute
+        self.compute1 = self.restart_compute_service(self.compute1)
        self.admin_api.put_service(
            self.compute1_service_id, {'forced_down': 'false'})
-        self.compute1 = self.restart_compute_service(self.compute1)

        # check again that even after source host recovery the source
        # allocation is intact