api: Reject requests to force up computes when `done` evacuation records exist

When evacuating an instance the evacuation migration record moves from a
state of `accepted` to `pre-migrating` and eventually `done` once the
instance has been rebuilt on the new compute host. At present the
migration record remains in this state until the original compute host
is restarted and the service cleans up any leftovers of the instance
before it is moved to a state of `completed`.

Bug #1922053 details a use case where an operator might unintentionally
forget to ensure the compute service is restarted before forcing the
service up leaving evacuation migration records stuck as `done`.

This could become an issue in the future if the instance is moved back
to this compute before the service is restarted. Any future restart
invoking the cleanup logic and potentially damaging the running
instance.

This change aims to address this by blocking requests to force up
computes associated to `done` evacuation records. Forcing operators to
restart the service allowing them to move to a `completed` state before
the service can be forced up again.

To allow this to be backportable these requests are rejected with a 400
BadRequest return code from the API. A TODO is left to move this to a
409 Conflict during the Xena release under a new microversion.

Finally, some additional functional tests have been updated to ensure
they restart the source compute service of an evacuation before
attempting to force up the service, ensuring any migration records are
marked as completed.

Closes-Bug: #1922053
Change-Id: I95882ea28564a31a6b4f8b665de462774d84edfc
This commit is contained in:
Lee Yarwood 2021-03-31 11:00:57 +01:00
parent db333070b6
commit bf89a23d91
4 changed files with 40 additions and 12 deletions

View File

@ -189,6 +189,9 @@ class ServiceController(wsgi.Controller):
host = body['host']
binary = body['binary']
if binary == 'nova-compute' and forced_down is False:
self._check_for_evacuations(context, host)
ret_value = {'service': {'host': host,
'binary': binary,
'forced_down': forced_down}}
@ -225,6 +228,25 @@ class ServiceController(wsgi.Controller):
return action(body, context)
def _check_for_evacuations(self, context, hostname):
# NOTE(lyarwood): When forcing a compute service back up ensure that
# there are no evacuation migration records against this host as the
# source that are marked as done, suggesting that the compute service
# hasn't restarted and moved such records to a completed state.
filters = {
'source_compute': hostname,
'status': 'done',
'migration_type': objects.fields.MigrationType.EVACUATION,
}
if any(objects.MigrationList.get_by_filters(context, filters)):
msg = _("Unable to force up host %(host)s as `done` evacuation "
"migration records remain associated with the host. "
"Ensure the compute service has been restarted, "
"allowing these records to move to `completed` before "
"retrying this request.") % {'host': hostname}
# TODO(lyarwood): Move to 409 HTTPConflict under a new microversion
raise webob.exc.HTTPBadRequest(explanation=msg)
@wsgi.response(204)
@wsgi.expected_errors((400, 404, 409))
def delete(self, req, id):
@ -446,6 +468,8 @@ class ServiceController(wsgi.Controller):
if 'forced_down' in body:
service.forced_down = strutils.bool_from_string(
body['forced_down'], strict=True)
if service.forced_down is False:
self._check_for_evacuations(context, service.host)
# Check to see if anything was actually updated since the schema does
# not define any required fields.

View File

@ -252,6 +252,7 @@ class TestInstanceNotificationSampleWithMultipleCompute(
services = self.admin_api.get_services(host='host2',
binary='nova-compute')
service_id = services[0]['id']
self.compute2.stop()
self.admin_api.put_service(service_id, {'forced_down': True})
evacuate = {
'evacuate': {
@ -272,6 +273,8 @@ class TestInstanceNotificationSampleWithMultipleCompute(
'reservation_id': server['reservation_id'],
'uuid': server['id']},
actual=notifications[0])
self.compute2.start()
self._wait_for_migration_status(server, ['completed'])
self.admin_api.put_service(service_id, {'forced_down': False})
def _test_live_migration_force_complete(self, server):

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from nova.tests.functional.api import client
from nova.tests.functional import integrated_helpers
@ -61,16 +62,13 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase):
expected_migration_status='done'
)
# FIXME(lyarwood): This is bug #1922053, this shouldn't be allowed with
# `done` evacuation migration records still associated with the host.
# Replace this with the following assertion once fixed:
# ex = self.assertRaises(
# client.OpenStackApiException,
# self._force_up_compute,
# 'compute',
# )
# self.assertEqual(400, ex.response.status_code)
self._force_up_compute('compute')
# Assert that the request to force up the host is rejected
ex = self.assertRaises(
client.OpenStackApiException,
self._force_up_compute,
'compute',
)
self.assertEqual(400, ex.response.status_code)
# Assert that the evacuation migration record remains `done`
self._wait_for_migration_status(server, ["done"])
@ -82,6 +80,9 @@ class ForceUpWithDoneEvacuations(integrated_helpers._IntegratedTestBase):
# Assert that the evacuation migration record is now `completed`
self._wait_for_migration_status(server, ["completed"])
# Assert that we can now force up the host
self._force_up_compute('compute')
class ForceUpWithDoneEvacuationsv252(ForceUpWithDoneEvacuations):

View File

@ -7181,9 +7181,9 @@ class ServerMoveWithPortResourceRequestTest(
self._assert_pci_request_pf_device_name(server, 'host2-ens2')
# recover source compute
self.compute1 = self.restart_compute_service(self.compute1)
self.admin_api.put_service(
self.compute1_service_id, {'forced_down': 'false'})
self.compute1 = self.restart_compute_service(self.compute1)
# check that source allocation is cleaned up and the dest allocation
# and the port bindings are not touched.
@ -7226,9 +7226,9 @@ class ServerMoveWithPortResourceRequestTest(
qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)
# recover source compute
self.compute1 = self.restart_compute_service(self.compute1)
self.admin_api.put_service(
self.compute1_service_id, {'forced_down': 'false'})
self.compute1 = self.restart_compute_service(self.compute1)
# check again that even after source host recovery the source
# allocation is intact