Block deleting compute services with in-progress migrations
This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a which made DELETE /os-services/{service_id} fail with a 409 response if the host has instances on it. This change checks for in-progress migrations involving the nodes on the host, either as the source or destination nodes, and returns a 409 error response if any are found. Failling to do this can lead to orphaned resource providers in placement and also failing to properly confirm or revert a pending resize or cold migration. A release note is included for the (justified) behavior change in the API. A new microversion should not be required for this since admins should not have to opt out of broken behavior. Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c Closes-Bug: #1852610
This commit is contained in:
parent
f7dde6054e
commit
92fed02610
@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
|
||||
will result in a 409 HTTPConflict response. The instances will need to be
|
||||
migrated or deleted before a compute service can be deleted.
|
||||
|
||||
Similarly, attempts to delete a ``nova-compute`` service which is involved in
|
||||
in-progress migrations will result in a 409 HTTPConflict response. The
|
||||
migrations will need to be completed, for example confirming or reverting a
|
||||
resize, or the instances will need to be deleted before the compute service can
|
||||
be deleted.
|
||||
|
||||
.. important:: Be sure to stop the actual ``nova-compute`` process on the
|
||||
physical host *before* deleting the service with this API.
|
||||
Failing to do so can lead to the running service re-creating
|
||||
|
@ -264,6 +264,16 @@ class ServiceController(wsgi.Controller):
|
||||
'is hosting instances. Migrate or '
|
||||
'delete the instances first.'))
|
||||
|
||||
# Similarly, check to see if the are any in-progress migrations
|
||||
# involving this host because if there are we need to block the
|
||||
# service delete since we could orphan resource providers and
|
||||
# break the ability to do things like confirm/revert instances
|
||||
# in VERIFY_RESIZE status.
|
||||
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
||||
context, service.host)
|
||||
self._assert_no_in_progress_migrations(
|
||||
context, id, compute_nodes)
|
||||
|
||||
aggrs = self.aggregate_api.get_aggregates_by_host(context,
|
||||
service.host)
|
||||
for ag in aggrs:
|
||||
@ -274,8 +284,6 @@ class ServiceController(wsgi.Controller):
|
||||
# placement for the compute nodes managed by this service;
|
||||
# remember that an ironic compute service can manage multiple
|
||||
# nodes
|
||||
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
||||
context, service.host)
|
||||
for compute_node in compute_nodes:
|
||||
try:
|
||||
self.placementclient.delete_resource_provider(
|
||||
@ -303,6 +311,36 @@ class ServiceController(wsgi.Controller):
|
||||
explanation = _("Service id %s refers to multiple services.") % id
|
||||
raise webob.exc.HTTPBadRequest(explanation=explanation)
|
||||
|
||||
@staticmethod
|
||||
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
|
||||
"""Ensures there are no in-progress migrations on the given nodes.
|
||||
|
||||
:param context: nova auth RequestContext
|
||||
:param service_id: id of the Service being deleted
|
||||
:param compute_nodes: ComputeNodeList of nodes on a compute service
|
||||
:raises: HTTPConflict if there are any in-progress migrations on the
|
||||
nodes
|
||||
"""
|
||||
for cn in compute_nodes:
|
||||
migrations = (
|
||||
objects.MigrationList.get_in_progress_by_host_and_node(
|
||||
context, cn.host, cn.hypervisor_hostname))
|
||||
if migrations:
|
||||
# Log the migrations for the operator and then raise
|
||||
# a 409 error.
|
||||
LOG.info('Unable to delete compute service with id %s '
|
||||
'for host %s. There are %i in-progress '
|
||||
'migrations involving the host. Migrations '
|
||||
'(uuid:status): %s',
|
||||
service_id, cn.host, len(migrations),
|
||||
','.join(['%s:%s' % (mig.uuid, mig.status)
|
||||
for mig in migrations]))
|
||||
raise webob.exc.HTTPConflict(
|
||||
explanation=_(
|
||||
'Unable to delete compute service that has '
|
||||
'in-progress migrations. Complete the '
|
||||
'migrations or delete the instances first.'))
|
||||
|
||||
@validation.query_schema(services.index_query_schema_275, '2.75')
|
||||
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
|
||||
@wsgi.expected_errors(())
|
||||
|
@ -955,6 +955,18 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
|
||||
'compute_confirm_resize', 'success')
|
||||
return server
|
||||
|
||||
def _revert_resize(self, server):
|
||||
self.api.post_server_action(server['id'], {'revertResize': None})
|
||||
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||
self._wait_for_migration_status(server, ['reverted'])
|
||||
# Note that the migration status is changed to "reverted" in the
|
||||
# dest host revert_resize method but the allocations are cleaned up
|
||||
# in the source host finish_revert_resize method so we need to wait
|
||||
# for the finish_revert_resize method to complete.
|
||||
fake_notifier.wait_for_versioned_notifications(
|
||||
'instance.resize_revert.end')
|
||||
return server
|
||||
|
||||
def get_unused_flavor_name_id(self):
|
||||
flavors = self.api.get_flavors()
|
||||
flavor_names = list()
|
||||
|
@ -194,21 +194,29 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
# Delete the source compute service.
|
||||
service = self.admin_api.get_services(
|
||||
binary='nova-compute', host='host1')[0]
|
||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
||||
# deleted but the resource provider is not because there are still
|
||||
# migration-based allocations against the source node provider.
|
||||
# We expect the delete request to fail with a 409 error because of the
|
||||
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||
# as being on host2 now.
|
||||
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||
self.admin_api.api_delete,
|
||||
'/os-services/%s' % service['id'])
|
||||
self.assertEqual(409, ex.response.status_code)
|
||||
self.assertIn('Unable to delete compute service that has in-progress '
|
||||
'migrations', six.text_type(ex))
|
||||
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||
self.stdlog.logger.output)
|
||||
# The provider is still around because we did not delete the service.
|
||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||
self.assertEqual(200, resp.status)
|
||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
|
||||
# Now try to confirm the migration.
|
||||
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
|
||||
# source compute service delete is blocked while there is an
|
||||
# in-progress migration involving the node.
|
||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
||||
self.api.post_server_action(server['id'], {'confirmResize': None})
|
||||
self._wait_for_state_change(self.api, server, 'ERROR')
|
||||
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
|
||||
self._confirm_resize(server)
|
||||
# Delete the host1 service since the migration is confirmed and the
|
||||
# server is on host2.
|
||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||
# The host1 resource provider should be gone.
|
||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||
self.assertEqual(404, resp.status)
|
||||
|
||||
def test_resize_revert_after_deleted_source_compute(self):
|
||||
"""Tests a scenario where a server is resized and while in
|
||||
@ -231,25 +239,34 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
# Delete the source compute service.
|
||||
service = self.admin_api.get_services(
|
||||
binary='nova-compute', host='host1')[0]
|
||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
||||
# deleted but the resource provider is not because there are still
|
||||
# migration-based allocations against the source node provider.
|
||||
# We expect the delete request to fail with a 409 error because of the
|
||||
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||
# as being on host2 now.
|
||||
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||
self.admin_api.api_delete,
|
||||
'/os-services/%s' % service['id'])
|
||||
self.assertEqual(409, ex.response.status_code)
|
||||
self.assertIn('Unable to delete compute service that has in-progress '
|
||||
'migrations', six.text_type(ex))
|
||||
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||
self.stdlog.logger.output)
|
||||
# The provider is still around because we did not delete the service.
|
||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||
self.assertEqual(200, resp.status)
|
||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||
# Now try to revert the resize.
|
||||
# NOTE(mriedem): This actually works because the drop_move_claim
|
||||
# happens in revert_resize on the dest host which still has its
|
||||
# ComputeNode record. The migration-based allocations are reverted
|
||||
# so the instance holds the allocations for the source provider and
|
||||
# the allocations against the dest provider are dropped.
|
||||
self.api.post_server_action(server['id'], {'revertResize': None})
|
||||
self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
||||
# Now revert the resize.
|
||||
self._revert_resize(server)
|
||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
|
||||
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
|
||||
# Delete the host2 service since the migration is reverted and the
|
||||
# server is on host1 again.
|
||||
service2 = self.admin_api.get_services(
|
||||
binary='nova-compute', host='host2')[0]
|
||||
self.admin_api.api_delete('/os-services/%s' % service2['id'])
|
||||
# The host2 resource provider should be gone.
|
||||
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
|
||||
self.assertEqual(404, resp.status)
|
||||
|
||||
|
||||
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):
|
||||
|
@ -0,0 +1,10 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The ``DELETE /os-services/{service_id}`` compute API will now return a
|
||||
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
|
||||
service which is involved in in-progress migrations. This is because doing
|
||||
so would not only orphan the compute node resource provider in the
|
||||
placement service on which those instances have resource allocations but
|
||||
can also break the ability to confirm/revert a pending resize properly.
|
||||
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.
|
Loading…
x
Reference in New Issue
Block a user