Merge "Block deleting compute services with in-progress migrations"

This commit is contained in:
Zuul 2019-11-16 08:28:41 +00:00 committed by Gerrit Code Review
commit c956a888e6
5 changed files with 109 additions and 26 deletions

View File

@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
will result in a 409 HTTPConflict response. The instances will need to be will result in a 409 HTTPConflict response. The instances will need to be
migrated or deleted before a compute service can be deleted. migrated or deleted before a compute service can be deleted.
Similarly, attempts to delete a ``nova-compute`` service which is involved in
in-progress migrations will result in a 409 HTTPConflict response. The
migrations will need to be completed, for example confirming or reverting a
resize, or the instances will need to be deleted before the compute service can
be deleted.
.. important:: Be sure to stop the actual ``nova-compute`` process on the .. important:: Be sure to stop the actual ``nova-compute`` process on the
physical host *before* deleting the service with this API. physical host *before* deleting the service with this API.
Failing to do so can lead to the running service re-creating Failing to do so can lead to the running service re-creating

View File

@ -264,6 +264,16 @@ class ServiceController(wsgi.Controller):
'is hosting instances. Migrate or ' 'is hosting instances. Migrate or '
'delete the instances first.')) 'delete the instances first.'))
# Similarly, check to see if the are any in-progress migrations
# involving this host because if there are we need to block the
# service delete since we could orphan resource providers and
# break the ability to do things like confirm/revert instances
# in VERIFY_RESIZE status.
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
self._assert_no_in_progress_migrations(
context, id, compute_nodes)
aggrs = self.aggregate_api.get_aggregates_by_host(context, aggrs = self.aggregate_api.get_aggregates_by_host(context,
service.host) service.host)
for ag in aggrs: for ag in aggrs:
@ -274,8 +284,6 @@ class ServiceController(wsgi.Controller):
# placement for the compute nodes managed by this service; # placement for the compute nodes managed by this service;
# remember that an ironic compute service can manage multiple # remember that an ironic compute service can manage multiple
# nodes # nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes: for compute_node in compute_nodes:
try: try:
self.placementclient.delete_resource_provider( self.placementclient.delete_resource_provider(
@ -303,6 +311,36 @@ class ServiceController(wsgi.Controller):
explanation = _("Service id %s refers to multiple services.") % id explanation = _("Service id %s refers to multiple services.") % id
raise webob.exc.HTTPBadRequest(explanation=explanation) raise webob.exc.HTTPBadRequest(explanation=explanation)
@staticmethod
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
"""Ensures there are no in-progress migrations on the given nodes.
:param context: nova auth RequestContext
:param service_id: id of the Service being deleted
:param compute_nodes: ComputeNodeList of nodes on a compute service
:raises: HTTPConflict if there are any in-progress migrations on the
nodes
"""
for cn in compute_nodes:
migrations = (
objects.MigrationList.get_in_progress_by_host_and_node(
context, cn.host, cn.hypervisor_hostname))
if migrations:
# Log the migrations for the operator and then raise
# a 409 error.
LOG.info('Unable to delete compute service with id %s '
'for host %s. There are %i in-progress '
'migrations involving the host. Migrations '
'(uuid:status): %s',
service_id, cn.host, len(migrations),
','.join(['%s:%s' % (mig.uuid, mig.status)
for mig in migrations]))
raise webob.exc.HTTPConflict(
explanation=_(
'Unable to delete compute service that has '
'in-progress migrations. Complete the '
'migrations or delete the instances first.'))
@validation.query_schema(services.index_query_schema_275, '2.75') @validation.query_schema(services.index_query_schema_275, '2.75')
@validation.query_schema(services.index_query_schema, '2.0', '2.74') @validation.query_schema(services.index_query_schema, '2.0', '2.74')
@wsgi.expected_errors(()) @wsgi.expected_errors(())

View File

@ -962,6 +962,18 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
'compute_confirm_resize', 'success') 'compute_confirm_resize', 'success')
return server return server
def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server
def get_unused_flavor_name_id(self): def get_unused_flavor_name_id(self):
flavors = self.api.get_flavors() flavors = self.api.get_flavors()
flavor_names = list() flavor_names = list()

View File

@ -194,21 +194,29 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service. # Delete the source compute service.
service = self.admin_api.get_services( service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0] binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id']) # We expect the delete request to fail with a 409 error because of the
# FIXME(mriedem): This is bug 1852610 where the compute service is # instance in VERIFY_RESIZE status even though that instance is marked
# deleted but the resource provider is not because there are still # as being on host2 now.
# migration-based allocations against the source node provider. ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status) self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
# Now try to confirm the migration. # Now try to confirm the migration.
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the self._confirm_resize(server)
# source compute service delete is blocked while there is an # Delete the host1 service since the migration is confirmed and the
# in-progress migration involving the node. # server is on host2.
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) self.admin_api.api_delete('/os-services/%s' % service['id'])
self.api.post_server_action(server['id'], {'confirmResize': None}) # The host1 resource provider should be gone.
self._wait_for_state_change(self.api, server, 'ERROR') resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) self.assertEqual(404, resp.status)
def test_resize_revert_after_deleted_source_compute(self): def test_resize_revert_after_deleted_source_compute(self):
"""Tests a scenario where a server is resized and while in """Tests a scenario where a server is resized and while in
@ -231,25 +239,34 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service. # Delete the source compute service.
service = self.admin_api.get_services( service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0] binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id']) # We expect the delete request to fail with a 409 error because of the
# FIXME(mriedem): This is bug 1852610 where the compute service is # instance in VERIFY_RESIZE status even though that instance is marked
# deleted but the resource provider is not because there are still # as being on host2 now.
# migration-based allocations against the source node provider. ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status) self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
# Now try to revert the resize. # Now revert the resize.
# NOTE(mriedem): This actually works because the drop_move_claim self._revert_resize(server)
# happens in revert_resize on the dest host which still has its
# ComputeNode record. The migration-based allocations are reverted
# so the instance holds the allocations for the source provider and
# the allocations against the dest provider are dropped.
self.api.post_server_action(server['id'], {'revertResize': None})
self._wait_for_state_change(self.api, server, 'ACTIVE')
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}} zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor) self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
# Delete the host2 service since the migration is reverted and the
# server is on host1 again.
service2 = self.admin_api.get_services(
binary='nova-compute', host='host2')[0]
self.admin_api.api_delete('/os-services/%s' % service2['id'])
# The host2 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
self.assertEqual(404, resp.status)
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase): class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):

View File

@ -0,0 +1,10 @@
---
fixes:
- |
The ``DELETE /os-services/{service_id}`` compute API will now return a
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
service which is involved in in-progress migrations. This is because doing
so would not only orphan the compute node resource provider in the
placement service on which those instances have resource allocations but
can also break the ability to confirm/revert a pending resize properly.
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.