Block deleting compute services with in-progress migrations

This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a
which made DELETE /os-services/{service_id} fail with a 409
response if the host has instances on it. This change checks
for in-progress migrations involving the nodes on the host,
either as the source or destination nodes, and returns a 409
error response if any are found.

Failling to do this can lead to orphaned resource providers
in placement and also failing to properly confirm or revert
a pending resize or cold migration.

A release note is included for the (justified) behavior
change in the API. A new microversion should not be required
for this since admins should not have to opt out of broken
behavior.

Conflicts:
      nova/tests/functional/integrated_helpers.py

NOTE(mriedem): The conflict is due to change
Iec61f56c05e06924def814a3a6e09ceb91a15894 which is not in Train.

NOTE(mriedem): services.py had to be updated to add the LOG
variable since change I8403a841f21a624a546ae5f26bb9ba19318ece6a
is not in Train.

Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c
Closes-Bug: #1852610
(cherry picked from commit 92fed02610)
This commit is contained in:
Matt Riedemann 2019-11-14 14:19:26 -05:00
parent 3774952410
commit a9650b3cbf
5 changed files with 112 additions and 26 deletions

View File

@ -349,6 +349,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
will result in a 409 HTTPConflict response. The instances will need to be
migrated or deleted before a compute service can be deleted.
Similarly, attempts to delete a ``nova-compute`` service which is involved in
in-progress migrations will result in a 409 HTTPConflict response. The
migrations will need to be completed, for example confirming or reverting a
resize, or the instances will need to be deleted before the compute service can
be deleted.
.. important:: Be sure to stop the actual ``nova-compute`` process on the
physical host *before* deleting the service with this API.
Failing to do so can lead to the running service re-creating

View File

@ -12,6 +12,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from oslo_log import log as logging
from oslo_utils import strutils
from oslo_utils import uuidutils
import webob.exc
@ -33,6 +34,8 @@ from nova import utils
UUID_FOR_ID_MIN_VERSION = '2.53'
PARTIAL_CONSTRUCT_FOR_CELL_DOWN_MIN_VERSION = '2.69'
LOG = logging.getLogger(__name__)
class ServiceController(wsgi.Controller):
@ -259,6 +262,16 @@ class ServiceController(wsgi.Controller):
'is hosting instances. Migrate or '
'delete the instances first.'))
# Similarly, check to see if the are any in-progress migrations
# involving this host because if there are we need to block the
# service delete since we could orphan resource providers and
# break the ability to do things like confirm/revert instances
# in VERIFY_RESIZE status.
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
self._assert_no_in_progress_migrations(
context, id, compute_nodes)
aggrs = self.aggregate_api.get_aggregates_by_host(context,
service.host)
for ag in aggrs:
@ -269,8 +282,6 @@ class ServiceController(wsgi.Controller):
# placement for the compute nodes managed by this service;
# remember that an ironic compute service can manage multiple
# nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes:
self.placementclient.delete_resource_provider(
context, compute_node, cascade=True)
@ -292,6 +303,36 @@ class ServiceController(wsgi.Controller):
explanation = _("Service id %s refers to multiple services.") % id
raise webob.exc.HTTPBadRequest(explanation=explanation)
@staticmethod
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
"""Ensures there are no in-progress migrations on the given nodes.
:param context: nova auth RequestContext
:param service_id: id of the Service being deleted
:param compute_nodes: ComputeNodeList of nodes on a compute service
:raises: HTTPConflict if there are any in-progress migrations on the
nodes
"""
for cn in compute_nodes:
migrations = (
objects.MigrationList.get_in_progress_by_host_and_node(
context, cn.host, cn.hypervisor_hostname))
if migrations:
# Log the migrations for the operator and then raise
# a 409 error.
LOG.info('Unable to delete compute service with id %s '
'for host %s. There are %i in-progress '
'migrations involving the host. Migrations '
'(uuid:status): %s',
service_id, cn.host, len(migrations),
','.join(['%s:%s' % (mig.uuid, mig.status)
for mig in migrations]))
raise webob.exc.HTTPConflict(
explanation=_(
'Unable to delete compute service that has '
'in-progress migrations. Complete the '
'migrations or delete the instances first.'))
@validation.query_schema(services.index_query_schema_275, '2.75')
@validation.query_schema(services.index_query_schema, '2.0', '2.74')
@wsgi.expected_errors(())

View File

@ -941,3 +941,15 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
self.api, server, instance_actions.CONFIRM_RESIZE,
'compute_confirm_resize', 'success')
return server
def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server

View File

@ -194,21 +194,29 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
# Now try to confirm the migration.
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
# source compute service delete is blocked while there is an
# in-progress migration involving the node.
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
self.api.post_server_action(server['id'], {'confirmResize': None})
self._wait_for_state_change(self.api, server, 'ERROR')
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
self._confirm_resize(server)
# Delete the host1 service since the migration is confirmed and the
# server is on host2.
self.admin_api.api_delete('/os-services/%s' % service['id'])
# The host1 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(404, resp.status)
def test_resize_revert_after_deleted_source_compute(self):
"""Tests a scenario where a server is resized and while in
@ -231,25 +239,34 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
# Now try to revert the resize.
# NOTE(mriedem): This actually works because the drop_move_claim
# happens in revert_resize on the dest host which still has its
# ComputeNode record. The migration-based allocations are reverted
# so the instance holds the allocations for the source provider and
# the allocations against the dest provider are dropped.
self.api.post_server_action(server['id'], {'revertResize': None})
self._wait_for_state_change(self.api, server, 'ACTIVE')
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
# Now revert the resize.
self._revert_resize(server)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
# Delete the host2 service since the migration is reverted and the
# server is on host1 again.
service2 = self.admin_api.get_services(
binary='nova-compute', host='host2')[0]
self.admin_api.api_delete('/os-services/%s' % service2['id'])
# The host2 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
self.assertEqual(404, resp.status)
class ComputeStatusFilterTest(integrated_helpers.ProviderUsageBaseTestCase):

View File

@ -0,0 +1,10 @@
---
fixes:
- |
The ``DELETE /os-services/{service_id}`` compute API will now return a
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
service which is involved in in-progress migrations. This is because doing
so would not only orphan the compute node resource provider in the
placement service on which those instances have resource allocations but
can also break the ability to confirm/revert a pending resize properly.
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.