Block deleting compute services with in-progress migrations
This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a which made DELETE /os-services/{service_id} fail with a 409 response if the host has instances on it. This change checks for in-progress migrations involving the nodes on the host, either as the source or destination nodes, and returns a 409 error response if any are found. Failling to do this can lead to orphaned resource providers in placement and also failing to properly confirm or revert a pending resize or cold migration. A release note is included for the (justified) behavior change in the API. A new microversion should not be required for this since admins should not have to opt out of broken behavior. Conflicts: nova/tests/functional/integrated_helpers.py NOTE(mriedem): The conflict is due to not having change Iea283322124cb35fc0bc6d25f35548621e8c8c2f in Queens so _revert_resize is added to ProviderUsageBaseTestCase within test_servers.py. Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c Closes-Bug: #1852610 (cherry picked from commit92fed02610
) (cherry picked from commita9650b3cbf
) (cherry picked from commita0290858b7
) (cherry picked from commit30a6350685
)
This commit is contained in:
parent
57ac269937
commit
d88f353796
|
@ -322,6 +322,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
|
||||||
will result in a 409 HTTPConflict response. The instances will need to be
|
will result in a 409 HTTPConflict response. The instances will need to be
|
||||||
migrated or deleted before a compute service can be deleted.
|
migrated or deleted before a compute service can be deleted.
|
||||||
|
|
||||||
|
Similarly, attempts to delete a ``nova-compute`` service which is involved in
|
||||||
|
in-progress migrations will result in a 409 HTTPConflict response. The
|
||||||
|
migrations will need to be completed, for example confirming or reverting a
|
||||||
|
resize, or the instances will need to be deleted before the compute service can
|
||||||
|
be deleted.
|
||||||
|
|
||||||
.. important:: Be sure to stop the actual ``nova-compute`` process on the
|
.. important:: Be sure to stop the actual ``nova-compute`` process on the
|
||||||
physical host *before* deleting the service with this API.
|
physical host *before* deleting the service with this API.
|
||||||
Failing to do so can lead to the running service re-creating
|
Failing to do so can lead to the running service re-creating
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
# License for the specific language governing permissions and limitations
|
# License for the specific language governing permissions and limitations
|
||||||
# under the License.
|
# under the License.
|
||||||
|
|
||||||
|
from oslo_log import log as logging
|
||||||
from oslo_utils import strutils
|
from oslo_utils import strutils
|
||||||
from oslo_utils import uuidutils
|
from oslo_utils import uuidutils
|
||||||
import webob.exc
|
import webob.exc
|
||||||
|
@ -32,6 +33,8 @@ from nova import utils
|
||||||
|
|
||||||
UUID_FOR_ID_MIN_VERSION = '2.53'
|
UUID_FOR_ID_MIN_VERSION = '2.53'
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ServiceController(wsgi.Controller):
|
class ServiceController(wsgi.Controller):
|
||||||
|
|
||||||
|
@ -238,6 +241,16 @@ class ServiceController(wsgi.Controller):
|
||||||
'is hosting instances. Migrate or '
|
'is hosting instances. Migrate or '
|
||||||
'delete the instances first.'))
|
'delete the instances first.'))
|
||||||
|
|
||||||
|
# Similarly, check to see if the are any in-progress migrations
|
||||||
|
# involving this host because if there are we need to block the
|
||||||
|
# service delete since we could orphan resource providers and
|
||||||
|
# break the ability to do things like confirm/revert instances
|
||||||
|
# in VERIFY_RESIZE status.
|
||||||
|
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
||||||
|
context, service.host)
|
||||||
|
self._assert_no_in_progress_migrations(
|
||||||
|
context, id, compute_nodes)
|
||||||
|
|
||||||
aggrs = self.aggregate_api.get_aggregates_by_host(context,
|
aggrs = self.aggregate_api.get_aggregates_by_host(context,
|
||||||
service.host)
|
service.host)
|
||||||
for ag in aggrs:
|
for ag in aggrs:
|
||||||
|
@ -248,8 +261,6 @@ class ServiceController(wsgi.Controller):
|
||||||
# placement for the compute nodes managed by this service;
|
# placement for the compute nodes managed by this service;
|
||||||
# remember that an ironic compute service can manage multiple
|
# remember that an ironic compute service can manage multiple
|
||||||
# nodes
|
# nodes
|
||||||
compute_nodes = objects.ComputeNodeList.get_all_by_host(
|
|
||||||
context, service.host)
|
|
||||||
for compute_node in compute_nodes:
|
for compute_node in compute_nodes:
|
||||||
self.placementclient.delete_resource_provider(
|
self.placementclient.delete_resource_provider(
|
||||||
context, compute_node, cascade=True)
|
context, compute_node, cascade=True)
|
||||||
|
@ -271,6 +282,36 @@ class ServiceController(wsgi.Controller):
|
||||||
explanation = _("Service id %s refers to multiple services.") % id
|
explanation = _("Service id %s refers to multiple services.") % id
|
||||||
raise webob.exc.HTTPBadRequest(explanation=explanation)
|
raise webob.exc.HTTPBadRequest(explanation=explanation)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
|
||||||
|
"""Ensures there are no in-progress migrations on the given nodes.
|
||||||
|
|
||||||
|
:param context: nova auth RequestContext
|
||||||
|
:param service_id: id of the Service being deleted
|
||||||
|
:param compute_nodes: ComputeNodeList of nodes on a compute service
|
||||||
|
:raises: HTTPConflict if there are any in-progress migrations on the
|
||||||
|
nodes
|
||||||
|
"""
|
||||||
|
for cn in compute_nodes:
|
||||||
|
migrations = (
|
||||||
|
objects.MigrationList.get_in_progress_by_host_and_node(
|
||||||
|
context, cn.host, cn.hypervisor_hostname))
|
||||||
|
if migrations:
|
||||||
|
# Log the migrations for the operator and then raise
|
||||||
|
# a 409 error.
|
||||||
|
LOG.info('Unable to delete compute service with id %s '
|
||||||
|
'for host %s. There are %i in-progress '
|
||||||
|
'migrations involving the host. Migrations '
|
||||||
|
'(uuid:status): %s',
|
||||||
|
service_id, cn.host, len(migrations),
|
||||||
|
','.join(['%s:%s' % (mig.uuid, mig.status)
|
||||||
|
for mig in migrations]))
|
||||||
|
raise webob.exc.HTTPConflict(
|
||||||
|
explanation=_(
|
||||||
|
'Unable to delete compute service that has '
|
||||||
|
'in-progress migrations. Complete the '
|
||||||
|
'migrations or delete the instances first.'))
|
||||||
|
|
||||||
@validation.query_schema(services.index_query_schema)
|
@validation.query_schema(services.index_query_schema)
|
||||||
@wsgi.expected_errors(())
|
@wsgi.expected_errors(())
|
||||||
def index(self, req):
|
def index(self, req):
|
||||||
|
|
|
@ -1762,6 +1762,18 @@ class ProviderUsageBaseTestCase(test.TestCase,
|
||||||
new_flavor=new_flavor, source_rp_uuid=source_rp_uuid,
|
new_flavor=new_flavor, source_rp_uuid=source_rp_uuid,
|
||||||
dest_rp_uuid=dest_rp_uuid)
|
dest_rp_uuid=dest_rp_uuid)
|
||||||
|
|
||||||
|
def _revert_resize(self, server):
|
||||||
|
self.api.post_server_action(server['id'], {'revertResize': None})
|
||||||
|
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||||
|
self._wait_for_migration_status(server, ['reverted'])
|
||||||
|
# Note that the migration status is changed to "reverted" in the
|
||||||
|
# dest host revert_resize method but the allocations are cleaned up
|
||||||
|
# in the source host finish_revert_resize method so we need to wait
|
||||||
|
# for the finish_revert_resize method to complete.
|
||||||
|
fake_notifier.wait_for_versioned_notifications(
|
||||||
|
'instance.resize_revert.end')
|
||||||
|
return server
|
||||||
|
|
||||||
|
|
||||||
class TraitsTrackingTests(ProviderUsageBaseTestCase):
|
class TraitsTrackingTests(ProviderUsageBaseTestCase):
|
||||||
compute_driver = 'fake.SmallFakeDriver'
|
compute_driver = 'fake.SmallFakeDriver'
|
||||||
|
|
|
@ -193,21 +193,30 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
|
||||||
# Delete the source compute service.
|
# Delete the source compute service.
|
||||||
service = self.admin_api.get_services(
|
service = self.admin_api.get_services(
|
||||||
binary='nova-compute', host='host1')[0]
|
binary='nova-compute', host='host1')[0]
|
||||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
# We expect the delete request to fail with a 409 error because of the
|
||||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||||
# deleted but the resource provider is not because there are still
|
# as being on host2 now.
|
||||||
# migration-based allocations against the source node provider.
|
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||||
|
self.admin_api.api_delete,
|
||||||
|
'/os-services/%s' % service['id'])
|
||||||
|
self.assertEqual(409, ex.response.status_code)
|
||||||
|
self.assertIn('Unable to delete compute service that has in-progress '
|
||||||
|
'migrations', six.text_type(ex))
|
||||||
|
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||||
|
self.stdlog.logger.output)
|
||||||
|
# The provider is still around because we did not delete the service.
|
||||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
self.assertEqual(200, resp.status)
|
self.assertEqual(200, resp.status)
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
|
||||||
# Now try to confirm the migration.
|
# Now try to confirm the migration.
|
||||||
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
|
|
||||||
# source compute service delete is blocked while there is an
|
|
||||||
# in-progress migration involving the node.
|
|
||||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
|
||||||
self.api.post_server_action(server['id'], {'confirmResize': None})
|
self.api.post_server_action(server['id'], {'confirmResize': None})
|
||||||
self._wait_for_state_change(self.api, server, 'ERROR')
|
self._wait_for_state_change(self.api, server, 'ACTIVE')
|
||||||
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
|
# Delete the host1 service since the migration is confirmed and the
|
||||||
|
# server is on host2.
|
||||||
|
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
||||||
|
# The host1 resource provider should be gone.
|
||||||
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
|
self.assertEqual(404, resp.status)
|
||||||
|
|
||||||
def test_resize_revert_after_deleted_source_compute(self):
|
def test_resize_revert_after_deleted_source_compute(self):
|
||||||
"""Tests a scenario where a server is resized and while in
|
"""Tests a scenario where a server is resized and while in
|
||||||
|
@ -230,22 +239,31 @@ class TestServicesAPI(test_servers.ProviderUsageBaseTestCase):
|
||||||
# Delete the source compute service.
|
# Delete the source compute service.
|
||||||
service = self.admin_api.get_services(
|
service = self.admin_api.get_services(
|
||||||
binary='nova-compute', host='host1')[0]
|
binary='nova-compute', host='host1')[0]
|
||||||
self.admin_api.api_delete('/os-services/%s' % service['id'])
|
# We expect the delete request to fail with a 409 error because of the
|
||||||
# FIXME(mriedem): This is bug 1852610 where the compute service is
|
# instance in VERIFY_RESIZE status even though that instance is marked
|
||||||
# deleted but the resource provider is not because there are still
|
# as being on host2 now.
|
||||||
# migration-based allocations against the source node provider.
|
ex = self.assertRaises(api_client.OpenStackApiException,
|
||||||
|
self.admin_api.api_delete,
|
||||||
|
'/os-services/%s' % service['id'])
|
||||||
|
self.assertEqual(409, ex.response.status_code)
|
||||||
|
self.assertIn('Unable to delete compute service that has in-progress '
|
||||||
|
'migrations', six.text_type(ex))
|
||||||
|
self.assertIn('There are 1 in-progress migrations involving the host',
|
||||||
|
self.stdlog.logger.output)
|
||||||
|
# The provider is still around because we did not delete the service.
|
||||||
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
|
||||||
self.assertEqual(200, resp.status)
|
self.assertEqual(200, resp.status)
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||||
# Now try to revert the resize.
|
# Now revert the resize.
|
||||||
# NOTE(mriedem): This actually works because the drop_move_claim
|
self._revert_resize(server)
|
||||||
# happens in revert_resize on the dest host which still has its
|
|
||||||
# ComputeNode record. The migration-based allocations are reverted
|
|
||||||
# so the instance holds the allocations for the source provider and
|
|
||||||
# the allocations against the dest provider are dropped.
|
|
||||||
self.api.post_server_action(server['id'], {'revertResize': None})
|
|
||||||
self._wait_for_state_change(self.api, server, 'ACTIVE')
|
|
||||||
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
|
|
||||||
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
|
||||||
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
|
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
|
||||||
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
|
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
|
||||||
|
# Delete the host2 service since the migration is reverted and the
|
||||||
|
# server is on host1 again.
|
||||||
|
service2 = self.admin_api.get_services(
|
||||||
|
binary='nova-compute', host='host2')[0]
|
||||||
|
self.admin_api.api_delete('/os-services/%s' % service2['id'])
|
||||||
|
# The host2 resource provider should be gone.
|
||||||
|
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
|
||||||
|
self.assertEqual(404, resp.status)
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- |
|
||||||
|
The ``DELETE /os-services/{service_id}`` compute API will now return a
|
||||||
|
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
|
||||||
|
service which is involved in in-progress migrations. This is because doing
|
||||||
|
so would not only orphan the compute node resource provider in the
|
||||||
|
placement service on which those instances have resource allocations but
|
||||||
|
can also break the ability to confirm/revert a pending resize properly.
|
||||||
|
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.
|
Loading…
Reference in New Issue