Selaa lähdekoodia

Merge "Block deleting compute services with in-progress migrations" into stable/rocky

tags/18.3.0
Zuul Gerrit Code Review 4 kuukautta sitten
vanhempi
commit
619f1e6a97
5 muutettua tiedostoa jossa 112 lisäystä ja 25 poistoa
  1. +6
    -0
      api-ref/source/os-services.inc
  2. +43
    -2
      nova/api/openstack/compute/services.py
  3. +12
    -0
      nova/tests/functional/integrated_helpers.py
  4. +41
    -23
      nova/tests/functional/wsgi/test_services.py
  5. +10
    -0
      releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml

+ 6
- 0
api-ref/source/os-services.inc Näytä tiedosto

@@ -322,6 +322,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances
will result in a 409 HTTPConflict response. The instances will need to be
migrated or deleted before a compute service can be deleted.

Similarly, attempts to delete a ``nova-compute`` service which is involved in
in-progress migrations will result in a 409 HTTPConflict response. The
migrations will need to be completed, for example confirming or reverting a
resize, or the instances will need to be deleted before the compute service can
be deleted.

.. important:: Be sure to stop the actual ``nova-compute`` process on the
physical host *before* deleting the service with this API.
Failing to do so can lead to the running service re-creating


+ 43
- 2
nova/api/openstack/compute/services.py Näytä tiedosto

@@ -12,6 +12,7 @@
# License for the specific language governing permissions and limitations
# under the License.

from oslo_log import log as logging
from oslo_utils import strutils
from oslo_utils import uuidutils
import webob.exc
@@ -32,6 +33,8 @@ from nova import utils

UUID_FOR_ID_MIN_VERSION = '2.53'

LOG = logging.getLogger(__name__)


class ServiceController(wsgi.Controller):

@@ -238,6 +241,16 @@ class ServiceController(wsgi.Controller):
'is hosting instances. Migrate or '
'delete the instances first.'))

# Similarly, check to see if the are any in-progress migrations
# involving this host because if there are we need to block the
# service delete since we could orphan resource providers and
# break the ability to do things like confirm/revert instances
# in VERIFY_RESIZE status.
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
self._assert_no_in_progress_migrations(
context, id, compute_nodes)

aggrs = self.aggregate_api.get_aggregates_by_host(context,
service.host)
for ag in aggrs:
@@ -248,8 +261,6 @@ class ServiceController(wsgi.Controller):
# placement for the compute nodes managed by this service;
# remember that an ironic compute service can manage multiple
# nodes
compute_nodes = objects.ComputeNodeList.get_all_by_host(
context, service.host)
for compute_node in compute_nodes:
self.placementclient.delete_resource_provider(
context, compute_node, cascade=True)
@@ -271,6 +282,36 @@ class ServiceController(wsgi.Controller):
explanation = _("Service id %s refers to multiple services.") % id
raise webob.exc.HTTPBadRequest(explanation=explanation)

@staticmethod
def _assert_no_in_progress_migrations(context, service_id, compute_nodes):
"""Ensures there are no in-progress migrations on the given nodes.

:param context: nova auth RequestContext
:param service_id: id of the Service being deleted
:param compute_nodes: ComputeNodeList of nodes on a compute service
:raises: HTTPConflict if there are any in-progress migrations on the
nodes
"""
for cn in compute_nodes:
migrations = (
objects.MigrationList.get_in_progress_by_host_and_node(
context, cn.host, cn.hypervisor_hostname))
if migrations:
# Log the migrations for the operator and then raise
# a 409 error.
LOG.info('Unable to delete compute service with id %s '
'for host %s. There are %i in-progress '
'migrations involving the host. Migrations '
'(uuid:status): %s',
service_id, cn.host, len(migrations),
','.join(['%s:%s' % (mig.uuid, mig.status)
for mig in migrations]))
raise webob.exc.HTTPConflict(
explanation=_(
'Unable to delete compute service that has '
'in-progress migrations. Complete the '
'migrations or delete the instances first.'))

@validation.query_schema(services.index_query_schema)
@wsgi.expected_errors(())
def index(self, req):


+ 12
- 0
nova/tests/functional/integrated_helpers.py Näytä tiedosto

@@ -707,3 +707,15 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
server, request=request, old_flavor=old_flavor,
new_flavor=new_flavor, source_rp_uuid=source_rp_uuid,
dest_rp_uuid=dest_rp_uuid)

def _revert_resize(self, server):
self.api.post_server_action(server['id'], {'revertResize': None})
server = self._wait_for_state_change(self.api, server, 'ACTIVE')
self._wait_for_migration_status(server, ['reverted'])
# Note that the migration status is changed to "reverted" in the
# dest host revert_resize method but the allocations are cleaned up
# in the source host finish_revert_resize method so we need to wait
# for the finish_revert_resize method to complete.
fake_notifier.wait_for_versioned_notifications(
'instance.resize_revert.end')
return server

+ 41
- 23
nova/tests/functional/wsgi/test_services.py Näytä tiedosto

@@ -193,21 +193,30 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor)
# Now try to confirm the migration.
# FIXME(mriedem): This will fail until bug 1852610 is fixed and the
# source compute service delete is blocked while there is an
# in-progress migration involving the node.
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
self.api.post_server_action(server['id'], {'confirmResize': None})
self._wait_for_state_change(self.api, server, 'ERROR')
self.assertIn('ComputeHostNotFound', self.stdlog.logger.output)
self._wait_for_state_change(self.api, server, 'ACTIVE')
# Delete the host1 service since the migration is confirmed and the
# server is on host2.
self.admin_api.api_delete('/os-services/%s' % service['id'])
# The host1 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(404, resp.status)

def test_resize_revert_after_deleted_source_compute(self):
"""Tests a scenario where a server is resized and while in
@@ -230,22 +239,31 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase):
# Delete the source compute service.
service = self.admin_api.get_services(
binary='nova-compute', host='host1')[0]
self.admin_api.api_delete('/os-services/%s' % service['id'])
# FIXME(mriedem): This is bug 1852610 where the compute service is
# deleted but the resource provider is not because there are still
# migration-based allocations against the source node provider.
# We expect the delete request to fail with a 409 error because of the
# instance in VERIFY_RESIZE status even though that instance is marked
# as being on host2 now.
ex = self.assertRaises(api_client.OpenStackApiException,
self.admin_api.api_delete,
'/os-services/%s' % service['id'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Unable to delete compute service that has in-progress '
'migrations', six.text_type(ex))
self.assertIn('There are 1 in-progress migrations involving the host',
self.stdlog.logger.output)
# The provider is still around because we did not delete the service.
resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid)
self.assertEqual(200, resp.status)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
# Now try to revert the resize.
# NOTE(mriedem): This actually works because the drop_move_claim
# happens in revert_resize on the dest host which still has its
# ComputeNode record. The migration-based allocations are reverted
# so the instance holds the allocations for the source provider and
# the allocations against the dest provider are dropped.
self.api.post_server_action(server['id'], {'revertResize': None})
self._wait_for_state_change(self.api, server, 'ACTIVE')
self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output)
# Now revert the resize.
self._revert_resize(server)
self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1)
zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}}
self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor)
# Delete the host2 service since the migration is reverted and the
# server is on host1 again.
service2 = self.admin_api.get_services(
binary='nova-compute', host='host2')[0]
self.admin_api.api_delete('/os-services/%s' % service2['id'])
# The host2 resource provider should be gone.
resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid)
self.assertEqual(404, resp.status)

+ 10
- 0
releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml Näytä tiedosto

@@ -0,0 +1,10 @@
---
fixes:
- |
The ``DELETE /os-services/{service_id}`` compute API will now return a
``409 HTTPConflict`` response when trying to delete a ``nova-compute``
service which is involved in in-progress migrations. This is because doing
so would not only orphan the compute node resource provider in the
placement service on which those instances have resource allocations but
can also break the ability to confirm/revert a pending resize properly.
See https://bugs.launchpad.net/nova/+bug/1852610 for more details.

Loading…
Peruuta
Tallenna