From 30a635068512be558acf0f9c83185dc1aaad560f Mon Sep 17 00:00:00 2001 From: Matt Riedemann Date: Thu, 14 Nov 2019 14:19:26 -0500 Subject: [PATCH] Block deleting compute services with in-progress migrations This builds on I0bd63b655ad3d3d39af8d15c781ce0a45efc8e3a which made DELETE /os-services/{service_id} fail with a 409 response if the host has instances on it. This change checks for in-progress migrations involving the nodes on the host, either as the source or destination nodes, and returns a 409 error response if any are found. Failling to do this can lead to orphaned resource providers in placement and also failing to properly confirm or revert a pending resize or cold migration. A release note is included for the (justified) behavior change in the API. A new microversion should not be required for this since admins should not have to opt out of broken behavior. Conflicts: nova/tests/functional/integrated_helpers.py NOTE(mriedem): The conflict is due to not having change Ie991d4b53e9bb5e7ec26da99219178ab7695abf6 in Rocky. Change-Id: I70e06c607045a1c0842f13069e51fef438012a9c Closes-Bug: #1852610 (cherry picked from commit 92fed026103b47fa2a76ea09204a4ba24c21e191) (cherry picked from commit a9650b3cbfc674e283964090fb64ac6297be5b78) (cherry picked from commit a0290858b717b4cefd0d6fc17acc2b143ab12ac4) --- api-ref/source/os-services.inc | 6 ++ nova/api/openstack/compute/services.py | 45 ++++++++++++- nova/tests/functional/integrated_helpers.py | 12 ++++ nova/tests/functional/wsgi/test_services.py | 64 ++++++++++++------- ...lete-with-migrations-ca0565fc0b503519.yaml | 10 +++ 5 files changed, 112 insertions(+), 25 deletions(-) create mode 100644 releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml diff --git a/api-ref/source/os-services.inc b/api-ref/source/os-services.inc index 0f2c5cbc2711..b64ddb7d03e4 100644 --- a/api-ref/source/os-services.inc +++ b/api-ref/source/os-services.inc @@ -322,6 +322,12 @@ Attempts to delete a ``nova-compute`` service which is still hosting instances will result in a 409 HTTPConflict response. The instances will need to be migrated or deleted before a compute service can be deleted. +Similarly, attempts to delete a ``nova-compute`` service which is involved in +in-progress migrations will result in a 409 HTTPConflict response. The +migrations will need to be completed, for example confirming or reverting a +resize, or the instances will need to be deleted before the compute service can +be deleted. + .. important:: Be sure to stop the actual ``nova-compute`` process on the physical host *before* deleting the service with this API. Failing to do so can lead to the running service re-creating diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py index b5aa42cdd56c..7783b8d415de 100644 --- a/nova/api/openstack/compute/services.py +++ b/nova/api/openstack/compute/services.py @@ -12,6 +12,7 @@ # License for the specific language governing permissions and limitations # under the License. +from oslo_log import log as logging from oslo_utils import strutils from oslo_utils import uuidutils import webob.exc @@ -32,6 +33,8 @@ from nova import utils UUID_FOR_ID_MIN_VERSION = '2.53' +LOG = logging.getLogger(__name__) + class ServiceController(wsgi.Controller): @@ -238,6 +241,16 @@ class ServiceController(wsgi.Controller): 'is hosting instances. Migrate or ' 'delete the instances first.')) + # Similarly, check to see if the are any in-progress migrations + # involving this host because if there are we need to block the + # service delete since we could orphan resource providers and + # break the ability to do things like confirm/revert instances + # in VERIFY_RESIZE status. + compute_nodes = objects.ComputeNodeList.get_all_by_host( + context, service.host) + self._assert_no_in_progress_migrations( + context, id, compute_nodes) + aggrs = self.aggregate_api.get_aggregates_by_host(context, service.host) for ag in aggrs: @@ -248,8 +261,6 @@ class ServiceController(wsgi.Controller): # placement for the compute nodes managed by this service; # remember that an ironic compute service can manage multiple # nodes - compute_nodes = objects.ComputeNodeList.get_all_by_host( - context, service.host) for compute_node in compute_nodes: self.placementclient.delete_resource_provider( context, compute_node, cascade=True) @@ -271,6 +282,36 @@ class ServiceController(wsgi.Controller): explanation = _("Service id %s refers to multiple services.") % id raise webob.exc.HTTPBadRequest(explanation=explanation) + @staticmethod + def _assert_no_in_progress_migrations(context, service_id, compute_nodes): + """Ensures there are no in-progress migrations on the given nodes. + + :param context: nova auth RequestContext + :param service_id: id of the Service being deleted + :param compute_nodes: ComputeNodeList of nodes on a compute service + :raises: HTTPConflict if there are any in-progress migrations on the + nodes + """ + for cn in compute_nodes: + migrations = ( + objects.MigrationList.get_in_progress_by_host_and_node( + context, cn.host, cn.hypervisor_hostname)) + if migrations: + # Log the migrations for the operator and then raise + # a 409 error. + LOG.info('Unable to delete compute service with id %s ' + 'for host %s. There are %i in-progress ' + 'migrations involving the host. Migrations ' + '(uuid:status): %s', + service_id, cn.host, len(migrations), + ','.join(['%s:%s' % (mig.uuid, mig.status) + for mig in migrations])) + raise webob.exc.HTTPConflict( + explanation=_( + 'Unable to delete compute service that has ' + 'in-progress migrations. Complete the ' + 'migrations or delete the instances first.')) + @validation.query_schema(services.index_query_schema) @wsgi.expected_errors(()) def index(self, req): diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py index 5920f1f57545..46947c88ffc6 100644 --- a/nova/tests/functional/integrated_helpers.py +++ b/nova/tests/functional/integrated_helpers.py @@ -707,3 +707,15 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin): server, request=request, old_flavor=old_flavor, new_flavor=new_flavor, source_rp_uuid=source_rp_uuid, dest_rp_uuid=dest_rp_uuid) + + def _revert_resize(self, server): + self.api.post_server_action(server['id'], {'revertResize': None}) + server = self._wait_for_state_change(self.api, server, 'ACTIVE') + self._wait_for_migration_status(server, ['reverted']) + # Note that the migration status is changed to "reverted" in the + # dest host revert_resize method but the allocations are cleaned up + # in the source host finish_revert_resize method so we need to wait + # for the finish_revert_resize method to complete. + fake_notifier.wait_for_versioned_notifications( + 'instance.resize_revert.end') + return server diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py index e90ec07fddbf..016eefa049b8 100644 --- a/nova/tests/functional/wsgi/test_services.py +++ b/nova/tests/functional/wsgi/test_services.py @@ -193,21 +193,30 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): # Delete the source compute service. service = self.admin_api.get_services( binary='nova-compute', host='host1')[0] - self.admin_api.api_delete('/os-services/%s' % service['id']) - # FIXME(mriedem): This is bug 1852610 where the compute service is - # deleted but the resource provider is not because there are still - # migration-based allocations against the source node provider. + # We expect the delete request to fail with a 409 error because of the + # instance in VERIFY_RESIZE status even though that instance is marked + # as being on host2 now. + ex = self.assertRaises(api_client.OpenStackApiException, + self.admin_api.api_delete, + '/os-services/%s' % service['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Unable to delete compute service that has in-progress ' + 'migrations', six.text_type(ex)) + self.assertIn('There are 1 in-progress migrations involving the host', + self.stdlog.logger.output) + # The provider is still around because we did not delete the service. resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) self.assertEqual(200, resp.status) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor) # Now try to confirm the migration. - # FIXME(mriedem): This will fail until bug 1852610 is fixed and the - # source compute service delete is blocked while there is an - # in-progress migration involving the node. - self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) self.api.post_server_action(server['id'], {'confirmResize': None}) - self._wait_for_state_change(self.api, server, 'ERROR') - self.assertIn('ComputeHostNotFound', self.stdlog.logger.output) + self._wait_for_state_change(self.api, server, 'ACTIVE') + # Delete the host1 service since the migration is confirmed and the + # server is on host2. + self.admin_api.api_delete('/os-services/%s' % service['id']) + # The host1 resource provider should be gone. + resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) + self.assertEqual(404, resp.status) def test_resize_revert_after_deleted_source_compute(self): """Tests a scenario where a server is resized and while in @@ -230,22 +239,31 @@ class TestServicesAPI(integrated_helpers.ProviderUsageBaseTestCase): # Delete the source compute service. service = self.admin_api.get_services( binary='nova-compute', host='host1')[0] - self.admin_api.api_delete('/os-services/%s' % service['id']) - # FIXME(mriedem): This is bug 1852610 where the compute service is - # deleted but the resource provider is not because there are still - # migration-based allocations against the source node provider. + # We expect the delete request to fail with a 409 error because of the + # instance in VERIFY_RESIZE status even though that instance is marked + # as being on host2 now. + ex = self.assertRaises(api_client.OpenStackApiException, + self.admin_api.api_delete, + '/os-services/%s' % service['id']) + self.assertEqual(409, ex.response.status_code) + self.assertIn('Unable to delete compute service that has in-progress ' + 'migrations', six.text_type(ex)) + self.assertIn('There are 1 in-progress migrations involving the host', + self.stdlog.logger.output) + # The provider is still around because we did not delete the service. resp = self.placement_api.get('/resource_providers/%s' % host1_rp_uuid) self.assertEqual(200, resp.status) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) - # Now try to revert the resize. - # NOTE(mriedem): This actually works because the drop_move_claim - # happens in revert_resize on the dest host which still has its - # ComputeNode record. The migration-based allocations are reverted - # so the instance holds the allocations for the source provider and - # the allocations against the dest provider are dropped. - self.api.post_server_action(server['id'], {'revertResize': None}) - self._wait_for_state_change(self.api, server, 'ACTIVE') - self.assertNotIn('ComputeHostNotFound', self.stdlog.logger.output) + # Now revert the resize. + self._revert_resize(server) self.assertFlavorMatchesUsage(host1_rp_uuid, flavor1) zero_flavor = {'vcpus': 0, 'ram': 0, 'disk': 0, 'extra_specs': {}} self.assertFlavorMatchesUsage(host2_rp_uuid, zero_flavor) + # Delete the host2 service since the migration is reverted and the + # server is on host1 again. + service2 = self.admin_api.get_services( + binary='nova-compute', host='host2')[0] + self.admin_api.api_delete('/os-services/%s' % service2['id']) + # The host2 resource provider should be gone. + resp = self.placement_api.get('/resource_providers/%s' % host2_rp_uuid) + self.assertEqual(404, resp.status) diff --git a/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml b/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml new file mode 100644 index 000000000000..7bee8090c272 --- /dev/null +++ b/releasenotes/notes/bug-1852610-service-delete-with-migrations-ca0565fc0b503519.yaml @@ -0,0 +1,10 @@ +--- +fixes: + - | + The ``DELETE /os-services/{service_id}`` compute API will now return a + ``409 HTTPConflict`` response when trying to delete a ``nova-compute`` + service which is involved in in-progress migrations. This is because doing + so would not only orphan the compute node resource provider in the + placement service on which those instances have resource allocations but + can also break the ability to confirm/revert a pending resize properly. + See https://bugs.launchpad.net/nova/+bug/1852610 for more details.