From 3c0eadae0b9ec48586087ea6c0c4e9176f0aa3bc Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Fri, 10 Dec 2021 17:53:30 +0100 Subject: [PATCH] Reject AZ changes during aggregate add / remove host After this patch nova rejects the add host to aggregate API action if the host has instances and the new aggregate for the host would mean that these instances need to move from one AZ (even from the default one) to another. Such AZ change is not implemented in nova and currently leads to stuck instances. Similarly nova will reject remove host from aggregate API action if the host has instances and the aggregate removal would mean that the instances need to change AZ. Depends-On: https://review.opendev.org/c/openstack/tempest/+/821732 Change-Id: I19c4c6d34aa2cc1f32d81e8c1a52762fa3a18580 Closes-Bug: #1907775 --- api-ref/source/os-aggregates.inc | 6 ++ doc/source/admin/aggregates.rst | 8 ++ doc/source/admin/availability-zones.rst | 9 ++ nova/api/openstack/compute/aggregates.py | 4 +- nova/compute/api.py | 98 +++++++++++++++++++ nova/tests/functional/test_aggregates.py | 40 ++++++-- nova/tests/unit/compute/test_compute.py | 4 + ...ing-aggregate-update-64d319d0717ed704.yaml | 12 +++ 8 files changed, 168 insertions(+), 13 deletions(-) create mode 100644 releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml diff --git a/api-ref/source/os-aggregates.inc b/api-ref/source/os-aggregates.inc index 6021c8e8607a..dd5d929d01ee 100644 --- a/api-ref/source/os-aggregates.inc +++ b/api-ref/source/os-aggregates.inc @@ -214,6 +214,9 @@ Adds a host to an aggregate. Specify the ``add_host`` action and host name in the request body. +It is not allowed to move a host with servers between Availability Zones. Such +request is rejected with 409 error. + Normal response codes: 200 Error response codes: badRequest(400), unauthorized(401), forbidden(403), @@ -264,6 +267,9 @@ Removes a host from an aggregate. Specify the ``remove_host`` action and host name in the request body. +It is not allowed to move a host with servers between Availability Zones. Such +request is rejected with 409 error. + Normal response codes: 200 Error response codes: badRequest(400), unauthorized(401), forbidden(403), diff --git a/doc/source/admin/aggregates.rst b/doc/source/admin/aggregates.rst index 621af8caa420..ff22c1bbc332 100644 --- a/doc/source/admin/aggregates.rst +++ b/doc/source/admin/aggregates.rst @@ -32,6 +32,14 @@ availability zone where instances will be scheduled when the user fails to specify one. For more information on how to do this, refer to :doc:`/admin/availability-zones`. +.. note:: + + It is not allowed to move instances between Availability Zones. + If adding a host to an aggregate or removing a host from an aggregate would + cause an instance to move between Availability Zones, including moving + from or moving to the default AZ, then the operation will be rejected. The + administrator should drain the instances from the host first then the host + can be moved. .. _config-sch-for-aggs: diff --git a/doc/source/admin/availability-zones.rst b/doc/source/admin/availability-zones.rst index 282e8fc9b324..72704790a439 100644 --- a/doc/source/admin/availability-zones.rst +++ b/doc/source/admin/availability-zones.rst @@ -55,6 +55,15 @@ when comparing availability zones and host aggregates: users to specify hosts where instances are launched in server creation. See `Using availability zones to select hosts`_ for more information. + .. note:: + + It is not allowed to move instances between Availability Zones. + If adding a host to an aggregate or removing a host from an aggregate + would cause an instance to move between Availability Zones, including + moving from or moving to the default AZ, then the operation will be + rejected. The administrator should drain the instances from the host + first then the host can be moved. + In addition, other services, such as the :neutron-doc:`networking service <>` and the :cinder-doc:`block storage service <>`, also provide an availability zone feature. However, the implementation of these features differs vastly diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py index 45bd904ec032..21ec897b5ef9 100644 --- a/nova/api/openstack/compute/aggregates.py +++ b/nova/api/openstack/compute/aggregates.py @@ -219,9 +219,7 @@ class AggregateController(wsgi.Controller): exception.ResourceProviderUpdateConflict) as e: LOG.error('Failed to remove host %s from aggregate %s. Error: %s', host, id, str(e)) - msg = _('Cannot remove host %(host)s in aggregate %(id)s') % { - 'host': host, 'id': id} - raise exc.HTTPConflict(explanation=msg) + raise exc.HTTPConflict(explanation=e.format_message()) return self._marshall_aggregate(req, aggregate) @wsgi.expected_errors((400, 404)) diff --git a/nova/compute/api.py b/nova/compute/api.py index 09dad41dd8db..f77d909f5326 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -6683,6 +6683,51 @@ class AggregateAPI: availability_zones.update_host_availability_zone_cache(context, host_name) + def ensure_no_instances_need_to_move_az_when_host_added( + self, context, aggregate, host_name + ): + instances = objects.InstanceList.get_by_host(context, host_name) + if not instances: + # if no instance then nothing moves + return + + new_az = aggregate.metadata.get('availability_zone') + if not new_az: + # if we add a host to an aggregate without AZ that cannot change + # existing, effective AZ of the host. The host was either not + # in any AZ and will not be in an AZ. Or the host was already in + # an AZ but this aggregate does not challenge that as it has no AZ. + return + + # let's gather what is the AZ of the instances on the host before the + # host is added to the aggregate + aggregates = objects.AggregateList.get_by_host(context, host_name) + az = { + agg.metadata['availability_zone'] + for agg in aggregates + if 'availability_zone' in agg.metadata} + + # There can only be one or zero AZ names. Two different AZ names case + # is already rejected by is_safe_to_update_az() + old_az = list(az)[0] if az else None + + # So here we know that the host is being added to a new AZ if it is + # different from the existing, effective AZ of the host then the + # instances on this host would need to move between AZs, that is not + # supported. So reject it. + if old_az != new_az: + msg = _( + "The host cannot be added to the aggregate as the " + "availability zone of the host would change from '%s' to '%s' " + "but the host already has %d instance(s). Changing the AZ of " + "an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this." + ) % (old_az, new_az, len(instances)) + self._raise_invalid_aggregate_exc( + AGGREGATE_ACTION_ADD, aggregate.id, msg) + @wrap_exception() def add_host_to_aggregate(self, context, aggregate_id, host_name): """Adds the host to an aggregate.""" @@ -6711,6 +6756,8 @@ class AggregateAPI: self.is_safe_to_update_az(context, aggregate.metadata, hosts=[host_name], aggregate=aggregate) + self.ensure_no_instances_need_to_move_az_when_host_added( + context, aggregate, host_name) aggregate.add_host(host_name) self.query_client.update_aggregates(context, [aggregate]) @@ -6746,6 +6793,54 @@ class AggregateAPI: return aggregate + def ensure_no_instances_need_to_move_az_when_host_removed( + self, context, aggregate, host_name + ): + instances = objects.InstanceList.get_by_host(context, host_name) + if not instances: + # if no instance then nothing moves + return + + current_az = aggregate.metadata.get('availability_zone') + if not current_az: + # if we remove a host from an aggregate without AZ that cannot + # change existing, effective AZ of the host. If the host has an AZ + # before the removal then that is due to a different aggregate + # membership so that does not change here. If the host has no AZ + # before the removal then it won't have either after the removal + # from an aggregate without az + return + + # let's gather what would be the AZ of the instances on the host + # if we exclude the current aggregate. + aggregates = objects.AggregateList.get_by_host(context, host_name) + azs = { + agg.metadata['availability_zone'] + for agg in aggregates + if agg.id != aggregate.id and 'availability_zone' in agg.metadata + } + + # There can only be one or zero AZ names. Two different AZ names case + # is already rejected by is_safe_to_update_az() + new_az = list(azs)[0] if azs else None + + # So here we know that the host is being removed from an aggregate + # that has an AZ. So if the new AZ without this aggregate is different + # then, that would mean the instances on this host need to change AZ. + # That is not supported. + if current_az != new_az: + msg = _( + "The host cannot be removed from the aggregate as the " + "availability zone of the host would change from '%s' to '%s' " + "but the host already has %d instance(s). Changing the AZ of " + "an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this." + ) % (current_az, new_az, len(instances)) + self._raise_invalid_aggregate_exc( + AGGREGATE_ACTION_DELETE, aggregate.id, msg) + @wrap_exception() def remove_host_from_aggregate(self, context, aggregate_id, host_name): """Removes host from the aggregate.""" @@ -6763,6 +6858,9 @@ class AggregateAPI: action=fields_obj.NotificationAction.REMOVE_HOST, phase=fields_obj.NotificationPhase.START) + self.ensure_no_instances_need_to_move_az_when_host_removed( + context, aggregate, host_name) + # Remove the resource provider from the provider aggregate first before # we change anything on the nova side because if we did the nova stuff # first we can't re-attempt this from the compute API if cleaning up diff --git a/nova/tests/functional/test_aggregates.py b/nova/tests/functional/test_aggregates.py index 6bbbf3cfdf46..a3f14000dcd1 100644 --- a/nova/tests/functional/test_aggregates.py +++ b/nova/tests/functional/test_aggregates.py @@ -389,13 +389,23 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest): # server will be in default AZ self._boot_server(host=self.host) - # FIXME(stephenfin): This is bug #1907775, where we should reject the - # request to add a host to an aggregate when and instance would need - # to move between AZs - # The server would need to move from default AZ to custom AZ, that # is not OK - self._add_host_to_aggregate('ag3-custom-az', self.host) + ex = self.assertRaises( + client.OpenStackApiException, + self._add_host_to_aggregate, 'ag3-custom-az', self.host + ) + self.assertEqual(409, ex.response.status_code) + self.assertIn( + "Reason: The host cannot be added to the aggregate as the " + "availability zone of the host would change from 'None' to " + "'custom-az' but the host already has 1 instance(s). " + "Changing the AZ of an existing instance is not supported by this " + "action. Move the instances away from this host then try again. " + "If you need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this.", + str(ex) + ) def test_add_host_with_instances_custom_az_then_default(self): self._add_host_to_aggregate('ag3-custom-az', self.host) @@ -432,12 +442,22 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest): # server will be in custom AZ self._boot_server(host=self.host) - # FIXME(stephenfin): This is bug #1907775, where we should reject the - # request to remove a host from the aggregate when there are instances - # on said host - # The server would need to move to the default AZ, that is not OK. - self._remove_host_from_aggregate('ag3-custom-az', self.host) + ex = self.assertRaises( + client.OpenStackApiException, + self._remove_host_from_aggregate, 'ag3-custom-az', self.host + ) + self.assertEqual(409, ex.response.status_code) + self.assertIn( + "Reason: The host cannot be removed from the aggregate as the " + "availability zone of the host would change from 'custom-az' to " + "'None' but the host already has 1 instance(s). Changing the AZ " + "of an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this.", + str(ex) + ) def test_moving_host_around_az_without_instances(self): # host moving from default to custom AZ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 14e3fb8d7725..cecefc2c9c3d 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -13246,6 +13246,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase): mock_get_all_by_host, ): self.api.is_safe_to_update_az = mock.Mock() + self.api.ensure_no_instances_need_to_move_az_when_host_added = ( + mock.Mock()) self.api._update_az_cache_for_host = mock.Mock() agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg) agg.add_host = mock.Mock() @@ -13274,6 +13276,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase): self, update_aggregates, mock_notify, mock_remove_host, mock_get_all_by_host, ): + self.api.ensure_no_instances_need_to_move_az_when_host_removed = ( + mock.Mock()) self.api._update_az_cache_for_host = mock.Mock() agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg) agg.delete_host = mock.Mock() diff --git a/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml b/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml new file mode 100644 index 000000000000..6639161ee8fd --- /dev/null +++ b/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Nova now ensures that an instance cannot move between availability zones + when the host of the instance is added or removed to an aggregate that is + part of another availability zone. Moving from or to the default + availability zone is also rejected. + + This resolves `bug 1907775`_ where after such move the instance become + stuck in between availability zones. + + .. _bug 1907775: https://bugs.launchpad.net/nova/+bug/1907775