diff --git a/api-ref/source/os-aggregates.inc b/api-ref/source/os-aggregates.inc index 6021c8e8607a..dd5d929d01ee 100644 --- a/api-ref/source/os-aggregates.inc +++ b/api-ref/source/os-aggregates.inc @@ -214,6 +214,9 @@ Adds a host to an aggregate. Specify the ``add_host`` action and host name in the request body. +It is not allowed to move a host with servers between Availability Zones. Such +request is rejected with 409 error. + Normal response codes: 200 Error response codes: badRequest(400), unauthorized(401), forbidden(403), @@ -264,6 +267,9 @@ Removes a host from an aggregate. Specify the ``remove_host`` action and host name in the request body. +It is not allowed to move a host with servers between Availability Zones. Such +request is rejected with 409 error. + Normal response codes: 200 Error response codes: badRequest(400), unauthorized(401), forbidden(403), diff --git a/doc/source/admin/aggregates.rst b/doc/source/admin/aggregates.rst index 621af8caa420..ff22c1bbc332 100644 --- a/doc/source/admin/aggregates.rst +++ b/doc/source/admin/aggregates.rst @@ -32,6 +32,14 @@ availability zone where instances will be scheduled when the user fails to specify one. For more information on how to do this, refer to :doc:`/admin/availability-zones`. +.. note:: + + It is not allowed to move instances between Availability Zones. + If adding a host to an aggregate or removing a host from an aggregate would + cause an instance to move between Availability Zones, including moving + from or moving to the default AZ, then the operation will be rejected. The + administrator should drain the instances from the host first then the host + can be moved. .. _config-sch-for-aggs: diff --git a/doc/source/admin/availability-zones.rst b/doc/source/admin/availability-zones.rst index 282e8fc9b324..72704790a439 100644 --- a/doc/source/admin/availability-zones.rst +++ b/doc/source/admin/availability-zones.rst @@ -55,6 +55,15 @@ when comparing availability zones and host aggregates: users to specify hosts where instances are launched in server creation. See `Using availability zones to select hosts`_ for more information. + .. note:: + + It is not allowed to move instances between Availability Zones. + If adding a host to an aggregate or removing a host from an aggregate + would cause an instance to move between Availability Zones, including + moving from or moving to the default AZ, then the operation will be + rejected. The administrator should drain the instances from the host + first then the host can be moved. + In addition, other services, such as the :neutron-doc:`networking service <>` and the :cinder-doc:`block storage service <>`, also provide an availability zone feature. However, the implementation of these features differs vastly diff --git a/nova/api/openstack/compute/aggregates.py b/nova/api/openstack/compute/aggregates.py index 45bd904ec032..21ec897b5ef9 100644 --- a/nova/api/openstack/compute/aggregates.py +++ b/nova/api/openstack/compute/aggregates.py @@ -219,9 +219,7 @@ class AggregateController(wsgi.Controller): exception.ResourceProviderUpdateConflict) as e: LOG.error('Failed to remove host %s from aggregate %s. Error: %s', host, id, str(e)) - msg = _('Cannot remove host %(host)s in aggregate %(id)s') % { - 'host': host, 'id': id} - raise exc.HTTPConflict(explanation=msg) + raise exc.HTTPConflict(explanation=e.format_message()) return self._marshall_aggregate(req, aggregate) @wsgi.expected_errors((400, 404)) diff --git a/nova/compute/api.py b/nova/compute/api.py index 09dad41dd8db..f77d909f5326 100644 --- a/nova/compute/api.py +++ b/nova/compute/api.py @@ -6683,6 +6683,51 @@ class AggregateAPI: availability_zones.update_host_availability_zone_cache(context, host_name) + def ensure_no_instances_need_to_move_az_when_host_added( + self, context, aggregate, host_name + ): + instances = objects.InstanceList.get_by_host(context, host_name) + if not instances: + # if no instance then nothing moves + return + + new_az = aggregate.metadata.get('availability_zone') + if not new_az: + # if we add a host to an aggregate without AZ that cannot change + # existing, effective AZ of the host. The host was either not + # in any AZ and will not be in an AZ. Or the host was already in + # an AZ but this aggregate does not challenge that as it has no AZ. + return + + # let's gather what is the AZ of the instances on the host before the + # host is added to the aggregate + aggregates = objects.AggregateList.get_by_host(context, host_name) + az = { + agg.metadata['availability_zone'] + for agg in aggregates + if 'availability_zone' in agg.metadata} + + # There can only be one or zero AZ names. Two different AZ names case + # is already rejected by is_safe_to_update_az() + old_az = list(az)[0] if az else None + + # So here we know that the host is being added to a new AZ if it is + # different from the existing, effective AZ of the host then the + # instances on this host would need to move between AZs, that is not + # supported. So reject it. + if old_az != new_az: + msg = _( + "The host cannot be added to the aggregate as the " + "availability zone of the host would change from '%s' to '%s' " + "but the host already has %d instance(s). Changing the AZ of " + "an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this." + ) % (old_az, new_az, len(instances)) + self._raise_invalid_aggregate_exc( + AGGREGATE_ACTION_ADD, aggregate.id, msg) + @wrap_exception() def add_host_to_aggregate(self, context, aggregate_id, host_name): """Adds the host to an aggregate.""" @@ -6711,6 +6756,8 @@ class AggregateAPI: self.is_safe_to_update_az(context, aggregate.metadata, hosts=[host_name], aggregate=aggregate) + self.ensure_no_instances_need_to_move_az_when_host_added( + context, aggregate, host_name) aggregate.add_host(host_name) self.query_client.update_aggregates(context, [aggregate]) @@ -6746,6 +6793,54 @@ class AggregateAPI: return aggregate + def ensure_no_instances_need_to_move_az_when_host_removed( + self, context, aggregate, host_name + ): + instances = objects.InstanceList.get_by_host(context, host_name) + if not instances: + # if no instance then nothing moves + return + + current_az = aggregate.metadata.get('availability_zone') + if not current_az: + # if we remove a host from an aggregate without AZ that cannot + # change existing, effective AZ of the host. If the host has an AZ + # before the removal then that is due to a different aggregate + # membership so that does not change here. If the host has no AZ + # before the removal then it won't have either after the removal + # from an aggregate without az + return + + # let's gather what would be the AZ of the instances on the host + # if we exclude the current aggregate. + aggregates = objects.AggregateList.get_by_host(context, host_name) + azs = { + agg.metadata['availability_zone'] + for agg in aggregates + if agg.id != aggregate.id and 'availability_zone' in agg.metadata + } + + # There can only be one or zero AZ names. Two different AZ names case + # is already rejected by is_safe_to_update_az() + new_az = list(azs)[0] if azs else None + + # So here we know that the host is being removed from an aggregate + # that has an AZ. So if the new AZ without this aggregate is different + # then, that would mean the instances on this host need to change AZ. + # That is not supported. + if current_az != new_az: + msg = _( + "The host cannot be removed from the aggregate as the " + "availability zone of the host would change from '%s' to '%s' " + "but the host already has %d instance(s). Changing the AZ of " + "an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this." + ) % (current_az, new_az, len(instances)) + self._raise_invalid_aggregate_exc( + AGGREGATE_ACTION_DELETE, aggregate.id, msg) + @wrap_exception() def remove_host_from_aggregate(self, context, aggregate_id, host_name): """Removes host from the aggregate.""" @@ -6763,6 +6858,9 @@ class AggregateAPI: action=fields_obj.NotificationAction.REMOVE_HOST, phase=fields_obj.NotificationPhase.START) + self.ensure_no_instances_need_to_move_az_when_host_removed( + context, aggregate, host_name) + # Remove the resource provider from the provider aggregate first before # we change anything on the nova side because if we did the nova stuff # first we can't re-attempt this from the compute API if cleaning up diff --git a/nova/tests/functional/test_aggregates.py b/nova/tests/functional/test_aggregates.py index 6bbbf3cfdf46..a3f14000dcd1 100644 --- a/nova/tests/functional/test_aggregates.py +++ b/nova/tests/functional/test_aggregates.py @@ -389,13 +389,23 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest): # server will be in default AZ self._boot_server(host=self.host) - # FIXME(stephenfin): This is bug #1907775, where we should reject the - # request to add a host to an aggregate when and instance would need - # to move between AZs - # The server would need to move from default AZ to custom AZ, that # is not OK - self._add_host_to_aggregate('ag3-custom-az', self.host) + ex = self.assertRaises( + client.OpenStackApiException, + self._add_host_to_aggregate, 'ag3-custom-az', self.host + ) + self.assertEqual(409, ex.response.status_code) + self.assertIn( + "Reason: The host cannot be added to the aggregate as the " + "availability zone of the host would change from 'None' to " + "'custom-az' but the host already has 1 instance(s). " + "Changing the AZ of an existing instance is not supported by this " + "action. Move the instances away from this host then try again. " + "If you need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this.", + str(ex) + ) def test_add_host_with_instances_custom_az_then_default(self): self._add_host_to_aggregate('ag3-custom-az', self.host) @@ -432,12 +442,22 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest): # server will be in custom AZ self._boot_server(host=self.host) - # FIXME(stephenfin): This is bug #1907775, where we should reject the - # request to remove a host from the aggregate when there are instances - # on said host - # The server would need to move to the default AZ, that is not OK. - self._remove_host_from_aggregate('ag3-custom-az', self.host) + ex = self.assertRaises( + client.OpenStackApiException, + self._remove_host_from_aggregate, 'ag3-custom-az', self.host + ) + self.assertEqual(409, ex.response.status_code) + self.assertIn( + "Reason: The host cannot be removed from the aggregate as the " + "availability zone of the host would change from 'custom-az' to " + "'None' but the host already has 1 instance(s). Changing the AZ " + "of an existing instance is not supported by this action. Move " + "the instances away from this host then try again. If you " + "need to move the instances between AZs then you can use " + "shelve_offload and unshelve to achieve this.", + str(ex) + ) def test_moving_host_around_az_without_instances(self): # host moving from default to custom AZ diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 14e3fb8d7725..cecefc2c9c3d 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -13246,6 +13246,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase): mock_get_all_by_host, ): self.api.is_safe_to_update_az = mock.Mock() + self.api.ensure_no_instances_need_to_move_az_when_host_added = ( + mock.Mock()) self.api._update_az_cache_for_host = mock.Mock() agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg) agg.add_host = mock.Mock() @@ -13274,6 +13276,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase): self, update_aggregates, mock_notify, mock_remove_host, mock_get_all_by_host, ): + self.api.ensure_no_instances_need_to_move_az_when_host_removed = ( + mock.Mock()) self.api._update_az_cache_for_host = mock.Mock() agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg) agg.delete_host = mock.Mock() diff --git a/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml b/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml new file mode 100644 index 000000000000..6639161ee8fd --- /dev/null +++ b/releasenotes/notes/reject-az-changes-of-an-instance-during-aggregate-update-64d319d0717ed704.yaml @@ -0,0 +1,12 @@ +--- +fixes: + - | + Nova now ensures that an instance cannot move between availability zones + when the host of the instance is added or removed to an aggregate that is + part of another availability zone. Moving from or to the default + availability zone is also rejected. + + This resolves `bug 1907775`_ where after such move the instance become + stuck in between availability zones. + + .. _bug 1907775: https://bugs.launchpad.net/nova/+bug/1907775