Reject AZ changes during aggregate add / remove host

After this patch nova rejects the add host to aggregate API action
if the host has instances and the new aggregate for the host would
mean that these instances need to move from one AZ (even from the
default one) to another. Such AZ change is not implemented in nova
and currently leads to stuck instances.

Similarly nova will reject remove host from aggregate API action if the
host has instances and the aggregate removal would mean that the
instances need to change AZ.

Depends-On: https://review.opendev.org/c/openstack/tempest/+/821732

Change-Id: I19c4c6d34aa2cc1f32d81e8c1a52762fa3a18580
Closes-Bug: #1907775
This commit is contained in:
Balazs Gibizer 2021-12-10 17:53:30 +01:00 committed by Balazs Gibizer
parent ca5be99837
commit 3c0eadae0b
8 changed files with 168 additions and 13 deletions

View File

@ -214,6 +214,9 @@ Adds a host to an aggregate.
Specify the ``add_host`` action and host name in the request body.
It is not allowed to move a host with servers between Availability Zones. Such
request is rejected with 409 error.
Normal response codes: 200
Error response codes: badRequest(400), unauthorized(401), forbidden(403),
@ -264,6 +267,9 @@ Removes a host from an aggregate.
Specify the ``remove_host`` action and host name in the request body.
It is not allowed to move a host with servers between Availability Zones. Such
request is rejected with 409 error.
Normal response codes: 200
Error response codes: badRequest(400), unauthorized(401), forbidden(403),

View File

@ -32,6 +32,14 @@ availability zone where instances will be scheduled when the user fails to
specify one. For more information on how to do this, refer to
:doc:`/admin/availability-zones`.
.. note::
It is not allowed to move instances between Availability Zones.
If adding a host to an aggregate or removing a host from an aggregate would
cause an instance to move between Availability Zones, including moving
from or moving to the default AZ, then the operation will be rejected. The
administrator should drain the instances from the host first then the host
can be moved.
.. _config-sch-for-aggs:

View File

@ -55,6 +55,15 @@ when comparing availability zones and host aggregates:
users to specify hosts where instances are launched in server creation.
See `Using availability zones to select hosts`_ for more information.
.. note::
It is not allowed to move instances between Availability Zones.
If adding a host to an aggregate or removing a host from an aggregate
would cause an instance to move between Availability Zones, including
moving from or moving to the default AZ, then the operation will be
rejected. The administrator should drain the instances from the host
first then the host can be moved.
In addition, other services, such as the :neutron-doc:`networking service <>`
and the :cinder-doc:`block storage service <>`, also provide an availability
zone feature. However, the implementation of these features differs vastly

View File

@ -219,9 +219,7 @@ class AggregateController(wsgi.Controller):
exception.ResourceProviderUpdateConflict) as e:
LOG.error('Failed to remove host %s from aggregate %s. Error: %s',
host, id, str(e))
msg = _('Cannot remove host %(host)s in aggregate %(id)s') % {
'host': host, 'id': id}
raise exc.HTTPConflict(explanation=msg)
raise exc.HTTPConflict(explanation=e.format_message())
return self._marshall_aggregate(req, aggregate)
@wsgi.expected_errors((400, 404))

View File

@ -6683,6 +6683,51 @@ class AggregateAPI:
availability_zones.update_host_availability_zone_cache(context,
host_name)
def ensure_no_instances_need_to_move_az_when_host_added(
self, context, aggregate, host_name
):
instances = objects.InstanceList.get_by_host(context, host_name)
if not instances:
# if no instance then nothing moves
return
new_az = aggregate.metadata.get('availability_zone')
if not new_az:
# if we add a host to an aggregate without AZ that cannot change
# existing, effective AZ of the host. The host was either not
# in any AZ and will not be in an AZ. Or the host was already in
# an AZ but this aggregate does not challenge that as it has no AZ.
return
# let's gather what is the AZ of the instances on the host before the
# host is added to the aggregate
aggregates = objects.AggregateList.get_by_host(context, host_name)
az = {
agg.metadata['availability_zone']
for agg in aggregates
if 'availability_zone' in agg.metadata}
# There can only be one or zero AZ names. Two different AZ names case
# is already rejected by is_safe_to_update_az()
old_az = list(az)[0] if az else None
# So here we know that the host is being added to a new AZ if it is
# different from the existing, effective AZ of the host then the
# instances on this host would need to move between AZs, that is not
# supported. So reject it.
if old_az != new_az:
msg = _(
"The host cannot be added to the aggregate as the "
"availability zone of the host would change from '%s' to '%s' "
"but the host already has %d instance(s). Changing the AZ of "
"an existing instance is not supported by this action. Move "
"the instances away from this host then try again. If you "
"need to move the instances between AZs then you can use "
"shelve_offload and unshelve to achieve this."
) % (old_az, new_az, len(instances))
self._raise_invalid_aggregate_exc(
AGGREGATE_ACTION_ADD, aggregate.id, msg)
@wrap_exception()
def add_host_to_aggregate(self, context, aggregate_id, host_name):
"""Adds the host to an aggregate."""
@ -6711,6 +6756,8 @@ class AggregateAPI:
self.is_safe_to_update_az(context, aggregate.metadata,
hosts=[host_name], aggregate=aggregate)
self.ensure_no_instances_need_to_move_az_when_host_added(
context, aggregate, host_name)
aggregate.add_host(host_name)
self.query_client.update_aggregates(context, [aggregate])
@ -6746,6 +6793,54 @@ class AggregateAPI:
return aggregate
def ensure_no_instances_need_to_move_az_when_host_removed(
self, context, aggregate, host_name
):
instances = objects.InstanceList.get_by_host(context, host_name)
if not instances:
# if no instance then nothing moves
return
current_az = aggregate.metadata.get('availability_zone')
if not current_az:
# if we remove a host from an aggregate without AZ that cannot
# change existing, effective AZ of the host. If the host has an AZ
# before the removal then that is due to a different aggregate
# membership so that does not change here. If the host has no AZ
# before the removal then it won't have either after the removal
# from an aggregate without az
return
# let's gather what would be the AZ of the instances on the host
# if we exclude the current aggregate.
aggregates = objects.AggregateList.get_by_host(context, host_name)
azs = {
agg.metadata['availability_zone']
for agg in aggregates
if agg.id != aggregate.id and 'availability_zone' in agg.metadata
}
# There can only be one or zero AZ names. Two different AZ names case
# is already rejected by is_safe_to_update_az()
new_az = list(azs)[0] if azs else None
# So here we know that the host is being removed from an aggregate
# that has an AZ. So if the new AZ without this aggregate is different
# then, that would mean the instances on this host need to change AZ.
# That is not supported.
if current_az != new_az:
msg = _(
"The host cannot be removed from the aggregate as the "
"availability zone of the host would change from '%s' to '%s' "
"but the host already has %d instance(s). Changing the AZ of "
"an existing instance is not supported by this action. Move "
"the instances away from this host then try again. If you "
"need to move the instances between AZs then you can use "
"shelve_offload and unshelve to achieve this."
) % (current_az, new_az, len(instances))
self._raise_invalid_aggregate_exc(
AGGREGATE_ACTION_DELETE, aggregate.id, msg)
@wrap_exception()
def remove_host_from_aggregate(self, context, aggregate_id, host_name):
"""Removes host from the aggregate."""
@ -6763,6 +6858,9 @@ class AggregateAPI:
action=fields_obj.NotificationAction.REMOVE_HOST,
phase=fields_obj.NotificationPhase.START)
self.ensure_no_instances_need_to_move_az_when_host_removed(
context, aggregate, host_name)
# Remove the resource provider from the provider aggregate first before
# we change anything on the nova side because if we did the nova stuff
# first we can't re-attempt this from the compute API if cleaning up

View File

@ -389,13 +389,23 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest):
# server will be in default AZ
self._boot_server(host=self.host)
# FIXME(stephenfin): This is bug #1907775, where we should reject the
# request to add a host to an aggregate when and instance would need
# to move between AZs
# The server would need to move from default AZ to custom AZ, that
# is not OK
self._add_host_to_aggregate('ag3-custom-az', self.host)
ex = self.assertRaises(
client.OpenStackApiException,
self._add_host_to_aggregate, 'ag3-custom-az', self.host
)
self.assertEqual(409, ex.response.status_code)
self.assertIn(
"Reason: The host cannot be added to the aggregate as the "
"availability zone of the host would change from 'None' to "
"'custom-az' but the host already has 1 instance(s). "
"Changing the AZ of an existing instance is not supported by this "
"action. Move the instances away from this host then try again. "
"If you need to move the instances between AZs then you can use "
"shelve_offload and unshelve to achieve this.",
str(ex)
)
def test_add_host_with_instances_custom_az_then_default(self):
self._add_host_to_aggregate('ag3-custom-az', self.host)
@ -432,12 +442,22 @@ class AggregateHostMoveTestCase(AggregateRequestFiltersTest):
# server will be in custom AZ
self._boot_server(host=self.host)
# FIXME(stephenfin): This is bug #1907775, where we should reject the
# request to remove a host from the aggregate when there are instances
# on said host
# The server would need to move to the default AZ, that is not OK.
self._remove_host_from_aggregate('ag3-custom-az', self.host)
ex = self.assertRaises(
client.OpenStackApiException,
self._remove_host_from_aggregate, 'ag3-custom-az', self.host
)
self.assertEqual(409, ex.response.status_code)
self.assertIn(
"Reason: The host cannot be removed from the aggregate as the "
"availability zone of the host would change from 'custom-az' to "
"'None' but the host already has 1 instance(s). Changing the AZ "
"of an existing instance is not supported by this action. Move "
"the instances away from this host then try again. If you "
"need to move the instances between AZs then you can use "
"shelve_offload and unshelve to achieve this.",
str(ex)
)
def test_moving_host_around_az_without_instances(self):
# host moving from default to custom AZ

View File

@ -13246,6 +13246,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase):
mock_get_all_by_host,
):
self.api.is_safe_to_update_az = mock.Mock()
self.api.ensure_no_instances_need_to_move_az_when_host_added = (
mock.Mock())
self.api._update_az_cache_for_host = mock.Mock()
agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg)
agg.add_host = mock.Mock()
@ -13274,6 +13276,8 @@ class ComputeAPIAggrCallsSchedulerTestCase(test.NoDBTestCase):
self, update_aggregates, mock_notify, mock_remove_host,
mock_get_all_by_host,
):
self.api.ensure_no_instances_need_to_move_az_when_host_removed = (
mock.Mock())
self.api._update_az_cache_for_host = mock.Mock()
agg = objects.Aggregate(name='fake', metadata={}, uuid=uuids.agg)
agg.delete_host = mock.Mock()

View File

@ -0,0 +1,12 @@
---
fixes:
- |
Nova now ensures that an instance cannot move between availability zones
when the host of the instance is added or removed to an aggregate that is
part of another availability zone. Moving from or to the default
availability zone is also rejected.
This resolves `bug 1907775`_ where after such move the instance become
stuck in between availability zones.
.. _bug 1907775: https://bugs.launchpad.net/nova/+bug/1907775