Make scheduler.utils.setup_instance_group query all cells

To check affinity and anti-affinity policies for scheduling instances,
we use the RequestSpec.instance_group.hosts field to check the hosts
that have group members on them. Access of the 'hosts' field calls
InstanceGroup.get_hosts during a lazy-load and get_hosts does a query
for all instances that are members of the group and returns their hosts
after removing duplicates. The InstanceList query isn't targeting any
cells, so it will return [] in a multi-cell environment in both the
instance create case and the instance move case. In the move case, we
do have a cell-targeted RequestContext when setup_instance_group is
called *but* the RequestSpec.instance_group object is queried early in
compute/api before we're targeted to a cell, so a call of
RequestSpec.instance_group.get_hosts() will result in [] still, even
for move operations.

This makes setup_instance_group query all cells for instances that are
members of the instance group if the RequestContext is untargeted, else
it queries the targeted cell for the instances.

Closes-Bug: #1746863

Change-Id: Ia5f5a0d75953b1154a8de3e1eaa15f8042e32d77
This commit is contained in:
melanie witt 2018-02-02 05:41:20 +00:00
parent ac6e51c10d
commit 14f4c502f9
5 changed files with 68 additions and 11 deletions

View File

@ -15,6 +15,7 @@
import copy
from oslo_db import exception as db_exc
from oslo_log import log as logging
from oslo_serialization import jsonutils
from oslo_utils import uuidutils
from oslo_utils import versionutils
@ -31,6 +32,7 @@ from nova.objects import fields
LAZY_LOAD_FIELDS = ['hosts']
LOG = logging.getLogger(__name__)
def _instance_group_get_query(context, id_field=None, id=None):
@ -341,6 +343,12 @@ class InstanceGroup(base.NovaPersistentObject, base.NovaObject,
raise exception.ObjectActionError(
action='obj_load_attr', reason='unable to load %s' % attrname)
LOG.debug("Lazy-loading '%(attr)s' on %(name)s uuid %(uuid)s",
{'attr': attrname,
'name': self.obj_name(),
'uuid': self.uuid,
})
self.hosts = self.get_hosts()
self.obj_reset_changes(['hosts'])

View File

@ -28,6 +28,7 @@ from nova.api.openstack.placement import lib as placement_lib
from nova.compute import flavors
from nova.compute import utils as compute_utils
import nova.conf
from nova import context as nova_context
from nova import exception
from nova.i18n import _
from nova import objects
@ -813,12 +814,44 @@ def _get_group_details(context, instance_uuid, user_group_hosts=None):
msg = _("ServerGroupSoftAntiAffinityWeigher not configured")
LOG.error(msg)
raise exception.UnsupportedPolicyException(reason=msg)
# NOTE(melwitt): If the context is already targeted to a cell (during a
# move operation), we don't need to scatter-gather.
if context.db_connection:
# We don't need to target the group object's context because it was
# retrieved with the targeted context earlier in this method.
group_hosts = set(group.get_hosts())
else:
group_hosts = set(_get_instance_group_hosts_all_cells(context,
group))
user_hosts = set(user_group_hosts) if user_group_hosts else set()
return GroupDetails(hosts=user_hosts | group_hosts,
policy=group.policy, members=group.members)
def _get_instance_group_hosts_all_cells(context, instance_group):
def get_hosts_in_cell(cell_context):
# NOTE(melwitt): The obj_alternate_context is going to mutate the
# cell_instance_group._context and to do this in a scatter-gather
# with multiple parallel greenthreads, we need the instance groups
# to be separate object copies.
cell_instance_group = instance_group.obj_clone()
with cell_instance_group.obj_alternate_context(cell_context):
return cell_instance_group.get_hosts()
results = nova_context.scatter_gather_skip_cell0(context,
get_hosts_in_cell)
hosts = []
for result in results.values():
# TODO(melwitt): We will need to handle scenarios where an exception
# is raised while targeting a cell and when a cell does not respond
# as part of the "handling of a down cell" spec:
# https://blueprints.launchpad.net/nova/+spec/handling-down-cell
if result not in (nova_context.did_not_respond_sentinel,
nova_context.raised_exception_sentinel):
hosts.extend(result)
return hosts
def setup_instance_group(context, request_spec):
"""Add group_hosts and group_policies fields to filter_properties dict
based on instance uuids provided in request_spec, if those instances are
@ -826,11 +859,30 @@ def setup_instance_group(context, request_spec):
:param request_spec: Request spec
"""
# NOTE(melwitt): Proactively query for the instance group hosts instead of
# relying on a lazy-load via the 'hosts' field of the InstanceGroup object.
if (request_spec.instance_group and
'hosts' not in request_spec.instance_group):
group = request_spec.instance_group
# If the context is already targeted to a cell (during a move
# operation), we don't need to scatter-gather. We do need to use
# obj_alternate_context here because the RequestSpec is queried at the
# start of a move operation in compute/api, before the context has been
# targeted.
if context.db_connection:
with group.obj_alternate_context(context):
group.hosts = group.get_hosts()
else:
group.hosts = _get_instance_group_hosts_all_cells(context, group)
if request_spec.instance_group and request_spec.instance_group.hosts:
group_hosts = request_spec.instance_group.hosts
else:
group_hosts = None
instance_uuid = request_spec.instance_uuid
# This queries the group details for the group where the instance is a
# member. The group_hosts passed in are the hosts that contain members of
# the requested instance group.
group_info = _get_group_details(context, instance_uuid, group_hosts)
if group_info is not None:
request_spec.instance_group.hosts = list(group_info.hosts)

View File

@ -966,12 +966,7 @@ class ServerGroupTestMultiCell(ServerGroupTestBase):
# cell1. So boot the server to cell2 where the group member cannot be
# found as a result of the default setting.
self._boot_a_server_to_group(created_group, az='cell2')
# TODO(melwitt): Uncomment this when the bug is fixed.
# self._boot_a_server_to_group(created_group, az='cell1',
# expected_status='ERROR')
# TODO(melwitt): Remove this when the bug is fixed.
# Boot a server to cell1 with the affinity policy. This should fail
# because a group member has landed in cell2 already. But it passes
# because of the bug -- the query for group members doesn't find any
# members in cell2 because the query isn't multi-cell enabled.
self._boot_a_server_to_group(created_group, az='cell1')
# because group members found in cell2 should violate the policy.
self._boot_a_server_to_group(created_group, az='cell1',
expected_status='ERROR')

View File

@ -327,7 +327,8 @@ class _TestInstanceGroupObject(object):
mock_get_by_filt.return_value = [objects.Instance(host='host1'),
objects.Instance(host='host2')]
obj = objects.InstanceGroup(self.context, members=['uuid1'])
obj = objects.InstanceGroup(self.context, members=['uuid1'],
uuid=uuids.group)
self.assertEqual(2, len(obj.hosts))
self.assertIn('host1', obj.hosts)
self.assertIn('host2', obj.hosts)

View File

@ -21,6 +21,7 @@ import six
from nova.compute import flavors
from nova.compute import utils as compute_utils
from nova import context as nova_context
from nova import exception
from nova import objects
from nova.scheduler import utils as scheduler_utils
@ -34,7 +35,7 @@ class SchedulerUtilsTestCase(test.NoDBTestCase):
"""Test case for scheduler utils methods."""
def setUp(self):
super(SchedulerUtilsTestCase, self).setUp()
self.context = 'fake-context'
self.context = nova_context.get_context()
def test_build_request_spec_without_image(self):
instance = {'uuid': uuids.instance}