Merge "Add late server group policy check to rebuild"

This commit is contained in:
Zuul 2018-02-07 03:46:34 +00:00 committed by Gerrit Code Review
commit b37aa54153
12 changed files with 165 additions and 56 deletions

View File

@ -326,7 +326,7 @@ following:
.. note:: This has been resolved in the Queens release [#]_.
#. Instance affinity reporting from the compute nodes to scheduler
#. The late anti-affinity check
#. The late anti-affinity check during server create and evacuate
#. Querying host aggregates from the cell
#. Attaching a volume and ``[cinder]/cross_az_attach=False``

View File

@ -485,7 +485,7 @@ class ComputeVirtAPI(virtapi.VirtAPI):
class ComputeManager(manager.Manager):
"""Manages the running instances from creation to destruction."""
target = messaging.Target(version='4.21')
target = messaging.Target(version='4.22')
# How long to wait in seconds before re-issuing a shutdown
# signal to an instance during power off. The overall
@ -2804,7 +2804,7 @@ class ComputeManager(manager.Manager):
injected_files, new_pass, orig_sys_metadata,
bdms, recreate, on_shared_storage=None,
preserve_ephemeral=False, migration=None,
scheduled_node=None, limits=None):
scheduled_node=None, limits=None, request_spec=None):
"""Destroy and re-make this instance.
A 'rebuild' effectively purges all existing data from the system and
@ -2834,6 +2834,8 @@ class ComputeManager(manager.Manager):
None
:param limits: Overcommit limits set by the scheduler. If a host was
specified by the user, this will be None
:param request_spec: a RequestSpec object used to schedule the instance
"""
context = context.elevated()
@ -2886,11 +2888,22 @@ class ComputeManager(manager.Manager):
claim_ctxt, context, instance, orig_image_ref,
image_ref, injected_files, new_pass, orig_sys_metadata,
bdms, recreate, on_shared_storage, preserve_ephemeral,
migration)
except exception.ComputeResourcesUnavailable as e:
LOG.debug("Could not rebuild instance on this host, not "
"enough resources available.", instance=instance)
migration, request_spec)
except (exception.ComputeResourcesUnavailable,
exception.RescheduledException) as e:
if isinstance(e, exception.ComputeResourcesUnavailable):
LOG.debug("Could not rebuild instance on this host, not "
"enough resources available.", instance=instance)
else:
# RescheduledException is raised by the late server group
# policy check during evacuation if a parallel scheduling
# violated the policy.
# We catch the RescheduledException here but we don't have
# the plumbing to do an actual reschedule so we abort the
# operation.
LOG.debug("Could not rebuild instance on this host, "
"late server group check failed.",
instance=instance)
# NOTE(ndipanov): We just abort the build for now and leave a
# migration record for potential cleanup later
self._set_migration_status(migration, 'failed')
@ -2949,10 +2962,18 @@ class ComputeManager(manager.Manager):
image_ref, injected_files, new_pass,
orig_sys_metadata, bdms, recreate,
on_shared_storage, preserve_ephemeral,
migration):
migration, request_spec):
orig_vm_state = instance.vm_state
if recreate:
if request_spec:
# NOTE(gibi): Do a late check of server group policy as
# parallel scheduling could violate such policy. This will
# cause the evacuate to fail as rebuild does not implement
# reschedule.
hints = self._get_scheduler_hints({}, request_spec)
self._validate_instance_group_policy(context, instance, hints)
if not self.driver.capabilities["supports_recreate"]:
raise exception.InstanceRecreateNotSupported

View File

@ -340,6 +340,7 @@ class ComputeAPI(object):
* 4.20 - Add multiattach argument to reserve_block_device_name().
* 4.21 - prep_resize() now gets a 'host_list' parameter representing
potential alternate hosts for retries within a cell.
* 4.22 - Add request_spec to rebuild_instance()
'''
VERSION_ALIASES = {
@ -811,7 +812,7 @@ class ComputeAPI(object):
image_ref, orig_image_ref, orig_sys_metadata, bdms,
recreate=False, on_shared_storage=False, host=None, node=None,
preserve_ephemeral=False, migration=None, limits=None,
kwargs=None):
request_spec=None, kwargs=None):
# NOTE(edleafe): compute nodes can only use the dict form of limits.
if isinstance(limits, objects.SchedulerLimits):
limits = limits.to_dict()
@ -820,9 +821,13 @@ class ComputeAPI(object):
extra = {'preserve_ephemeral': preserve_ephemeral,
'migration': migration,
'scheduled_node': node,
'limits': limits}
version = '4.5'
'limits': limits,
'request_spec': request_spec}
version = '4.22'
client = self.router.client(ctxt)
if not client.can_send_version(version):
version = '4.5'
del extra['request_spec']
if not client.can_send_version(version):
version = '4.0'
extra.pop('migration')

View File

@ -988,7 +988,8 @@ class ComputeTaskManager(base.Base):
on_shared_storage=on_shared_storage,
preserve_ephemeral=preserve_ephemeral,
migration=migration,
host=host, node=node, limits=limits)
host=host, node=node, limits=limits,
request_spec=request_spec)
# TODO(avolkov): move method to bdm
@staticmethod

View File

@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)
# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 28
SERVICE_VERSION = 29
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -127,6 +127,8 @@ SERVICE_VERSION_HISTORY = (
{'compute_rpc': '4.20'},
# Version 28: Adds a 'host_list' parameter to prep_resize()
{'compute_rpc': '4.21'},
# Version 29: Compute RPC version 4.22
{'compute_rpc': '4.22'},
)

View File

@ -316,8 +316,8 @@ class InstanceHelperMixin(object):
self.fail('Timed out waiting for %s failure event. Current '
'instance actions: %s' % (event_name, actions))
def _wait_for_migration_status(self, server, expected_status):
"""Waits for a migration record with the given status to be found
def _wait_for_migration_status(self, server, expected_statuses):
"""Waits for a migration record with the given statuses to be found
for the given server, else the test fails. The migration record, if
found, is returned.
"""
@ -325,13 +325,13 @@ class InstanceHelperMixin(object):
if api is None:
api = self.api
statuses = [status.lower() for status in expected_statuses]
for attempt in range(10):
migrations = api.api_get('/os-migrations').body['migrations']
for migration in migrations:
if (migration['instance_uuid'] == server['id'] and
migration['status'].lower() ==
expected_status.lower()):
migration['status'].lower() in statuses):
return migration
time.sleep(0.5)
self.fail('Timed out waiting for migration with status "%s" for '
'instance: %s' % (expected_status, server['id']))
'instance: %s' % (expected_statuses, server['id']))

View File

@ -148,8 +148,8 @@ class TestParallelEvacuationWithServerGroup(
# that. The only thing that happens after the instance.host is set to
# the target host is the migration status setting to done. So we have
# to wait for that to avoid asserting the wrong host below.
self._wait_for_migration_status(server1, 'done')
self._wait_for_migration_status(server2, 'done')
self._wait_for_migration_status(server1, ['done', 'failed'])
self._wait_for_migration_status(server2, ['done', 'failed'])
# get the servers again to have the latest information about their
# hosts
@ -158,28 +158,19 @@ class TestParallelEvacuationWithServerGroup(
# assert that the anti-affinity policy is enforced during the
# evacuation
# NOTE(gibi): This shows bug 1735407 as both instance ends up on the
# same host.
self.assertEqual(server1['OS-EXT-SRV-ATTR:host'],
server2['OS-EXT-SRV-ATTR:host'])
# After the bug 1735407 is fixed the following is expected:
# self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'],
# server2['OS-EXT-SRV-ATTR:host'])
self.assertNotEqual(server1['OS-EXT-SRV-ATTR:host'],
server2['OS-EXT-SRV-ATTR:host'])
# assert that one of the evacuation was successful and that server is
# moved to another host and the evacuation of the other server is
# failed
# NOTE(gibi): This shows bug 1735407 as both instance is moved
self.assertNotIn(server1['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'})
self.assertNotIn(server2['OS-EXT-SRV-ATTR:host'], {'host1', 'host2'})
# After fixing the bug 1735407 the following is expected
# if server1['status'] == 'ERROR':
# failed_server = server1
# evacuated_server = server2
# else:
# failed_server = server2
# evacuated_server = server1
# self.assertEqual('ERROR', failed_server['status'])
# self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host'])
# self.assertEqual('ACTIVE', evacuated_server['status'])
# self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host'])
if server1['status'] == 'ERROR':
failed_server = server1
evacuated_server = server2
else:
failed_server = server2
evacuated_server = server1
self.assertEqual('ERROR', failed_server['status'])
self.assertNotEqual('host3', failed_server['OS-EXT-SRV-ATTR:host'])
self.assertEqual('ACTIVE', evacuated_server['status'])
self.assertEqual('host3', evacuated_server['OS-EXT-SRV-ATTR:host'])

View File

@ -427,7 +427,7 @@ class ServerGroupTestV21(ServerGroupTestBase):
post = {'evacuate': {'onSharedStorage': False}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'done')
self._wait_for_migration_status(servers[1], ['done'])
evacuated_server = self._wait_for_state_change(
self.admin_api, servers[1], 'ACTIVE')
@ -453,7 +453,7 @@ class ServerGroupTestV21(ServerGroupTestBase):
post = {'evacuate': {'onSharedStorage': False}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'error')
self._wait_for_migration_status(servers[1], ['error'])
server_after_failed_evac = self._wait_for_state_change(
self.admin_api, servers[1], 'ERROR')
@ -477,7 +477,7 @@ class ServerGroupTestV21(ServerGroupTestBase):
post = {'evacuate': {'onSharedStorage': False}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'error')
self._wait_for_migration_status(servers[1], ['error'])
server_after_failed_evac = self._wait_for_state_change(
self.admin_api, servers[1], 'ERROR')
@ -624,7 +624,7 @@ class ServerGroupTestV215(ServerGroupTestV21):
post = {'evacuate': {}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'done')
self._wait_for_migration_status(servers[1], ['done'])
evacuated_server = self._wait_for_state_change(
self.admin_api, servers[1], 'ACTIVE')
@ -651,7 +651,7 @@ class ServerGroupTestV215(ServerGroupTestV21):
post = {'evacuate': {}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'error')
self._wait_for_migration_status(servers[1], ['error'])
server_after_failed_evac = self._wait_for_state_change(
self.admin_api, servers[1], 'ERROR')
@ -675,7 +675,7 @@ class ServerGroupTestV215(ServerGroupTestV21):
post = {'evacuate': {}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'error')
self._wait_for_migration_status(servers[1], ['error'])
server_after_failed_evac = self._wait_for_state_change(
self.admin_api, servers[1], 'ERROR')
@ -817,7 +817,7 @@ class ServerGroupTestV215(ServerGroupTestV21):
post = {'evacuate': {}}
self.admin_api.post_server_action(servers[1]['id'], post)
self._wait_for_migration_status(servers[1], 'done')
self._wait_for_migration_status(servers[1], ['done'])
evacuated_server = self._wait_for_state_change(
self.admin_api, servers[1], 'ACTIVE')

View File

@ -2588,7 +2588,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
# clears the task_state. The migration record status is set to
# 'error', so that's what we need to look for to know when this
# is done.
migration = self._wait_for_migration_status(server, 'error')
migration = self._wait_for_migration_status(server, ['error'])
# The source_compute should be set on the migration record, but the
# destination shouldn't be as we never made it to one.
@ -2648,7 +2648,7 @@ class ServerMovingTests(ProviderUsageBaseTestCase):
self.api.post_server_action(server['id'], post)
# The compute manager will put the migration record into error status
# when pre_live_migration fails, so wait for that to happen.
migration = self._wait_for_migration_status(server, 'error')
migration = self._wait_for_migration_status(server, ['error'])
# The _rollback_live_migration method in the compute manager will reset
# the task_state on the instance, so wait for that to happen.
server = self._wait_for_server_parameter(
@ -2916,7 +2916,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase):
}
self.api.post_server_action(server['id'], post)
migration = self._wait_for_migration_status(server, 'running')
migration = self._wait_for_migration_status(server, ['running'])
self.api.force_complete_migration(server['id'],
migration['id'])
@ -2959,7 +2959,7 @@ class ServerLiveMigrateForceAndAbort(ProviderUsageBaseTestCase):
}
self.api.post_server_action(server['id'], post)
migration = self._wait_for_migration_status(server, 'running')
migration = self._wait_for_migration_status(server, ['running'])
self.api.delete_migration(server['id'], migration['id'])
self._wait_for_server_parameter(self.api, server,

View File

@ -3935,6 +3935,87 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
delete_alloc.assert_called_once_with(instance, 'foo',
node_type='destination')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch('nova.objects.instance.Instance.drop_migration_context')
@mock.patch('nova.objects.instance.Instance.apply_migration_context')
@mock.patch('nova.objects.instance.Instance.mutated_migration_context')
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
@mock.patch('nova.network.neutronv2.api.API.'
'setup_instance_network_on_host')
@mock.patch('nova.network.neutronv2.api.API.setup_networks_on_host')
@mock.patch('nova.objects.instance.Instance.save')
@mock.patch('nova.compute.utils.notify_about_instance_action')
@mock.patch('nova.compute.utils.notify_about_instance_usage')
@mock.patch('nova.compute.utils.notify_usage_exists')
@mock.patch('nova.objects.instance.Instance.image_meta',
new_callable=mock.PropertyMock)
@mock.patch('nova.compute.manager.ComputeManager.'
'_validate_instance_group_policy')
@mock.patch('nova.compute.manager.ComputeManager._set_migration_status')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim')
def test_evacuate_late_server_group_policy_check(
self, mock_rebuild_claim, mock_set_migration_status,
mock_validate_policy, mock_image_meta, mock_notify_exists,
mock_notify_legacy, mock_notify, mock_instance_save,
mock_setup_networks, mock_setup_intance_network, mock_get_bdms,
mock_mutate_migration, mock_appy_migration, mock_drop_migration,
mock_context_elevated):
instance = fake_instance.fake_instance_obj(self.context)
instance.info_cache = None
elevated_context = mock.Mock()
mock_context_elevated.return_value = elevated_context
request_spec = objects.RequestSpec()
request_spec.scheduler_hints = {'group': [uuids.group]}
self.compute.rebuild_instance(
self.context, instance, None, None, None, None, None,
None, recreate=True, scheduled_node='fake-node',
request_spec=request_spec)
mock_validate_policy.assert_called_once_with(
elevated_context, instance, {'group': [uuids.group]})
@mock.patch('nova.compute.utils.add_instance_fault_from_exc')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'delete_allocation_for_evacuated_instance')
@mock.patch('nova.context.RequestContext.elevated')
@mock.patch('nova.objects.instance.Instance.save')
@mock.patch('nova.compute.utils.notify_about_instance_action')
@mock.patch('nova.compute.utils.notify_about_instance_usage')
@mock.patch('nova.compute.manager.ComputeManager.'
'_validate_instance_group_policy')
@mock.patch('nova.compute.manager.ComputeManager._set_migration_status')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.rebuild_claim')
def test_evacuate_late_server_group_policy_check_fails(
self, mock_rebuild_claim, mock_set_migration_status,
mock_validate_policy, mock_notify_legacy, mock_notify,
mock_instance_save, mock_context_elevated, mock_delete_allocation,
mock_instance_fault):
instance = fake_instance.fake_instance_obj(self.context)
instance.info_cache = None
elevated_context = mock.Mock()
mock_context_elevated.return_value = elevated_context
request_spec = objects.RequestSpec()
request_spec.scheduler_hints = {'group': [uuids.group]}
exc = exception.RescheduledException(
instance_uuid=instance.uuid, reason='policy violation')
mock_validate_policy.side_effect = exc
self.assertRaises(
exception.BuildAbortException, self.compute.rebuild_instance,
self.context, instance, None, None, None, None, None, None,
recreate=True, scheduled_node='fake-node',
request_spec=request_spec)
mock_validate_policy.assert_called_once_with(
elevated_context, instance, {'group': [uuids.group]})
mock_delete_allocation.assert_called_once_with(
instance, 'fake-node', node_type='destination')
mock_notify.assert_called_once_with(
elevated_context, instance, 'fake-mini', action='rebuild',
bdms=None, exception=exc, phase='error')
def test_rebuild_node_not_updated_if_not_recreate(self):
node = uuidutils.generate_uuid() # ironic node uuid
instance = fake_instance.fake_instance_obj(self.context, node=node)

View File

@ -517,6 +517,15 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
reboot_type='type')
def test_rebuild_instance(self):
self._test_compute_api('rebuild_instance', 'cast', new_pass='None',
injected_files='None', image_ref='None', orig_image_ref='None',
bdms=[], instance=self.fake_instance_obj, host='new_host',
orig_sys_metadata=None, recreate=True, on_shared_storage=True,
preserve_ephemeral=True, migration=None, node=None,
limits=None, request_spec=None, version='4.22')
def test_rebuild_instance_remove_request_spec(self):
self.flags(group='upgrade_levels', compute='4.21')
self._test_compute_api('rebuild_instance', 'cast', new_pass='None',
injected_files='None', image_ref='None', orig_image_ref='None',
bdms=[], instance=self.fake_instance_obj, host='new_host',

View File

@ -366,9 +366,6 @@ class _BaseTaskTestCase(object):
compute_rebuild_args['node'] = node
compute_rebuild_args['limits'] = limits
# Args that are passed in to the method but don't get passed to RPC
compute_rebuild_args.pop('request_spec')
return rebuild_args, compute_rebuild_args
@mock.patch.object(objects.InstanceMapping, 'get_by_instance_uuid')
@ -1312,6 +1309,7 @@ class _BaseTaskTestCase(object):
select_dest_mock.assert_called_once_with(self.context, fake_spec,
inst_uuids, return_objects=True, return_alternates=False)
compute_args['host'] = expected_host
compute_args['request_spec'] = fake_spec
rebuild_mock.assert_called_once_with(self.context,
instance=inst_obj,
**compute_args)
@ -1469,6 +1467,7 @@ class _BaseTaskTestCase(object):
fake_spec, [inst_obj.uuid], return_objects=True,
return_alternates=False)
compute_args['host'] = expected_host
compute_args['request_spec'] = fake_spec
rebuild_mock.assert_called_once_with(self.context,
instance=inst_obj,
**compute_args)