Ensure source compute is up when confirming a resize

When _poll_unconfirmed_resizes runs or a user tries to confirm
a resize in the API, if the source compute service is down the
migration status will be stuck in "confirming" status if it never
reached the source compute. Subsequent runs of
_poll_unconfirmed_resizes will not be able to auto-confirm the
resize nor will the user be able to manually confirm the resize.
An admin could reset the status on the server to ACTIVE or ERROR
but that means the source compute never gets cleaned up since you
can only confirm or revert a resize on a server with VERIFY_RESIZE
status.

This adds a check in the API before updating the migration record
such that if the source compute service is down the API returns a
409 response as an indication to try again later.

SingleCellSimple._fake_target_cell is updated so that tests using
it can assert when a context was targeted without having to stub
nova.context.target_cell. As a result some HostManager unit tests
needed to be updated.

Change-Id: I33aa5e32cb321e5a16da51e227af2f67ed9e6713
Closes-Bug: #1855927
This commit is contained in:
Matt Riedemann 2019-12-16 16:15:24 -05:00 committed by Lee Yarwood
parent 8ca9b7254b
commit e4601c77fb
7 changed files with 129 additions and 35 deletions

View File

@ -885,7 +885,10 @@ class ServersController(wsgi.Controller):
except exception.MigrationNotFound:
msg = _("Instance has not been resized.")
raise exc.HTTPBadRequest(explanation=msg)
except exception.InstanceIsLocked as e:
except (
exception.InstanceIsLocked,
exception.ServiceUnavailable,
) as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,

View File

@ -3775,10 +3775,40 @@ class API(base.Base):
migration.dest_compute,
reqspec)
@staticmethod
def _get_source_compute_service(context, migration):
"""Find the source compute Service object given the Migration.
:param context: nova auth RequestContext target at the destination
compute cell
:param migration: Migration object for the move operation
:return: Service object representing the source host nova-compute
"""
if migration.cross_cell_move:
# The source compute could be in another cell so look up the
# HostMapping to determine the source cell.
hm = objects.HostMapping.get_by_host(
context, migration.source_compute)
with nova_context.target_cell(context, hm.cell_mapping) as cctxt:
return objects.Service.get_by_compute_host(
cctxt, migration.source_compute)
# Same-cell migration so just use the context we have.
return objects.Service.get_by_compute_host(
context, migration.source_compute)
@check_instance_lock
@check_instance_state(vm_state=[vm_states.RESIZED])
def confirm_resize(self, context, instance, migration=None):
"""Confirms a migration/resize and deletes the 'old' instance."""
"""Confirms a migration/resize and deletes the 'old' instance.
:param context: nova auth RequestContext
:param instance: Instance object to confirm the resize
:param migration: Migration object; provided if called from the
_poll_unconfirmed_resizes periodic task on the dest compute.
:raises: MigrationNotFound if migration is not provided and a migration
cannot be found for the instance with status "finished".
:raises: ServiceUnavailable if the source compute service is down.
"""
elevated = context.elevated()
# NOTE(melwitt): We're not checking quota here because there isn't a
# change in resource usage when confirming a resize. Resource
@ -3789,6 +3819,13 @@ class API(base.Base):
migration = objects.Migration.get_by_instance_and_status(
elevated, instance.uuid, 'finished')
# Check if the source compute service is up before modifying the
# migration record because once we do we cannot come back through this
# method since it will be looking for a "finished" status migration.
source_svc = self._get_source_compute_service(context, migration)
if not self.servicegroup_api.service_is_up(source_svc):
raise exception.ServiceUnavailable()
migration.status = 'confirming'
migration.save()

View File

@ -324,8 +324,12 @@ class SingleCellSimple(fixtures.Fixture):
@contextmanager
def _fake_target_cell(self, context, target_cell):
# NOTE(danms): Just pass through the context without actually
# targeting anything.
# Just do something simple and set/unset the cell_uuid on the context.
if target_cell:
context.cell_uuid = getattr(target_cell, 'uuid',
uuidsentinel.cell1)
else:
context.cell_uuid = None
yield context
def _fake_set_target_cell(self, context, cell_mapping):

View File

@ -963,6 +963,10 @@ class TestMultiCellMigrate(integrated_helpers.ProviderUsageBaseTestCase):
server2['id'], {'migrate': {'host': 'host2'}})
self._wait_for_migration_status(server2, ['error'])
@mock.patch('nova.compute.api.API._get_source_compute_service',
new=mock.Mock())
@mock.patch('nova.servicegroup.api.API.service_is_up',
new=mock.Mock(return_value=True))
def test_poll_unconfirmed_resizes_with_upcall(self):
"""Tests the _poll_unconfirmed_resizes periodic task with a cross-cell
resize once the instance is in VERIFY_RESIZE status on the dest host.
@ -993,6 +997,10 @@ class TestMultiCellMigrate(integrated_helpers.ProviderUsageBaseTestCase):
self._assert_confirm(
server, source_rp_uuid, target_rp_uuid, new_flavor)
@mock.patch('nova.compute.api.API._get_source_compute_service',
new=mock.Mock())
@mock.patch('nova.servicegroup.api.API.service_is_up',
new=mock.Mock(return_value=True))
def test_poll_unconfirmed_resizes_with_no_upcall(self):
"""Tests the _poll_unconfirmed_resizes periodic task with a cross-cell
resize once the instance is in VERIFY_RESIZE status on the dest host.

View File

@ -46,7 +46,6 @@ from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
from nova.tests.functional import integrated_helpers
from nova.tests.unit.api.openstack import fakes
from nova.tests.unit import cast_as_call
from nova.tests.unit import fake_block_device
from nova.tests.unit import fake_notifier
from nova.tests.unit import fake_requests
@ -3359,34 +3358,37 @@ class PollUnconfirmedResizesTest(integrated_helpers.ProviderUsageBaseTestCase):
self.api.put_service(
source_service['id'], {'status': 'disabled', 'forced_down': True})
# Now configure auto-confirm and call the method on the target compute
# so we do not have to wait for the periodic to run. Also configure
# the RPC call from the API to the source compute to timeout after 1
# second.
self.flags(resize_confirm_window=1, rpc_response_timeout=1)
# so we do not have to wait for the periodic to run.
self.flags(resize_confirm_window=1)
# Stub timeutils so the DB API query finds the unconfirmed migration.
future = timeutils.utcnow() + datetime.timedelta(hours=1)
ctxt = context.get_admin_context()
with osloutils_fixture.TimeFixture(future):
# Use the CastAsCall fixture since the fake rpc is not going to
# simulate a failure from the service being down.
with cast_as_call.CastAsCall(self):
self.computes['host2'].manager._poll_unconfirmed_resizes(ctxt)
self.computes['host2'].manager._poll_unconfirmed_resizes(ctxt)
self.assertIn('Error auto-confirming resize',
self.stdlog.logger.output)
self.assertIn('No reply on topic compute', self.stdlog.logger.output)
# FIXME(mriedem): This is bug 1855927 where the migration status is
# left in "confirming" so the _poll_unconfirmed_resizes task will not
# process it again nor can the user confirm the resize in the API since
# the migration status is not "finished".
self._wait_for_migration_status(server, ['confirming'])
self.assertIn('Service is unavailable at this time',
self.stdlog.logger.output)
# The source compute service check should have been done before the
# migration status was updated so it should still be "finished".
self._wait_for_migration_status(server, ['finished'])
# Try to confirm in the API while the source compute service is still
# down to assert the 409 (rather than a 500) error.
ex = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], {'confirmResize': None})
self.assertEqual(400, ex.response.status_code)
self.assertIn('Instance has not been resized', six.text_type(ex))
# That error message is bogus because the server is resized.
server = self.api.get_server(server['id'])
self.assertEqual('VERIFY_RESIZE', server['status'])
self.assertEqual(409, ex.response.status_code)
self.assertIn('Service is unavailable at this time', six.text_type(ex))
# Bring the source compute back up and try to confirm the resize which
# should work since the migration status is still "finished".
self.restart_compute_service(self.computes['host1'])
self.api.put_service(
source_service['id'], {'status': 'enabled', 'forced_down': False})
# Use the API to confirm the resize because _poll_unconfirmed_resizes
# requires mucking with the current time which causes problems with
# the service_is_up check in the API.
self.api.post_server_action(server['id'], {'confirmResize': None})
self._wait_for_state_change(server, 'ACTIVE')
class ServerLiveMigrateForceAndAbort(

View File

@ -1644,10 +1644,13 @@ class _ComputeAPIUnitTestMixIn(object):
test()
@mock.patch('nova.compute.api.API._get_source_compute_service')
@mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True)
@mock.patch.object(objects.Migration, 'save')
@mock.patch.object(objects.Migration, 'get_by_instance_and_status')
@mock.patch.object(context.RequestContext, 'elevated')
def _test_confirm_resize(self, mock_elevated, mock_get, mock_save,
mock_service_is_up, mock_get_service,
mig_ref_passed=False):
params = dict(vm_state=vm_states.RESIZED)
fake_inst = self._create_instance_obj(params=params)
@ -1678,6 +1681,8 @@ class _ComputeAPIUnitTestMixIn(object):
self.compute_api.confirm_resize(self.context, fake_inst)
mock_elevated.assert_called_once_with()
mock_service_is_up.assert_called_once_with(
mock_get_service.return_value)
mock_save.assert_called_once_with()
mock_record.assert_called_once_with(self.context, fake_inst,
'confirmResize')
@ -1694,6 +1699,34 @@ class _ComputeAPIUnitTestMixIn(object):
def test_confirm_resize_with_migration_ref(self):
self._test_confirm_resize(mig_ref_passed=True)
@mock.patch('nova.objects.HostMapping.get_by_host',
return_value=objects.HostMapping(
cell_mapping=objects.CellMapping(
database_connection='fake://', transport_url='none://',
uuid=uuids.cell_uuid)))
@mock.patch('nova.objects.Service.get_by_compute_host')
def test_get_source_compute_service(self, mock_service_get, mock_hm_get):
# First start with a same-cell migration.
migration = objects.Migration(source_compute='source.host',
cross_cell_move=False)
self.compute_api._get_source_compute_service(self.context, migration)
mock_hm_get.assert_not_called()
mock_service_get.assert_called_once_with(self.context, 'source.host')
# Make sure the context was not targeted.
ctxt = mock_service_get.call_args[0][0]
self.assertIsNone(ctxt.cell_uuid)
# Now test with a cross-cell migration.
mock_service_get.reset_mock()
migration.cross_cell_move = True
self.compute_api._get_source_compute_service(self.context, migration)
mock_hm_get.assert_called_once_with(self.context, 'source.host')
mock_service_get.assert_called_once_with(
test.MatchType(context.RequestContext), 'source.host')
# Make sure the context was targeted.
ctxt = mock_service_get.call_args[0][0]
self.assertEqual(uuids.cell_uuid, ctxt.cell_uuid)
@mock.patch('nova.virt.hardware.numa_get_constraints')
@mock.patch('nova.network.neutron.API.get_requested_resource_for_instance',
return_value=[])
@ -7460,8 +7493,10 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
self._test_get_migrations_sorted_filter_duplicates(
[older, newer], newer)
@mock.patch('nova.servicegroup.api.API.service_is_up', return_value=True)
@mock.patch('nova.objects.Migration.get_by_instance_and_status')
def test_confirm_resize_cross_cell_move_true(self, mock_migration_get):
def test_confirm_resize_cross_cell_move_true(self, mock_migration_get,
mock_service_is_up):
"""Tests confirm_resize where Migration.cross_cell_move is True"""
instance = fake_instance.fake_instance_obj(
self.context, vm_state=vm_states.RESIZED, task_state=None,
@ -7475,12 +7510,15 @@ class ComputeAPIUnitTestCase(_ComputeAPIUnitTestMixIn, test.NoDBTestCase):
mock.patch.object(self.compute_api, '_record_action_start'),
mock.patch.object(self.compute_api.compute_task_api,
'confirm_snapshot_based_resize'),
mock.patch.object(self.compute_api, '_get_source_compute_service'),
) as (
mock_elevated, mock_migration_save, mock_record_action,
mock_conductor_confirm
mock_conductor_confirm, mock_get_service
):
self.compute_api.confirm_resize(self.context, instance)
mock_elevated.assert_called_once_with()
mock_service_is_up.assert_called_once_with(
mock_get_service.return_value)
mock_migration_save.assert_called_once_with()
self.assertEqual('confirming', migration.status)
mock_record_action.assert_called_once_with(

View File

@ -219,7 +219,8 @@ class HostManagerTestCase(test.NoDBTestCase):
inst2 = objects.Instance(host='host1', uuid=uuids.instance_2)
inst3 = objects.Instance(host='host2', uuid=uuids.instance_3)
cell = objects.CellMapping(database_connection='',
target_url='')
target_url='',
uuid=uuids.cell_uuid)
mock_get_by_filters.return_value = objects.InstanceList(
objects=[inst1, inst2, inst3])
hm = self.host_manager
@ -589,7 +590,7 @@ class HostManagerTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = []
mock_get_all.return_value = fakes.COMPUTE_NODES
mock_get_by_binary.return_value = fakes.SERVICES
context = 'fake_context'
context = nova_context.get_admin_context()
compute_nodes, services = self.host_manager._get_computes_for_cells(
context, self.host_manager.enabled_cells)
@ -786,7 +787,7 @@ class HostManagerTestCase(test.NoDBTestCase):
@mock.patch('nova.objects.InstanceList.get_uuids_by_host')
def test_host_state_not_updated(self, mock_get_by_host):
context = 'fake_context'
context = nova_context.get_admin_context()
hm = self.host_manager
inst1 = objects.Instance(uuid=uuids.instance)
cn1 = objects.ComputeNode(host='host1')
@ -806,10 +807,11 @@ class HostManagerTestCase(test.NoDBTestCase):
@mock.patch('nova.objects.InstanceList.get_uuids_by_host')
def test_recreate_instance_info(self, mock_get_by_host):
context = nova_context.get_admin_context()
host_name = 'fake_host'
inst1 = fake_instance.fake_instance_obj('fake_context',
inst1 = fake_instance.fake_instance_obj(context,
uuid=uuids.instance_1)
inst2 = fake_instance.fake_instance_obj('fake_context',
inst2 = fake_instance.fake_instance_obj(context,
uuid=uuids.instance_2)
orig_inst_dict = {inst1.uuid: inst1, inst2.uuid: inst2}
mock_get_by_host.return_value = [uuids.instance_1, uuids.instance_2]
@ -818,7 +820,7 @@ class HostManagerTestCase(test.NoDBTestCase):
'instances': orig_inst_dict,
'updated': True,
}}
self.host_manager._recreate_instance_info('fake_context', host_name)
self.host_manager._recreate_instance_info(context, host_name)
new_info = self.host_manager._instance_info[host_name]
self.assertEqual(len(new_info['instances']),
len(mock_get_by_host.return_value))
@ -1231,7 +1233,7 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = []
mock_get_all.return_value = fakes.COMPUTE_NODES
mock_get_by_binary.return_value = fakes.SERVICES
context = 'fake_context'
context = nova_context.get_admin_context()
compute_nodes, services = self.host_manager._get_computes_for_cells(
context, self.host_manager.enabled_cells)
@ -1256,7 +1258,7 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = []
mock_get_all.side_effect = [fakes.COMPUTE_NODES, running_nodes]
mock_get_by_binary.side_effect = [fakes.SERVICES, fakes.SERVICES]
context = 'fake_context'
context = nova_context.get_admin_context()
# first call: all nodes
compute_nodes, services = self.host_manager._get_computes_for_cells(
@ -1286,7 +1288,7 @@ class HostManagerChangedNodesTestCase(test.NoDBTestCase):
mock_get_by_host.return_value = []
mock_get_all.side_effect = [fakes.COMPUTE_NODES, []]
mock_get_by_binary.side_effect = [fakes.SERVICES, fakes.SERVICES]
context = 'fake_context'
context = nova_context.get_admin_context()
# first call: all nodes
compute_nodes, services = self.host_manager._get_computes_for_cells(