Make API always RPC cast to conductor for resize/migrate

This is a follow up to [1] to make the API behave consistently
by always asynchronously casting to conductor during resize
and cold migration regardless of same-cell or cross-cell
migration.

From the end user point of view, not much changes besides
the possibility of some exceptions occurring during scheduling
which would have resulted in a 400 BadRequest error.
The user still gets a 202 response, must poll the server status
until the server goes to VERIFY_RESIZE status or times out, and
can check the instance actions if the resize/migrate fails.

The specific errors that can occur are not really an API contract
and as such end user applications should not be building logic
around, for example, getting a NoValidHost error. It should be
noted, however, that by default non-admin users cannot see
the instance action event traceback that would contain the
error, e.g. NoValidHost.

The only exception types removed from handling in the API are
(1) AllocationMoveFailed which can be raised when the conductor
MigrationTask runs replace_allocation_with_migration and
(2) NoValidHost when the scheduler is called to select destinations.

Because of this, quite a few functional negative tests have to be
adjusted since the API no longer returns a 400 for NoValidHost and
other errors that can happen during scheduling.

Finally, the do_cast kwarg is left on the conductor API method since
the compute service calls it during same-cell reschedule as a
synchronous RPC call and has error handling if rescheduling in
conductor fails.

[1] I098f91d8c498e5a85266e193ad37c08aca4792b2

Change-Id: I711e56bcb4b72605253fa63be230a68e03e45b84
This commit is contained in:
Matt Riedemann 2019-11-12 09:25:56 -05:00
parent fb283dab57
commit a05ef30fb9
15 changed files with 132 additions and 110 deletions

View File

@ -76,15 +76,14 @@ class MigrateServerController(wsgi.Controller):
host_name=host_name)
except (exception.TooManyInstances, exception.QuotaError) as e:
raise exc.HTTPForbidden(explanation=e.format_message())
except (exception.InstanceIsLocked,
exception.AllocationMoveFailed) as e:
except exception.InstanceIsLocked as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,
'migrate', id)
except exception.InstanceNotFound as e:
raise exc.HTTPNotFound(explanation=e.format_message())
except (exception.NoValidHost, exception.ComputeHostNotFound,
except (exception.ComputeHostNotFound,
exception.CannotMigrateToSameHost) as e:
raise exc.HTTPBadRequest(explanation=e.format_message())

View File

@ -957,8 +957,7 @@ class ServersController(wsgi.Controller):
except exception.QuotaError as error:
raise exc.HTTPForbidden(
explanation=error.format_message())
except (exception.InstanceIsLocked,
exception.AllocationMoveFailed) as e:
except exception.InstanceIsLocked as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,
@ -974,8 +973,7 @@ class ServersController(wsgi.Controller):
except (exception.AutoDiskConfigDisabledByImage,
exception.CannotResizeDisk,
exception.CannotResizeToSameFlavor,
exception.FlavorNotFound,
exception.NoValidHost) as e:
exception.FlavorNotFound) as e:
raise exc.HTTPBadRequest(explanation=e.format_message())
except INVALID_FLAVOR_IMAGE_EXCEPTIONS as e:
raise exc.HTTPBadRequest(explanation=e.format_message())

View File

@ -3839,17 +3839,15 @@ class API(base.Base):
host=node.host, node=node.hypervisor_hostname,
allow_cross_cell_move=allow_cross_cell_resize)
# This is by default a synchronous RPC call to conductor which does not
# return until the MigrationTask in conductor RPC casts to the
# prep_resize method on the selected destination nova-compute service.
# However, if we are allowed to do a cross-cell resize, then we
# asynchronously RPC cast since the CrossCellMigrationTask is blocking.
# Asynchronously RPC cast to conductor so the response is not blocked
# during scheduling. If something fails the user can find out via
# instance actions.
self.compute_task_api.resize_instance(context, instance,
scheduler_hint=scheduler_hint,
flavor=new_instance_type,
clean_shutdown=clean_shutdown,
request_spec=request_spec,
do_cast=allow_cross_cell_resize)
do_cast=True)
@check_instance_lock
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,

View File

@ -184,6 +184,17 @@ class InstanceActionEvent(base.NovaPersistentObject, base.NovaObject,
values['result'] = 'Success'
else:
values['result'] = 'Error'
# FIXME(mriedem): message is not used. The instance_actions_events
# table has a "details" column but not a "message" column which
# means the exc_val is never stored in the record. So far it does
# not matter because the exc_val is not exposed out of the API,
# but we should consider storing at least the exception type so
# we could expose that to non-admin owners of a server in the API
# since then they could see something like NoValidHost to know why
# the operation failed. Note by default policy non-admins are not
# allowed to see the traceback field. If we expose exc_val somehow
# we might consider re-using logic from exception_to_dict which
# is used to store an instance fault message.
values['message'] = exc_val
values['traceback'] = exc_tb
return values

View File

@ -347,6 +347,19 @@ class InstanceHelperMixin(object):
'actions: %s. Events in the last matching action: %s'
% (event_name, actions, events))
def _assert_resize_migrate_action_fail(self, server, action, error_in_tb):
"""Waits for the conductor_migrate_server action event to fail for
the given action and asserts the error is in the event traceback.
:param server: API response dict of the server being resized/migrated
:param action: Either "resize" or "migrate" instance action.
:param error_in_tb: Some expected part of the error event traceback.
"""
api = self.admin_api if hasattr(self, 'admin_api') else self.api
event = self._wait_for_action_fail_completion(
server, action, 'conductor_migrate_server', api=api)
self.assertIn(error_in_tb, event['traceback'])
def _wait_for_migration_status(self, server, expected_statuses):
"""Waits for a migration record with the given statuses to be found
for the given server, else the test fails. The migration record, if

View File

@ -11,7 +11,6 @@
# under the License.
from nova.tests import fixtures
from nova.tests.functional.api import client as api_client
from nova.tests.functional.notification_sample_tests \
import notification_sample_base
from nova.tests.unit import fake_notifier
@ -107,9 +106,9 @@ class TestComputeTaskNotificationSample(
fake_notifier.reset()
self.assertRaises(api_client.OpenStackApiException,
self.admin_api.post_server_action,
server['id'], {'migrate': None})
# Note that the operation will return a 202 response but fail with
# NoValidHost asynchronously.
self.admin_api.post_server_action(server['id'], {'migrate': None})
self._wait_for_notification('compute_task.migrate_server.error')
# 0. scheduler.select_destinations.start
# 1. compute_task.migrate_server.error

View File

@ -11,9 +11,8 @@
# under the License.
from oslo_utils.fixture import uuidsentinel as uuids
import six
from nova.tests.functional.api import client as api_client
from nova.compute import instance_actions
from nova.tests.functional import integrated_helpers
@ -70,8 +69,8 @@ class ResizeSameHostDoubledAllocations(
# fails because DISK_GB inventory allocation ratio is 1.0 and when
# resizing to the same host we'll be trying to claim 2048 but the
# provider only has a total of 1024.
ex = self.assertRaises(api_client.OpenStackApiException,
self.api.post_server_action,
server['id'], resize_req)
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found', six.text_type(ex))
self.api.post_server_action(server['id'], resize_req,
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.RESIZE, 'NoValidHost')

View File

@ -12,13 +12,11 @@
# License for the specific language governing permissions and limitations
# under the License.
import six
from nova.compute import instance_actions
from nova import context
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
from nova.tests.functional import fixtures as func_fixtures
from nova.tests.functional import integrated_helpers
from nova.tests.unit.image import fake as image_fake
@ -118,18 +116,15 @@ class NonPersistentFieldNotResetTest(
source_compute_id, {'forced_down': 'true'})
# Cold migrate a server with a target host.
# If requested_destination is reset, the server is moved to a host
# that is not a requested target host.
# In that case, the response code is 202.
# If requested_destination is not reset, no valid host error (400) is
# returned because the target host is down.
ex = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'],
{'migrate': {'host': target_host}})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found. No valid host found '
'for cold migrate', six.text_type(ex))
# The response status code is 202 even though the operation will
# fail because the requested target host is down which will result
# in a NoValidHost error.
self.api.post_server_action(
server['id'], {'migrate': {'host': target_host}},
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# Make sure 'is_bfv' is set.
reqspec = objects.RequestSpec.get_by_instance_uuid(self.ctxt,

View File

@ -96,16 +96,17 @@ class DeletedServerAllocationRevertTest(
self._stub_delete_server_during_scheduling(server)
# Now start the cold migration which will fail due to NoValidHost.
# Note that we get a 404 back because this is a blocking call until
# conductor RPC casts to prep_resize on the selected dest host but
# we never get that far.
self.api.post_server_action(server['id'], {'migrate': None},
check_response_status=[404])
check_response_status=[202])
# We cannot monitor the migration from the API since it is deleted
# when the instance is deleted so just wait for the failed instance
# action event after the task rollback happens.
self._wait_for_action_fail_completion(
server, instance_actions.MIGRATE, 'cold_migrate', api=self.api)
# Note that we get InstanceNotFound rather than NoValidHost because
# the NoValidHost handler in ComputeTaskManager._cold_migrate calls
# _set_vm_state_and_notify which raises InstanceNotFound and masks
# the NoValidHost error.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'InstanceNotFound')
self._assert_no_allocations(server)
def test_live_migration_task_rollback(self):

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from nova.compute import instance_actions
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import fixtures as func_fixtures
@ -63,7 +64,7 @@ class MultiCellSchedulerTestCase(test.TestCase,
return self.admin_api.api_post(
'/servers/%s/action' % found_server['id'],
{'migrate': None},
check_response_status=[expected_status])
check_response_status=[expected_status]), found_server
def test_migrate_between_cells(self):
"""Verify that migrating between cells is not allowed.
@ -75,8 +76,10 @@ class MultiCellSchedulerTestCase(test.TestCase,
self.start_service('compute', host='compute1', cell=CELL1_NAME)
self.start_service('compute', host='compute2', cell=CELL2_NAME)
result = self._test_create_and_migrate(expected_status=400)
self.assertIn('No valid host', result.body['badRequest']['message'])
_, server = self._test_create_and_migrate(expected_status=202)
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
def test_migrate_within_cell(self):
"""Verify that migrating within cells is allowed.

View File

@ -17,6 +17,7 @@ import mock
from oslo_config import cfg
import six
from nova.compute import instance_actions
from nova import context
from nova.db import api as db
from nova.db.sqlalchemy import api as db_api
@ -346,11 +347,13 @@ class ServerGroupTestV21(ServerGroupTestBase):
servers = self._boot_servers_to_group(created_group)
post = {'migrate': {}}
ex = self.assertRaises(client.OpenStackApiException,
self.admin_api.post_server_action,
servers[1]['id'], post)
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host found for cold migrate', ex.response.text)
# NoValidHost errors are handled in conductor after the API has cast
# off and returned a 202 response to the user.
self.admin_api.post_server_action(servers[1]['id'], post,
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
servers[1], instance_actions.MIGRATE, 'NoValidHost')
def test_migrate_with_group_no_valid_host(self):
for group in [self.affinity, self.anti_affinity]:

View File

@ -2006,11 +2006,10 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
}
}
resp = self.api.post_server_action(
server['id'], resize_req, check_response_status=[400])
self.assertEqual(
resp['badRequest']['message'],
"No valid host was found. No valid host found for resize")
self.api.post_server_action(
server['id'], resize_req, check_response_status=[202])
self._assert_resize_migrate_action_fail(
server, instance_actions.RESIZE, 'NoValidHost')
server = self.admin_api.get_server(server['id'])
self.assertEqual(source_hostname, server['OS-EXT-SRV-ATTR:host'])
@ -2210,10 +2209,9 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
# migrate the server
post = {'migrate': None}
ex = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], post)
self.assertIn('No valid host', six.text_type(ex))
self.api.post_server_action(server['id'], post)
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
expected_params = {'OS-EXT-SRV-ATTR:host': source_hostname,
'status': 'ACTIVE'}
self._wait_for_server_parameter(self.api, server, expected_params)
@ -3300,15 +3298,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
data = {"resize": {"flavorRef": self.flavor2["id"]}}
self.api.post_server_action(server_uuid, data)
# fill_provider_mapping should have been called once for the initial
# build, once for the resize scheduling to the primary host and then
# once per reschedule.
expected_fill_count = 2
if num_alts > 1:
expected_fill_count += self.num_fails - 1
self.assertGreaterEqual(mock_fill_provider_map.call_count,
expected_fill_count)
if num_alts < fails:
# We will run out of alternates before populate_retry will
# raise a MaxRetriesExceeded exception, so the migration will
@ -3349,6 +3338,15 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
migrations = objects.MigrationList.get_by_filters(ctxt, filters)
self.assertEqual(1, len(migrations.objects))
# fill_provider_mapping should have been called once for the initial
# build, once for the resize scheduling to the primary host and then
# once per reschedule.
expected_fill_count = 2
if num_alts > 1:
expected_fill_count += self.num_fails - 1
self.assertGreaterEqual(mock_fill_provider_map.call_count,
expected_fill_count)
def test_resize_reschedule_uses_host_lists_1_fail(self):
self._test_resize_reschedule_uses_host_lists(fails=1)
@ -4701,14 +4699,19 @@ class ConsumerGenerationConflictTest(
mock_put.return_value = rsp
request = {'migrate': None}
exception = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], request)
self.api.post_server_action(server['id'], request,
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
# I know that HTTP 500 is harsh code but I think this conflict case
# signals either a serious db inconsistency or a bug in nova's
# claim code.
self.assertEqual(500, exception.response.status_code)
# The instance action should have failed with details.
# save_and_reraise_exception gets different results between py2 and py3
# for the traceback but we want to use the more specific
# "claim_resources" for py3. We can remove this when we drop support
# for py2.
error_in_tb = 'claim_resources' if six.PY3 else 'select_destinations'
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, error_in_tb)
# The migration is aborted so the instance is ACTIVE on the source
# host instead of being in VERIFY_RESIZE state.
@ -4741,15 +4744,16 @@ class ConsumerGenerationConflictTest(
mock_post.return_value = rsp
request = {'migrate': None}
exception = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], request)
self.api.post_server_action(server['id'], request,
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'move_allocations')
self.assertEqual(1, mock_post.call_count)
self.assertEqual(409, exception.response.status_code)
self.assertIn('Failed to move allocations', exception.response.text)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('migration', migrations[0]['migration_type'])
@ -6542,12 +6546,13 @@ class ServerMoveWithPortResourceRequestTest(
'nova.objects.Service.get_by_host_and_binary',
side_effect=fake_get_service):
ex = self.assertRaises(
client.OpenStackApiException,
self.api.post_server_action, server['id'], {'migrate': None})
self.api.post_server_action(server['id'], {'migrate': None},
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found.', six.text_type(ex))
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# check that the server still allocates from the original host
self._check_allocation(
@ -6607,8 +6612,7 @@ class ServerMoveWithPortResourceRequestTest(
side_effect=fake_get_service):
self.api.post_server_action(server['id'], {'migrate': None})
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
@ -6954,13 +6958,11 @@ class ServerMoveWithPortResourceRequestTest(
# enough information to do a proper port binding on the target host.
# The MigrationTask in the conductor checks that the RPC is new enough
# for this request for each possible destination provided by the
# scheduler and skips the old hosts.
ex = self.assertRaises(
client.OpenStackApiException, self.api.post_server_action,
server['id'], {'migrate': None})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found.', six.text_type(ex))
# scheduler and skips the old hosts. The actual response will be a 202
# so we have to wait for the failed instance action event.
self.api.post_server_action(server['id'], {'migrate': None})
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# The migration is put into error
self._wait_for_migration_status(server, ['error'])

View File

@ -721,8 +721,6 @@ class ServerActionsControllerTestV21(test.TestCase):
(exception.ImageNotFound(image_id=image_id),
webob.exc.HTTPBadRequest),
(exception.Invalid, webob.exc.HTTPBadRequest),
(exception.NoValidHost(reason='Bad host'),
webob.exc.HTTPBadRequest),
(exception.AutoDiskConfigDisabledByImage(image=image_id),
webob.exc.HTTPBadRequest),
]
@ -794,15 +792,6 @@ class ServerActionsControllerTestV21(test.TestCase):
self.controller._action_resize,
self.req, FAKE_UUID, body=body)
@mock.patch('nova.compute.api.API.resize',
side_effect=exception.NoValidHost(reason=''))
def test_resize_raises_no_valid_host(self, mock_resize):
body = dict(resize=dict(flavorRef="http://localhost/3"))
self.assertRaises(webob.exc.HTTPBadRequest,
self.controller._action_resize,
self.req, FAKE_UUID, body=body)
@mock.patch.object(compute_api.API, 'resize')
def test_resize_instance_raise_auto_disk_config_exc(self, mock_resize):
mock_resize.side_effect = exception.AutoDiskConfigDisabledByImage(

View File

@ -1941,7 +1941,7 @@ class _ComputeAPIUnitTestMixIn(object):
scheduler_hint=scheduler_hint,
flavor=test.MatchType(objects.Flavor),
clean_shutdown=clean_shutdown,
request_spec=fake_spec, do_cast=allow_cross_cell_resize)
request_spec=fake_spec, do_cast=True)
else:
mock_resize.assert_not_called()

View File

@ -0,0 +1,12 @@
---
other:
- |
The ``resize`` and ``migrate`` server action APIs used to synchronously
block until a destination host is selected by the scheduler. Those APIs
now asynchronously return a response to the user before scheduling.
The response code remains 202 and users can monitor the operation via the
``status`` and ``OS-EXT-STS:task_state`` fields on the server resource and
also by using the ``os-instance-actions`` API. The most notable
change is ``NoValidHost`` will not be returned in a 400 error response
from the API if scheduling fails but that information is available via the
instance actions API interface.