Merge "Make API always RPC cast to conductor for resize/migrate"

This commit is contained in:
Zuul 2019-11-14 08:49:50 +00:00 committed by Gerrit Code Review
commit 54ac837865
15 changed files with 132 additions and 110 deletions

View File

@ -76,15 +76,14 @@ class MigrateServerController(wsgi.Controller):
host_name=host_name)
except (exception.TooManyInstances, exception.QuotaError) as e:
raise exc.HTTPForbidden(explanation=e.format_message())
except (exception.InstanceIsLocked,
exception.AllocationMoveFailed) as e:
except exception.InstanceIsLocked as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,
'migrate', id)
except exception.InstanceNotFound as e:
raise exc.HTTPNotFound(explanation=e.format_message())
except (exception.NoValidHost, exception.ComputeHostNotFound,
except (exception.ComputeHostNotFound,
exception.CannotMigrateToSameHost) as e:
raise exc.HTTPBadRequest(explanation=e.format_message())

View File

@ -958,8 +958,7 @@ class ServersController(wsgi.Controller):
except exception.QuotaError as error:
raise exc.HTTPForbidden(
explanation=error.format_message())
except (exception.InstanceIsLocked,
exception.AllocationMoveFailed) as e:
except exception.InstanceIsLocked as e:
raise exc.HTTPConflict(explanation=e.format_message())
except exception.InstanceInvalidState as state_error:
common.raise_http_conflict_for_instance_invalid_state(state_error,
@ -975,8 +974,7 @@ class ServersController(wsgi.Controller):
except (exception.AutoDiskConfigDisabledByImage,
exception.CannotResizeDisk,
exception.CannotResizeToSameFlavor,
exception.FlavorNotFound,
exception.NoValidHost) as e:
exception.FlavorNotFound) as e:
raise exc.HTTPBadRequest(explanation=e.format_message())
except INVALID_FLAVOR_IMAGE_EXCEPTIONS as e:
raise exc.HTTPBadRequest(explanation=e.format_message())

View File

@ -3837,17 +3837,15 @@ class API(base.Base):
host=node.host, node=node.hypervisor_hostname,
allow_cross_cell_move=allow_cross_cell_resize)
# This is by default a synchronous RPC call to conductor which does not
# return until the MigrationTask in conductor RPC casts to the
# prep_resize method on the selected destination nova-compute service.
# However, if we are allowed to do a cross-cell resize, then we
# asynchronously RPC cast since the CrossCellMigrationTask is blocking.
# Asynchronously RPC cast to conductor so the response is not blocked
# during scheduling. If something fails the user can find out via
# instance actions.
self.compute_task_api.resize_instance(context, instance,
scheduler_hint=scheduler_hint,
flavor=new_instance_type,
clean_shutdown=clean_shutdown,
request_spec=request_spec,
do_cast=allow_cross_cell_resize)
do_cast=True)
@check_instance_lock
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,

View File

@ -184,6 +184,17 @@ class InstanceActionEvent(base.NovaPersistentObject, base.NovaObject,
values['result'] = 'Success'
else:
values['result'] = 'Error'
# FIXME(mriedem): message is not used. The instance_actions_events
# table has a "details" column but not a "message" column which
# means the exc_val is never stored in the record. So far it does
# not matter because the exc_val is not exposed out of the API,
# but we should consider storing at least the exception type so
# we could expose that to non-admin owners of a server in the API
# since then they could see something like NoValidHost to know why
# the operation failed. Note by default policy non-admins are not
# allowed to see the traceback field. If we expose exc_val somehow
# we might consider re-using logic from exception_to_dict which
# is used to store an instance fault message.
values['message'] = exc_val
values['traceback'] = exc_tb
return values

View File

@ -347,6 +347,19 @@ class InstanceHelperMixin(object):
'actions: %s. Events in the last matching action: %s'
% (event_name, actions, events))
def _assert_resize_migrate_action_fail(self, server, action, error_in_tb):
"""Waits for the conductor_migrate_server action event to fail for
the given action and asserts the error is in the event traceback.
:param server: API response dict of the server being resized/migrated
:param action: Either "resize" or "migrate" instance action.
:param error_in_tb: Some expected part of the error event traceback.
"""
api = self.admin_api if hasattr(self, 'admin_api') else self.api
event = self._wait_for_action_fail_completion(
server, action, 'conductor_migrate_server', api=api)
self.assertIn(error_in_tb, event['traceback'])
def _wait_for_migration_status(self, server, expected_statuses):
"""Waits for a migration record with the given statuses to be found
for the given server, else the test fails. The migration record, if

View File

@ -11,7 +11,6 @@
# under the License.
from nova.tests import fixtures
from nova.tests.functional.api import client as api_client
from nova.tests.functional.notification_sample_tests \
import notification_sample_base
from nova.tests.unit import fake_notifier
@ -107,9 +106,9 @@ class TestComputeTaskNotificationSample(
fake_notifier.reset()
self.assertRaises(api_client.OpenStackApiException,
self.admin_api.post_server_action,
server['id'], {'migrate': None})
# Note that the operation will return a 202 response but fail with
# NoValidHost asynchronously.
self.admin_api.post_server_action(server['id'], {'migrate': None})
self._wait_for_notification('compute_task.migrate_server.error')
# 0. scheduler.select_destinations.start
# 1. compute_task.migrate_server.error

View File

@ -11,9 +11,8 @@
# under the License.
from oslo_utils.fixture import uuidsentinel as uuids
import six
from nova.tests.functional.api import client as api_client
from nova.compute import instance_actions
from nova.tests.functional import integrated_helpers
@ -70,8 +69,8 @@ class ResizeSameHostDoubledAllocations(
# fails because DISK_GB inventory allocation ratio is 1.0 and when
# resizing to the same host we'll be trying to claim 2048 but the
# provider only has a total of 1024.
ex = self.assertRaises(api_client.OpenStackApiException,
self.api.post_server_action,
server['id'], resize_req)
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found', six.text_type(ex))
self.api.post_server_action(server['id'], resize_req,
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.RESIZE, 'NoValidHost')

View File

@ -12,13 +12,11 @@
# License for the specific language governing permissions and limitations
# under the License.
import six
from nova.compute import instance_actions
from nova import context
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional.api import client
from nova.tests.functional import fixtures as func_fixtures
from nova.tests.functional import integrated_helpers
from nova.tests.unit.image import fake as image_fake
@ -118,18 +116,15 @@ class NonPersistentFieldNotResetTest(
source_compute_id, {'forced_down': 'true'})
# Cold migrate a server with a target host.
# If requested_destination is reset, the server is moved to a host
# that is not a requested target host.
# In that case, the response code is 202.
# If requested_destination is not reset, no valid host error (400) is
# returned because the target host is down.
ex = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'],
{'migrate': {'host': target_host}})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found. No valid host found '
'for cold migrate', six.text_type(ex))
# The response status code is 202 even though the operation will
# fail because the requested target host is down which will result
# in a NoValidHost error.
self.api.post_server_action(
server['id'], {'migrate': {'host': target_host}},
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# Make sure 'is_bfv' is set.
reqspec = objects.RequestSpec.get_by_instance_uuid(self.ctxt,

View File

@ -96,16 +96,17 @@ class DeletedServerAllocationRevertTest(
self._stub_delete_server_during_scheduling(server)
# Now start the cold migration which will fail due to NoValidHost.
# Note that we get a 404 back because this is a blocking call until
# conductor RPC casts to prep_resize on the selected dest host but
# we never get that far.
self.api.post_server_action(server['id'], {'migrate': None},
check_response_status=[404])
check_response_status=[202])
# We cannot monitor the migration from the API since it is deleted
# when the instance is deleted so just wait for the failed instance
# action event after the task rollback happens.
self._wait_for_action_fail_completion(
server, instance_actions.MIGRATE, 'cold_migrate', api=self.api)
# Note that we get InstanceNotFound rather than NoValidHost because
# the NoValidHost handler in ComputeTaskManager._cold_migrate calls
# _set_vm_state_and_notify which raises InstanceNotFound and masks
# the NoValidHost error.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'InstanceNotFound')
self._assert_no_allocations(server)
def test_live_migration_task_rollback(self):

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from nova.compute import instance_actions
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import fixtures as func_fixtures
@ -63,7 +64,7 @@ class MultiCellSchedulerTestCase(test.TestCase,
return self.admin_api.api_post(
'/servers/%s/action' % found_server['id'],
{'migrate': None},
check_response_status=[expected_status])
check_response_status=[expected_status]), found_server
def test_migrate_between_cells(self):
"""Verify that migrating between cells is not allowed.
@ -75,8 +76,10 @@ class MultiCellSchedulerTestCase(test.TestCase,
self.start_service('compute', host='compute1', cell=CELL1_NAME)
self.start_service('compute', host='compute2', cell=CELL2_NAME)
result = self._test_create_and_migrate(expected_status=400)
self.assertIn('No valid host', result.body['badRequest']['message'])
_, server = self._test_create_and_migrate(expected_status=202)
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
def test_migrate_within_cell(self):
"""Verify that migrating within cells is allowed.

View File

@ -17,6 +17,7 @@ import mock
from oslo_config import cfg
import six
from nova.compute import instance_actions
from nova import context
from nova.db import api as db
from nova.db.sqlalchemy import api as db_api
@ -346,11 +347,13 @@ class ServerGroupTestV21(ServerGroupTestBase):
servers = self._boot_servers_to_group(created_group)
post = {'migrate': {}}
ex = self.assertRaises(client.OpenStackApiException,
self.admin_api.post_server_action,
servers[1]['id'], post)
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host found for cold migrate', ex.response.text)
# NoValidHost errors are handled in conductor after the API has cast
# off and returned a 202 response to the user.
self.admin_api.post_server_action(servers[1]['id'], post,
check_response_status=[202])
# The instance action should have failed with details.
self._assert_resize_migrate_action_fail(
servers[1], instance_actions.MIGRATE, 'NoValidHost')
def test_migrate_with_group_no_valid_host(self):
for group in [self.affinity, self.anti_affinity]:

View File

@ -2006,11 +2006,10 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
}
}
resp = self.api.post_server_action(
server['id'], resize_req, check_response_status=[400])
self.assertEqual(
resp['badRequest']['message'],
"No valid host was found. No valid host found for resize")
self.api.post_server_action(
server['id'], resize_req, check_response_status=[202])
self._assert_resize_migrate_action_fail(
server, instance_actions.RESIZE, 'NoValidHost')
server = self.admin_api.get_server(server['id'])
self.assertEqual(source_hostname, server['OS-EXT-SRV-ATTR:host'])
@ -2210,10 +2209,9 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
# migrate the server
post = {'migrate': None}
ex = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], post)
self.assertIn('No valid host', six.text_type(ex))
self.api.post_server_action(server['id'], post)
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
expected_params = {'OS-EXT-SRV-ATTR:host': source_hostname,
'status': 'ACTIVE'}
self._wait_for_server_parameter(self.api, server, expected_params)
@ -3300,15 +3298,6 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
data = {"resize": {"flavorRef": self.flavor2["id"]}}
self.api.post_server_action(server_uuid, data)
# fill_provider_mapping should have been called once for the initial
# build, once for the resize scheduling to the primary host and then
# once per reschedule.
expected_fill_count = 2
if num_alts > 1:
expected_fill_count += self.num_fails - 1
self.assertGreaterEqual(mock_fill_provider_map.call_count,
expected_fill_count)
if num_alts < fails:
# We will run out of alternates before populate_retry will
# raise a MaxRetriesExceeded exception, so the migration will
@ -3349,6 +3338,15 @@ class ServerMovingTests(integrated_helpers.ProviderUsageBaseTestCase):
migrations = objects.MigrationList.get_by_filters(ctxt, filters)
self.assertEqual(1, len(migrations.objects))
# fill_provider_mapping should have been called once for the initial
# build, once for the resize scheduling to the primary host and then
# once per reschedule.
expected_fill_count = 2
if num_alts > 1:
expected_fill_count += self.num_fails - 1
self.assertGreaterEqual(mock_fill_provider_map.call_count,
expected_fill_count)
def test_resize_reschedule_uses_host_lists_1_fail(self):
self._test_resize_reschedule_uses_host_lists(fails=1)
@ -4701,14 +4699,19 @@ class ConsumerGenerationConflictTest(
mock_put.return_value = rsp
request = {'migrate': None}
exception = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], request)
self.api.post_server_action(server['id'], request,
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
# I know that HTTP 500 is harsh code but I think this conflict case
# signals either a serious db inconsistency or a bug in nova's
# claim code.
self.assertEqual(500, exception.response.status_code)
# The instance action should have failed with details.
# save_and_reraise_exception gets different results between py2 and py3
# for the traceback but we want to use the more specific
# "claim_resources" for py3. We can remove this when we drop support
# for py2.
error_in_tb = 'claim_resources' if six.PY3 else 'select_destinations'
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, error_in_tb)
# The migration is aborted so the instance is ACTIVE on the source
# host instead of being in VERIFY_RESIZE state.
@ -4741,15 +4744,16 @@ class ConsumerGenerationConflictTest(
mock_post.return_value = rsp
request = {'migrate': None}
exception = self.assertRaises(client.OpenStackApiException,
self.api.post_server_action,
server['id'], request)
self.api.post_server_action(server['id'], request,
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'move_allocations')
self.assertEqual(1, mock_post.call_count)
self.assertEqual(409, exception.response.status_code)
self.assertIn('Failed to move allocations', exception.response.text)
migrations = self.api.get_migrations()
self.assertEqual(1, len(migrations))
self.assertEqual('migration', migrations[0]['migration_type'])
@ -6542,12 +6546,13 @@ class ServerMoveWithPortResourceRequestTest(
'nova.objects.Service.get_by_host_and_binary',
side_effect=fake_get_service):
ex = self.assertRaises(
client.OpenStackApiException,
self.api.post_server_action, server['id'], {'migrate': None})
self.api.post_server_action(server['id'], {'migrate': None},
check_response_status=[202])
self._wait_for_server_parameter(self.admin_api, server,
{'OS-EXT-STS:task_state': None})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found.', six.text_type(ex))
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# check that the server still allocates from the original host
self._check_allocation(
@ -6607,7 +6612,6 @@ class ServerMoveWithPortResourceRequestTest(
side_effect=fake_get_service):
self.api.post_server_action(server['id'], {'migrate': None})
self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
migration_uuid = self.get_migration_uuid_for_instance(server['id'])
@ -6954,13 +6958,11 @@ class ServerMoveWithPortResourceRequestTest(
# enough information to do a proper port binding on the target host.
# The MigrationTask in the conductor checks that the RPC is new enough
# for this request for each possible destination provided by the
# scheduler and skips the old hosts.
ex = self.assertRaises(
client.OpenStackApiException, self.api.post_server_action,
server['id'], {'migrate': None})
self.assertEqual(400, ex.response.status_code)
self.assertIn('No valid host was found.', six.text_type(ex))
# scheduler and skips the old hosts. The actual response will be a 202
# so we have to wait for the failed instance action event.
self.api.post_server_action(server['id'], {'migrate': None})
self._assert_resize_migrate_action_fail(
server, instance_actions.MIGRATE, 'NoValidHost')
# The migration is put into error
self._wait_for_migration_status(server, ['error'])

View File

@ -721,8 +721,6 @@ class ServerActionsControllerTestV21(test.TestCase):
(exception.ImageNotFound(image_id=image_id),
webob.exc.HTTPBadRequest),
(exception.Invalid, webob.exc.HTTPBadRequest),
(exception.NoValidHost(reason='Bad host'),
webob.exc.HTTPBadRequest),
(exception.AutoDiskConfigDisabledByImage(image=image_id),
webob.exc.HTTPBadRequest),
]
@ -795,15 +793,6 @@ class ServerActionsControllerTestV21(test.TestCase):
self.controller._action_resize,
self.req, FAKE_UUID, body=body)
@mock.patch('nova.compute.api.API.resize',
side_effect=exception.NoValidHost(reason=''))
def test_resize_raises_no_valid_host(self, mock_resize):
body = dict(resize=dict(flavorRef="http://localhost/3"))
self.assertRaises(webob.exc.HTTPBadRequest,
self.controller._action_resize,
self.req, FAKE_UUID, body=body)
@mock.patch.object(compute_api.API, 'resize')
def test_resize_instance_raise_auto_disk_config_exc(self, mock_resize):
mock_resize.side_effect = exception.AutoDiskConfigDisabledByImage(

View File

@ -1941,7 +1941,7 @@ class _ComputeAPIUnitTestMixIn(object):
scheduler_hint=scheduler_hint,
flavor=test.MatchType(objects.Flavor),
clean_shutdown=clean_shutdown,
request_spec=fake_spec, do_cast=allow_cross_cell_resize)
request_spec=fake_spec, do_cast=True)
else:
mock_resize.assert_not_called()

View File

@ -0,0 +1,12 @@
---
other:
- |
The ``resize`` and ``migrate`` server action APIs used to synchronously
block until a destination host is selected by the scheduler. Those APIs
now asynchronously return a response to the user before scheduling.
The response code remains 202 and users can monitor the operation via the
``status`` and ``OS-EXT-STS:task_state`` fields on the server resource and
also by using the ``os-instance-actions`` API. The most notable
change is ``NoValidHost`` will not be returned in a 400 error response
from the API if scheduling fails but that information is available via the
instance actions API interface.