Start functional testing for cross-cell resize

This adds the beginning of functional tests for
cross-cell resize which get the instance to VERIFY_RESIZE
state and assert various aspects of the instance both in
the source cell and target cell.

The fake virt driver is updated to support creating a
server with device tags and also to have a host-specific
volume connector dict to aid in debugging test failures.

Confirm/revert resize flows will build on these tests.

Part of blueprint cross-cell-resize

Change-Id: I398aeb73da6bf09e177e5f1fdc668f59f06de8dd
This commit is contained in:
Matt Riedemann 2019-02-11 20:07:15 -05:00
parent 63094c2883
commit 3b60442456
5 changed files with 403 additions and 1 deletions

View File

@ -2310,6 +2310,9 @@ class API(base.Base):
src_host = migration.source_compute
# FIXME(mriedem): If migration.cross_cell_move, we need to also
# cleanup the instance data from the source cell database.
self._record_action_start(context, instance,
instance_actions.CONFIRM_RESIZE)

View File

@ -1947,6 +1947,14 @@ class CinderFixture(fixtures.Fixture):
yield volume_id
break
def attachment_ids_for_instance(self, instance_uuid):
attachment_ids = []
for volume_id, attachments in self.volume_to_attachment.items():
for attachment in attachments.values():
if attachment['instance_uuid'] == instance_uuid:
attachment_ids.append(attachment['id'])
return attachment_ids
def setUp(self):
super(CinderFixture, self).setUp()

View File

@ -426,6 +426,7 @@ class ProviderUsageBaseTestCase(test.TestCase, InstanceHelperMixin):
expected_fake_driver_capability_traits = set([
six.u(trait) for trait in [
os_traits.COMPUTE_IMAGE_TYPE_RAW,
os_traits.COMPUTE_DEVICE_TAGGING,
os_traits.COMPUTE_NET_ATTACH_INTERFACE,
os_traits.COMPUTE_NET_ATTACH_INTERFACE_WITH_TAG,
os_traits.COMPUTE_VOLUME_ATTACH_WITH_TAG,

View File

@ -0,0 +1,389 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from nova import context as nova_context
from nova import objects
from nova.scheduler import weights
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers
from nova.tests.unit.image import fake as fake_image
class HostNameWeigher(weights.BaseHostWeigher):
# TestMultiCellMigrate creates host1 in cell1 and host2 in cell2.
# Something about migrating from host1 to host2 teases out failures
# which probably has to do with cell1 being the default cell DB in
# our base test class setup, so prefer host1 to make the tests
# deterministic.
_weights = {'host1': 100, 'host2': 50}
def _weigh_object(self, host_state, weight_properties):
# Any undefined host gets no weight.
return self._weights.get(host_state.host, 0)
class TestMultiCellMigrate(integrated_helpers.ProviderUsageBaseTestCase):
"""Tests for cross-cell cold migration (resize)"""
NUMBER_OF_CELLS = 2
compute_driver = 'fake.MediumFakeDriver'
def setUp(self):
# Use our custom weigher defined above to make sure that we have
# a predictable scheduling sort order during server create.
self.flags(weight_classes=[__name__ + '.HostNameWeigher'],
group='filter_scheduler')
super(TestMultiCellMigrate, self).setUp()
self.cinder = self.useFixture(nova_fixtures.CinderFixture(self))
self._enable_cross_cell_resize()
# Adjust the polling interval and timeout for long RPC calls.
self.flags(rpc_response_timeout=1)
self.flags(long_rpc_timeout=60)
# Set up 2 compute services in different cells
self.host_to_cell_mappings = {
'host1': 'cell1', 'host2': 'cell2'}
# TODO(mriedem): Should probably put the hosts in separate AZs so we
# can assert the instance.availability_zone value is updated when it
# moves.
for host in sorted(self.host_to_cell_mappings):
self._start_compute(
host, cell_name=self.host_to_cell_mappings[host])
def _enable_cross_cell_resize(self):
# Enable cross-cell resize policy since it defaults to not allow
# anyone to perform that type of operation. For these tests we'll
# just allow admins to perform cross-cell resize.
# TODO(mriedem): Uncomment this when the policy rule is added and
# used in the compute API _allow_cross_cell_resize method. For now
# we just stub that method to return True.
# self.policy_fixture.set_rules({
# servers_policies.CROSS_CELL_RESIZE:
# base_policies.RULE_ADMIN_API},
# overwrite=False)
self.stub_out('nova.compute.api.API._allow_cross_cell_resize',
lambda *a, **kw: True)
def assertFlavorMatchesAllocation(self, flavor, allocation,
volume_backed=False):
self.assertEqual(flavor['vcpus'], allocation['VCPU'])
self.assertEqual(flavor['ram'], allocation['MEMORY_MB'])
# Volume-backed instances won't have DISK_GB allocations.
if volume_backed:
self.assertNotIn('DISK_GB', allocation)
else:
self.assertEqual(flavor['disk'], allocation['DISK_GB'])
def assert_instance_fields_match_flavor(self, instance, flavor):
self.assertEqual(instance.memory_mb, flavor['ram'])
self.assertEqual(instance.vcpus, flavor['vcpus'])
self.assertEqual(instance.root_gb, flavor['disk'])
self.assertEqual(
instance.ephemeral_gb, flavor['OS-FLV-EXT-DATA:ephemeral'])
def _count_volume_attachments(self, server_id):
attachment_ids = self.cinder.attachment_ids_for_instance(server_id)
return len(attachment_ids)
def assert_quota_usage(self, expected_num_instances):
limits = self.api.get_limits()['absolute']
self.assertEqual(expected_num_instances, limits['totalInstancesUsed'])
def _create_server(self, flavor, volume_backed=False):
"""Creates a server and waits for it to be ACTIVE
:param flavor: dict form of the flavor to use
:param volume_backed: True if the server should be volume-backed
:returns: server dict response from the GET /servers/{server_id} API
"""
# Provide a VIF tag for the pre-existing port. Since VIF tags are
# stored in the virtual_interfaces table in the cell DB, we want to
# make sure those survive the resize to another cell.
networks = [{
'port': self.neutron.port_1['id'],
'tag': 'private'
}]
image_uuid = fake_image.get_valid_image_id()
server = self._build_minimal_create_server_request(
self.api, 'test_cross_cell_resize',
image_uuid=image_uuid,
flavor_id=flavor['id'],
networks=networks)
# Put a tag on the server to make sure that survives the resize.
server['tags'] = ['test']
if volume_backed:
bdms = [{
'boot_index': 0,
'uuid': nova_fixtures.CinderFixture.IMAGE_BACKED_VOL,
'source_type': 'volume',
'destination_type': 'volume',
'tag': 'root'
}]
server['block_device_mapping_v2'] = bdms
# We don't need the imageRef for volume-backed servers.
server.pop('imageRef', None)
server = self.api.post_server({'server': server})
server = self._wait_for_state_change(self.admin_api, server, 'ACTIVE')
# For volume-backed make sure there is one attachment to start.
if volume_backed:
self.assertEqual(1, self._count_volume_attachments(server['id']),
self.cinder.volume_to_attachment)
return server
def _resize_and_validate(self, volume_backed=False, stopped=False,
target_host=None):
"""Creates and resizes the server to another cell. Validates various
aspects of the server and its related records (allocations, migrations,
actions, VIF tags, etc).
:param volume_backed: True if the server should be volume-backed, False
if image-backed.
:param stopped: True if the server should be stopped prior to resize,
False if the server should be ACTIVE
:param target_host: If not None, triggers a cold migration to the
specified host.
:returns: tuple of:
- server response object
- source compute node resource provider uuid
- target compute node resource provider uuid
- old flavor
- new flavor
"""
# Create the server.
flavors = self.api.get_flavors()
old_flavor = flavors[0]
server = self._create_server(old_flavor, volume_backed=volume_backed)
original_host = server['OS-EXT-SRV-ATTR:host']
if stopped:
# Stop the server before resizing it.
self.api.post_server_action(server['id'], {'os-stop': None})
self._wait_for_state_change(self.api, server, 'SHUTOFF')
# Before resizing make sure quota usage is only 1 for total instances.
self.assert_quota_usage(expected_num_instances=1)
# Resize it which should migrate the server to the host in the
# other cell.
new_flavor = flavors[1]
body = {'resize': {'flavorRef': new_flavor['id']}}
self.api.post_server_action(server['id'], body)
# Wait for the server to be resized and then verify the host has
# changed to be the host in the other cell.
server = self._wait_for_state_change(self.api, server, 'VERIFY_RESIZE')
expected_host = 'host1' if original_host == 'host2' else 'host2'
self.assertEqual(expected_host, server['OS-EXT-SRV-ATTR:host'])
# Assert that the instance is only listed one time from the API (to
# make sure it's not listed out of both cells).
# Note that we only get one because the DB API excludes hidden
# instances by default (see instance_get_all_by_filters_sort).
servers = self.api.get_servers()
self.assertEqual(1, len(servers),
'Unexpected number of servers: %s' % servers)
self.assertEqual(expected_host, servers[0]['OS-EXT-SRV-ATTR:host'])
# And that there is only one migration record.
migrations = self.api.api_get(
'/os-migrations?instance_uuid=%s' % server['id']
).body['migrations']
self.assertEqual(1, len(migrations),
'Unexpected number of migrations records: %s' %
migrations)
migration = migrations[0]
self.assertEqual('finished', migration['status'])
# There should be at least two actions, one for create and one for the
# resize. There will be a third action if the server was stopped.
actions = self.api.api_get(
'/servers/%s/os-instance-actions' % server['id']
).body['instanceActions']
expected_num_of_actions = 3 if stopped else 2
self.assertEqual(expected_num_of_actions, len(actions), actions)
# Each action should have events (make sure these were copied from
# the source cell to the target cell).
for action in actions:
detail = self.api.api_get(
'/servers/%s/os-instance-actions/%s' % (
server['id'], action['request_id'])).body['instanceAction']
self.assertNotEqual(0, len(detail['events']), detail)
# The tag should still be present on the server.
self.assertEqual(1, len(server['tags']),
'Server tags not found in target cell.')
self.assertEqual('test', server['tags'][0])
# Confirm the source node has allocations for the old flavor and the
# target node has allocations for the new flavor.
source_rp_uuid = self._get_provider_uuid_by_host(original_host)
# The source node allocations should be on the migration record.
source_allocations = self._get_allocations_by_provider_uuid(
source_rp_uuid)[migration['uuid']]['resources']
self.assertFlavorMatchesAllocation(
old_flavor, source_allocations, volume_backed=volume_backed)
target_rp_uuid = self._get_provider_uuid_by_host(expected_host)
# The target node allocations should be on the instance record.
target_allocations = self._get_allocations_by_provider_uuid(
target_rp_uuid)[server['id']]['resources']
self.assertFlavorMatchesAllocation(
new_flavor, target_allocations, volume_backed=volume_backed)
# The instance, in the target cell DB, should have the old and new
# flavor stored with it with the values we expect at this point.
target_cell = self.cell_mappings[
self.host_to_cell_mappings[expected_host]]
admin_context = nova_context.get_admin_context()
with nova_context.target_cell(admin_context, target_cell) as cctxt:
inst = objects.Instance.get_by_uuid(
cctxt, server['id'], expected_attrs=['flavor'])
self.assertIsNotNone(
inst.old_flavor,
'instance.old_flavor not saved in target cell')
self.assertIsNotNone(
inst.new_flavor,
'instance.new_flavor not saved in target cell')
self.assertEqual(inst.flavor.flavorid, inst.new_flavor.flavorid)
self.assertNotEqual(inst.flavor.flavorid, inst.old_flavor.flavorid)
self.assertEqual(old_flavor['id'], inst.old_flavor.flavorid)
self.assertEqual(new_flavor['id'], inst.new_flavor.flavorid)
# Assert the ComputeManager._set_instance_info fields
# are correct after the resize.
self.assert_instance_fields_match_flavor(inst, new_flavor)
# Assert the VIF tag was carried through to the target cell DB.
interface_attachments = self.api.get_port_interfaces(server['id'])
self.assertEqual(1, len(interface_attachments))
self.assertEqual('private', interface_attachments[0]['tag'])
if volume_backed:
# Assert the BDM tag was carried through to the target cell DB.
volume_attachments = self.api.get_server_volumes(server['id'])
self.assertEqual(1, len(volume_attachments))
self.assertEqual('root', volume_attachments[0]['tag'])
# Make sure the guest is no longer tracked on the source node.
source_guest_uuids = (
self.computes[original_host].manager.driver.list_instance_uuids())
self.assertNotIn(server['id'], source_guest_uuids)
# And the guest is on the target node hypervisor.
target_guest_uuids = (
self.computes[expected_host].manager.driver.list_instance_uuids())
self.assertIn(server['id'], target_guest_uuids)
# The source hypervisor continues to report usage in the hypervisors
# API because even though the guest was destroyed there, the instance
# resources are still claimed on that node in case the user reverts.
self.assert_hypervisor_usage(source_rp_uuid, old_flavor, volume_backed)
# The new flavor should show up with resource usage on the target host.
self.assert_hypervisor_usage(target_rp_uuid, new_flavor, volume_backed)
# While we have a copy of the instance in each cell database make sure
# that quota usage is only reporting 1 (because one is hidden).
self.assert_quota_usage(expected_num_instances=1)
# For a volume-backed server, at this point there should be two volume
# attachments for the instance: one tracked in the source cell and
# one in the target cell.
if volume_backed:
self.assertEqual(2, self._count_volume_attachments(server['id']),
self.cinder.volume_to_attachment)
# Assert the expected power state.
expected_power_state = 4 if stopped else 1
self.assertEqual(
expected_power_state, server['OS-EXT-STS:power_state'],
"Unexpected power state after resize.")
return server, source_rp_uuid, target_rp_uuid, old_flavor, new_flavor
def test_resize_confirm_image_backed(self):
"""Creates an image-backed server in one cell and resizes it to the
host in the other cell. The resize is confirmed.
"""
self._resize_and_validate()
# TODO(mriedem): Confirm the resize and make assertions.
def test_resize_revert_volume_backed(self):
"""Tests a volume-backed resize to another cell where the resize
is reverted back to the original source cell.
"""
self._resize_and_validate(volume_backed=True)
# TODO(mriedem): Revert the resize and make assertions.
def test_delete_while_in_verify_resize_status(self):
"""Tests that when deleting a server in VERIFY_RESIZE status, the
data is cleaned from both the source and target cell.
"""
server = self._resize_and_validate()[0]
self.api.delete_server(server['id'])
self._wait_until_deleted(server)
# Now list servers to make sure it doesn't show up from the source cell
servers = self.api.get_servers()
self.assertEqual(0, len(servers), servers)
# FIXME(mriedem): Need to cleanup from source cell in API method
# _confirm_resize_on_deleting(). The above check passes because the
# instance is still hidden in the source cell so the API filters it
# out.
target_host = server['OS-EXT-SRV-ATTR:host']
source_host = 'host1' if target_host == 'host2' else 'host2'
source_cell = self.cell_mappings[
self.host_to_cell_mappings[source_host]]
ctxt = nova_context.get_admin_context()
with nova_context.target_cell(ctxt, source_cell) as cctxt:
# Once the API is fixed this should raise InstanceNotFound.
instance = objects.Instance.get_by_uuid(cctxt, server['id'])
self.assertTrue(instance.hidden)
# TODO(mriedem): Test cold migration with a specified target host in
# another cell.
# TODO(mriedem): Test cross-cell list where the source cell has two
# hosts so the CrossCellWeigher picks the other host in the source cell
# and we do a traditional resize. Add a variant on this where the flavor
# being resized to is only available, via aggregate, on the host in the
# other cell so the CrossCellWeigher is overruled by the filters.
# TODO(mriedem): Test a bunch of rollback scenarios.
# TODO(mriedem): Test re-scheduling when the first host fails the
# resize_claim and a subsequent alternative host works, and also the
# case that all hosts fail the resize_claim.
# TODO(mriedem): Test cross-cell anti-affinity group assumptions from
# scheduler utils setup_instance_group where it assumes moves are within
# the same cell, so:
# 0. create 2 hosts in cell1 and 1 host in cell2
# 1. create two servers in an anti-affinity group in cell1
# 2. migrate one server to cell2
# 3. migrate the other server to cell2 - this should fail during scheduling
# because there is already a server from the anti-affinity group on the
# host in cell2 but setup_instance_group code may not catch it.
# TODO(mriedem): Perform a resize with at-capacity computes, meaning that
# when we revert we can only fit the instance with the old flavor back
# onto the source host in the source cell.
def test_resize_confirm_from_stopped(self):
"""Tests resizing and confirming a server that was initially stopped
so it should remain stopped through the resize.
"""
self._resize_and_validate(volume_backed=True, stopped=True)
# TODO(mriedem): Confirm the resize and assert the guest remains off
# TODO(mriedem): Test a failure scenario where the server is recovered via
# rebuild in the source cell.

View File

@ -106,6 +106,7 @@ class FakeDriver(driver.ComputeDriver):
"supports_evacuate": True,
"supports_migrate_to_same_host": True,
"supports_attach_interface": True,
"supports_device_tagging": True,
"supports_tagged_attach_interface": True,
"supports_tagged_attach_volume": True,
"supports_extend_volume": True,
@ -642,7 +643,7 @@ class FakeDriver(driver.ComputeDriver):
def get_volume_connector(self, instance):
return {'ip': CONF.my_block_storage_ip,
'initiator': 'fake',
'host': 'fakehost'}
'host': self._host}
def get_available_nodes(self, refresh=False):
return self._nodes