Support unshelve with qos ports

This patch adds support for unshelving an offloaded server with qos ports.
To do that this patch:
* collects the port resource requests from neutron before the scheduler
  is called to select the target of the unshelve.
* calculate the request group - provider mapping after the scheduler
  selected the target host
* update the InstancePCIRequest to drive the pci_claim to allocate VFs
  from the same PF as the bandwidth is allocated from by the scheduler
* update the binding profile of the qos ports to so that the allocation
  key of the binding profile points to the RPs the port is allocated
  from.

As this was the last move operation to be supported the compute service
version is bumped to indicate such support. This will be used in a later
patches to implement a global service level check in the API.

Note that unshelve does not have a re-schedule loop and all the RPC
changes was committed in Queens.

Two error cases needs special care by rolling back allocations before
putting the instance back to SHELVED_OFFLOADED state:

* if the IntancePCIRequest cannot be updated according to the new target
host of unshelve
* if updating port binding fails in neutron during unshelve

Change-Id: I678722b3cf295c89110967d5ad8c0c964df4cb42
blueprint: support-move-ops-with-qos-ports-ussuri
This commit is contained in:
Balazs Gibizer 2020-01-29 10:45:00 +01:00
parent ca4226cb87
commit 94c7e7ad43
9 changed files with 433 additions and 17 deletions

View File

@ -6452,8 +6452,9 @@ class ComputeManager(manager.Manager):
@utils.synchronized(instance.uuid)
def do_unshelve_instance():
self._unshelve_instance(context, instance, image,
filter_properties, node)
self._unshelve_instance(
context, instance, image, filter_properties, node,
request_spec)
do_unshelve_instance()
def _unshelve_instance_key_scrub(self, instance):
@ -6470,7 +6471,7 @@ class ComputeManager(manager.Manager):
instance.update(keys)
def _unshelve_instance(self, context, instance, image, filter_properties,
node):
node, request_spec):
LOG.info('Unshelving', instance=instance)
bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
context, instance.uuid)
@ -6503,9 +6504,18 @@ class ComputeManager(manager.Manager):
utils.get_image_from_system_metadata(
instance.system_metadata))
provider_mappings = self._get_request_group_mapping(request_spec)
try:
self.network_api.setup_instance_network_on_host(context, instance,
self.host)
if provider_mappings:
update = (
compute_utils.
update_pci_request_spec_with_allocated_interface_name)
update(context, self.reportclient, instance, provider_mappings)
self.network_api.setup_instance_network_on_host(
context, instance, self.host,
provider_mappings=provider_mappings)
network_info = self.network_api.get_instance_nw_info(
context, instance)
with self.rt.instance_claim(context, instance, node, allocations,

View File

@ -935,9 +935,14 @@ class ComputeTaskManager(base.Base):
filter_properties = request_spec.\
to_legacy_filter_properties_dict()
# TODO(gibi): We need to make sure that the
# requested_resources field is re calculated based on
# neutron ports.
port_res_req = (
self.network_api.get_requested_resource_for_instance(
context, instance.uuid))
# NOTE(gibi): When cyborg or other module wants to handle
# similar non-nova resources then here we have to collect
# all the external resource requests in a single list and
# add them to the RequestSpec.
request_spec.requested_resources = port_res_req
# NOTE(cfriesen): Ensure that we restrict the scheduler to
# the cell specified by the instance mapping.
@ -959,6 +964,10 @@ class ComputeTaskManager(base.Base):
instance.availability_zone = (
availability_zones.get_host_availability_zone(
context, host))
scheduler_utils.fill_provider_mapping(
request_spec, selection)
self.compute_rpcapi.unshelve_instance(
context, instance, host, request_spec, image=image,
filter_properties=filter_properties, node=node)

View File

@ -3433,9 +3433,12 @@ class API(base.Base):
pci_slot)
# NOTE(gibi): during live migration the conductor already sets the
# allocation key in the port binding
# allocation key in the port binding. However during resize, cold
# migrate, evacuate and unshelve we have to set the binding here.
# Also note that during unshelve no migration object is created.
if (p.get('resource_request') and
migration['migration_type'] != constants.LIVE_MIGRATION):
(migration is None or
migration['migration_type'] != constants.LIVE_MIGRATION)):
if not provider_mappings:
# TODO(gibi): Remove this check when compute RPC API is
# bumped to 6.0

View File

@ -31,7 +31,7 @@ LOG = logging.getLogger(__name__)
# NOTE(danms): This is the global service version counter
SERVICE_VERSION = 48
SERVICE_VERSION = 49
# NOTE(danms): This is our SERVICE_VERSION history. The idea is that any
@ -178,6 +178,8 @@ SERVICE_VERSION_HISTORY = (
{'compute_rpc': '5.10'},
# Version 48: Drivers report COMPUTE_SAME_HOST_COLD_MIGRATE trait.
{'compute_rpc': '5.10'},
# Version 49: Compute now support server move operations with qos ports
{'compute_rpc': '5.10'},
)

View File

@ -22,6 +22,7 @@ import zlib
from keystoneauth1 import adapter
import mock
from neutronclient.common import exceptions as neutron_exception
import os_resource_classes as orc
from oslo_config import cfg
from oslo_log import log as logging
@ -7408,6 +7409,191 @@ class ServerMoveWithPortResourceRequestTest(
self._delete_server_and_check_allocations(
server, qos_normal_port, qos_sriov_port)
def _turn_off_api_check(self):
# The API actively rejecting the move operations with resource
# request so we have to turn off that check.
# TODO(gibi): Remove this when the move operations are supported and
# the API check is removed.
patcher = mock.patch(
'nova.api.openstack.common.'
'supports_port_resource_request_during_move',
return_value=True)
self.addCleanup(patcher.stop)
patcher.start()
def test_unshelve_offloaded_server_with_qos_port(self):
# TODO(gibi): remove this when live migration is fully supported and
# therefore the check is removed from the api
self._turn_off_api_check()
non_qos_normal_port = self.neutron.port_1
qos_normal_port = self.neutron.port_with_resource_request
qos_sriov_port = self.neutron.port_with_sriov_resource_request
server = self._create_server_with_ports_and_check_allocation(
non_qos_normal_port, qos_normal_port, qos_sriov_port)
# with default config shelve means immediate offload as well
req = {
'shelve': {}
}
self.api.post_server_action(server['id'], req)
self._wait_for_server_parameter(
server, {'status': 'SHELVED_OFFLOADED'})
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
self.api.post_server_action(server['id'], {'unshelve': None})
self._wait_for_server_parameter(
server,
{'OS-EXT-SRV-ATTR:host': 'host1',
'status': 'ACTIVE'})
self._check_allocation(
server, self.compute1_rp_uuid, non_qos_normal_port,
qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)
self._assert_pci_request_pf_device_name(server, 'host1-ens2')
# shelve offload again and then make host1 unusable so the subsequent
# unshelve needs to select host2
req = {
'shelve': {}
}
self.api.post_server_action(server['id'], req)
self._wait_for_server_parameter(
server, {'status': 'SHELVED_OFFLOADED'})
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
self.admin_api.put_service(
self.compute1_service_id, {"status": "disabled"})
self.api.post_server_action(server['id'], {'unshelve': None})
self._wait_for_server_parameter(
server,
{'OS-EXT-SRV-ATTR:host': 'host2',
'status': 'ACTIVE'})
self._check_allocation(
server, self.compute2_rp_uuid, non_qos_normal_port,
qos_normal_port, qos_sriov_port, self.flavor_with_group_policy)
self._assert_pci_request_pf_device_name(server, 'host2-ens2')
self._delete_server_and_check_allocations(
server, qos_normal_port, qos_sriov_port)
def test_unshelve_offloaded_server_with_qos_port_pci_update_fails(self):
# TODO(gibi): remove this when live migration is fully supported and
# therefore the check is removed from the api
self._turn_off_api_check()
# Update the name of the network device RP of PF2 on host2 to something
# unexpected. This will cause
# update_pci_request_spec_with_allocated_interface_name() to raise
# when the instance is unshelved to the host2.
rsp = self.placement_api.put(
'/resource_providers/%s'
% self.sriov_dev_rp_per_host[self.compute2_rp_uuid][self.PF2],
{"name": "invalid-device-rp-name"})
self.assertEqual(200, rsp.status)
non_qos_normal_port = self.neutron.port_1
qos_normal_port = self.neutron.port_with_resource_request
qos_sriov_port = self.neutron.port_with_sriov_resource_request
server = self._create_server_with_ports_and_check_allocation(
non_qos_normal_port, qos_normal_port, qos_sriov_port)
# with default config shelve means immediate offload as well
req = {
'shelve': {}
}
self.api.post_server_action(server['id'], req)
self._wait_for_server_parameter(
server, {'status': 'SHELVED_OFFLOADED'})
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
# make host1 unusable so the subsequent unshelve needs to select host2
self.admin_api.put_service(
self.compute1_service_id, {"status": "disabled"})
self.api.post_server_action(server['id'], {'unshelve': None})
# Unshelve fails on host2 due to
# update_pci_request_spec_with_allocated_interface_name fails so the
# instance goes back to shelve offloaded state
fake_notifier.wait_for_versioned_notifications(
'instance.unshelve.start')
error_notification = fake_notifier.wait_for_versioned_notifications(
'compute.exception')[0]
self.assertEqual(
'UnexpectedResourceProviderNameForPCIRequest',
error_notification['payload']['nova_object.data']['exception'])
server = self._wait_for_server_parameter(
server,
{'OS-EXT-STS:task_state': None,
'status': 'SHELVED_OFFLOADED'})
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
self._delete_server_and_check_allocations(
server, qos_normal_port, qos_sriov_port)
def test_unshelve_offloaded_server_with_qos_port_fails_due_to_neutron(
self):
# TODO(gibi): remove this when live migration is fully supported and
# therefore the check is removed from the api
self._turn_off_api_check()
non_qos_normal_port = self.neutron.port_1
qos_normal_port = self.neutron.port_with_resource_request
qos_sriov_port = self.neutron.port_with_sriov_resource_request
server = self._create_server_with_ports_and_check_allocation(
non_qos_normal_port, qos_normal_port, qos_sriov_port)
# with default config shelve means immediate offload as well
req = {
'shelve': {}
}
self.api.post_server_action(server['id'], req)
self._wait_for_server_parameter(
server, {'status': 'SHELVED_OFFLOADED'})
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
# Simulate that port update fails during unshelve due to neutron is
# unavailable
with mock.patch(
'nova.tests.fixtures.NeutronFixture.'
'update_port') as mock_update_port:
mock_update_port.side_effect = neutron_exception.ConnectionFailed(
reason='test')
req = {'unshelve': None}
self.api.post_server_action(server['id'], req)
fake_notifier.wait_for_versioned_notifications(
'instance.unshelve.start')
self._wait_for_server_parameter(
server,
{'status': 'SHELVED_OFFLOADED',
'OS-EXT-STS:task_state': None})
# As the instance went back to offloaded state we expect no allocation
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
self.assertEqual(0, len(allocations))
self._delete_server_and_check_allocations(
server, qos_normal_port, qos_sriov_port)
class LiveMigrateAbortWithPortResourceRequestTest(
PortResourceRequestBasedSchedulingTestBase):

View File

@ -42,6 +42,7 @@ from nova.objects import base
from nova.objects import block_device as block_device_obj
from nova.objects import fields
from nova import rpc
from nova.scheduler.client import report
from nova import test
from nova.tests.unit import fake_block_device
from nova.tests.unit import fake_crypto
@ -1548,3 +1549,99 @@ class ComputeUtilsImageFunctionsTestCase(test.TestCase):
self.assertNotIn(p, properties)
for p in CONF.non_inheritable_image_properties:
self.assertNotIn(p, properties)
class PciRequestUpdateTestCase(test.NoDBTestCase):
def setUp(self):
super().setUp()
self.context = context.RequestContext('fake', 'fake')
def test_no_pci_request(self):
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[]))
provider_mapping = {}
compute_utils.update_pci_request_spec_with_allocated_interface_name(
self.context, mock.sentinel.report_client, instance,
provider_mapping)
def test_pci_request_from_flavor(self):
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[
objects.InstancePCIRequest(requester_id=None)
]))
provider_mapping = {}
compute_utils.update_pci_request_spec_with_allocated_interface_name(
self.context, mock.sentinel.report_client, instance,
provider_mapping)
def test_pci_request_has_no_mapping(self):
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[
objects.InstancePCIRequest(requester_id=uuids.port_1)
]))
provider_mapping = {}
compute_utils.update_pci_request_spec_with_allocated_interface_name(
self.context, mock.sentinel.report_client, instance,
provider_mapping)
def test_pci_request_ambiguous_mapping(self):
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[
objects.InstancePCIRequest(requester_id=uuids.port_1)
]))
provider_mapping = {uuids.port_1: [uuids.rp1, uuids.rp2]}
self.assertRaises(
exception.AmbiguousResourceProviderForPCIRequest,
(compute_utils.
update_pci_request_spec_with_allocated_interface_name),
self.context, mock.sentinel.report_client, instance,
provider_mapping)
def test_unexpected_provider_name(self):
report_client = mock.Mock(spec=report.SchedulerReportClient)
report_client.get_resource_provider_name.return_value = 'unexpected'
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[
objects.InstancePCIRequest(
requester_id=uuids.port_1,
spec=[{}])
]))
provider_mapping = {uuids.port_1: [uuids.rp1]}
self.assertRaises(
exception.UnexpectedResourceProviderNameForPCIRequest,
(compute_utils.
update_pci_request_spec_with_allocated_interface_name),
self.context, report_client, instance,
provider_mapping)
report_client.get_resource_provider_name.assert_called_once_with(
self.context, uuids.rp1)
self.assertNotIn(
'parent_ifname', instance.pci_requests.requests[0].spec[0])
def test_pci_request_updated(self):
report_client = mock.Mock(spec=report.SchedulerReportClient)
report_client.get_resource_provider_name.return_value = (
'host:agent:enp0s31f6')
instance = objects.Instance(
pci_requests=objects.InstancePCIRequests(requests=[
objects.InstancePCIRequest(
requester_id=uuids.port_1,
spec=[{}],
)
]))
provider_mapping = {uuids.port_1: [uuids.rp1]}
compute_utils.update_pci_request_spec_with_allocated_interface_name(
self.context, report_client, instance, provider_mapping)
report_client.get_resource_provider_name.assert_called_once_with(
self.context, uuids.rp1)
self.assertEqual(
'enp0s31f6',
instance.pci_requests.requests[0].spec[0]['parent_ifname'])

View File

@ -265,6 +265,9 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
return instance
@mock.patch('nova.compute.utils.'
'update_pci_request_spec_with_allocated_interface_name',
new=mock.NonCallableMock())
@mock.patch('nova.objects.BlockDeviceMappingList.get_by_instance_uuid')
@mock.patch('nova.compute.utils.notify_about_instance_action')
@mock.patch.object(nova.compute.manager.ComputeManager,
@ -358,8 +361,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
mock_notify_instance_usage_call_list)
mock_prep_block_device.assert_called_once_with(self.context,
instance, mock.ANY)
mock_setup_network.assert_called_once_with(self.context, instance,
self.compute.host)
mock_setup_network.assert_called_once_with(
self.context, instance, self.compute.host, provider_mappings=None)
mock_spawn.assert_called_once_with(self.context, instance,
test.MatchType(objects.ImageMeta), injected_files=[],
admin_password=None, allocations={}, network_info=[],
@ -458,8 +461,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
mock_notify_instance_usage_call_list)
mock_prep_block_device.assert_called_once_with(self.context, instance,
mock.ANY)
mock_setup_network.assert_called_once_with(self.context, instance,
self.compute.host)
mock_setup_network.assert_called_once_with(
self.context, instance, self.compute.host, provider_mappings=None)
mock_instance_claim.assert_called_once_with(self.context, instance,
test_compute.NODENAME,
{}, limits)
@ -545,8 +548,8 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
self.context, instance, 'unshelve.start')
mock_prep_block_device.assert_called_once_with(
self.context, instance, mock_bdms)
mock_setup_network.assert_called_once_with(self.context, instance,
self.compute.host)
mock_setup_network.assert_called_once_with(
self.context, instance, self.compute.host, provider_mappings=None)
mock_instance_claim.assert_called_once_with(self.context, instance,
test_compute.NODENAME,
{}, limits)
@ -557,6 +560,54 @@ class ShelveComputeManagerTestCase(test_compute.BaseTestCase):
mock_terminate_volume_connections.assert_called_once_with(
self.context, instance, mock_bdms)
@mock.patch('nova.network.neutron.API.setup_instance_network_on_host')
@mock.patch('nova.compute.utils.'
'update_pci_request_spec_with_allocated_interface_name')
def test_unshelve_with_resource_request(
self, mock_update_pci, mock_setup_network):
requested_res = [objects.RequestGroup(
requester_id=uuids.port_1,
provider_uuids=[uuids.rp1])]
request_spec = objects.RequestSpec(requested_resources=requested_res)
instance = self._create_fake_instance_obj()
self.compute.unshelve_instance(
self.context, instance, image=None,
filter_properties={}, node='fake-node', request_spec=request_spec)
mock_update_pci.assert_called_once_with(
self.context, self.compute.reportclient, instance,
{uuids.port_1: [uuids.rp1]})
mock_setup_network.assert_called_once_with(
self.context, instance, self.compute.host,
provider_mappings={uuids.port_1: [uuids.rp1]})
@mock.patch('nova.network.neutron.API.setup_instance_network_on_host',
new=mock.NonCallableMock())
@mock.patch('nova.compute.utils.'
'update_pci_request_spec_with_allocated_interface_name')
def test_unshelve_with_resource_request_update_raises(
self, mock_update_pci):
requested_res = [objects.RequestGroup(
requester_id=uuids.port_1,
provider_uuids=[uuids.rp1])]
request_spec = objects.RequestSpec(requested_resources=requested_res)
instance = self._create_fake_instance_obj()
mock_update_pci.side_effect = (
exception.UnexpectedResourceProviderNameForPCIRequest(
provider=uuids.rp1,
requester=uuids.port1,
provider_name='unexpected'))
self.assertRaises(
exception.UnexpectedResourceProviderNameForPCIRequest,
self.compute.unshelve_instance, self.context, instance, image=None,
filter_properties={}, node='fake-node', request_spec=request_spec)
mock_update_pci.assert_called_once_with(
self.context, self.compute.reportclient, instance,
{uuids.port_1: [uuids.rp1]})
@mock.patch.object(objects.InstanceList, 'get_by_filters')
def test_shelved_poll_none_offloaded(self, mock_get_by_filters):
# Test instances are not offloaded when shelved_offload_time is -1

View File

@ -1505,6 +1505,39 @@ class _BaseTaskTestCase(object):
self.context, instance, 'fake_host', fake_spec, image=None,
filter_properties={'limits': {}}, node='fake_node')
@mock.patch('nova.scheduler.utils.fill_provider_mapping')
@mock.patch('nova.network.neutron.API.get_requested_resource_for_instance')
@mock.patch.object(conductor_manager.ComputeTaskManager,
'_schedule_instances', )
def test_unshelve_instance_resource_request(
self, mock_schedule, mock_get_res_req, mock_fill_provider_mapping):
instance = self._create_fake_instance_obj()
instance.vm_state = vm_states.SHELVED_OFFLOADED
instance.save()
request_spec = objects.RequestSpec()
selection = objects.Selection(
service_host='fake_host',
nodename='fake_node',
limits=None)
mock_schedule.return_value = [[selection]]
res_req = [objects.RequestGroup()]
mock_get_res_req.return_value = res_req
self.conductor_manager.unshelve_instance(
self.context, instance, request_spec)
self.assertEqual(res_req, request_spec.requested_resources)
mock_get_res_req.assert_called_once_with(self.context, instance.uuid)
mock_schedule.assert_called_once_with(
self.context, request_spec, [instance.uuid],
return_alternates=False)
mock_fill_provider_mapping.assert_called_once_with(
request_spec, selection)
def test_rebuild_instance(self):
inst_obj = self._create_fake_instance_obj()
rebuild_args, compute_args = self._prepare_rebuild_args(

View File

@ -4634,6 +4634,31 @@ class TestAPI(TestAPIBase):
'binding:profile': {'allocation': uuids.dest_compute_rp},
'binding:host_id': 'new-host'}})
@mock.patch.object(neutronapi, 'get_client')
def test_update_port_bindings_for_instance_with_resource_req_unshelve(
self, get_client_mock):
instance = fake_instance.fake_instance_obj(self.context)
fake_ports = {'ports': [
{'id': 'fake-port-1',
'binding:vnic_type': 'normal',
constants.BINDING_HOST_ID: 'old-host',
constants.BINDING_PROFILE: {
'allocation': uuids.source_compute_rp},
'resource_request': mock.sentinel.resource_request}]}
list_ports_mock = mock.Mock(return_value=fake_ports)
get_client_mock.return_value.list_ports = list_ports_mock
# NOTE(gibi): during unshelve migration object is not created
self.api._update_port_binding_for_instance(
self.context, instance, 'new-host', None,
{'fake-port-1': [uuids.dest_compute_rp]})
get_client_mock.return_value.update_port.assert_called_once_with(
'fake-port-1',
{'port': {'device_owner': 'compute:None',
'binding:profile': {'allocation': uuids.dest_compute_rp},
'binding:host_id': 'new-host'}})
@mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
def test_update_port_bindings_for_instance_with_resource_req_no_mapping(
self, get_client_mock):