nova/nova/tests/functional/libvirt/test_reshape.py
Balazs Gibizer 2794748d9c Enhance service restart in functional env
Bugfix Icaf1bae8cb040b939f916a19ce026031ddb84af7 showed that restarting
a compute service in the functional env is unrealistic causing faults
to slip through. During that bug fix only the minimal change was done
in the functional env regarding compute service restart to reproduce
the reported fault. However the restart of the compute service could
be made even more realistic.

This patch simulates a compute service restart in the functional env
by stopping the original compute service and starting a totally new
compute service for the same host and node. This way we can make sure
that we get a brand new ComputeManager in the new service and no
state can leak between the old and the new service.

This change revealed another shortcoming of the functional env.
In the real world the nova-compute service could be restarted without
loosing any running servers on the compute host. But with the naive
implementation of this change the compute service is re-created. This
means that a new ComputeManager is instantiated that loads a new
FakeDriver instance as well. That new FakeDriver instance then reports
an empty hypervisor. This behavior is not totally unrealistic as it
simulates such a compute host restart that cleans the hypervisor state
as well (e.g. compute host redeployment). However this type of restart
shows another bug in the code path that destroys and deallocates
evacuated instance from the source host. Therefore this patch
implements the compute service restart in a way that simulates only a
service restart and not a full compute restart. A subsequent patch will
add a test that uses the clean hypervisor case to reproduces the
revealed bug.

Related-Bug: #1724172
Change-Id: I9d6cd6259659a35383c0c9c21db72a9434ba86b1
2019-06-25 16:11:02 +00:00

226 lines
9.9 KiB
Python

#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import time
import mock
from oslo_config import cfg
from oslo_log import log as logging
from nova import context
from nova import objects
from nova.tests.functional.libvirt import base
from nova.tests.unit.virt.libvirt import fakelibvirt
from nova.virt.libvirt import utils
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
class VGPUReshapeTests(base.ServersTestBase):
# the minimum libvirt version needed for vgpu
MIN_LIBVIRT_MDEV_SUPPORT = 3004000
def _wait_for_state_change(self, server, expected_status):
for i in range(0, 50):
server = self.api.get_server(server['id'])
if server['status'] == expected_status:
return server
time.sleep(.1)
self.assertEqual(expected_status, server['status'])
return server
def test_create_servers_with_vgpu(self):
"""Verify that vgpu reshape works with libvirt driver
1) create two servers with an old tree where the VGPU resource is on
the compute provider
2) trigger a reshape
3) check that the allocations of the servers are still valid
4) create another server now against the new tree
"""
# NOTE(gibi): We cannot simply ask the virt driver to create an old
# RP tree with vgpu on the root RP as that code path does not exist
# any more. So we have to hack a "bit". We will create a compute
# service without vgpu support to have the compute RP ready then we
# manually add the VGPU resources to that RP in placement. Also we make
# sure that during the instance claim the virt driver does not detect
# the old tree as that would be a bad time for reshape. Later when the
# compute service is restarted the driver will do the reshape.
fake_connection = self._get_connection(
# We need more RAM or the 3rd server won't be created
host_info=fakelibvirt.HostInfo(kB_mem=8192),
libvirt_version=self.MIN_LIBVIRT_MDEV_SUPPORT,
mdev_info=fakelibvirt.HostMdevDevicesInfo())
self.mock_conn.return_value = fake_connection
# start a compute with vgpu support disabled so the driver will
# ignore the content of the above HostMdevDeviceInfo
self.flags(enabled_vgpu_types='', group='devices')
self.compute = self.start_service('compute', host='compute1')
# create the VGPU resource in placement manually
compute_rp_uuid = self.placement_api.get(
'/resource_providers?name=compute1').body[
'resource_providers'][0]['uuid']
inventories = self.placement_api.get(
'/resource_providers/%s/inventories' % compute_rp_uuid).body
inventories['inventories']['VGPU'] = {
'allocation_ratio': 1.0,
'max_unit': 3,
'min_unit': 1,
'reserved': 0,
'step_size': 1,
'total': 3}
self.placement_api.put(
'/resource_providers/%s/inventories' % compute_rp_uuid,
inventories)
# now we boot two servers with vgpu
extra_spec = {"resources:VGPU": 1}
flavor_id = self._create_flavor(extra_spec=extra_spec)
server_req = self._build_server(flavor_id)
# NOTE(gibi): during instance_claim() there is a
# driver.update_provider_tree() call that would detect the old tree and
# would fail as this is not a good time to reshape. To avoid that we
# temporarily mock update_provider_tree here.
with mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'update_provider_tree'):
created_server1 = self.api.post_server({'server': server_req})
server1 = self._wait_for_state_change(created_server1, 'ACTIVE')
created_server2 = self.api.post_server({'server': server_req})
server2 = self._wait_for_state_change(created_server2, 'ACTIVE')
# Determine which device is associated with which instance
# { inst.uuid: pgpu_name }
inst_to_pgpu = {}
ctx = context.get_admin_context()
for server in (server1, server2):
inst = objects.Instance.get_by_uuid(ctx, server['id'])
mdevs = list(
self.compute.driver._get_all_assigned_mediated_devices(inst))
self.assertEqual(1, len(mdevs))
mdev_uuid = mdevs[0]
mdev_info = self.compute.driver._get_mediated_device_information(
utils.mdev_uuid2name(mdev_uuid))
inst_to_pgpu[inst.uuid] = mdev_info['parent']
# The VGPUs should have come from different pGPUs
self.assertNotEqual(*list(inst_to_pgpu.values()))
# verify that the inventory, usages and allocation are correct before
# the reshape
compute_inventory = self.placement_api.get(
'/resource_providers/%s/inventories' % compute_rp_uuid).body[
'inventories']
self.assertEqual(3, compute_inventory['VGPU']['total'])
compute_usages = self.placement_api.get(
'/resource_providers/%s/usages' % compute_rp_uuid).body[
'usages']
self.assertEqual(2, compute_usages['VGPU'])
for server in (server1, server2):
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body['allocations']
# the flavor has disk=10 and ephemeral=10
self.assertEqual(
{'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2, 'VGPU': 1},
allocations[compute_rp_uuid]['resources'])
# enabled vgpu support
self.flags(
enabled_vgpu_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
group='devices')
# restart compute which will trigger a reshape
self.compute = self.restart_compute_service(self.compute)
# verify that the inventory, usages and allocation are correct after
# the reshape
compute_inventory = self.placement_api.get(
'/resource_providers/%s/inventories' % compute_rp_uuid).body[
'inventories']
self.assertNotIn('VGPU', compute_inventory)
# NOTE(sbauza): The two instances will use two different pGPUs
# That said, we need to check all the pGPU inventories for knowing
# which ones are used.
usages = {}
pgpu_uuid_to_name = {}
for pci_device in [fakelibvirt.PGPU1_PCI_ADDR,
fakelibvirt.PGPU2_PCI_ADDR,
fakelibvirt.PGPU3_PCI_ADDR]:
gpu_rp_uuid = self.placement_api.get(
'/resource_providers?name=compute1_%s' % pci_device).body[
'resource_providers'][0]['uuid']
pgpu_uuid_to_name[gpu_rp_uuid] = pci_device
gpu_inventory = self.placement_api.get(
'/resource_providers/%s/inventories' % gpu_rp_uuid).body[
'inventories']
self.assertEqual(1, gpu_inventory['VGPU']['total'])
gpu_usages = self.placement_api.get(
'/resource_providers/%s/usages' % gpu_rp_uuid).body[
'usages']
usages[pci_device] = gpu_usages['VGPU']
# Make sure that both instances are using different pGPUs
used_devices = [dev for dev, usage in usages.items() if usage == 1]
avail_devices = list(set(usages.keys()) - set(used_devices))
self.assertEqual(2, len(used_devices))
# Make sure that both instances are using the correct pGPUs
for server in [server1, server2]:
allocations = self.placement_api.get(
'/allocations/%s' % server['id']).body[
'allocations']
self.assertEqual(
{'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
allocations[compute_rp_uuid]['resources'])
rp_uuids = list(allocations.keys())
# We only have two RPs, the compute RP (the root) and the child
# pGPU RP
gpu_rp_uuid = (rp_uuids[1] if rp_uuids[0] == compute_rp_uuid
else rp_uuids[0])
self.assertEqual(
{'VGPU': 1},
allocations[gpu_rp_uuid]['resources'])
# The pGPU's RP name contains the pGPU name
self.assertIn(inst_to_pgpu[server['id']],
pgpu_uuid_to_name[gpu_rp_uuid])
# now create one more instance with vgpu against the reshaped tree
created_server = self.api.post_server({'server': server_req})
server3 = self._wait_for_state_change(created_server, 'ACTIVE')
# find the pGPU that wasn't used before we created the third instance
# It should have taken the previously available pGPU
device = avail_devices[0]
gpu_rp_uuid = self.placement_api.get(
'/resource_providers?name=compute1_%s' % device).body[
'resource_providers'][0]['uuid']
gpu_usages = self.placement_api.get(
'/resource_providers/%s/usages' % gpu_rp_uuid).body[
'usages']
self.assertEqual(1, gpu_usages['VGPU'])
allocations = self.placement_api.get(
'/allocations/%s' % server3['id']).body[
'allocations']
self.assertEqual(
{'DISK_GB': 20, 'MEMORY_MB': 2048, 'VCPU': 2},
allocations[compute_rp_uuid]['resources'])
self.assertEqual(
{'VGPU': 1},
allocations[gpu_rp_uuid]['resources'])