libvirt: Ignore DiskNotFound during update_available_resource

There was a previous attempt to fix this in
change Id687e11e235fd6c2f99bb647184310dfdce9a08d. However, there were 2
problems with the previous fix:

1. The handling of missing volumes and disks, while typically having the
   same cause, was inconsistent.

2. It failed to consider the very wide race opportunity in
   _get_disk_over_committed_size_total between initially fetching the
   instance list from the DB and later getting disk sizes.

Because _get_disk_over_committed_size_total() can be a very long
operation, we found that we were reliably hitting this race in CI.
It might be possible to fix the race, but this would add unnecessary
complication to code which isn't critical. It's far more robust just to
log it and ignore it, which is also consistent with the handling of
missing volumes.

Closes-Bug: #1774249

Change-Id: I48719c02713113a41176b8f5cc3c5831f1284a39
(cherry picked from commit 6198f317be)
This commit is contained in:
Matthew Booth 2019-09-27 16:51:02 +01:00 committed by Lee Yarwood
parent 40c288b1fb
commit 73d9b6e5f6
2 changed files with 10 additions and 51 deletions

View File

@ -16314,30 +16314,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertEqual(0, drvr._get_disk_over_committed_size_total())
@mock.patch('nova.virt.libvirt.host.Host.list_instance_domains')
@mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
'_get_instance_disk_info_from_config',
side_effect=exception.DiskNotFound(location='/opt/stack/foo'))
@mock.patch('nova.objects.BlockDeviceMappingList.bdms_by_instance_uuid',
return_value=objects.BlockDeviceMappingList())
@mock.patch('nova.objects.InstanceList.get_by_filters',
return_value=objects.InstanceList(objects=[
objects.Instance(uuid=uuids.instance,
vm_state=vm_states.ACTIVE,
task_state=None)]))
def test_disk_over_committed_size_total_disk_not_found_reraise(
self, mock_get, mock_bdms, mock_get_disk_info, mock_list_domains):
"""Tests that we handle DiskNotFound gracefully for an instance that
is NOT undergoing a task_state transition and the error is re-raised.
"""
mock_dom = mock.Mock()
mock_dom.XMLDesc.return_value = "<domain/>"
mock_dom.UUIDString.return_value = uuids.instance
mock_list_domains.return_value = [mock_dom]
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
self.assertRaises(exception.DiskNotFound,
drvr._get_disk_over_committed_size_total)
@mock.patch('nova.virt.libvirt.storage.lvm.get_volume_size')
@mock.patch('nova.virt.disk.api.get_disk_size',
new_callable=mock.NonCallableMock)

View File

@ -9596,35 +9596,18 @@ class LibvirtDriver(driver.ComputeDriver):
{'i_name': guest.name})
else:
raise
except exception.VolumeBDMPathNotFound as e:
except (exception.VolumeBDMPathNotFound,
exception.DiskNotFound) as e:
if isinstance(e, exception.VolumeBDMPathNotFound):
thing = 'backing volume block device'
elif isinstance(e, exception.DiskNotFound):
thing = 'backing disk storage'
LOG.warning('Periodic task is updating the host stats, '
'it is trying to get disk info for %(i_name)s, '
'but the backing volume block device was removed '
'by concurrent operations such as resize. '
'Error: %(error)s',
{'i_name': guest.name, 'error': e})
except exception.DiskNotFound:
with excutils.save_and_reraise_exception() as err_ctxt:
# If the instance is undergoing a task state transition,
# like moving to another host or is being deleted, we
# should ignore this instance and move on.
if guest.uuid in local_instances:
inst = local_instances[guest.uuid]
# bug 1774249 indicated when instance is in RESIZED
# state it might also can't find back disk
if (inst.task_state is not None or
inst.vm_state == vm_states.RESIZED):
LOG.info('Periodic task is updating the host '
'stats; it is trying to get disk info '
'for %(i_name)s, but the backing disk '
'was removed by a concurrent operation '
'(task_state=%(task_state)s) and '
'(vm_state=%(vm_state)s)',
{'i_name': guest.name,
'task_state': inst.task_state,
'vm_state': inst.vm_state},
instance=inst)
err_ctxt.reraise = False
'but the %(thing)s was removed by a concurrent '
'operation such as resize. Error: %(error)s',
{'i_name': guest.name, 'thing': thing, 'error': e})
# NOTE(gtt116): give other tasks a chance.
greenthread.sleep(0)