Add retry logic for detaching device using LibVirt
Add retry logic for removing a disk device from the LibVirt guest domain XML. This is needed because immediately after a guest reboot, libvirtmod.virDomainDetachDeviceFlags() will silently fail to remove the mapping from the guest domain. The async retry behavior is done in Guest and is generic so it can be re-used by any other detaches which hit this same race condition. Change-Id: I983f80822a5c210929f33e1aa348a0fef91e890b Closes-Bug: #1374508
This commit is contained in:
parent
3f8c69b2ef
commit
3a3fb3cfb2
@ -634,6 +634,14 @@ class VolumeBDMPathNotFound(VolumeBDMNotFound):
|
||||
msg_fmt = _("No volume Block Device Mapping at path: %(path)s")
|
||||
|
||||
|
||||
class DeviceDetachFailed(NovaException):
|
||||
msg_fmt = _("Device detach failed for %(device)s: %(reason)s)")
|
||||
|
||||
|
||||
class DeviceNotFound(NotFound):
|
||||
msg_fmt = _("Device '%(device)s' not found.")
|
||||
|
||||
|
||||
class SnapshotNotFound(NotFound):
|
||||
ec2_code = 'InvalidSnapshot.NotFound'
|
||||
msg_fmt = _("Snapshot %(snapshot_id)s could not be found.")
|
||||
|
@ -5108,16 +5108,25 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
mock_get_domain):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
instance = objects.Instance(**self.test_instance)
|
||||
mock_xml = """<domain>
|
||||
mock_xml_with_disk = """<domain>
|
||||
<devices>
|
||||
<disk type='file'>
|
||||
<source file='/path/to/fake-volume'/>
|
||||
<target dev='vdc' bus='virtio'/>
|
||||
</disk>
|
||||
</devices>
|
||||
</domain>"""
|
||||
mock_xml_without_disk = """<domain>
|
||||
<devices>
|
||||
</devices>
|
||||
</domain>"""
|
||||
mock_dom = mock.MagicMock()
|
||||
mock_dom.XMLDesc.return_value = mock_xml
|
||||
|
||||
# Second time don't return anything about disk vdc so it looks removed
|
||||
return_list = [mock_xml_with_disk, mock_xml_without_disk]
|
||||
# Doubling the size of return list because we test with two guest power
|
||||
# states
|
||||
mock_dom.XMLDesc.side_effect = return_list + return_list
|
||||
|
||||
connection_info = {"driver_volume_type": "fake",
|
||||
"data": {"device_path": "/fake",
|
||||
@ -5130,7 +5139,6 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
for state in (power_state.RUNNING, power_state.PAUSED):
|
||||
mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678]
|
||||
mock_get_domain.return_value = mock_dom
|
||||
|
||||
drvr.detach_volume(connection_info, instance, '/dev/vdc')
|
||||
|
||||
mock_get_domain.assert_called_with(instance)
|
||||
@ -5142,6 +5150,28 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
mock_disconnect_volume.assert_called_with(
|
||||
connection_info, 'vdc')
|
||||
|
||||
@mock.patch('nova.virt.libvirt.host.Host.get_domain')
|
||||
def test_detach_volume_disk_not_found(self, mock_get_domain):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
instance = objects.Instance(**self.test_instance)
|
||||
mock_xml_without_disk = """<domain>
|
||||
<devices>
|
||||
</devices>
|
||||
</domain>"""
|
||||
mock_dom = mock.MagicMock(return_value=mock_xml_without_disk)
|
||||
|
||||
connection_info = {"driver_volume_type": "fake",
|
||||
"data": {"device_path": "/fake",
|
||||
"access_mode": "rw"}}
|
||||
|
||||
mock_dom.info.return_value = [power_state.RUNNING, 512, 512, 2, 1234,
|
||||
5678]
|
||||
mock_get_domain.return_value = mock_dom
|
||||
self.assertRaises(exception.DiskNotFound, drvr.detach_volume,
|
||||
connection_info, instance, '/dev/vdc')
|
||||
|
||||
mock_get_domain.assert_called_once_with(instance)
|
||||
|
||||
def test_multi_nic(self):
|
||||
network_info = _fake_network_info(self.stubs, 2)
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
|
||||
|
@ -213,6 +213,56 @@ class GuestTestCase(test.NoDBTestCase):
|
||||
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
|
||||
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
|
||||
|
||||
def test_detach_device_with_retry_detach_success(self):
|
||||
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
|
||||
conf.to_xml.return_value = "</xml>"
|
||||
get_config = mock.Mock()
|
||||
# Force multiple retries of detach
|
||||
get_config.side_effect = [conf, conf, conf, None]
|
||||
dev_path = "/dev/vdb"
|
||||
|
||||
retry_detach = self.guest.detach_device_with_retry(
|
||||
get_config, dev_path, persistent=True, live=True,
|
||||
inc_sleep_time=.01)
|
||||
# Ensure we've only done the initial detach call
|
||||
self.domain.detachDeviceFlags.assert_called_once_with(
|
||||
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
|
||||
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
|
||||
|
||||
get_config.assert_called_with(dev_path)
|
||||
|
||||
# Some time later, we can do the wait/retry to ensure detach succeeds
|
||||
self.domain.detachDeviceFlags.reset_mock()
|
||||
retry_detach()
|
||||
# Should have two retries before we pretend device is detached
|
||||
self.assertEqual(2, self.domain.detachDeviceFlags.call_count)
|
||||
|
||||
def test_detach_device_with_retry_detach_failure(self):
|
||||
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
|
||||
conf.to_xml.return_value = "</xml>"
|
||||
# Continue to return some value for the disk config
|
||||
get_config = mock.Mock(return_value=conf)
|
||||
|
||||
retry_detach = self.guest.detach_device_with_retry(
|
||||
get_config, "/dev/vdb", persistent=True, live=True,
|
||||
inc_sleep_time=.01, max_retry_count=3)
|
||||
# Ensure we've only done the initial detach call
|
||||
self.domain.detachDeviceFlags.assert_called_once_with(
|
||||
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
|
||||
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
|
||||
|
||||
# Some time later, we can do the wait/retry to ensure detach
|
||||
self.domain.detachDeviceFlags.reset_mock()
|
||||
# Should hit max # of retries
|
||||
self.assertRaises(exception.DeviceDetachFailed, retry_detach)
|
||||
self.assertEqual(4, self.domain.detachDeviceFlags.call_count)
|
||||
|
||||
def test_detach_device_with_retry_device_not_found(self):
|
||||
get_config = mock.Mock(return_value=None)
|
||||
self.assertRaises(
|
||||
exception.DeviceNotFound, self.guest.detach_device_with_retry,
|
||||
get_config, "/dev/vdb", persistent=True, live=True)
|
||||
|
||||
def test_get_xml_desc(self):
|
||||
self.guest.get_xml_desc()
|
||||
self.domain.XMLDesc.assert_called_once_with(flags=0)
|
||||
|
@ -139,6 +139,22 @@ class _FakeDriverBackendTestCase(object):
|
||||
def fake_delete_instance_files(_self, _instance):
|
||||
pass
|
||||
|
||||
def fake_wait():
|
||||
pass
|
||||
|
||||
def fake_detach_device_with_retry(_self, get_device_conf_func, device,
|
||||
persistent, live,
|
||||
max_retry_count=7,
|
||||
inc_sleep_time=2,
|
||||
max_sleep_time=30):
|
||||
# Still calling detach, but instead of returning function
|
||||
# that actually checks if device is gone from XML, just continue
|
||||
# because XML never gets updated in these tests
|
||||
_self.detach_device(get_device_conf_func(device),
|
||||
persistent=persistent,
|
||||
live=live)
|
||||
return fake_wait
|
||||
|
||||
self.stubs.Set(nova.virt.libvirt.driver.LibvirtDriver,
|
||||
'_get_instance_disk_info',
|
||||
fake_get_instance_disk_info)
|
||||
@ -150,6 +166,10 @@ class _FakeDriverBackendTestCase(object):
|
||||
'delete_instance_files',
|
||||
fake_delete_instance_files)
|
||||
|
||||
self.stubs.Set(nova.virt.libvirt.guest.Guest,
|
||||
'detach_device_with_retry',
|
||||
fake_detach_device_with_retry)
|
||||
|
||||
# Like the existing fakelibvirt.migrateToURI, do nothing,
|
||||
# but don't fail for these tests.
|
||||
self.stubs.Set(nova.virt.libvirt.driver.libvirt.Domain,
|
||||
|
@ -1229,13 +1229,14 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
disk_dev = mountpoint.rpartition("/")[2]
|
||||
try:
|
||||
guest = self._host.get_guest(instance)
|
||||
conf = guest.get_disk(disk_dev)
|
||||
if not conf:
|
||||
raise exception.DiskNotFound(location=disk_dev)
|
||||
|
||||
state = guest.get_power_state(self._host)
|
||||
live = state in (power_state.RUNNING, power_state.PAUSED)
|
||||
guest.detach_device(conf, persistent=True, live=live)
|
||||
|
||||
wait_for_detach = guest.detach_device_with_retry(guest.get_disk,
|
||||
disk_dev,
|
||||
persistent=True,
|
||||
live=live)
|
||||
|
||||
if encryption:
|
||||
# The volume must be detached from the VM before
|
||||
@ -1244,12 +1245,16 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
encryptor = self._get_volume_encryptor(connection_info,
|
||||
encryption)
|
||||
encryptor.detach_volume(**encryption)
|
||||
|
||||
wait_for_detach()
|
||||
except exception.InstanceNotFound:
|
||||
# NOTE(zhaoqin): If the instance does not exist, _lookup_by_name()
|
||||
# will throw InstanceNotFound exception. Need to
|
||||
# disconnect volume under this circumstance.
|
||||
LOG.warn(_LW("During detach_volume, instance disappeared."),
|
||||
instance=instance)
|
||||
except exception.DeviceNotFound:
|
||||
raise exception.DiskNotFound(location=disk_dev)
|
||||
except libvirt.libvirtError as ex:
|
||||
# NOTE(vish): This is called to cleanup volumes after live
|
||||
# migration, so we should still disconnect even if
|
||||
|
@ -29,6 +29,7 @@ then used by all the other libvirt related classes
|
||||
|
||||
from lxml import etree
|
||||
from oslo_log import log as logging
|
||||
from oslo_service import loopingcall
|
||||
from oslo_utils import encodeutils
|
||||
from oslo_utils import excutils
|
||||
from oslo_utils import importutils
|
||||
@ -277,6 +278,55 @@ class Guest(object):
|
||||
devs.append(dev)
|
||||
return devs
|
||||
|
||||
def detach_device_with_retry(self, get_device_conf_func, device,
|
||||
persistent, live, max_retry_count=7,
|
||||
inc_sleep_time=2,
|
||||
max_sleep_time=30):
|
||||
"""Detaches a device from the guest. After the initial detach request,
|
||||
a function is returned which can be used to ensure the device is
|
||||
successfully removed from the guest domain (retrying the removal as
|
||||
necessary).
|
||||
|
||||
:param get_device_conf_func: function which takes device as a parameter
|
||||
and returns the configuration for device
|
||||
:param device: device to detach
|
||||
:param persistent: bool to indicate whether the change is
|
||||
persistent or not
|
||||
:param live: bool to indicate whether it affects the guest in running
|
||||
state
|
||||
:param max_retry_count: number of times the returned function will
|
||||
retry a detach before failing
|
||||
:param inc_sleep_time: incremental time to sleep in seconds between
|
||||
detach retries
|
||||
:param max_sleep_time: max sleep time in seconds beyond which the sleep
|
||||
time will not be incremented using param
|
||||
inc_sleep_time. On reaching this threshold,
|
||||
max_sleep_time will be used as the sleep time.
|
||||
"""
|
||||
|
||||
conf = get_device_conf_func(device)
|
||||
if conf is None:
|
||||
raise exception.DeviceNotFound(device=device)
|
||||
|
||||
self.detach_device(conf, persistent, live)
|
||||
|
||||
@loopingcall.RetryDecorator(max_retry_count=max_retry_count,
|
||||
inc_sleep_time=inc_sleep_time,
|
||||
max_sleep_time=max_sleep_time,
|
||||
exceptions=exception.DeviceDetachFailed)
|
||||
def _do_wait_and_retry_detach():
|
||||
config = get_device_conf_func(device)
|
||||
if config is not None:
|
||||
# Device is already detached from persistent domain
|
||||
# and only transient domain needs update
|
||||
self.detach_device(config, persistent=False, live=live)
|
||||
# Raise error since the device still existed on the guest
|
||||
reason = _("Unable to detach from guest transient domain.")
|
||||
raise exception.DeviceDetachFailed(device=device,
|
||||
reason=reason)
|
||||
|
||||
return _do_wait_and_retry_detach
|
||||
|
||||
def detach_device(self, conf, persistent=False, live=False):
|
||||
"""Detaches device to the guest.
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user