Add retry logic for detaching device using LibVirt

Add retry logic for removing a disk device from the LibVirt
guest domain XML. This is needed because immediately after a guest
reboot, libvirtmod.virDomainDetachDeviceFlags() will silently fail
to remove the mapping from the guest domain. The async retry
behavior is done in Guest and is generic so it can be re-used by any other
detaches which hit this same race condition.

Change-Id: I983f80822a5c210929f33e1aa348a0fef91e890b
Closes-Bug: #1374508
This commit is contained in:
Ryan McNair 2015-09-24 22:20:23 +00:00
parent 3f8c69b2ef
commit 3a3fb3cfb2
6 changed files with 170 additions and 7 deletions

View File

@ -634,6 +634,14 @@ class VolumeBDMPathNotFound(VolumeBDMNotFound):
msg_fmt = _("No volume Block Device Mapping at path: %(path)s") msg_fmt = _("No volume Block Device Mapping at path: %(path)s")
class DeviceDetachFailed(NovaException):
msg_fmt = _("Device detach failed for %(device)s: %(reason)s)")
class DeviceNotFound(NotFound):
msg_fmt = _("Device '%(device)s' not found.")
class SnapshotNotFound(NotFound): class SnapshotNotFound(NotFound):
ec2_code = 'InvalidSnapshot.NotFound' ec2_code = 'InvalidSnapshot.NotFound'
msg_fmt = _("Snapshot %(snapshot_id)s could not be found.") msg_fmt = _("Snapshot %(snapshot_id)s could not be found.")

View File

@ -5108,16 +5108,25 @@ class LibvirtConnTestCase(test.NoDBTestCase):
mock_get_domain): mock_get_domain):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance) instance = objects.Instance(**self.test_instance)
mock_xml = """<domain> mock_xml_with_disk = """<domain>
<devices> <devices>
<disk type='file'> <disk type='file'>
<source file='/path/to/fake-volume'/> <source file='/path/to/fake-volume'/>
<target dev='vdc' bus='virtio'/> <target dev='vdc' bus='virtio'/>
</disk> </disk>
</devices> </devices>
</domain>"""
mock_xml_without_disk = """<domain>
<devices>
</devices>
</domain>""" </domain>"""
mock_dom = mock.MagicMock() mock_dom = mock.MagicMock()
mock_dom.XMLDesc.return_value = mock_xml
# Second time don't return anything about disk vdc so it looks removed
return_list = [mock_xml_with_disk, mock_xml_without_disk]
# Doubling the size of return list because we test with two guest power
# states
mock_dom.XMLDesc.side_effect = return_list + return_list
connection_info = {"driver_volume_type": "fake", connection_info = {"driver_volume_type": "fake",
"data": {"device_path": "/fake", "data": {"device_path": "/fake",
@ -5130,7 +5139,6 @@ class LibvirtConnTestCase(test.NoDBTestCase):
for state in (power_state.RUNNING, power_state.PAUSED): for state in (power_state.RUNNING, power_state.PAUSED):
mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678] mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678]
mock_get_domain.return_value = mock_dom mock_get_domain.return_value = mock_dom
drvr.detach_volume(connection_info, instance, '/dev/vdc') drvr.detach_volume(connection_info, instance, '/dev/vdc')
mock_get_domain.assert_called_with(instance) mock_get_domain.assert_called_with(instance)
@ -5142,6 +5150,28 @@ class LibvirtConnTestCase(test.NoDBTestCase):
mock_disconnect_volume.assert_called_with( mock_disconnect_volume.assert_called_with(
connection_info, 'vdc') connection_info, 'vdc')
@mock.patch('nova.virt.libvirt.host.Host.get_domain')
def test_detach_volume_disk_not_found(self, mock_get_domain):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
mock_xml_without_disk = """<domain>
<devices>
</devices>
</domain>"""
mock_dom = mock.MagicMock(return_value=mock_xml_without_disk)
connection_info = {"driver_volume_type": "fake",
"data": {"device_path": "/fake",
"access_mode": "rw"}}
mock_dom.info.return_value = [power_state.RUNNING, 512, 512, 2, 1234,
5678]
mock_get_domain.return_value = mock_dom
self.assertRaises(exception.DiskNotFound, drvr.detach_volume,
connection_info, instance, '/dev/vdc')
mock_get_domain.assert_called_once_with(instance)
def test_multi_nic(self): def test_multi_nic(self):
network_info = _fake_network_info(self.stubs, 2) network_info = _fake_network_info(self.stubs, 2)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)

View File

@ -213,6 +213,56 @@ class GuestTestCase(test.NoDBTestCase):
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | "</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
def test_detach_device_with_retry_detach_success(self):
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
conf.to_xml.return_value = "</xml>"
get_config = mock.Mock()
# Force multiple retries of detach
get_config.side_effect = [conf, conf, conf, None]
dev_path = "/dev/vdb"
retry_detach = self.guest.detach_device_with_retry(
get_config, dev_path, persistent=True, live=True,
inc_sleep_time=.01)
# Ensure we've only done the initial detach call
self.domain.detachDeviceFlags.assert_called_once_with(
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
get_config.assert_called_with(dev_path)
# Some time later, we can do the wait/retry to ensure detach succeeds
self.domain.detachDeviceFlags.reset_mock()
retry_detach()
# Should have two retries before we pretend device is detached
self.assertEqual(2, self.domain.detachDeviceFlags.call_count)
def test_detach_device_with_retry_detach_failure(self):
conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice)
conf.to_xml.return_value = "</xml>"
# Continue to return some value for the disk config
get_config = mock.Mock(return_value=conf)
retry_detach = self.guest.detach_device_with_retry(
get_config, "/dev/vdb", persistent=True, live=True,
inc_sleep_time=.01, max_retry_count=3)
# Ensure we've only done the initial detach call
self.domain.detachDeviceFlags.assert_called_once_with(
"</xml>", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG |
fakelibvirt.VIR_DOMAIN_AFFECT_LIVE))
# Some time later, we can do the wait/retry to ensure detach
self.domain.detachDeviceFlags.reset_mock()
# Should hit max # of retries
self.assertRaises(exception.DeviceDetachFailed, retry_detach)
self.assertEqual(4, self.domain.detachDeviceFlags.call_count)
def test_detach_device_with_retry_device_not_found(self):
get_config = mock.Mock(return_value=None)
self.assertRaises(
exception.DeviceNotFound, self.guest.detach_device_with_retry,
get_config, "/dev/vdb", persistent=True, live=True)
def test_get_xml_desc(self): def test_get_xml_desc(self):
self.guest.get_xml_desc() self.guest.get_xml_desc()
self.domain.XMLDesc.assert_called_once_with(flags=0) self.domain.XMLDesc.assert_called_once_with(flags=0)

View File

@ -139,6 +139,22 @@ class _FakeDriverBackendTestCase(object):
def fake_delete_instance_files(_self, _instance): def fake_delete_instance_files(_self, _instance):
pass pass
def fake_wait():
pass
def fake_detach_device_with_retry(_self, get_device_conf_func, device,
persistent, live,
max_retry_count=7,
inc_sleep_time=2,
max_sleep_time=30):
# Still calling detach, but instead of returning function
# that actually checks if device is gone from XML, just continue
# because XML never gets updated in these tests
_self.detach_device(get_device_conf_func(device),
persistent=persistent,
live=live)
return fake_wait
self.stubs.Set(nova.virt.libvirt.driver.LibvirtDriver, self.stubs.Set(nova.virt.libvirt.driver.LibvirtDriver,
'_get_instance_disk_info', '_get_instance_disk_info',
fake_get_instance_disk_info) fake_get_instance_disk_info)
@ -150,6 +166,10 @@ class _FakeDriverBackendTestCase(object):
'delete_instance_files', 'delete_instance_files',
fake_delete_instance_files) fake_delete_instance_files)
self.stubs.Set(nova.virt.libvirt.guest.Guest,
'detach_device_with_retry',
fake_detach_device_with_retry)
# Like the existing fakelibvirt.migrateToURI, do nothing, # Like the existing fakelibvirt.migrateToURI, do nothing,
# but don't fail for these tests. # but don't fail for these tests.
self.stubs.Set(nova.virt.libvirt.driver.libvirt.Domain, self.stubs.Set(nova.virt.libvirt.driver.libvirt.Domain,

View File

@ -1229,13 +1229,14 @@ class LibvirtDriver(driver.ComputeDriver):
disk_dev = mountpoint.rpartition("/")[2] disk_dev = mountpoint.rpartition("/")[2]
try: try:
guest = self._host.get_guest(instance) guest = self._host.get_guest(instance)
conf = guest.get_disk(disk_dev)
if not conf:
raise exception.DiskNotFound(location=disk_dev)
state = guest.get_power_state(self._host) state = guest.get_power_state(self._host)
live = state in (power_state.RUNNING, power_state.PAUSED) live = state in (power_state.RUNNING, power_state.PAUSED)
guest.detach_device(conf, persistent=True, live=live)
wait_for_detach = guest.detach_device_with_retry(guest.get_disk,
disk_dev,
persistent=True,
live=live)
if encryption: if encryption:
# The volume must be detached from the VM before # The volume must be detached from the VM before
@ -1244,12 +1245,16 @@ class LibvirtDriver(driver.ComputeDriver):
encryptor = self._get_volume_encryptor(connection_info, encryptor = self._get_volume_encryptor(connection_info,
encryption) encryption)
encryptor.detach_volume(**encryption) encryptor.detach_volume(**encryption)
wait_for_detach()
except exception.InstanceNotFound: except exception.InstanceNotFound:
# NOTE(zhaoqin): If the instance does not exist, _lookup_by_name() # NOTE(zhaoqin): If the instance does not exist, _lookup_by_name()
# will throw InstanceNotFound exception. Need to # will throw InstanceNotFound exception. Need to
# disconnect volume under this circumstance. # disconnect volume under this circumstance.
LOG.warn(_LW("During detach_volume, instance disappeared."), LOG.warn(_LW("During detach_volume, instance disappeared."),
instance=instance) instance=instance)
except exception.DeviceNotFound:
raise exception.DiskNotFound(location=disk_dev)
except libvirt.libvirtError as ex: except libvirt.libvirtError as ex:
# NOTE(vish): This is called to cleanup volumes after live # NOTE(vish): This is called to cleanup volumes after live
# migration, so we should still disconnect even if # migration, so we should still disconnect even if

View File

@ -29,6 +29,7 @@ then used by all the other libvirt related classes
from lxml import etree from lxml import etree
from oslo_log import log as logging from oslo_log import log as logging
from oslo_service import loopingcall
from oslo_utils import encodeutils from oslo_utils import encodeutils
from oslo_utils import excutils from oslo_utils import excutils
from oslo_utils import importutils from oslo_utils import importutils
@ -277,6 +278,55 @@ class Guest(object):
devs.append(dev) devs.append(dev)
return devs return devs
def detach_device_with_retry(self, get_device_conf_func, device,
persistent, live, max_retry_count=7,
inc_sleep_time=2,
max_sleep_time=30):
"""Detaches a device from the guest. After the initial detach request,
a function is returned which can be used to ensure the device is
successfully removed from the guest domain (retrying the removal as
necessary).
:param get_device_conf_func: function which takes device as a parameter
and returns the configuration for device
:param device: device to detach
:param persistent: bool to indicate whether the change is
persistent or not
:param live: bool to indicate whether it affects the guest in running
state
:param max_retry_count: number of times the returned function will
retry a detach before failing
:param inc_sleep_time: incremental time to sleep in seconds between
detach retries
:param max_sleep_time: max sleep time in seconds beyond which the sleep
time will not be incremented using param
inc_sleep_time. On reaching this threshold,
max_sleep_time will be used as the sleep time.
"""
conf = get_device_conf_func(device)
if conf is None:
raise exception.DeviceNotFound(device=device)
self.detach_device(conf, persistent, live)
@loopingcall.RetryDecorator(max_retry_count=max_retry_count,
inc_sleep_time=inc_sleep_time,
max_sleep_time=max_sleep_time,
exceptions=exception.DeviceDetachFailed)
def _do_wait_and_retry_detach():
config = get_device_conf_func(device)
if config is not None:
# Device is already detached from persistent domain
# and only transient domain needs update
self.detach_device(config, persistent=False, live=live)
# Raise error since the device still existed on the guest
reason = _("Unable to detach from guest transient domain.")
raise exception.DeviceDetachFailed(device=device,
reason=reason)
return _do_wait_and_retry_detach
def detach_device(self, conf, persistent=False, live=False): def detach_device(self, conf, persistent=False, live=False):
"""Detaches device to the guest. """Detaches device to the guest.