From 3a3fb3cfb2c41ad182545e47649ff12a4f3a743e Mon Sep 17 00:00:00 2001 From: Ryan McNair Date: Thu, 24 Sep 2015 22:20:23 +0000 Subject: [PATCH] Add retry logic for detaching device using LibVirt Add retry logic for removing a disk device from the LibVirt guest domain XML. This is needed because immediately after a guest reboot, libvirtmod.virDomainDetachDeviceFlags() will silently fail to remove the mapping from the guest domain. The async retry behavior is done in Guest and is generic so it can be re-used by any other detaches which hit this same race condition. Change-Id: I983f80822a5c210929f33e1aa348a0fef91e890b Closes-Bug: #1374508 --- nova/exception.py | 8 ++++ nova/tests/unit/virt/libvirt/test_driver.py | 36 +++++++++++++-- nova/tests/unit/virt/libvirt/test_guest.py | 50 +++++++++++++++++++++ nova/tests/unit/virt/test_virt_drivers.py | 20 +++++++++ nova/virt/libvirt/driver.py | 13 ++++-- nova/virt/libvirt/guest.py | 50 +++++++++++++++++++++ 6 files changed, 170 insertions(+), 7 deletions(-) diff --git a/nova/exception.py b/nova/exception.py index 3dc73bf80b49..9a3f2bfb674c 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -634,6 +634,14 @@ class VolumeBDMPathNotFound(VolumeBDMNotFound): msg_fmt = _("No volume Block Device Mapping at path: %(path)s") +class DeviceDetachFailed(NovaException): + msg_fmt = _("Device detach failed for %(device)s: %(reason)s)") + + +class DeviceNotFound(NotFound): + msg_fmt = _("Device '%(device)s' not found.") + + class SnapshotNotFound(NotFound): ec2_code = 'InvalidSnapshot.NotFound' msg_fmt = _("Snapshot %(snapshot_id)s could not be found.") diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 48493ef5e1ff..2e02c02d559b 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -5108,16 +5108,25 @@ class LibvirtConnTestCase(test.NoDBTestCase): mock_get_domain): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) instance = objects.Instance(**self.test_instance) - mock_xml = """ + mock_xml_with_disk = """ +""" + mock_xml_without_disk = """ + + """ mock_dom = mock.MagicMock() - mock_dom.XMLDesc.return_value = mock_xml + + # Second time don't return anything about disk vdc so it looks removed + return_list = [mock_xml_with_disk, mock_xml_without_disk] + # Doubling the size of return list because we test with two guest power + # states + mock_dom.XMLDesc.side_effect = return_list + return_list connection_info = {"driver_volume_type": "fake", "data": {"device_path": "/fake", @@ -5130,7 +5139,6 @@ class LibvirtConnTestCase(test.NoDBTestCase): for state in (power_state.RUNNING, power_state.PAUSED): mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678] mock_get_domain.return_value = mock_dom - drvr.detach_volume(connection_info, instance, '/dev/vdc') mock_get_domain.assert_called_with(instance) @@ -5142,6 +5150,28 @@ class LibvirtConnTestCase(test.NoDBTestCase): mock_disconnect_volume.assert_called_with( connection_info, 'vdc') + @mock.patch('nova.virt.libvirt.host.Host.get_domain') + def test_detach_volume_disk_not_found(self, mock_get_domain): + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + instance = objects.Instance(**self.test_instance) + mock_xml_without_disk = """ + + +""" + mock_dom = mock.MagicMock(return_value=mock_xml_without_disk) + + connection_info = {"driver_volume_type": "fake", + "data": {"device_path": "/fake", + "access_mode": "rw"}} + + mock_dom.info.return_value = [power_state.RUNNING, 512, 512, 2, 1234, + 5678] + mock_get_domain.return_value = mock_dom + self.assertRaises(exception.DiskNotFound, drvr.detach_volume, + connection_info, instance, '/dev/vdc') + + mock_get_domain.assert_called_once_with(instance) + def test_multi_nic(self): network_info = _fake_network_info(self.stubs, 2) drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index c467b6c87a94..8bf5e0f3b0df 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -213,6 +213,56 @@ class GuestTestCase(test.NoDBTestCase): "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + def test_detach_device_with_retry_detach_success(self): + conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) + conf.to_xml.return_value = "" + get_config = mock.Mock() + # Force multiple retries of detach + get_config.side_effect = [conf, conf, conf, None] + dev_path = "/dev/vdb" + + retry_detach = self.guest.detach_device_with_retry( + get_config, dev_path, persistent=True, live=True, + inc_sleep_time=.01) + # Ensure we've only done the initial detach call + self.domain.detachDeviceFlags.assert_called_once_with( + "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | + fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + + get_config.assert_called_with(dev_path) + + # Some time later, we can do the wait/retry to ensure detach succeeds + self.domain.detachDeviceFlags.reset_mock() + retry_detach() + # Should have two retries before we pretend device is detached + self.assertEqual(2, self.domain.detachDeviceFlags.call_count) + + def test_detach_device_with_retry_detach_failure(self): + conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) + conf.to_xml.return_value = "" + # Continue to return some value for the disk config + get_config = mock.Mock(return_value=conf) + + retry_detach = self.guest.detach_device_with_retry( + get_config, "/dev/vdb", persistent=True, live=True, + inc_sleep_time=.01, max_retry_count=3) + # Ensure we've only done the initial detach call + self.domain.detachDeviceFlags.assert_called_once_with( + "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | + fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + + # Some time later, we can do the wait/retry to ensure detach + self.domain.detachDeviceFlags.reset_mock() + # Should hit max # of retries + self.assertRaises(exception.DeviceDetachFailed, retry_detach) + self.assertEqual(4, self.domain.detachDeviceFlags.call_count) + + def test_detach_device_with_retry_device_not_found(self): + get_config = mock.Mock(return_value=None) + self.assertRaises( + exception.DeviceNotFound, self.guest.detach_device_with_retry, + get_config, "/dev/vdb", persistent=True, live=True) + def test_get_xml_desc(self): self.guest.get_xml_desc() self.domain.XMLDesc.assert_called_once_with(flags=0) diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index 5adcdfb296ab..36fac3ec1cd7 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -139,6 +139,22 @@ class _FakeDriverBackendTestCase(object): def fake_delete_instance_files(_self, _instance): pass + def fake_wait(): + pass + + def fake_detach_device_with_retry(_self, get_device_conf_func, device, + persistent, live, + max_retry_count=7, + inc_sleep_time=2, + max_sleep_time=30): + # Still calling detach, but instead of returning function + # that actually checks if device is gone from XML, just continue + # because XML never gets updated in these tests + _self.detach_device(get_device_conf_func(device), + persistent=persistent, + live=live) + return fake_wait + self.stubs.Set(nova.virt.libvirt.driver.LibvirtDriver, '_get_instance_disk_info', fake_get_instance_disk_info) @@ -150,6 +166,10 @@ class _FakeDriverBackendTestCase(object): 'delete_instance_files', fake_delete_instance_files) + self.stubs.Set(nova.virt.libvirt.guest.Guest, + 'detach_device_with_retry', + fake_detach_device_with_retry) + # Like the existing fakelibvirt.migrateToURI, do nothing, # but don't fail for these tests. self.stubs.Set(nova.virt.libvirt.driver.libvirt.Domain, diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index d551cf242e02..1e7bc405e798 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -1229,13 +1229,14 @@ class LibvirtDriver(driver.ComputeDriver): disk_dev = mountpoint.rpartition("/")[2] try: guest = self._host.get_guest(instance) - conf = guest.get_disk(disk_dev) - if not conf: - raise exception.DiskNotFound(location=disk_dev) state = guest.get_power_state(self._host) live = state in (power_state.RUNNING, power_state.PAUSED) - guest.detach_device(conf, persistent=True, live=live) + + wait_for_detach = guest.detach_device_with_retry(guest.get_disk, + disk_dev, + persistent=True, + live=live) if encryption: # The volume must be detached from the VM before @@ -1244,12 +1245,16 @@ class LibvirtDriver(driver.ComputeDriver): encryptor = self._get_volume_encryptor(connection_info, encryption) encryptor.detach_volume(**encryption) + + wait_for_detach() except exception.InstanceNotFound: # NOTE(zhaoqin): If the instance does not exist, _lookup_by_name() # will throw InstanceNotFound exception. Need to # disconnect volume under this circumstance. LOG.warn(_LW("During detach_volume, instance disappeared."), instance=instance) + except exception.DeviceNotFound: + raise exception.DiskNotFound(location=disk_dev) except libvirt.libvirtError as ex: # NOTE(vish): This is called to cleanup volumes after live # migration, so we should still disconnect even if diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 263f873f0c4e..5b5f37b68d10 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -29,6 +29,7 @@ then used by all the other libvirt related classes from lxml import etree from oslo_log import log as logging +from oslo_service import loopingcall from oslo_utils import encodeutils from oslo_utils import excutils from oslo_utils import importutils @@ -277,6 +278,55 @@ class Guest(object): devs.append(dev) return devs + def detach_device_with_retry(self, get_device_conf_func, device, + persistent, live, max_retry_count=7, + inc_sleep_time=2, + max_sleep_time=30): + """Detaches a device from the guest. After the initial detach request, + a function is returned which can be used to ensure the device is + successfully removed from the guest domain (retrying the removal as + necessary). + + :param get_device_conf_func: function which takes device as a parameter + and returns the configuration for device + :param device: device to detach + :param persistent: bool to indicate whether the change is + persistent or not + :param live: bool to indicate whether it affects the guest in running + state + :param max_retry_count: number of times the returned function will + retry a detach before failing + :param inc_sleep_time: incremental time to sleep in seconds between + detach retries + :param max_sleep_time: max sleep time in seconds beyond which the sleep + time will not be incremented using param + inc_sleep_time. On reaching this threshold, + max_sleep_time will be used as the sleep time. + """ + + conf = get_device_conf_func(device) + if conf is None: + raise exception.DeviceNotFound(device=device) + + self.detach_device(conf, persistent, live) + + @loopingcall.RetryDecorator(max_retry_count=max_retry_count, + inc_sleep_time=inc_sleep_time, + max_sleep_time=max_sleep_time, + exceptions=exception.DeviceDetachFailed) + def _do_wait_and_retry_detach(): + config = get_device_conf_func(device) + if config is not None: + # Device is already detached from persistent domain + # and only transient domain needs update + self.detach_device(config, persistent=False, live=live) + # Raise error since the device still existed on the guest + reason = _("Unable to detach from guest transient domain.") + raise exception.DeviceDetachFailed(device=device, + reason=reason) + + return _do_wait_and_retry_detach + def detach_device(self, conf, persistent=False, live=False): """Detaches device to the guest.