From 30317e6b3f5ee5e7fd1e8c564a1d6f3e3a2fb91b Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Tue, 12 Jan 2021 08:23:17 +0100 Subject: [PATCH] Replace blind retry with libvirt event waiting in detach Nova so far applied a retry loop that tried to periodically detach the device from libvirt while the device was visible in the domain xml. This could lead to an issue where an already progressing detach on the libvirt side is interrupted by nova re-sending the detach request for the same device. See bug #1882521 for more information. Also if there was both a persistent and a live domain the nova tried the detach from both at the same call. This lead to confusion about the result when such call failed. Was the detach failed partially? We can do better, at least for the live detach case. Based on the libvirt developers detaching from the persistent domain always succeeds and it is a synchronous process. Detaching from the live domain can be both synchronous or asynchronous depending on the guest OS and the load on the hypervisor. But for live detach libvirt always sends an event [1] nova can wait for. So this patch does two things. 1) Separates the detach from the persistent domain from the detach from the live domain to make the error cases clearer. 2) Changes the retry mechanism. Detaching from the persistent domain is not retried. If libvirt reports device not found, while both persistent and live detach is needed, the error is ignored, and the process continues with the live detach. In any other case the error considered as fatal. Detaching from the live domain is changed to always wait for the libvirt event. In case of timeout, the live detach is retried. But a failure event from libvirt considered fatal, based on the information from the libvirt developers, so in this case the detach is not retried. Related-Bug: #1882521 [1]https://libvirt.org/html/libvirt-libvirt-domain.html#virConnectDomainEventDeviceRemovedCallback Change-Id: I7f2b6330decb92e2838aa7cee47fb228f00f47da (cherry picked from commit e56cc4f439846558fc13298c2360d7cdd473cc89) --- nova/conf/libvirt.py | 24 + nova/tests/unit/virt/libvirt/fakelibvirt.py | 12 +- nova/tests/unit/virt/libvirt/test_driver.py | 805 ++++++++++++++++-- nova/tests/unit/virt/libvirt/test_guest.py | 207 ----- nova/tests/unit/virt/test_virt_drivers.py | 2 + nova/virt/libvirt/driver.py | 364 +++++++- nova/virt/libvirt/guest.py | 104 --- ...-based-device-detach-23ac037004d753b1.yaml | 11 + 8 files changed, 1144 insertions(+), 385 deletions(-) create mode 100644 releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 80984c5a970b..5d89dbe7396f 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -875,6 +875,30 @@ Related options: * It's recommended to consider including ``x86_64=q35`` in :oslo.config:option:`libvirt.hw_machine_type`; see :ref:`deploying-sev-capable-infrastructure` for more on this. +"""), + cfg.IntOpt('device_detach_attempts', + default=8, + min=1, + help=""" +Maximum number of attempts the driver tries to detach a device in libvirt. + +Related options: + +* :oslo.config:option:`libvirt.device_detach_timeout` + +"""), + cfg.IntOpt('device_detach_timeout', + default=20, + min=1, + help=""" +Maximum number of seconds the driver waits for the success or the failure +event from libvirt for a given device detach attempt before it re-trigger the +detach. + +Related options: + +* :oslo.config:option:`libvirt.device_detach_attempts` + """), ] diff --git a/nova/tests/unit/virt/libvirt/fakelibvirt.py b/nova/tests/unit/virt/libvirt/fakelibvirt.py index 115b490984d1..5164192f3db1 100644 --- a/nova/tests/unit/virt/libvirt/fakelibvirt.py +++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py @@ -1278,9 +1278,17 @@ class Domain(object): self.attachDevice(xml) def detachDevice(self, xml): + # TODO(gibi): this should handle nics similarly to attachDevice() disk_info = _parse_disk_info(etree.fromstring(xml)) - disk_info['_attached'] = True - return disk_info in self._def['devices']['disks'] + attached_disk_info = None + for attached_disk in self._def['devices']['disks']: + if attached_disk['target_dev'] == disk_info.get('target_dev'): + attached_disk_info = attached_disk + break + + if attached_disk_info: + self._def['devices']['disks'].remove(attached_disk_info) + return attached_disk_info is not None def detachDeviceFlags(self, xml, flags): self.detachDevice(xml) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index faa166f903e4..90eb7305b1c8 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -9299,6 +9299,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_build_metadata.assert_called_with(self.context, instance) mock_save.assert_called_with() + @mock.patch('threading.Event', new=mock.Mock()) @mock.patch('nova.virt.libvirt.host.Host._get_domain') def test_detach_volume_with_vir_domain_affect_live_flag(self, mock_get_domain): @@ -9319,8 +9320,12 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_dom = mock.MagicMock() # Second time don't return anything about disk vdc so it looks removed - return_list = [mock_xml_with_disk, mock_xml_without_disk, - mock_xml_without_disk] + return_list = [ + mock_xml_with_disk, # presistent domain + mock_xml_with_disk, # live domain + mock_xml_without_disk, # persistent gone + mock_xml_without_disk # live gone + ] # Doubling the size of return list because we test with two guest power # states mock_dom.XMLDesc.side_effect = return_list + return_list @@ -9328,25 +9333,36 @@ class LibvirtConnTestCase(test.NoDBTestCase, connection_info = {"driver_volume_type": "fake", "data": {"device_path": "/fake", "access_mode": "rw"}} - flags = (fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) with mock.patch.object(drvr, '_disconnect_volume') as \ mock_disconnect_volume: for state in (power_state.RUNNING, power_state.PAUSED): + mock_dom.reset_mock() mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678] mock_get_domain.return_value = mock_dom drvr.detach_volume( self.context, connection_info, instance, '/dev/vdc') mock_get_domain.assert_called_with(instance) - call = mock_dom.detachDeviceFlags.mock_calls[0] xml = """ - """ + """ + # we expect two separate detach calls + self.assertEqual(2, mock_dom.detachDeviceFlags.call_count) + # one for the persistent domain + call = mock_dom.detachDeviceFlags.mock_calls[0] self.assertXmlEqual(xml, call.args[0]) - self.assertEqual({"flags": flags}, call.kwargs) + self.assertEqual( + {"flags": fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG}, + call.kwargs) + # and one for the live domain + call = mock_dom.detachDeviceFlags.mock_calls[1] + self.assertXmlEqual(xml, call.args[0]) + self.assertEqual( + {"flags": fakelibvirt.VIR_DOMAIN_AFFECT_LIVE}, + call.kwargs) + mock_disconnect_volume.assert_called_with( self.context, connection_info, instance, encryption=None) @@ -9406,11 +9422,14 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_disconnect_volume.assert_called_once_with( self.context, connection_info, instance, encryption=encryption) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_detach_with_retry') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_driver') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_encryptor') @mock.patch('nova.virt.libvirt.host.Host.get_guest') def test_detach_volume_order_with_encryptors(self, mock_get_guest, - mock_get_encryptor, mock_get_volume_driver): + mock_get_encryptor, mock_get_volume_driver, mock_detach_with_retry + ): mock_volume_driver = mock.MagicMock( spec=volume_drivers.LibvirtBaseVolumeDriver) @@ -9425,7 +9444,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_order = mock.Mock() mock_order.attach_mock(mock_volume_driver.disconnect_volume, 'disconnect_volume') - mock_order.attach_mock(mock_guest.detach_device_with_retry(), + mock_order.attach_mock(mock_detach_with_retry, 'detach_volume') mock_order.attach_mock(mock_encryptor.detach_volume, 'detach_encryptor') @@ -9441,9 +9460,20 @@ class LibvirtConnTestCase(test.NoDBTestCase, encryption=encryption) mock_order.assert_has_calls([ - mock.call.detach_volume(), + mock.call.detach_volume( + mock_guest, + instance.uuid, + # it is functools.partial(mock_guest.get_disk, 'vdc') + # see assert below + mock.ANY, + device_name='vdc', + live=True + ), mock.call.detach_encryptor(**encryption), mock.call.disconnect_volume(connection_info, instance)]) + get_device_conf_func = mock_detach_with_retry.mock_calls[0][1][2] + self.assertEqual(mock_guest.get_disk, get_device_conf_func.func) + self.assertEqual(('vdc',), get_device_conf_func.args) def test_extend_volume(self): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) @@ -15589,12 +15619,16 @@ class LibvirtConnTestCase(test.NoDBTestCase, self.assertFalse(service_mock.disabled) self.assertIsNone(service_mock.disabled_reason) + @mock.patch.object(libvirt_driver.AsyncDeviceEventsHandler, + 'cleanup_waiters') @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files') @mock.patch.object(host.Host, '_get_domain', side_effect=exception.InstanceNotFound( instance_id=uuids.instance)) @mock.patch.object(objects.Instance, 'save') - def test_immediate_delete(self, mock_save, mock_get, mock_delete): + def test_immediate_delete(self, mock_save, mock_get, mock_delete, + mock_cleanup_async_detach + ): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) instance = objects.Instance(self.context, **self.test_instance) @@ -15603,6 +15637,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_get.assert_has_calls([mock.call(instance)] * 3) self.assertEqual(3, mock_get.call_count) mock_delete.assert_called_once_with(instance) + mock_cleanup_async_detach.assert_called_once_with(instance.uuid) @mock.patch.object(objects.Instance, 'get_by_uuid') @mock.patch.object(objects.Instance, 'obj_load_attr', autospec=True) @@ -22944,8 +22979,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self._test_attach_interface( power_state.SHUTDOWN, fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG) - def _test_detach_interface(self, power_state, expected_flags, - device_not_found=False): + @mock.patch('threading.Event.wait', new=mock.Mock) + def _test_detach_interface(self, state, device_not_found=False): # setup some mocks instance = self._create_instance() network_info = _fake_network_info(self) @@ -22962,7 +22997,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """, - info=[power_state, 1, 2, 3, 4]) + info=[state, 1, 2, 3, 4]) guest = libvirt_guest.Guest(domain) expected_cfg = vconfig.LibvirtConfigGuestInterface() @@ -22975,11 +23010,28 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """) if device_not_found: - # This will trigger detach_device_with_retry to raise + # This will trigger _detach_with_retry to raise # DeviceNotFound - get_interface_calls = [expected_cfg, None] + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + None, # _detach_with_retry: no device in live config + ] else: - get_interface_calls = [expected_cfg, expected_cfg, None, None] + if state in (power_state.RUNNING, power_state.PAUSED): + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + expected_cfg, # _detach_with_retry: live config + None, # _detach_with_retry: persistent config gone + None # _detach_with_retry: live config gone + ] + else: + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + None, # _detach_with_retry: persistent config gone + ] with test.nested( mock.patch.object(host.Host, 'get_guest', return_value=guest), @@ -23006,45 +23058,60 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): mock_get_config.assert_called_once_with( instance, network_info[0], test.MatchType(objects.ImageMeta), test.MatchType(objects.Flavor), CONF.libvirt.virt_type) - mock_get_interface.assert_has_calls( - [mock.call(expected_cfg) for x in range(len(get_interface_calls))]) - if device_not_found: mock_detach_device_flags.assert_not_called() self.assertTrue(mock_warning.called) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + ]) else: - mock_detach_device_flags.assert_called_once_with( - expected_cfg.to_xml(), flags=expected_flags) - mock_warning.assert_not_called() mock_get_network_info.assert_called_once_with() + if state in (power_state.RUNNING, power_state.PAUSED): + mock_detach_device_flags.assert_has_calls([ + mock.call( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG), + mock.call( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), + ]) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + ]) + else: + mock_detach_device_flags.assert_called_once_with( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg, from_persistent_config=True), + ]) mock_unplug.assert_called_once_with(instance, network_info[0]) def test_detach_interface_with_running_instance(self): - self._test_detach_interface( - power_state.RUNNING, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + self._test_detach_interface(power_state.RUNNING) def test_detach_interface_with_running_instance_device_not_found(self): """Tests that the interface is detached before we try to detach it. """ - self._test_detach_interface( - power_state.RUNNING, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), - device_not_found=True) + self._test_detach_interface(power_state.RUNNING, device_not_found=True) def test_detach_interface_with_pause_instance(self): - self._test_detach_interface( - power_state.PAUSED, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + self._test_detach_interface(power_state.PAUSED) def test_detach_interface_with_shutdown_instance(self): - self._test_detach_interface( - power_state.SHUTDOWN, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG)) + self._test_detach_interface(power_state.SHUTDOWN) @mock.patch('nova.virt.libvirt.driver.LOG') def test_detach_interface_device_not_found(self, mock_log): @@ -23068,8 +23135,12 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self.assertIn('the device is no longer found on the guest', str(mock_log.warning.call_args[0])) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_detach_with_retry') @mock.patch('nova.virt.libvirt.driver.LOG') - def test_detach_interface_guest_not_found_after_detach(self, mock_log): + def test_detach_interface_guest_not_found_after_detach( + self, mock_log, mock_detach_with_retry + ): # Asserts that we don't raise an exception when the guest is gone # after a libvirt error during detach. instance = self._create_instance() @@ -23087,13 +23158,14 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): error = fakelibvirt.libvirtError( 'internal error: End of file from qemu monitor') error.err = (fakelibvirt.VIR_ERR_OPERATION_FAILED,) - guest.detach_device_with_retry.side_effect = error + mock_detach_with_retry.side_effect = error self.drvr.detach_interface(self.context, instance, vif) self.assertEqual(1, mock_log.info.call_count) self.assertIn('Instance disappeared while detaching interface', mock_log.info.call_args[0][0]) get_guest_mock.assert_has_calls([mock.call(instance)] * 2) + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(FakeVirtDomain, 'info') @mock.patch.object(FakeVirtDomain, 'detachDeviceFlags') @mock.patch.object(host.Host, '_get_domain') @@ -23132,26 +23204,46 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """) - expected_flags = (fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) with test.nested( - mock.patch.object(libvirt_guest.Guest, 'get_interface_by_cfg', - side_effect=[expected, expected, None, None]), + mock.patch.object( + libvirt_guest.Guest, 'get_interface_by_cfg', + side_effect=[ + expected, # detach_interface gets the config + expected, # _detach_with_retry: persistent config + expected, # _detach_with_retry: live config + None, # _detach_with_retry: persistent gone + None # _detach_with_retry: live gone + ]), mock.patch.object(self.drvr.vif_driver, 'get_config', return_value=expected), mock.patch.object(instance, 'get_network_info') ) as (mock_get_interface, mock_get_config, mock_get_network_info): self.drvr.detach_interface(self.context, instance, network_info[0]) - mock_get_interface.assert_has_calls([mock.call(expected)] * 3) - self.assertEqual(4, mock_get_interface.call_count) + mock_get_interface.assert_has_calls( + [ + mock.call(expected, ), + mock.call(expected, from_persistent_config=True), + mock.call(expected), + mock.call(expected, from_persistent_config=True), + mock.call(expected), + ] + ) + self.assertEqual(5, mock_get_interface.call_count) mock_get_config.assert_called_once_with( instance, network_info[0], test.MatchType(objects.ImageMeta), test.MatchType(objects.Flavor), CONF.libvirt.virt_type) mock_get_domain.assert_called_once_with(instance) mock_info.assert_called_once_with() - mock_detach.assert_called_once_with(expected.to_xml(), - flags=expected_flags) + mock_detach.assert_has_calls( + [ + mock.call( + expected.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG), + mock.call( + expected.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), + ]) mock_get_network_info.assert_called_once_with() def test_detach_interface_guest_set_metadata(self): @@ -23165,7 +23257,6 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): CONF.libvirt.virt_type, instance, image_meta) cfg = self.drvr._get_guest_config( instance, network_info, image_meta, disk_info) - mock_wait_for_detach = mock.Mock() config_meta = vconfig.LibvirtConfigGuestMetaNovaInstance() with test.nested( @@ -23178,15 +23269,15 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): mock.patch.object(guest, 'get_power_state'), mock.patch.object( instance, 'get_network_info', return_value=network_info), - mock.patch.object(guest, - 'detach_device_with_retry', return_value=mock_wait_for_detach), + mock.patch.object( + self.drvr, '_detach_with_retry'), mock.patch.object( self.drvr, '_get_guest_config_meta', return_value=config_meta), mock.patch.object(guest, 'set_metadata') ) as ( mock_get_guest, mock_get_config, mock_get_interface_by_cfg, mock_get_power_state, mock_get_network_info, - mock_detach_device_with_retry, mock_get_guest_config_meta, + mock_detach_with_retry, mock_get_guest_config_meta, mock_set_metadata ): self.drvr.detach_interface(self.context, instance, vif) @@ -23196,15 +23287,606 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): test.MatchType(objects.Flavor), CONF.libvirt.virt_type) mock_get_interface_by_cfg.assert_called_once_with(cfg) mock_get_power_state.assert_called_once_with(self.drvr._host) - mock_detach_device_with_retry.assert_called_once_with( - guest.get_interface_by_cfg, cfg, live=False, - alternative_device_name=None) - mock_wait_for_detach.assert_called_once_with() + mock_detach_with_retry.assert_called_once_with( + guest, instance.uuid, mock.ANY, live=False, + device_name=None) mock_get_network_info.assert_called_once_with() mock_get_guest_config_meta.assert_called_once_with( instance, network_info[1:]) mock_set_metadata.assert_called_once_with(config_meta) + def test__detach_with_retry_persistent_success(self): + """Test detach only from the persistent domain""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to double check that the dev is gone from the + # persistent domain + side_effect=[ + mock_dev, + None, + ] + ) + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(from_persistent_config=True), + ]) + mock_guest.detach_device.assert_called_once_with( + mock_dev, + persistent=True, + live=False) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_live_success(self): + """Test detach only from the live domain""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain + side_effect=[ + mock_dev, + None, + ] + ) + # simulate that libvirt sends an event + mock_guest.detach_device.side_effect = ( + lambda dev, persistent, live: drvr.emit_event( + libvirtevent.DeviceRemovedEvent(uuids.instance_uuid, dev.alias) + )) + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(), + mock.call(), + ]) + mock_guest.detach_device.assert_called_once_with( + mock_dev, + persistent=False, + live=True) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_persistent_and_live_success(self): + """Test detach both from the persistent and from the live domains""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to get the device from the live domain + # the third call is to double check that the dev is gone from the + # persistent domain + # the fourth call is to double check that the dev is gone from the + # live domain + side_effect=[ + mock_dev, + mock_dev, + None, + None, + ] + ) + + # simulate that libvirt sends an event for the live detach call + # for the persistent detach call no side effect needs to be simulated + def fake_detach(dev, persistent, live): + if live: + drvr.emit_event(libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + + mock_guest.detach_device.side_effect = fake_detach + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + mock.call(from_persistent_config=True), + mock.call(), + ]) + # we detach from the persistent config first and then from the live + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=True, live=False), + mock.call(mock_dev, persistent=False, live=True), + ]) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_persistent_but_guest_is_not(self): + """Test that a pesistent only detach is requested but the guest is + not persistent. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + # we expect that no device query or actual detach happens. + mock_get_device_conf_func = mock.NonCallableMock() + mock_guest.detach_device = mock.NonCallableMock() + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + + def test__detach_with_retry_persistent_dev_not_found(self): + """Tests that expection is raised if a persistent detach is requested, + the domain is persistent but the device is not found. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + # we simulate that the device does not exists in the domain even + # before we try to detach + mock_get_device_conf_func = mock.Mock(return_value=None) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_get_device_conf_func.assert_called_once_with( + from_persistent_config=True) + mock_guest.detach_device.assert_not_called() + + def test__detach_with_retry_live_dev_not_found(self): + """Tests that expection is raised if a live detach is requested, + but the device is not found in the live domain. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + # we simulate that the device does not exists in the domain even + # before we try to detach + mock_get_device_conf_func = mock.Mock(return_value=None) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_get_device_conf_func.assert_called_once_with() + mock_guest.detach_device.assert_not_called() + + def test__detach_with_retry_async_fail(self): + """Test that libvirt sends error event during detach""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock(return_value=mock_dev) + + # simulate that libvirt sends an failed event + mock_guest.detach_device.side_effect = ( + lambda dev, persistent, live: drvr.emit_event( + libvirtevent.DeviceRemovalFailedEvent( + uuids.instance_uuid, dev.alias))) + + self.assertRaises( + exception.DeviceDetachFailed, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + self.assertEqual(1, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=False, live=True) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_retry_succeeds(self, mock_event_wait): + """Test that that a live detach times out while waiting for the libvirt + event but then the retry succeeds. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain after the first detach attempt + # the third call is to double check that the dev is gone from the + # live domain after the second detach attempt + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False first we simulate to the + # caller that the wait returned not becasuse the event is set but + # because timeout happend. Then during the retry we return True + # signalling that the event is set, i.e. the libvirt event the caller + # is waiting for has been received + mock_event_wait.side_effect = [False, True] + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the 3 calls has explained above where the return value is defined + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_retry_unplug_in_progress( + self, mock_event_wait + ): + """Test that that a live detach times out while waiting for the libvirt + event but then the retry gets a unplug already in progress error from + libvirt, which it ignores, then the detach finishes and the event is + received. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplicity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain after the first detach attempt + # the third call is to double check that the dev is gone from the + # live domain after the second detach attempt + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False first we simulate to the + # caller that the wait returned not because the event is set but + # because timeout happened. Then during the retry we return True + # signalling that the event is set, i.e. the libvirt event the caller + # is waiting for has been received + mock_event_wait.side_effect = [False, True] + + # there will be two detach attempts + # 1) simulate that synchronous parts of the detach succeeds + # 2) return an error as the previous detach attempts is still ongoing + def fake_detach_device(dev, persistent, live): + if mock_guest.detach_device.call_count == 0: + return + else: + # In reality these events are in reverse order. However + # simulating that would require parallelism in the test. From + # the perspective of the code under test the call order does + # not matter as the call to detach_device is synchronous. + drvr.emit_event( + libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + raise fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_message='already in the process of unplug', + error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR) + + mock_guest.detach_device.side_effect = fake_detach_device + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the 3 calls has explained above where the return value is defined + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_run_out_of_retries( + self, mock_event_wait + ): + """Test that that a live detach times out while waiting for the libvirt + event at every attempt so the driver runs out of retry attempts. + """ + # decreased the number to simplyfy the test + self.flags(group='libvirt', device_detach_attempts=2) + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock(return_value=mock_dev) + + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False we simulate to the + # caller that the wait returned not becasuse the event is set but + # because timeout happend. + mock_event_wait.return_value = False + + self.assertRaises( + exception.DeviceDetachFailed, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the first call is to get the dev we detach, then there is one call + # after each retry attempt to check if the device is gone + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_libvirt_reports_not_found_live_detach_done( + self + ): + """Test that libvirt reports that the device is not found in the + persistent domain but as live detach is also requested the original + error is just logged and the live detach is attempted. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to get the device from the live domain + # the third call is to double check that the dev is gone from the + # live domain + # NOTE(gibi): as the persistent detach fails with exception the + # driver does not double check that the device is removed but + # simply move to the live detach + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + + # there will be two detach attempts: + # 1) with persistent=True: simulate that libvirt fails synchronosuly + # signalling that the device is not found. + # 2) with persistent=True: simulate that detach succeeds by emitting + # the libvirt event + def fake_detach_device(dev, persistent, live): + if persistent: + raise fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING) + else: + drvr.emit_event( + libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + + mock_guest.detach_device.side_effect = fake_detach_device + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + mock.call(), + ]) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=True, live=False), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_libvirt_reports_not_found_give_up(self): + """Test that libvirt reports that the device is not found in the + persistent domain and as live detach is not requested the drive simply + bubble up the exception. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # NOTE(gibi): as the persistent detach fails with exception the + # driver does not double check that the device is removed but + # simply gives up + side_effect=[ + mock_dev, + ] + ) + + mock_guest.detach_device.side_effect = fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_called_once_with( + from_persistent_config=True) + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=True, live=False) + + def test__detach_with_retry_other_sync_libvirt_error(self): + """Test that libvirt reports non device related error during detach + from persistent config which causes that the driver gives up + regardless of the fact that live detach also requested. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second all is to get the device from the live domain + side_effect=[ + mock_dev, + mock_dev, + ] + ) + mock_guest.detach_device.side_effect = fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_NO_DOMAIN) + + self.assertRaises( + fakelibvirt.libvirtError, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + ]) + # NOTE(gibi): the drive does not attempt a live detach even though + # it was requested + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=True, live=False) + @mock.patch('nova.objects.block_device.BlockDeviceMapping.save', new=mock.Mock()) @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref') @@ -27326,19 +28008,22 @@ class LibvirtDeviceRemoveEventTestCase(test.NoDBTestCase): super().setUp() self.useFixture(fakelibvirt.FakeLibvirtFixture()) - @mock.patch.object(libvirt_driver.LOG, 'debug') + @mock.patch.object(libvirt_driver.LOG, 'warning') @mock.patch('nova.virt.driver.ComputeDriver.emit_event') @ddt.data( libvirtevent.DeviceRemovedEvent, libvirtevent.DeviceRemovalFailedEvent) def test_libvirt_device_removal_events( - self, event_type, mock_base_handles, mock_debug + self, event_type, mock_base_handles, mock_warning ): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) event = event_type(uuid=uuids.event, dev=mock.sentinel.dev_alias) drvr.emit_event(event) mock_base_handles.assert_not_called() - mock_debug.assert_not_called() + self.assertEqual(1, len(mock_warning.mock_calls)) + self.assertIn( + "the driver is not waiting for it; ignored.", + mock_warning.mock_calls[0][1][0]) class AsyncDeviceEventsHandlerTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index f09c8ae71db1..87b189882581 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -21,7 +21,6 @@ from oslo_service import fixture as service_fixture from oslo_utils import encodeutils from nova import context -from nova import exception from nova import test from nova.tests.unit.virt.libvirt import fakelibvirt from nova.virt.libvirt import config as vconfig @@ -213,212 +212,6 @@ class GuestTestCase(test.NoDBTestCase): "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - def test_detach_device_with_retry_from_transient_domain(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - get_config = mock.Mock() - get_config.side_effect = [conf, conf, conf, None, None] - dev_path = "/dev/vdb" - self.domain.isPersistent.return_value = False - retry_detach = self.guest.detach_device_with_retry( - get_config, dev_path, live=True, inc_sleep_time=.01) - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - self.assertEqual(1, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_detach_success(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - get_config = mock.Mock() - # Force multiple retries of detach - get_config.side_effect = [conf, conf, conf, conf, conf, None, None] - dev_path = "/dev/vdb" - self.domain.isPersistent.return_value = True - - retry_detach = self.guest.detach_device_with_retry( - get_config, dev_path, live=True, inc_sleep_time=.01) - # Ensure we've only done the initial detach call - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - - get_config.assert_called_with(dev_path) - - # Some time later, we can do the wait/retry to ensure detach succeeds - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - # Should have two retries before we pretend device is detached - self.assertEqual(2, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_detach_failure(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - # Continue to return some value for the disk config - get_config = mock.Mock(return_value=conf) - self.domain.isPersistent.return_value = True - - retry_detach = self.guest.detach_device_with_retry( - get_config, "/dev/vdb", live=True, inc_sleep_time=.01, - max_retry_count=3) - # Ensure we've only done the initial detach call - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - - # Some time later, we can do the wait/retry to ensure detach - self.domain.detachDeviceFlags.reset_mock() - # Should hit max # of retries - self.assertRaises(exception.DeviceDetachFailed, retry_detach) - self.assertEqual(4, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_device_not_found(self): - get_config = mock.Mock(return_value=None) - self.domain.isPersistent.return_value = True - ex = self.assertRaises( - exception.DeviceNotFound, self.guest.detach_device_with_retry, - get_config, "/dev/vdb", live=True) - self.assertIn("/dev/vdb", str(ex)) - - def test_detach_device_with_retry_device_not_found_alt_name(self): - """Tests to make sure we use the alternative name in errors.""" - get_config = mock.Mock(return_value=None) - self.domain.isPersistent.return_value = True - ex = self.assertRaises( - exception.DeviceNotFound, self.guest.detach_device_with_retry, - get_config, mock.sentinel.device, live=True, - alternative_device_name='foo') - self.assertIn('foo', str(ex)) - - @mock.patch.object(libvirt_guest.Guest, "detach_device") - def _test_detach_device_with_retry_second_detach_failure( - self, mock_detach, error_code=None, error_message=None, - supports_device_missing=False): - # This simulates a retry of the transient/live domain detach - # failing because the device is not found - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - self.domain.isPersistent.return_value = True - - get_config = mock.Mock(return_value=conf) - fake_device = "vdb" - fake_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message=error_message, - error_code=error_code, - error_domain=fakelibvirt.VIR_FROM_DOMAIN) - mock_detach.side_effect = [None, fake_exc] - retry_detach = self.guest.detach_device_with_retry( - get_config, fake_device, live=True, - inc_sleep_time=.01, max_retry_count=3) - # Some time later, we can do the wait/retry to ensure detach - self.assertRaises(exception.DeviceNotFound, retry_detach) - # Check that the save_and_reraise_exception context manager didn't log - # a traceback when the libvirtError was caught and DeviceNotFound was - # raised. - self.assertNotIn('Original exception being dropped', - self.stdlog.logger.output) - - def test_detach_device_with_retry_second_detach_device_missing(self): - self._test_detach_device_with_retry_second_detach_failure( - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_message="device not found: disk vdb not found", - supports_device_missing=True) - - def _test_detach_device_with_retry_first_detach_failure( - self, error_code=None, error_message=None, - supports_device_missing=False): - # This simulates a persistent or live domain detach failing because the - # device is not found during the first attempt to detach the device. - # We should still attempt to detach the device from the live config if - # the detach from persistent failed OR we should retry the detach from - # the live config if the first detach from live config failed. - # Note that the side effects in this test case [fake_exc, None] could - # not happen in real life if the first detach failed because the detach - # from live raised not found. In real life, the second attempt to - # detach from live would raise not found again because the device is - # not present. The purpose of this test is to verify that we try to - # detach a second time if the first detach fails, so we are OK with the - # unrealistic side effects for detach from live failing the first time. - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - self.domain.isPersistent.return_value = True - - get_config = mock.Mock() - # Simulate an inactive or live detach attempt which fails (not found) - # followed by a live config detach attempt that is successful - get_config.side_effect = [conf, conf, conf, None, None] - fake_device = "vdb" - fake_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message=error_message, - error_code=error_code, - error_domain=fakelibvirt.VIR_FROM_DOMAIN) - # Detach from persistent or live raises not found, detach from live - # succeeds afterward - self.domain.detachDeviceFlags.side_effect = [fake_exc, None] - retry_detach = self.guest.detach_device_with_retry(get_config, - fake_device, live=True, inc_sleep_time=.01, max_retry_count=3) - # We should have tried to detach from the persistent domain - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - # During the retry detach, should detach from the live domain - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - # We should have tried to detach from the live domain - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) - - def test_detach_device_with_retry_first_detach_device_missing(self): - self._test_detach_device_with_retry_first_detach_failure( - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_message="device not found: disk vdb not found", - supports_device_missing=True) - - def test_detach_device_with_already_in_process_of_unplug_error(self): - # Assert that DeviceNotFound is raised when encountering - # https://bugzilla.redhat.com/show_bug.cgi?id=1878659 - # This is raised as QEMU returns a VIR_ERR_INTERNAL_ERROR when - # a request to device_del is made while another is about to complete. - - self.domain.isPersistent.return_value = True - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - - existing_unplug_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message='device vdb is already in the process of unplug', - error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR, - error_domain=fakelibvirt.VIR_FROM_DOMAIN - ) - device_missing_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message='device not found: disk vdb not found', - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_domain=fakelibvirt.VIR_FROM_DOMAIN - ) - - # Raise VIR_ERR_INTERNAL_ERROR on the second call before raising - # VIR_ERR_DEVICE_MISSING to mock the first call successfully detaching - # the device asynchronously. - self.domain.detachDeviceFlags.side_effect = [ - None, - existing_unplug_exc, - device_missing_exc - ] - - retry_detach = self.guest.detach_device_with_retry( - mock.Mock(return_value=conf), - 'vdb', - live=True, - inc_sleep_time=.01 - ) - - # Assert that we raise exception.DeviceNotFound - self.assertRaises(exception.DeviceNotFound, retry_detach) - def test_get_xml_desc(self): self.guest.get_xml_desc() self.domain.XMLDesc.assert_called_once_with(flags=0) diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index 54c587bfdf70..9d3ef54e8975 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -416,6 +416,7 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase): self.assertEqual(storage_ip, result['ip']) @catch_notimplementederror + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata', return_value=objects.InstanceDeviceMetadata()) def test_attach_detach_volume(self, _): @@ -452,6 +453,7 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase): '/dev/sda', 2)) @catch_notimplementederror + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata', return_value=objects.InstanceDeviceMetadata()) def test_attach_detach_different_power_states(self, _): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 8ee8757e40cc..747489aeeb62 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -504,6 +504,10 @@ class LibvirtDriver(driver.ComputeDriver): self.pgpu_type_mapping = collections.defaultdict(str) self.supported_vgpu_types = self._get_supported_vgpu_types() + # Handles ongoing device manipultion in libvirt where we wait for the + # events about success or failure. + self._device_event_handler = AsyncDeviceEventsHandler() + def _discover_vpmems(self, vpmem_conf=None): """Discover vpmems on host and configuration. @@ -1400,6 +1404,9 @@ class LibvirtDriver(driver.ComputeDriver): def destroy(self, context, instance, network_info, block_device_info=None, destroy_disks=True): self._destroy(instance) + # NOTE(gibi): if there was device detach in progress then we need to + # unblock the waiting threads and clean up. + self._device_event_handler.cleanup_waiters(instance.uuid) self.cleanup(context, instance, network_info, block_device_info, destroy_disks) @@ -2189,8 +2196,20 @@ class LibvirtDriver(driver.ComputeDriver): # These are libvirt specific events handled here on the driver # level instead of propagating them to the compute manager level if isinstance(event, libvirtevent.DeviceEvent): - # TODO(gibi): handle it - pass + had_clients = self._device_event_handler.notify_waiters(event) + + if had_clients: + LOG.debug( + "Received event %s from libvirt while the driver is " + "waiting for it; dispatched.", + event, + ) + else: + LOG.warning( + "Received event %s from libvirt but the driver is not " + "waiting for it; ignored.", + event, + ) else: LOG.debug( "Received event %s from libvirt but no handler is " @@ -2201,6 +2220,321 @@ class LibvirtDriver(driver.ComputeDriver): # manager super().emit_event(event) + def _detach_with_retry( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + # to properly typehint this param we would need typing.Protocol but + # that is only available since python 3.8 + get_device_conf_func: ty.Callable, + device_name: str, + live: bool, + ) -> None: + """Detaches a device from the guest + + If live detach is requested then this call will wait for the libvirt + event signalling the end of the detach process. + + If the live detach times out then it will retry the detach. Detach from + the persistent config is not retried as it is: + + * synchronous and no event is sent from libvirt + * it is always expected to succeed if the device is in the domain + config + + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param get_device_conf_func: function which returns the configuration + for device from the domain, having one optional boolean parameter + `from_persistent_config` to select which domain config to query + :param device_name: This is the name of the device used solely for + error messages. Note that it is not the same as the device alias + used by libvirt to identify the device. + :param live: bool to indicate whether it affects the guest in running + state. If live is True then the device is detached from both the + persistent and the live config. If live is False the device only + detached from the persistent config. + :raises exception.DeviceNotFound: if the device does not exist in the + domain even before we try to detach or if libvirt reported that the + device is missing from the domain synchronously. + :raises exception.DeviceDetachFailed: if libvirt reported error during + detaching from the live domain or we timed out waiting for libvirt + events and run out of retries + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + """ + persistent = guest.has_persistent_configuration() + + if not persistent and not live: + # nothing to do + return + + persistent_dev = None + if persistent: + persistent_dev = get_device_conf_func(from_persistent_config=True) + + live_dev = None + if live: + live_dev = get_device_conf_func() + + if live and live_dev is None: + # caller requested a live detach but device is not present + raise exception.DeviceNotFound(device=device_name) + + if not live and persistent_dev is None: + # caller requested detach from the persistent domain but device is + # not present + raise exception.DeviceNotFound(device=device_name) + + if persistent_dev: + try: + self._detach_from_persistent( + guest, instance_uuid, persistent_dev, get_device_conf_func, + device_name) + except exception.DeviceNotFound: + if live: + # ignore the error so that we can do the live detach + LOG.warning( + 'Libvirt reported sync error while detaching ' + 'device %s from instance %s from the persistent ' + 'domain config. Ignoring the error to proceed with ' + 'live detach as that was also requested.', + device_name, instance_uuid) + else: + # if only persistent detach was requested then give up + raise + + if live and live_dev: + self._detach_from_live_with_retry( + guest, instance_uuid, live_dev, get_device_conf_func, + device_name) + + def _detach_from_persistent( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + persistent_dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + get_device_conf_func, + device_name: str, + ): + LOG.debug( + 'Attempting to detach device %s from instance %s from ' + 'the persistent domain config.', device_name, instance_uuid) + + self._detach_sync( + persistent_dev, guest, instance_uuid, device_name, + persistent=True, live=False) + + # make sure the dev is really gone + persistent_dev = get_device_conf_func( + from_persistent_config=True) + if not persistent_dev: + LOG.info( + 'Successfully detached device %s from instance %s ' + 'from the persistent domain config.', + device_name, instance_uuid) + else: + # Based on the libvirt devs this should never happen + LOG.warning( + 'Failed to detach device %s from instance %s ' + 'from the persistent domain config. Libvirt did not ' + 'report any error but the device is still in the ' + 'config.', device_name, instance_uuid) + + def _detach_from_live_with_retry( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + live_dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + get_device_conf_func, + device_name: str, + ): + max_attempts = CONF.libvirt.device_detach_attempts + for attempt in range(max_attempts): + LOG.debug( + '(%s/%s): Attempting to detach device %s with device ' + 'alias %s from instance %s from the live domain config.', + attempt + 1, max_attempts, device_name, live_dev.alias, + instance_uuid) + + self._detach_from_live_and_wait_for_event( + live_dev, guest, instance_uuid, device_name) + + # make sure the dev is really gone + live_dev = get_device_conf_func() + if not live_dev: + LOG.info( + 'Successfully detached device %s from instance %s ' + 'from the live domain config.', device_name, instance_uuid) + # we are done + return + + LOG.debug( + 'Failed to detach device %s with device alias %s from ' + 'instance %s from the live domain config. Libvirt did not ' + 'report any error but the device is still in the config.', + device_name, live_dev.alias, instance_uuid) + + msg = ( + 'Run out of retry while detaching device %s with device ' + 'alias %s from instance %s from the live domain config. ' + 'Device is still attached to the guest.') + LOG.error(msg, device_name, live_dev.alias, instance_uuid) + raise exception.DeviceDetachFailed( + device=device_name, + reason=msg % (device_name, live_dev.alias, instance_uuid)) + + def _detach_from_live_and_wait_for_event( + self, + dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + guest: libvirt_guest.Guest, + instance_uuid: str, + device_name: str, + ) -> None: + """Detaches a device from the live config of the guest and waits for + the libvirt event singling the finish of the detach. + + :param dev: the device configuration to be detached + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param device_name: This is the name of the device used solely for + error messages. + :raises exception.DeviceNotFound: if libvirt reported that the device + is missing from the domain synchronously. + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + :raises DeviceDetachFailed: if libvirt sent DeviceRemovalFailedEvent + """ + # So we will issue an detach to libvirt and we will wait for an + # event from libvirt about the result. We need to set up the event + # handling before the detach to avoid missing the event if libvirt + # is really fast + # NOTE(gibi): we need to use the alias name of the device as that + # is what libvirt will send back to us in the event + waiter = self._device_event_handler.create_waiter( + instance_uuid, dev.alias, + {libvirtevent.DeviceRemovedEvent, + libvirtevent.DeviceRemovalFailedEvent}) + try: + self._detach_sync( + dev, guest, instance_uuid, device_name, persistent=False, + live=True) + except Exception: + # clean up the libvirt event handler as we failed synchronously + self._device_event_handler.delete_waiter(waiter) + raise + + LOG.debug( + 'Start waiting for the detach event from libvirt for ' + 'device %s with device alias %s for instance %s', + device_name, dev.alias, instance_uuid) + # We issued the detach without any exception so we can wait for + # a libvirt event to arrive to notify us about the result + # NOTE(gibi): we expect that this call will be unblocked by an + # incoming libvirt DeviceRemovedEvent or DeviceRemovalFailedEvent + event = self._device_event_handler.wait( + waiter, timeout=CONF.libvirt.device_detach_timeout) + + if not event: + # This should not happen based on information from the libvirt + # developers. But it does at least during the cleanup of the + # tempest test case + # ServerRescueNegativeTestJSON.test_rescued_vm_detach_volume + # Log a warning and let the upper layer detect that the device is + # still attached and retry + LOG.error( + 'Waiting for libvirt event about the detach of ' + 'device %s with device alias %s from instance %s is timed ' + 'out.', device_name, dev.alias, instance_uuid) + + if isinstance(event, libvirtevent.DeviceRemovalFailedEvent): + # Based on the libvirt developers this signals a permanent failure + LOG.error( + 'Received DeviceRemovalFailedEvent from libvirt for the ' + 'detach of device %s with device alias %s from instance %s ', + device_name, dev.alias, instance_uuid) + raise exception.DeviceDetachFailed( + device=device_name, + reason="DeviceRemovalFailedEvent received from libvirt") + + @staticmethod + def _detach_sync( + dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + guest: libvirt_guest.Guest, + instance_uuid: str, + device_name: str, + persistent: bool, + live: bool, + ): + """Detaches a device from the guest without waiting for libvirt events + + It only handles synchronous errors (i.e. exceptions) but does not wait + for any event from libvirt. + + :param dev: the device configuration to be detached + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param device_name: This is the name of the device used solely for + error messages. + :param live: detach the device from the live domain config only + :param persistent: detach the device from the persistent domain config + only + :raises exception.DeviceNotFound: if libvirt reported that the device + is missing from the domain synchronously. + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + """ + try: + guest.detach_device(dev, persistent=persistent, live=live) + except libvirt.libvirtError as ex: + code = ex.get_error_code() + msg = ex.get_error_message() + if code == libvirt.VIR_ERR_DEVICE_MISSING: + LOG.debug( + 'Libvirt failed to detach device %s from instance %s ' + 'synchronously (persistent=%s, live=%s) with error: %s.', + device_name, instance_uuid, persistent, live, str(ex)) + raise exception.DeviceNotFound(device=device_name) from ex + + # NOTE(lyarwood): https://bugzilla.redhat.com/1878659 + # Ignore this known QEMU bug for the time being allowing + # our retry logic to handle it. + # NOTE(gibi): This can only happen in case of detaching from the + # live domain as we never retry a detach from the persistent + # domain so we cannot hit an already running detach there. + # In case of detaching from the live domain this error can happen + # if the caller timed out during the first detach attempt then saw + # that the device is still attached and therefore looped over and + # and retried the detach. In this case the previous attempt stopped + # waiting for the libvirt event. Also libvirt reports that there is + # a detach ongoing, so the current attempt expects that a + # libvirt event will be still emitted. Therefore we simply return + # from here. Then the caller will wait for such event. + if (code == libvirt.VIR_ERR_INTERNAL_ERROR and msg and + 'already in the process of unplug' in msg + ): + LOG.debug( + 'Ignoring QEMU rejecting our request to detach device %s ' + 'from instance %s as it is caused by a previous request ' + 'still being in progress.', device_name, instance_uuid) + return + + LOG.warning( + 'Unexpected libvirt error while detaching device %s from ' + 'instance %s: %s', device_name, instance_uuid, str(ex)) + raise + def detach_volume(self, context, connection_info, instance, mountpoint, encryption=None): disk_dev = mountpoint.rpartition("/")[2] @@ -2213,10 +2547,14 @@ class LibvirtDriver(driver.ComputeDriver): # detaching any attached encryptors or disconnecting the underlying # volume in _disconnect_volume. Otherwise, the encryptor or volume # driver may report that the volume is still in use. - wait_for_detach = guest.detach_device_with_retry( - guest.get_disk, disk_dev, live=live) - wait_for_detach() - + get_dev = functools.partial(guest.get_disk, disk_dev) + self._detach_with_retry( + guest, + instance.uuid, + get_dev, + device_name=disk_dev, + live=live + ) except exception.InstanceNotFound: # NOTE(zhaoqin): If the instance does not exist, _lookup_by_name() # will throw InstanceNotFound exception. Need to @@ -2423,12 +2761,14 @@ class LibvirtDriver(driver.ComputeDriver): state = guest.get_power_state(self._host) live = state in (power_state.RUNNING, power_state.PAUSED) - # Now we are going to loop until the interface is detached or we - # timeout. - wait_for_detach = guest.detach_device_with_retry( - guest.get_interface_by_cfg, cfg, live=live, - alternative_device_name=self.vif_driver.get_vif_devname(vif)) - wait_for_detach() + get_dev = functools.partial(guest.get_interface_by_cfg, cfg) + self._detach_with_retry( + guest, + instance.uuid, + get_dev, + device_name=self.vif_driver.get_vif_devname(vif), + live=live, + ) except exception.DeviceDetachFailed: # We failed to detach the device even with the retry loop, so let's # dump some debug information to the logs before raising back up. diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 66e06896c69e..0831010117e4 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -31,7 +31,6 @@ import time from lxml import etree from oslo_log import log as logging -from oslo_service import loopingcall from oslo_utils import encodeutils from oslo_utils import excutils from oslo_utils import importutils @@ -399,109 +398,6 @@ class Guest(object): devs.append(dev) return devs - def detach_device_with_retry(self, get_device_conf_func, device, live, - max_retry_count=7, inc_sleep_time=10, - max_sleep_time=60, - alternative_device_name=None): - """Detaches a device from the guest. After the initial detach request, - a function is returned which can be used to ensure the device is - successfully removed from the guest domain (retrying the removal as - necessary). - - :param get_device_conf_func: function which takes device as a parameter - and returns the configuration for device - :param device: device to detach - :param live: bool to indicate whether it affects the guest in running - state - :param max_retry_count: number of times the returned function will - retry a detach before failing - :param inc_sleep_time: incremental time to sleep in seconds between - detach retries - :param max_sleep_time: max sleep time in seconds beyond which the sleep - time will not be incremented using param - inc_sleep_time. On reaching this threshold, - max_sleep_time will be used as the sleep time. - :param alternative_device_name: This is an alternative identifier for - the device if device is not an ID, used solely for error messages. - """ - alternative_device_name = alternative_device_name or device - - def _try_detach_device(conf, persistent=False, live=False): - # Raise DeviceNotFound if the device isn't found during detach - try: - self.detach_device(conf, persistent=persistent, live=live) - if get_device_conf_func(device) is None: - LOG.debug('Successfully detached device %s from guest. ' - 'Persistent? %s. Live? %s', - device, persistent, live) - except libvirt.libvirtError as ex: - with excutils.save_and_reraise_exception(reraise=False) as ctx: - code = ex.get_error_code() - msg = ex.get_error_message() - if code == libvirt.VIR_ERR_DEVICE_MISSING: - raise exception.DeviceNotFound( - device=alternative_device_name) - # NOTE(lyarwood): https://bugzilla.redhat.com/1878659 - # Ignore this known QEMU bug for the time being allowing - # our retry logic to fire again and hopefully see that - # the device has been removed asynchronously by QEMU - # in the meantime when the next call to detach raises - # VIR_ERR_DEVICE_MISSING. - if (code == libvirt.VIR_ERR_INTERNAL_ERROR and - msg and 'already in the process of unplug' in msg - ): - LOG.debug('Ignoring QEMU rejecting our request to ' - 'detach as it is caused by a previous ' - 'request still being in progress.') - return - - # Re-raise the original exception if we're not raising - # DeviceNotFound instead. This will avoid logging of a - # "Original exception being dropped" traceback. - ctx.reraise = True - - conf = get_device_conf_func(device) - if conf is None: - raise exception.DeviceNotFound(device=alternative_device_name) - - persistent = self.has_persistent_configuration() - - LOG.debug('Attempting initial detach for device %s', - alternative_device_name) - try: - _try_detach_device(conf, persistent, live) - except exception.DeviceNotFound: - # NOTE(melwitt): There are effectively two configs for an instance. - # The persistent config (affects instance upon next boot) and the - # live config (affects running instance). When we detach a device, - # we need to detach it from both configs if the instance has a - # persistent config and a live config. If we tried to detach the - # device with persistent=True and live=True and it was not found, - # we should still try to detach from the live config, so continue. - if persistent and live: - pass - else: - raise - LOG.debug('Start retrying detach until device %s is gone.', - alternative_device_name) - - @loopingcall.RetryDecorator(max_retry_count=max_retry_count, - inc_sleep_time=inc_sleep_time, - max_sleep_time=max_sleep_time, - exceptions=exception.DeviceDetachFailed) - def _do_wait_and_retry_detach(): - config = get_device_conf_func(device) - if config is not None: - # Device is already detached from persistent config - # and only the live config needs to be updated. - _try_detach_device(config, persistent=False, live=live) - - reason = _("Unable to detach the device from the live config.") - raise exception.DeviceDetachFailed( - device=alternative_device_name, reason=reason) - - return _do_wait_and_retry_detach - def detach_device(self, conf, persistent=False, live=False): """Detaches device to the guest. diff --git a/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml b/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml new file mode 100644 index 000000000000..12c55fc315d4 --- /dev/null +++ b/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + To fix `device detach issues`__ in the libvirt driver the detach logic has + been changed from a sleep based retry loop to waiting for libvirt domain + events. During this change we also introduced two new config options to + allow fine tuning the retry logic. For details see the description of the + new ``[libvirt]device_detach_attempts`` and + ``[libvirt]device_detach_timeout`` config options. + + .. __: https://bugs.launchpad.net/nova/+bug/1882521