diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 80984c5a970b..5d89dbe7396f 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -875,6 +875,30 @@ Related options: * It's recommended to consider including ``x86_64=q35`` in :oslo.config:option:`libvirt.hw_machine_type`; see :ref:`deploying-sev-capable-infrastructure` for more on this. +"""), + cfg.IntOpt('device_detach_attempts', + default=8, + min=1, + help=""" +Maximum number of attempts the driver tries to detach a device in libvirt. + +Related options: + +* :oslo.config:option:`libvirt.device_detach_timeout` + +"""), + cfg.IntOpt('device_detach_timeout', + default=20, + min=1, + help=""" +Maximum number of seconds the driver waits for the success or the failure +event from libvirt for a given device detach attempt before it re-trigger the +detach. + +Related options: + +* :oslo.config:option:`libvirt.device_detach_attempts` + """), ] diff --git a/nova/tests/unit/virt/libvirt/fakelibvirt.py b/nova/tests/unit/virt/libvirt/fakelibvirt.py index 115b490984d1..5164192f3db1 100644 --- a/nova/tests/unit/virt/libvirt/fakelibvirt.py +++ b/nova/tests/unit/virt/libvirt/fakelibvirt.py @@ -1278,9 +1278,17 @@ class Domain(object): self.attachDevice(xml) def detachDevice(self, xml): + # TODO(gibi): this should handle nics similarly to attachDevice() disk_info = _parse_disk_info(etree.fromstring(xml)) - disk_info['_attached'] = True - return disk_info in self._def['devices']['disks'] + attached_disk_info = None + for attached_disk in self._def['devices']['disks']: + if attached_disk['target_dev'] == disk_info.get('target_dev'): + attached_disk_info = attached_disk + break + + if attached_disk_info: + self._def['devices']['disks'].remove(attached_disk_info) + return attached_disk_info is not None def detachDeviceFlags(self, xml, flags): self.detachDevice(xml) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index faa166f903e4..90eb7305b1c8 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -9299,6 +9299,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_build_metadata.assert_called_with(self.context, instance) mock_save.assert_called_with() + @mock.patch('threading.Event', new=mock.Mock()) @mock.patch('nova.virt.libvirt.host.Host._get_domain') def test_detach_volume_with_vir_domain_affect_live_flag(self, mock_get_domain): @@ -9319,8 +9320,12 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_dom = mock.MagicMock() # Second time don't return anything about disk vdc so it looks removed - return_list = [mock_xml_with_disk, mock_xml_without_disk, - mock_xml_without_disk] + return_list = [ + mock_xml_with_disk, # presistent domain + mock_xml_with_disk, # live domain + mock_xml_without_disk, # persistent gone + mock_xml_without_disk # live gone + ] # Doubling the size of return list because we test with two guest power # states mock_dom.XMLDesc.side_effect = return_list + return_list @@ -9328,25 +9333,36 @@ class LibvirtConnTestCase(test.NoDBTestCase, connection_info = {"driver_volume_type": "fake", "data": {"device_path": "/fake", "access_mode": "rw"}} - flags = (fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) with mock.patch.object(drvr, '_disconnect_volume') as \ mock_disconnect_volume: for state in (power_state.RUNNING, power_state.PAUSED): + mock_dom.reset_mock() mock_dom.info.return_value = [state, 512, 512, 2, 1234, 5678] mock_get_domain.return_value = mock_dom drvr.detach_volume( self.context, connection_info, instance, '/dev/vdc') mock_get_domain.assert_called_with(instance) - call = mock_dom.detachDeviceFlags.mock_calls[0] xml = """ - """ + """ + # we expect two separate detach calls + self.assertEqual(2, mock_dom.detachDeviceFlags.call_count) + # one for the persistent domain + call = mock_dom.detachDeviceFlags.mock_calls[0] self.assertXmlEqual(xml, call.args[0]) - self.assertEqual({"flags": flags}, call.kwargs) + self.assertEqual( + {"flags": fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG}, + call.kwargs) + # and one for the live domain + call = mock_dom.detachDeviceFlags.mock_calls[1] + self.assertXmlEqual(xml, call.args[0]) + self.assertEqual( + {"flags": fakelibvirt.VIR_DOMAIN_AFFECT_LIVE}, + call.kwargs) + mock_disconnect_volume.assert_called_with( self.context, connection_info, instance, encryption=None) @@ -9406,11 +9422,14 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_disconnect_volume.assert_called_once_with( self.context, connection_info, instance, encryption=encryption) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_detach_with_retry') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_driver') @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_encryptor') @mock.patch('nova.virt.libvirt.host.Host.get_guest') def test_detach_volume_order_with_encryptors(self, mock_get_guest, - mock_get_encryptor, mock_get_volume_driver): + mock_get_encryptor, mock_get_volume_driver, mock_detach_with_retry + ): mock_volume_driver = mock.MagicMock( spec=volume_drivers.LibvirtBaseVolumeDriver) @@ -9425,7 +9444,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_order = mock.Mock() mock_order.attach_mock(mock_volume_driver.disconnect_volume, 'disconnect_volume') - mock_order.attach_mock(mock_guest.detach_device_with_retry(), + mock_order.attach_mock(mock_detach_with_retry, 'detach_volume') mock_order.attach_mock(mock_encryptor.detach_volume, 'detach_encryptor') @@ -9441,9 +9460,20 @@ class LibvirtConnTestCase(test.NoDBTestCase, encryption=encryption) mock_order.assert_has_calls([ - mock.call.detach_volume(), + mock.call.detach_volume( + mock_guest, + instance.uuid, + # it is functools.partial(mock_guest.get_disk, 'vdc') + # see assert below + mock.ANY, + device_name='vdc', + live=True + ), mock.call.detach_encryptor(**encryption), mock.call.disconnect_volume(connection_info, instance)]) + get_device_conf_func = mock_detach_with_retry.mock_calls[0][1][2] + self.assertEqual(mock_guest.get_disk, get_device_conf_func.func) + self.assertEqual(('vdc',), get_device_conf_func.args) def test_extend_volume(self): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) @@ -15589,12 +15619,16 @@ class LibvirtConnTestCase(test.NoDBTestCase, self.assertFalse(service_mock.disabled) self.assertIsNone(service_mock.disabled_reason) + @mock.patch.object(libvirt_driver.AsyncDeviceEventsHandler, + 'cleanup_waiters') @mock.patch.object(libvirt_driver.LibvirtDriver, 'delete_instance_files') @mock.patch.object(host.Host, '_get_domain', side_effect=exception.InstanceNotFound( instance_id=uuids.instance)) @mock.patch.object(objects.Instance, 'save') - def test_immediate_delete(self, mock_save, mock_get, mock_delete): + def test_immediate_delete(self, mock_save, mock_get, mock_delete, + mock_cleanup_async_detach + ): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) instance = objects.Instance(self.context, **self.test_instance) @@ -15603,6 +15637,7 @@ class LibvirtConnTestCase(test.NoDBTestCase, mock_get.assert_has_calls([mock.call(instance)] * 3) self.assertEqual(3, mock_get.call_count) mock_delete.assert_called_once_with(instance) + mock_cleanup_async_detach.assert_called_once_with(instance.uuid) @mock.patch.object(objects.Instance, 'get_by_uuid') @mock.patch.object(objects.Instance, 'obj_load_attr', autospec=True) @@ -22944,8 +22979,8 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self._test_attach_interface( power_state.SHUTDOWN, fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG) - def _test_detach_interface(self, power_state, expected_flags, - device_not_found=False): + @mock.patch('threading.Event.wait', new=mock.Mock) + def _test_detach_interface(self, state, device_not_found=False): # setup some mocks instance = self._create_instance() network_info = _fake_network_info(self) @@ -22962,7 +22997,7 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """, - info=[power_state, 1, 2, 3, 4]) + info=[state, 1, 2, 3, 4]) guest = libvirt_guest.Guest(domain) expected_cfg = vconfig.LibvirtConfigGuestInterface() @@ -22975,11 +23010,28 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """) if device_not_found: - # This will trigger detach_device_with_retry to raise + # This will trigger _detach_with_retry to raise # DeviceNotFound - get_interface_calls = [expected_cfg, None] + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + None, # _detach_with_retry: no device in live config + ] else: - get_interface_calls = [expected_cfg, expected_cfg, None, None] + if state in (power_state.RUNNING, power_state.PAUSED): + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + expected_cfg, # _detach_with_retry: live config + None, # _detach_with_retry: persistent config gone + None # _detach_with_retry: live config gone + ] + else: + get_interface_calls = [ + expected_cfg, # detach_interface() itself gets the config + expected_cfg, # _detach_with_retry: persistent config + None, # _detach_with_retry: persistent config gone + ] with test.nested( mock.patch.object(host.Host, 'get_guest', return_value=guest), @@ -23006,45 +23058,60 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): mock_get_config.assert_called_once_with( instance, network_info[0], test.MatchType(objects.ImageMeta), test.MatchType(objects.Flavor), CONF.libvirt.virt_type) - mock_get_interface.assert_has_calls( - [mock.call(expected_cfg) for x in range(len(get_interface_calls))]) - if device_not_found: mock_detach_device_flags.assert_not_called() self.assertTrue(mock_warning.called) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + ]) else: - mock_detach_device_flags.assert_called_once_with( - expected_cfg.to_xml(), flags=expected_flags) - mock_warning.assert_not_called() mock_get_network_info.assert_called_once_with() + if state in (power_state.RUNNING, power_state.PAUSED): + mock_detach_device_flags.assert_has_calls([ + mock.call( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG), + mock.call( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), + ]) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg), + ]) + else: + mock_detach_device_flags.assert_called_once_with( + expected_cfg.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG) + mock_get_interface.assert_has_calls( + [ + mock.call(expected_cfg), + mock.call(expected_cfg, from_persistent_config=True), + mock.call(expected_cfg, from_persistent_config=True), + ]) mock_unplug.assert_called_once_with(instance, network_info[0]) def test_detach_interface_with_running_instance(self): - self._test_detach_interface( - power_state.RUNNING, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + self._test_detach_interface(power_state.RUNNING) def test_detach_interface_with_running_instance_device_not_found(self): """Tests that the interface is detached before we try to detach it. """ - self._test_detach_interface( - power_state.RUNNING, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), - device_not_found=True) + self._test_detach_interface(power_state.RUNNING, device_not_found=True) def test_detach_interface_with_pause_instance(self): - self._test_detach_interface( - power_state.PAUSED, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) + self._test_detach_interface(power_state.PAUSED) def test_detach_interface_with_shutdown_instance(self): - self._test_detach_interface( - power_state.SHUTDOWN, - expected_flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG)) + self._test_detach_interface(power_state.SHUTDOWN) @mock.patch('nova.virt.libvirt.driver.LOG') def test_detach_interface_device_not_found(self, mock_log): @@ -23068,8 +23135,12 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): self.assertIn('the device is no longer found on the guest', str(mock_log.warning.call_args[0])) + @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.' + '_detach_with_retry') @mock.patch('nova.virt.libvirt.driver.LOG') - def test_detach_interface_guest_not_found_after_detach(self, mock_log): + def test_detach_interface_guest_not_found_after_detach( + self, mock_log, mock_detach_with_retry + ): # Asserts that we don't raise an exception when the guest is gone # after a libvirt error during detach. instance = self._create_instance() @@ -23087,13 +23158,14 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): error = fakelibvirt.libvirtError( 'internal error: End of file from qemu monitor') error.err = (fakelibvirt.VIR_ERR_OPERATION_FAILED,) - guest.detach_device_with_retry.side_effect = error + mock_detach_with_retry.side_effect = error self.drvr.detach_interface(self.context, instance, vif) self.assertEqual(1, mock_log.info.call_count) self.assertIn('Instance disappeared while detaching interface', mock_log.info.call_args[0][0]) get_guest_mock.assert_has_calls([mock.call(instance)] * 2) + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(FakeVirtDomain, 'info') @mock.patch.object(FakeVirtDomain, 'detachDeviceFlags') @mock.patch.object(host.Host, '_get_domain') @@ -23132,26 +23204,46 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): """) - expected_flags = (fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) with test.nested( - mock.patch.object(libvirt_guest.Guest, 'get_interface_by_cfg', - side_effect=[expected, expected, None, None]), + mock.patch.object( + libvirt_guest.Guest, 'get_interface_by_cfg', + side_effect=[ + expected, # detach_interface gets the config + expected, # _detach_with_retry: persistent config + expected, # _detach_with_retry: live config + None, # _detach_with_retry: persistent gone + None # _detach_with_retry: live gone + ]), mock.patch.object(self.drvr.vif_driver, 'get_config', return_value=expected), mock.patch.object(instance, 'get_network_info') ) as (mock_get_interface, mock_get_config, mock_get_network_info): self.drvr.detach_interface(self.context, instance, network_info[0]) - mock_get_interface.assert_has_calls([mock.call(expected)] * 3) - self.assertEqual(4, mock_get_interface.call_count) + mock_get_interface.assert_has_calls( + [ + mock.call(expected, ), + mock.call(expected, from_persistent_config=True), + mock.call(expected), + mock.call(expected, from_persistent_config=True), + mock.call(expected), + ] + ) + self.assertEqual(5, mock_get_interface.call_count) mock_get_config.assert_called_once_with( instance, network_info[0], test.MatchType(objects.ImageMeta), test.MatchType(objects.Flavor), CONF.libvirt.virt_type) mock_get_domain.assert_called_once_with(instance) mock_info.assert_called_once_with() - mock_detach.assert_called_once_with(expected.to_xml(), - flags=expected_flags) + mock_detach.assert_has_calls( + [ + mock.call( + expected.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG), + mock.call( + expected.to_xml(), + flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE), + ]) mock_get_network_info.assert_called_once_with() def test_detach_interface_guest_set_metadata(self): @@ -23165,7 +23257,6 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): CONF.libvirt.virt_type, instance, image_meta) cfg = self.drvr._get_guest_config( instance, network_info, image_meta, disk_info) - mock_wait_for_detach = mock.Mock() config_meta = vconfig.LibvirtConfigGuestMetaNovaInstance() with test.nested( @@ -23178,15 +23269,15 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): mock.patch.object(guest, 'get_power_state'), mock.patch.object( instance, 'get_network_info', return_value=network_info), - mock.patch.object(guest, - 'detach_device_with_retry', return_value=mock_wait_for_detach), + mock.patch.object( + self.drvr, '_detach_with_retry'), mock.patch.object( self.drvr, '_get_guest_config_meta', return_value=config_meta), mock.patch.object(guest, 'set_metadata') ) as ( mock_get_guest, mock_get_config, mock_get_interface_by_cfg, mock_get_power_state, mock_get_network_info, - mock_detach_device_with_retry, mock_get_guest_config_meta, + mock_detach_with_retry, mock_get_guest_config_meta, mock_set_metadata ): self.drvr.detach_interface(self.context, instance, vif) @@ -23196,15 +23287,606 @@ class LibvirtDriverTestCase(test.NoDBTestCase, TraitsComparisonMixin): test.MatchType(objects.Flavor), CONF.libvirt.virt_type) mock_get_interface_by_cfg.assert_called_once_with(cfg) mock_get_power_state.assert_called_once_with(self.drvr._host) - mock_detach_device_with_retry.assert_called_once_with( - guest.get_interface_by_cfg, cfg, live=False, - alternative_device_name=None) - mock_wait_for_detach.assert_called_once_with() + mock_detach_with_retry.assert_called_once_with( + guest, instance.uuid, mock.ANY, live=False, + device_name=None) mock_get_network_info.assert_called_once_with() mock_get_guest_config_meta.assert_called_once_with( instance, network_info[1:]) mock_set_metadata.assert_called_once_with(config_meta) + def test__detach_with_retry_persistent_success(self): + """Test detach only from the persistent domain""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to double check that the dev is gone from the + # persistent domain + side_effect=[ + mock_dev, + None, + ] + ) + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(from_persistent_config=True), + ]) + mock_guest.detach_device.assert_called_once_with( + mock_dev, + persistent=True, + live=False) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_live_success(self): + """Test detach only from the live domain""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain + side_effect=[ + mock_dev, + None, + ] + ) + # simulate that libvirt sends an event + mock_guest.detach_device.side_effect = ( + lambda dev, persistent, live: drvr.emit_event( + libvirtevent.DeviceRemovedEvent(uuids.instance_uuid, dev.alias) + )) + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(), + mock.call(), + ]) + mock_guest.detach_device.assert_called_once_with( + mock_dev, + persistent=False, + live=True) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_persistent_and_live_success(self): + """Test detach both from the persistent and from the live domains""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to get the device from the live domain + # the third call is to double check that the dev is gone from the + # persistent domain + # the fourth call is to double check that the dev is gone from the + # live domain + side_effect=[ + mock_dev, + mock_dev, + None, + None, + ] + ) + + # simulate that libvirt sends an event for the live detach call + # for the persistent detach call no side effect needs to be simulated + def fake_detach(dev, persistent, live): + if live: + drvr.emit_event(libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + + mock_guest.detach_device.side_effect = fake_detach + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + mock.call(from_persistent_config=True), + mock.call(), + ]) + # we detach from the persistent config first and then from the live + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=True, live=False), + mock.call(mock_dev, persistent=False, live=True), + ]) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_persistent_but_guest_is_not(self): + """Test that a pesistent only detach is requested but the guest is + not persistent. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + # we expect that no device query or actual detach happens. + mock_get_device_conf_func = mock.NonCallableMock() + mock_guest.detach_device = mock.NonCallableMock() + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + + def test__detach_with_retry_persistent_dev_not_found(self): + """Tests that expection is raised if a persistent detach is requested, + the domain is persistent but the device is not found. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + # we simulate that the device does not exists in the domain even + # before we try to detach + mock_get_device_conf_func = mock.Mock(return_value=None) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_get_device_conf_func.assert_called_once_with( + from_persistent_config=True) + mock_guest.detach_device.assert_not_called() + + def test__detach_with_retry_live_dev_not_found(self): + """Tests that expection is raised if a live detach is requested, + but the device is not found in the live domain. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = False + + # we simulate that the device does not exists in the domain even + # before we try to detach + mock_get_device_conf_func = mock.Mock(return_value=None) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_get_device_conf_func.assert_called_once_with() + mock_guest.detach_device.assert_not_called() + + def test__detach_with_retry_async_fail(self): + """Test that libvirt sends error event during detach""" + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock(return_value=mock_dev) + + # simulate that libvirt sends an failed event + mock_guest.detach_device.side_effect = ( + lambda dev, persistent, live: drvr.emit_event( + libvirtevent.DeviceRemovalFailedEvent( + uuids.instance_uuid, dev.alias))) + + self.assertRaises( + exception.DeviceDetachFailed, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + self.assertEqual(1, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=False, live=True) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_retry_succeeds(self, mock_event_wait): + """Test that that a live detach times out while waiting for the libvirt + event but then the retry succeeds. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain after the first detach attempt + # the third call is to double check that the dev is gone from the + # live domain after the second detach attempt + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False first we simulate to the + # caller that the wait returned not becasuse the event is set but + # because timeout happend. Then during the retry we return True + # signalling that the event is set, i.e. the libvirt event the caller + # is waiting for has been received + mock_event_wait.side_effect = [False, True] + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the 3 calls has explained above where the return value is defined + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_retry_unplug_in_progress( + self, mock_event_wait + ): + """Test that that a live detach times out while waiting for the libvirt + event but then the retry gets a unplug already in progress error from + libvirt, which it ignores, then the detach finishes and the event is + received. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplicity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the live domain + # the second call is to double check that the dev is gone from the + # live domain after the first detach attempt + # the third call is to double check that the dev is gone from the + # live domain after the second detach attempt + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False first we simulate to the + # caller that the wait returned not because the event is set but + # because timeout happened. Then during the retry we return True + # signalling that the event is set, i.e. the libvirt event the caller + # is waiting for has been received + mock_event_wait.side_effect = [False, True] + + # there will be two detach attempts + # 1) simulate that synchronous parts of the detach succeeds + # 2) return an error as the previous detach attempts is still ongoing + def fake_detach_device(dev, persistent, live): + if mock_guest.detach_device.call_count == 0: + return + else: + # In reality these events are in reverse order. However + # simulating that would require parallelism in the test. From + # the perspective of the code under test the call order does + # not matter as the call to detach_device is synchronous. + drvr.emit_event( + libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + raise fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_message='already in the process of unplug', + error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR) + + mock_guest.detach_device.side_effect = fake_detach_device + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the 3 calls has explained above where the return value is defined + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + @mock.patch('threading.Event.wait') + def test__detach_with_retry_timeout_run_out_of_retries( + self, mock_event_wait + ): + """Test that that a live detach times out while waiting for the libvirt + event at every attempt so the driver runs out of retry attempts. + """ + # decreased the number to simplyfy the test + self.flags(group='libvirt', device_detach_attempts=2) + + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + # for simplycity do a live only detach + mock_guest.has_persistent_configuration.return_value = False + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock(return_value=mock_dev) + + # By mocking threading.Event.wait we prevent the test to wait until the + # timeout happens, and by returning False we simulate to the + # caller that the wait returned not becasuse the event is set but + # because timeout happend. + mock_event_wait.return_value = False + + self.assertRaises( + exception.DeviceDetachFailed, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + # the first call is to get the dev we detach, then there is one call + # after each retry attempt to check if the device is gone + self.assertEqual(3, mock_get_device_conf_func.call_count) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=False, live=True), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_libvirt_reports_not_found_live_detach_done( + self + ): + """Test that libvirt reports that the device is not found in the + persistent domain but as live detach is also requested the original + error is just logged and the live detach is attempted. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second call is to get the device from the live domain + # the third call is to double check that the dev is gone from the + # live domain + # NOTE(gibi): as the persistent detach fails with exception the + # driver does not double check that the device is removed but + # simply move to the live detach + side_effect=[ + mock_dev, + mock_dev, + None, + ] + ) + + # there will be two detach attempts: + # 1) with persistent=True: simulate that libvirt fails synchronosuly + # signalling that the device is not found. + # 2) with persistent=True: simulate that detach succeeds by emitting + # the libvirt event + def fake_detach_device(dev, persistent, live): + if persistent: + raise fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING) + else: + drvr.emit_event( + libvirtevent.DeviceRemovedEvent( + uuids.instance_uuid, dev.alias)) + + mock_guest.detach_device.side_effect = fake_detach_device + + drvr._detach_with_retry( + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + mock.call(), + ]) + mock_guest.detach_device.assert_has_calls( + [ + mock.call(mock_dev, persistent=True, live=False), + mock.call(mock_dev, persistent=False, live=True), + ] + ) + # check that the internal event handling is cleaned up + self.assertEqual(set(), drvr._device_event_handler._waiters) + + def test__detach_with_retry_libvirt_reports_not_found_give_up(self): + """Test that libvirt reports that the device is not found in the + persistent domain and as live detach is not requested the drive simply + bubble up the exception. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # NOTE(gibi): as the persistent detach fails with exception the + # driver does not double check that the device is removed but + # simply gives up + side_effect=[ + mock_dev, + ] + ) + + mock_guest.detach_device.side_effect = fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING) + + self.assertRaises( + exception.DeviceNotFound, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=False, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_called_once_with( + from_persistent_config=True) + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=True, live=False) + + def test__detach_with_retry_other_sync_libvirt_error(self): + """Test that libvirt reports non device related error during detach + from persistent config which causes that the driver gives up + regardless of the fact that live detach also requested. + """ + drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + mock_guest = mock.Mock(spec=libvirt_guest.Guest) + mock_guest.has_persistent_configuration.return_value = True + + mock_dev = mock.Mock(spec=vconfig.LibvirtConfigGuestDisk) + mock_dev.alias = 'virtio-disk1' + + mock_get_device_conf_func = mock.Mock( + # the first call is to get the device from the persistent domain + # the second all is to get the device from the live domain + side_effect=[ + mock_dev, + mock_dev, + ] + ) + mock_guest.detach_device.side_effect = fakelibvirt.make_libvirtError( + fakelibvirt.libvirtError, + msg='error', + error_code=fakelibvirt.VIR_ERR_NO_DOMAIN) + + self.assertRaises( + fakelibvirt.libvirtError, + drvr._detach_with_retry, + mock_guest, + uuids.instance_uuid, + mock_get_device_conf_func, + device_name='vdb', + live=True, + ) + + mock_guest.has_persistent_configuration.assert_called_once_with() + mock_get_device_conf_func.assert_has_calls( + [ + mock.call(from_persistent_config=True), + mock.call(), + ]) + # NOTE(gibi): the drive does not attempt a live detach even though + # it was requested + mock_guest.detach_device.assert_called_once_with( + mock_dev, persistent=True, live=False) + @mock.patch('nova.objects.block_device.BlockDeviceMapping.save', new=mock.Mock()) @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref') @@ -27326,19 +28008,22 @@ class LibvirtDeviceRemoveEventTestCase(test.NoDBTestCase): super().setUp() self.useFixture(fakelibvirt.FakeLibvirtFixture()) - @mock.patch.object(libvirt_driver.LOG, 'debug') + @mock.patch.object(libvirt_driver.LOG, 'warning') @mock.patch('nova.virt.driver.ComputeDriver.emit_event') @ddt.data( libvirtevent.DeviceRemovedEvent, libvirtevent.DeviceRemovalFailedEvent) def test_libvirt_device_removal_events( - self, event_type, mock_base_handles, mock_debug + self, event_type, mock_base_handles, mock_warning ): drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True) event = event_type(uuid=uuids.event, dev=mock.sentinel.dev_alias) drvr.emit_event(event) mock_base_handles.assert_not_called() - mock_debug.assert_not_called() + self.assertEqual(1, len(mock_warning.mock_calls)) + self.assertIn( + "the driver is not waiting for it; ignored.", + mock_warning.mock_calls[0][1][0]) class AsyncDeviceEventsHandlerTestCase(test.NoDBTestCase): diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py index f09c8ae71db1..87b189882581 100644 --- a/nova/tests/unit/virt/libvirt/test_guest.py +++ b/nova/tests/unit/virt/libvirt/test_guest.py @@ -21,7 +21,6 @@ from oslo_service import fixture as service_fixture from oslo_utils import encodeutils from nova import context -from nova import exception from nova import test from nova.tests.unit.virt.libvirt import fakelibvirt from nova.virt.libvirt import config as vconfig @@ -213,212 +212,6 @@ class GuestTestCase(test.NoDBTestCase): "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - def test_detach_device_with_retry_from_transient_domain(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - get_config = mock.Mock() - get_config.side_effect = [conf, conf, conf, None, None] - dev_path = "/dev/vdb" - self.domain.isPersistent.return_value = False - retry_detach = self.guest.detach_device_with_retry( - get_config, dev_path, live=True, inc_sleep_time=.01) - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - self.assertEqual(1, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_detach_success(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - get_config = mock.Mock() - # Force multiple retries of detach - get_config.side_effect = [conf, conf, conf, conf, conf, None, None] - dev_path = "/dev/vdb" - self.domain.isPersistent.return_value = True - - retry_detach = self.guest.detach_device_with_retry( - get_config, dev_path, live=True, inc_sleep_time=.01) - # Ensure we've only done the initial detach call - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - - get_config.assert_called_with(dev_path) - - # Some time later, we can do the wait/retry to ensure detach succeeds - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - # Should have two retries before we pretend device is detached - self.assertEqual(2, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_detach_failure(self): - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - # Continue to return some value for the disk config - get_config = mock.Mock(return_value=conf) - self.domain.isPersistent.return_value = True - - retry_detach = self.guest.detach_device_with_retry( - get_config, "/dev/vdb", live=True, inc_sleep_time=.01, - max_retry_count=3) - # Ensure we've only done the initial detach call - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - - # Some time later, we can do the wait/retry to ensure detach - self.domain.detachDeviceFlags.reset_mock() - # Should hit max # of retries - self.assertRaises(exception.DeviceDetachFailed, retry_detach) - self.assertEqual(4, self.domain.detachDeviceFlags.call_count) - - def test_detach_device_with_retry_device_not_found(self): - get_config = mock.Mock(return_value=None) - self.domain.isPersistent.return_value = True - ex = self.assertRaises( - exception.DeviceNotFound, self.guest.detach_device_with_retry, - get_config, "/dev/vdb", live=True) - self.assertIn("/dev/vdb", str(ex)) - - def test_detach_device_with_retry_device_not_found_alt_name(self): - """Tests to make sure we use the alternative name in errors.""" - get_config = mock.Mock(return_value=None) - self.domain.isPersistent.return_value = True - ex = self.assertRaises( - exception.DeviceNotFound, self.guest.detach_device_with_retry, - get_config, mock.sentinel.device, live=True, - alternative_device_name='foo') - self.assertIn('foo', str(ex)) - - @mock.patch.object(libvirt_guest.Guest, "detach_device") - def _test_detach_device_with_retry_second_detach_failure( - self, mock_detach, error_code=None, error_message=None, - supports_device_missing=False): - # This simulates a retry of the transient/live domain detach - # failing because the device is not found - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - self.domain.isPersistent.return_value = True - - get_config = mock.Mock(return_value=conf) - fake_device = "vdb" - fake_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message=error_message, - error_code=error_code, - error_domain=fakelibvirt.VIR_FROM_DOMAIN) - mock_detach.side_effect = [None, fake_exc] - retry_detach = self.guest.detach_device_with_retry( - get_config, fake_device, live=True, - inc_sleep_time=.01, max_retry_count=3) - # Some time later, we can do the wait/retry to ensure detach - self.assertRaises(exception.DeviceNotFound, retry_detach) - # Check that the save_and_reraise_exception context manager didn't log - # a traceback when the libvirtError was caught and DeviceNotFound was - # raised. - self.assertNotIn('Original exception being dropped', - self.stdlog.logger.output) - - def test_detach_device_with_retry_second_detach_device_missing(self): - self._test_detach_device_with_retry_second_detach_failure( - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_message="device not found: disk vdb not found", - supports_device_missing=True) - - def _test_detach_device_with_retry_first_detach_failure( - self, error_code=None, error_message=None, - supports_device_missing=False): - # This simulates a persistent or live domain detach failing because the - # device is not found during the first attempt to detach the device. - # We should still attempt to detach the device from the live config if - # the detach from persistent failed OR we should retry the detach from - # the live config if the first detach from live config failed. - # Note that the side effects in this test case [fake_exc, None] could - # not happen in real life if the first detach failed because the detach - # from live raised not found. In real life, the second attempt to - # detach from live would raise not found again because the device is - # not present. The purpose of this test is to verify that we try to - # detach a second time if the first detach fails, so we are OK with the - # unrealistic side effects for detach from live failing the first time. - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - self.domain.isPersistent.return_value = True - - get_config = mock.Mock() - # Simulate an inactive or live detach attempt which fails (not found) - # followed by a live config detach attempt that is successful - get_config.side_effect = [conf, conf, conf, None, None] - fake_device = "vdb" - fake_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message=error_message, - error_code=error_code, - error_domain=fakelibvirt.VIR_FROM_DOMAIN) - # Detach from persistent or live raises not found, detach from live - # succeeds afterward - self.domain.detachDeviceFlags.side_effect = [fake_exc, None] - retry_detach = self.guest.detach_device_with_retry(get_config, - fake_device, live=True, inc_sleep_time=.01, max_retry_count=3) - # We should have tried to detach from the persistent domain - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=(fakelibvirt.VIR_DOMAIN_AFFECT_CONFIG | - fakelibvirt.VIR_DOMAIN_AFFECT_LIVE)) - # During the retry detach, should detach from the live domain - self.domain.detachDeviceFlags.reset_mock() - retry_detach() - # We should have tried to detach from the live domain - self.domain.detachDeviceFlags.assert_called_once_with( - "", flags=fakelibvirt.VIR_DOMAIN_AFFECT_LIVE) - - def test_detach_device_with_retry_first_detach_device_missing(self): - self._test_detach_device_with_retry_first_detach_failure( - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_message="device not found: disk vdb not found", - supports_device_missing=True) - - def test_detach_device_with_already_in_process_of_unplug_error(self): - # Assert that DeviceNotFound is raised when encountering - # https://bugzilla.redhat.com/show_bug.cgi?id=1878659 - # This is raised as QEMU returns a VIR_ERR_INTERNAL_ERROR when - # a request to device_del is made while another is about to complete. - - self.domain.isPersistent.return_value = True - conf = mock.Mock(spec=vconfig.LibvirtConfigGuestDevice) - conf.to_xml.return_value = "" - - existing_unplug_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message='device vdb is already in the process of unplug', - error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR, - error_domain=fakelibvirt.VIR_FROM_DOMAIN - ) - device_missing_exc = fakelibvirt.make_libvirtError( - fakelibvirt.libvirtError, "", - error_message='device not found: disk vdb not found', - error_code=fakelibvirt.VIR_ERR_DEVICE_MISSING, - error_domain=fakelibvirt.VIR_FROM_DOMAIN - ) - - # Raise VIR_ERR_INTERNAL_ERROR on the second call before raising - # VIR_ERR_DEVICE_MISSING to mock the first call successfully detaching - # the device asynchronously. - self.domain.detachDeviceFlags.side_effect = [ - None, - existing_unplug_exc, - device_missing_exc - ] - - retry_detach = self.guest.detach_device_with_retry( - mock.Mock(return_value=conf), - 'vdb', - live=True, - inc_sleep_time=.01 - ) - - # Assert that we raise exception.DeviceNotFound - self.assertRaises(exception.DeviceNotFound, retry_detach) - def test_get_xml_desc(self): self.guest.get_xml_desc() self.domain.XMLDesc.assert_called_once_with(flags=0) diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index 54c587bfdf70..9d3ef54e8975 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -416,6 +416,7 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase): self.assertEqual(storage_ip, result['ip']) @catch_notimplementederror + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata', return_value=objects.InstanceDeviceMetadata()) def test_attach_detach_volume(self, _): @@ -452,6 +453,7 @@ class _VirtDriverTestCase(_FakeDriverBackendTestCase): '/dev/sda', 2)) @catch_notimplementederror + @mock.patch('threading.Event.wait', new=mock.Mock()) @mock.patch.object(libvirt.driver.LibvirtDriver, '_build_device_metadata', return_value=objects.InstanceDeviceMetadata()) def test_attach_detach_different_power_states(self, _): diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 8ee8757e40cc..747489aeeb62 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -504,6 +504,10 @@ class LibvirtDriver(driver.ComputeDriver): self.pgpu_type_mapping = collections.defaultdict(str) self.supported_vgpu_types = self._get_supported_vgpu_types() + # Handles ongoing device manipultion in libvirt where we wait for the + # events about success or failure. + self._device_event_handler = AsyncDeviceEventsHandler() + def _discover_vpmems(self, vpmem_conf=None): """Discover vpmems on host and configuration. @@ -1400,6 +1404,9 @@ class LibvirtDriver(driver.ComputeDriver): def destroy(self, context, instance, network_info, block_device_info=None, destroy_disks=True): self._destroy(instance) + # NOTE(gibi): if there was device detach in progress then we need to + # unblock the waiting threads and clean up. + self._device_event_handler.cleanup_waiters(instance.uuid) self.cleanup(context, instance, network_info, block_device_info, destroy_disks) @@ -2189,8 +2196,20 @@ class LibvirtDriver(driver.ComputeDriver): # These are libvirt specific events handled here on the driver # level instead of propagating them to the compute manager level if isinstance(event, libvirtevent.DeviceEvent): - # TODO(gibi): handle it - pass + had_clients = self._device_event_handler.notify_waiters(event) + + if had_clients: + LOG.debug( + "Received event %s from libvirt while the driver is " + "waiting for it; dispatched.", + event, + ) + else: + LOG.warning( + "Received event %s from libvirt but the driver is not " + "waiting for it; ignored.", + event, + ) else: LOG.debug( "Received event %s from libvirt but no handler is " @@ -2201,6 +2220,321 @@ class LibvirtDriver(driver.ComputeDriver): # manager super().emit_event(event) + def _detach_with_retry( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + # to properly typehint this param we would need typing.Protocol but + # that is only available since python 3.8 + get_device_conf_func: ty.Callable, + device_name: str, + live: bool, + ) -> None: + """Detaches a device from the guest + + If live detach is requested then this call will wait for the libvirt + event signalling the end of the detach process. + + If the live detach times out then it will retry the detach. Detach from + the persistent config is not retried as it is: + + * synchronous and no event is sent from libvirt + * it is always expected to succeed if the device is in the domain + config + + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param get_device_conf_func: function which returns the configuration + for device from the domain, having one optional boolean parameter + `from_persistent_config` to select which domain config to query + :param device_name: This is the name of the device used solely for + error messages. Note that it is not the same as the device alias + used by libvirt to identify the device. + :param live: bool to indicate whether it affects the guest in running + state. If live is True then the device is detached from both the + persistent and the live config. If live is False the device only + detached from the persistent config. + :raises exception.DeviceNotFound: if the device does not exist in the + domain even before we try to detach or if libvirt reported that the + device is missing from the domain synchronously. + :raises exception.DeviceDetachFailed: if libvirt reported error during + detaching from the live domain or we timed out waiting for libvirt + events and run out of retries + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + """ + persistent = guest.has_persistent_configuration() + + if not persistent and not live: + # nothing to do + return + + persistent_dev = None + if persistent: + persistent_dev = get_device_conf_func(from_persistent_config=True) + + live_dev = None + if live: + live_dev = get_device_conf_func() + + if live and live_dev is None: + # caller requested a live detach but device is not present + raise exception.DeviceNotFound(device=device_name) + + if not live and persistent_dev is None: + # caller requested detach from the persistent domain but device is + # not present + raise exception.DeviceNotFound(device=device_name) + + if persistent_dev: + try: + self._detach_from_persistent( + guest, instance_uuid, persistent_dev, get_device_conf_func, + device_name) + except exception.DeviceNotFound: + if live: + # ignore the error so that we can do the live detach + LOG.warning( + 'Libvirt reported sync error while detaching ' + 'device %s from instance %s from the persistent ' + 'domain config. Ignoring the error to proceed with ' + 'live detach as that was also requested.', + device_name, instance_uuid) + else: + # if only persistent detach was requested then give up + raise + + if live and live_dev: + self._detach_from_live_with_retry( + guest, instance_uuid, live_dev, get_device_conf_func, + device_name) + + def _detach_from_persistent( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + persistent_dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + get_device_conf_func, + device_name: str, + ): + LOG.debug( + 'Attempting to detach device %s from instance %s from ' + 'the persistent domain config.', device_name, instance_uuid) + + self._detach_sync( + persistent_dev, guest, instance_uuid, device_name, + persistent=True, live=False) + + # make sure the dev is really gone + persistent_dev = get_device_conf_func( + from_persistent_config=True) + if not persistent_dev: + LOG.info( + 'Successfully detached device %s from instance %s ' + 'from the persistent domain config.', + device_name, instance_uuid) + else: + # Based on the libvirt devs this should never happen + LOG.warning( + 'Failed to detach device %s from instance %s ' + 'from the persistent domain config. Libvirt did not ' + 'report any error but the device is still in the ' + 'config.', device_name, instance_uuid) + + def _detach_from_live_with_retry( + self, + guest: libvirt_guest.Guest, + instance_uuid: str, + live_dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + get_device_conf_func, + device_name: str, + ): + max_attempts = CONF.libvirt.device_detach_attempts + for attempt in range(max_attempts): + LOG.debug( + '(%s/%s): Attempting to detach device %s with device ' + 'alias %s from instance %s from the live domain config.', + attempt + 1, max_attempts, device_name, live_dev.alias, + instance_uuid) + + self._detach_from_live_and_wait_for_event( + live_dev, guest, instance_uuid, device_name) + + # make sure the dev is really gone + live_dev = get_device_conf_func() + if not live_dev: + LOG.info( + 'Successfully detached device %s from instance %s ' + 'from the live domain config.', device_name, instance_uuid) + # we are done + return + + LOG.debug( + 'Failed to detach device %s with device alias %s from ' + 'instance %s from the live domain config. Libvirt did not ' + 'report any error but the device is still in the config.', + device_name, live_dev.alias, instance_uuid) + + msg = ( + 'Run out of retry while detaching device %s with device ' + 'alias %s from instance %s from the live domain config. ' + 'Device is still attached to the guest.') + LOG.error(msg, device_name, live_dev.alias, instance_uuid) + raise exception.DeviceDetachFailed( + device=device_name, + reason=msg % (device_name, live_dev.alias, instance_uuid)) + + def _detach_from_live_and_wait_for_event( + self, + dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + guest: libvirt_guest.Guest, + instance_uuid: str, + device_name: str, + ) -> None: + """Detaches a device from the live config of the guest and waits for + the libvirt event singling the finish of the detach. + + :param dev: the device configuration to be detached + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param device_name: This is the name of the device used solely for + error messages. + :raises exception.DeviceNotFound: if libvirt reported that the device + is missing from the domain synchronously. + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + :raises DeviceDetachFailed: if libvirt sent DeviceRemovalFailedEvent + """ + # So we will issue an detach to libvirt and we will wait for an + # event from libvirt about the result. We need to set up the event + # handling before the detach to avoid missing the event if libvirt + # is really fast + # NOTE(gibi): we need to use the alias name of the device as that + # is what libvirt will send back to us in the event + waiter = self._device_event_handler.create_waiter( + instance_uuid, dev.alias, + {libvirtevent.DeviceRemovedEvent, + libvirtevent.DeviceRemovalFailedEvent}) + try: + self._detach_sync( + dev, guest, instance_uuid, device_name, persistent=False, + live=True) + except Exception: + # clean up the libvirt event handler as we failed synchronously + self._device_event_handler.delete_waiter(waiter) + raise + + LOG.debug( + 'Start waiting for the detach event from libvirt for ' + 'device %s with device alias %s for instance %s', + device_name, dev.alias, instance_uuid) + # We issued the detach without any exception so we can wait for + # a libvirt event to arrive to notify us about the result + # NOTE(gibi): we expect that this call will be unblocked by an + # incoming libvirt DeviceRemovedEvent or DeviceRemovalFailedEvent + event = self._device_event_handler.wait( + waiter, timeout=CONF.libvirt.device_detach_timeout) + + if not event: + # This should not happen based on information from the libvirt + # developers. But it does at least during the cleanup of the + # tempest test case + # ServerRescueNegativeTestJSON.test_rescued_vm_detach_volume + # Log a warning and let the upper layer detect that the device is + # still attached and retry + LOG.error( + 'Waiting for libvirt event about the detach of ' + 'device %s with device alias %s from instance %s is timed ' + 'out.', device_name, dev.alias, instance_uuid) + + if isinstance(event, libvirtevent.DeviceRemovalFailedEvent): + # Based on the libvirt developers this signals a permanent failure + LOG.error( + 'Received DeviceRemovalFailedEvent from libvirt for the ' + 'detach of device %s with device alias %s from instance %s ', + device_name, dev.alias, instance_uuid) + raise exception.DeviceDetachFailed( + device=device_name, + reason="DeviceRemovalFailedEvent received from libvirt") + + @staticmethod + def _detach_sync( + dev: ty.Union[ + vconfig.LibvirtConfigGuestDisk, + vconfig.LibvirtConfigGuestInterface], + guest: libvirt_guest.Guest, + instance_uuid: str, + device_name: str, + persistent: bool, + live: bool, + ): + """Detaches a device from the guest without waiting for libvirt events + + It only handles synchronous errors (i.e. exceptions) but does not wait + for any event from libvirt. + + :param dev: the device configuration to be detached + :param guest: the guest we are detach the device from + :param instance_uuid: the UUID of the instance we are detaching the + device from + :param device_name: This is the name of the device used solely for + error messages. + :param live: detach the device from the live domain config only + :param persistent: detach the device from the persistent domain config + only + :raises exception.DeviceNotFound: if libvirt reported that the device + is missing from the domain synchronously. + :raises libvirt.libvirtError: for any other errors reported by libvirt + synchronously. + """ + try: + guest.detach_device(dev, persistent=persistent, live=live) + except libvirt.libvirtError as ex: + code = ex.get_error_code() + msg = ex.get_error_message() + if code == libvirt.VIR_ERR_DEVICE_MISSING: + LOG.debug( + 'Libvirt failed to detach device %s from instance %s ' + 'synchronously (persistent=%s, live=%s) with error: %s.', + device_name, instance_uuid, persistent, live, str(ex)) + raise exception.DeviceNotFound(device=device_name) from ex + + # NOTE(lyarwood): https://bugzilla.redhat.com/1878659 + # Ignore this known QEMU bug for the time being allowing + # our retry logic to handle it. + # NOTE(gibi): This can only happen in case of detaching from the + # live domain as we never retry a detach from the persistent + # domain so we cannot hit an already running detach there. + # In case of detaching from the live domain this error can happen + # if the caller timed out during the first detach attempt then saw + # that the device is still attached and therefore looped over and + # and retried the detach. In this case the previous attempt stopped + # waiting for the libvirt event. Also libvirt reports that there is + # a detach ongoing, so the current attempt expects that a + # libvirt event will be still emitted. Therefore we simply return + # from here. Then the caller will wait for such event. + if (code == libvirt.VIR_ERR_INTERNAL_ERROR and msg and + 'already in the process of unplug' in msg + ): + LOG.debug( + 'Ignoring QEMU rejecting our request to detach device %s ' + 'from instance %s as it is caused by a previous request ' + 'still being in progress.', device_name, instance_uuid) + return + + LOG.warning( + 'Unexpected libvirt error while detaching device %s from ' + 'instance %s: %s', device_name, instance_uuid, str(ex)) + raise + def detach_volume(self, context, connection_info, instance, mountpoint, encryption=None): disk_dev = mountpoint.rpartition("/")[2] @@ -2213,10 +2547,14 @@ class LibvirtDriver(driver.ComputeDriver): # detaching any attached encryptors or disconnecting the underlying # volume in _disconnect_volume. Otherwise, the encryptor or volume # driver may report that the volume is still in use. - wait_for_detach = guest.detach_device_with_retry( - guest.get_disk, disk_dev, live=live) - wait_for_detach() - + get_dev = functools.partial(guest.get_disk, disk_dev) + self._detach_with_retry( + guest, + instance.uuid, + get_dev, + device_name=disk_dev, + live=live + ) except exception.InstanceNotFound: # NOTE(zhaoqin): If the instance does not exist, _lookup_by_name() # will throw InstanceNotFound exception. Need to @@ -2423,12 +2761,14 @@ class LibvirtDriver(driver.ComputeDriver): state = guest.get_power_state(self._host) live = state in (power_state.RUNNING, power_state.PAUSED) - # Now we are going to loop until the interface is detached or we - # timeout. - wait_for_detach = guest.detach_device_with_retry( - guest.get_interface_by_cfg, cfg, live=live, - alternative_device_name=self.vif_driver.get_vif_devname(vif)) - wait_for_detach() + get_dev = functools.partial(guest.get_interface_by_cfg, cfg) + self._detach_with_retry( + guest, + instance.uuid, + get_dev, + device_name=self.vif_driver.get_vif_devname(vif), + live=live, + ) except exception.DeviceDetachFailed: # We failed to detach the device even with the retry loop, so let's # dump some debug information to the logs before raising back up. diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py index 66e06896c69e..0831010117e4 100644 --- a/nova/virt/libvirt/guest.py +++ b/nova/virt/libvirt/guest.py @@ -31,7 +31,6 @@ import time from lxml import etree from oslo_log import log as logging -from oslo_service import loopingcall from oslo_utils import encodeutils from oslo_utils import excutils from oslo_utils import importutils @@ -399,109 +398,6 @@ class Guest(object): devs.append(dev) return devs - def detach_device_with_retry(self, get_device_conf_func, device, live, - max_retry_count=7, inc_sleep_time=10, - max_sleep_time=60, - alternative_device_name=None): - """Detaches a device from the guest. After the initial detach request, - a function is returned which can be used to ensure the device is - successfully removed from the guest domain (retrying the removal as - necessary). - - :param get_device_conf_func: function which takes device as a parameter - and returns the configuration for device - :param device: device to detach - :param live: bool to indicate whether it affects the guest in running - state - :param max_retry_count: number of times the returned function will - retry a detach before failing - :param inc_sleep_time: incremental time to sleep in seconds between - detach retries - :param max_sleep_time: max sleep time in seconds beyond which the sleep - time will not be incremented using param - inc_sleep_time. On reaching this threshold, - max_sleep_time will be used as the sleep time. - :param alternative_device_name: This is an alternative identifier for - the device if device is not an ID, used solely for error messages. - """ - alternative_device_name = alternative_device_name or device - - def _try_detach_device(conf, persistent=False, live=False): - # Raise DeviceNotFound if the device isn't found during detach - try: - self.detach_device(conf, persistent=persistent, live=live) - if get_device_conf_func(device) is None: - LOG.debug('Successfully detached device %s from guest. ' - 'Persistent? %s. Live? %s', - device, persistent, live) - except libvirt.libvirtError as ex: - with excutils.save_and_reraise_exception(reraise=False) as ctx: - code = ex.get_error_code() - msg = ex.get_error_message() - if code == libvirt.VIR_ERR_DEVICE_MISSING: - raise exception.DeviceNotFound( - device=alternative_device_name) - # NOTE(lyarwood): https://bugzilla.redhat.com/1878659 - # Ignore this known QEMU bug for the time being allowing - # our retry logic to fire again and hopefully see that - # the device has been removed asynchronously by QEMU - # in the meantime when the next call to detach raises - # VIR_ERR_DEVICE_MISSING. - if (code == libvirt.VIR_ERR_INTERNAL_ERROR and - msg and 'already in the process of unplug' in msg - ): - LOG.debug('Ignoring QEMU rejecting our request to ' - 'detach as it is caused by a previous ' - 'request still being in progress.') - return - - # Re-raise the original exception if we're not raising - # DeviceNotFound instead. This will avoid logging of a - # "Original exception being dropped" traceback. - ctx.reraise = True - - conf = get_device_conf_func(device) - if conf is None: - raise exception.DeviceNotFound(device=alternative_device_name) - - persistent = self.has_persistent_configuration() - - LOG.debug('Attempting initial detach for device %s', - alternative_device_name) - try: - _try_detach_device(conf, persistent, live) - except exception.DeviceNotFound: - # NOTE(melwitt): There are effectively two configs for an instance. - # The persistent config (affects instance upon next boot) and the - # live config (affects running instance). When we detach a device, - # we need to detach it from both configs if the instance has a - # persistent config and a live config. If we tried to detach the - # device with persistent=True and live=True and it was not found, - # we should still try to detach from the live config, so continue. - if persistent and live: - pass - else: - raise - LOG.debug('Start retrying detach until device %s is gone.', - alternative_device_name) - - @loopingcall.RetryDecorator(max_retry_count=max_retry_count, - inc_sleep_time=inc_sleep_time, - max_sleep_time=max_sleep_time, - exceptions=exception.DeviceDetachFailed) - def _do_wait_and_retry_detach(): - config = get_device_conf_func(device) - if config is not None: - # Device is already detached from persistent config - # and only the live config needs to be updated. - _try_detach_device(config, persistent=False, live=live) - - reason = _("Unable to detach the device from the live config.") - raise exception.DeviceDetachFailed( - device=alternative_device_name, reason=reason) - - return _do_wait_and_retry_detach - def detach_device(self, conf, persistent=False, live=False): """Detaches device to the guest. diff --git a/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml b/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml new file mode 100644 index 000000000000..12c55fc315d4 --- /dev/null +++ b/releasenotes/notes/libvirt-event-based-device-detach-23ac037004d753b1.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + To fix `device detach issues`__ in the libvirt driver the detach logic has + been changed from a sleep based retry loop to waiting for libvirt domain + events. During this change we also introduced two new config options to + allow fine tuning the retry logic. For details see the description of the + new ``[libvirt]device_detach_attempts`` and + ``[libvirt]device_detach_timeout`` config options. + + .. __: https://bugs.launchpad.net/nova/+bug/1882521