From 39831c55999abe97e3bd26a1c0db2e4ceb467041 Mon Sep 17 00:00:00 2001 From: Alexandre Arents Date: Mon, 24 Aug 2020 14:05:27 +0000 Subject: [PATCH] Add a lock to prevent race during detach/attach of interface Only allow one detach/attach at a time with the same pattern instance-port_id in order to avoid race condition when multiple detach/attach are run concurrently. When multiple detach run concurrently on a specific instance-port_id, manager consider many of them as valid because info_cache still contains the port and info_cache is refreshed only once the first request complete. So during this gap of time, while the first request accomplishes the task, all subsequent requests are destined to fail and log a warning [1] in different location of code, depending on the outcome of the first request. The Issue is that all those caught requests finally run a deallocate_port_for_instance which will unbind the port. This may cause a race condition, because a successful attach can pass between those unbind, and be silently unbound, resulting in an infrastructure/DB inconsistency. [1] 'Detaching interface %(mac)s failed because the device is no longer found on the guest.' Closes-Bug: #1892870 Change-Id: Iea5969d0bd16dc9a6f1ba950224b0115e466ce66 --- nova/compute/manager.py | 25 +++++++++++++++++++ nova/tests/unit/compute/test_compute.py | 20 ++++++++++++--- nova/tests/unit/compute/test_compute_mgr.py | 10 +++++--- .../notes/bug-1892870-eb894956bf04713d.yaml | 8 ++++++ 4 files changed, 56 insertions(+), 7 deletions(-) create mode 100644 releasenotes/notes/bug-1892870-eb894956bf04713d.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index a1bcc9c144fa..0d0b78402ddd 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -7481,6 +7481,19 @@ class ComputeManager(manager.Manager): def attach_interface(self, context, instance, network_id, port_id, requested_ip, tag): """Use hotplug to add an network adapter to an instance.""" + lockname = 'interface-%s-%s' % (instance.uuid, port_id) + + @utils.synchronized(lockname) + def do_attach_interface(context, instance, network_id, port_id, + requested_ip, tag): + return self._attach_interface(context, instance, network_id, + port_id, requested_ip, tag) + + return do_attach_interface(context, instance, network_id, port_id, + requested_ip, tag) + + def _attach_interface(self, context, instance, network_id, port_id, + requested_ip, tag): if not self.driver.capabilities.get('supports_attach_interface', False): raise exception.AttachInterfaceNotSupported( @@ -7540,6 +7553,18 @@ class ComputeManager(manager.Manager): @wrap_instance_fault def detach_interface(self, context, instance, port_id): """Detach a network adapter from an instance.""" + lockname = 'interface-%s-%s' % (instance.uuid, port_id) + + @utils.synchronized(lockname) + def do_detach_interface(context, instance, port_id): + self._detach_interface(context, instance, port_id) + + do_detach_interface(context, instance, port_id) + + def _detach_interface(self, context, instance, port_id): + # NOTE(aarents): we need to refresh info cache from DB here, + # as previous detach/attach lock holder just updated it. + compute_utils.refresh_info_cache_for_instance(context, instance) network_info = instance.info_cache.network_info condemned = None for vif in network_info: diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 9633be3ebb36..2c0bcf88526a 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -10285,11 +10285,15 @@ class ComputeAPITestCase(BaseTestCase): network_id = nwinfo[0]['network']['id'] port_id = nwinfo[0]['id'] req_ip = '1.2.3.4' + lock_name = 'interface-%s-%s' % (instance.uuid, port_id) mock_allocate = mock.Mock(return_value=nwinfo) self.compute.network_api.allocate_port_for_instance = mock_allocate - with mock.patch.dict(self.compute.driver.capabilities, - supports_attach_interface=True): + with test.nested( + mock.patch.dict(self.compute.driver.capabilities, + supports_attach_interface=True), + mock.patch('oslo_concurrency.lockutils.lock') + ) as (cap, mock_lock): vif = self.compute.attach_interface(self.context, instance, network_id, @@ -10304,6 +10308,9 @@ class ComputeAPITestCase(BaseTestCase): action='interface_attach', phase='start'), mock.call(self.context, instance, self.compute.host, action='interface_attach', phase='end')]) + mock_lock.assert_called_once_with(lock_name, mock.ANY, mock.ANY, + mock.ANY, delay=mock.ANY, do_log=mock.ANY, fair=mock.ANY, + semaphores=mock.ANY) return nwinfo, port_id @mock.patch.object(compute_utils, 'notify_about_instance_action') @@ -10401,6 +10408,7 @@ class ComputeAPITestCase(BaseTestCase): self.context, uuids.info_cache_instance) instance.info_cache.network_info = network_model.NetworkInfo.hydrate( nwinfo) + lock_name = 'interface-%s-%s' % (instance.uuid, port_id) port_allocation = {uuids.rp1: {'NET_BW_EGR_KILOBIT_PER_SEC': 10000}} with test.nested( @@ -10409,8 +10417,9 @@ class ComputeAPITestCase(BaseTestCase): 'remove_resources_from_instance_allocation'), mock.patch.object(self.compute.network_api, 'deallocate_port_for_instance', - return_value=([], port_allocation))) as ( - mock_remove_alloc, mock_deallocate): + return_value=([], port_allocation)), + mock.patch('oslo_concurrency.lockutils.lock')) as ( + mock_remove_alloc, mock_deallocate, mock_lock): self.compute.detach_interface(self.context, instance, port_id) mock_deallocate.assert_called_once_with( @@ -10424,6 +10433,9 @@ class ComputeAPITestCase(BaseTestCase): action='interface_detach', phase='start'), mock.call(self.context, instance, self.compute.host, action='interface_detach', phase='end')]) + mock_lock.assert_called_once_with(lock_name, mock.ANY, mock.ANY, + mock.ANY, delay=mock.ANY, do_log=mock.ANY, fair=mock.ANY, + semaphores=mock.ANY) @mock.patch('nova.compute.manager.LOG.log') def test_detach_interface_failed(self, mock_log): diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index c8a483516f1d..e15736019847 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -2586,8 +2586,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, @mock.patch.object(compute_utils, 'EventReporter') @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') + @mock.patch.object(compute_utils, 'refresh_info_cache_for_instance') @mock.patch.object(self.compute, '_set_instance_obj_error_state') - def do_test(meth, add_fault, event): + def do_test(meth, refresh, add_fault, event): self.assertRaises(exception.PortNotFound, self.compute.detach_interface, self.context, f_instance, 'port_id') @@ -2596,6 +2597,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, event.assert_called_once_with( self.context, 'compute_detach_interface', CONF.host, f_instance.uuid, graceful_exit=False) + refresh.assert_called_once_with(self.context, f_instance) do_test() @@ -2603,8 +2605,9 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, @mock.patch.object(compute_utils, 'EventReporter') @mock.patch.object(compute_utils, 'add_instance_fault_from_exc') @mock.patch.object(compute_utils, 'notify_about_instance_action') - def test_detach_interface_instance_not_found(self, mock_notify, mock_fault, - mock_event, mock_log): + @mock.patch.object(compute_utils, 'refresh_info_cache_for_instance') + def test_detach_interface_instance_not_found(self, mock_refresh, + mock_notify, mock_fault, mock_event, mock_log): nw_info = network_model.NetworkInfo([ network_model.VIF(uuids.port_id)]) info_cache = objects.InstanceInfoCache(network_info=nw_info, @@ -2617,6 +2620,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.assertRaises(exception.InterfaceDetachFailed, self.compute.detach_interface, self.context, instance, uuids.port_id) + mock_refresh.assert_called_once_with(self.context, instance) self.assertEqual(1, mock_log.call_count) self.assertEqual(logging.DEBUG, mock_log.call_args[0][0]) diff --git a/releasenotes/notes/bug-1892870-eb894956bf04713d.yaml b/releasenotes/notes/bug-1892870-eb894956bf04713d.yaml new file mode 100644 index 000000000000..3a4fe968f30a --- /dev/null +++ b/releasenotes/notes/bug-1892870-eb894956bf04713d.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Resolve a race condition that may occur during concurrent + ``interface detach/attach``, resulting in an interface accidentally unbind + after attached. See `bug 1892870`_ for more details. + + .. _bug 1892870: https://bugs.launchpad.net/nova/+bug/1892870