From abb31b69a701148cb01f680083de8c29e89147f6 Mon Sep 17 00:00:00 2001 From: Felix Huettner Date: Fri, 22 Sep 2023 16:25:10 +0200 Subject: [PATCH] fix netns deletion of broken namespaces normal network namespaces are bind-mounted to files under /var/run/netns. If a process deleting a network namespace gets killed during that operation there is the chance that the bind mount to the netns has been removed, but the file under /var/run/netns still exists. When the neutron-ovn-metadata-agent tries to clean up such network namespaces it first tires to validate that the network namespace is empty. For the cases described above this fails, as this network namespace no longer really exists, but is just a stray file laying around. To fix this we treat network namespaces where we get an `OSError` with errno 22 (Invalid Argument) as empty. The calls to pyroute2 to delete the namespace will then clean up the file. Additionally we add a guard to teardown_datapath to continue even if this fails. failing to remove a datapath is not critical and leaves in the worst case a process and a network namespace running, however previously it would have also prevented the creation of new datapaths which is critical for VM startup. Closes-Bug: #2037102 Change-Id: I7c43812fed5903f98a2e491076c24a8d926a59b4 (cherry picked from commit 566fea3fed837b0130023303c770aade391d3d61) --- neutron/agent/linux/ip_lib.py | 17 ++++++++++++- neutron/agent/ovn/metadata/agent.py | 5 +++- neutron/tests/unit/agent/linux/test_ip_lib.py | 15 +++++++++++ .../unit/agent/ovn/metadata/test_agent.py | 25 +++++++++++++++++++ 4 files changed, 60 insertions(+), 2 deletions(-) diff --git a/neutron/agent/linux/ip_lib.py b/neutron/agent/linux/ip_lib.py index 5a5ec63fa08..6143cb07184 100644 --- a/neutron/agent/linux/ip_lib.py +++ b/neutron/agent/linux/ip_lib.py @@ -260,7 +260,22 @@ class IPWrapper(SubProcessBase): return ip def namespace_is_empty(self): - return not self.get_devices() + try: + return not self.get_devices() + except OSError as e: + # This can happen if we previously got terminated in the middle of + # removing this namespace. In this case the bind mount of the + # namespace under /var/run/netns will be removed, but the namespace + # file is still there. As the bind mount is gone we can no longer + # access the namespace to validate that it is empty. But since it + # should have already been removed we are sure that the check has + # passed the last time and since the namespace is unuseable that + # can not have changed. + # Future calls to pyroute2 to remove that namespace will clean up + # the leftover file. + if e.errno == errno.EINVAL: + return True + raise e def garbage_collect_namespace(self): """Conditionally destroy the namespace if it is empty.""" diff --git a/neutron/agent/ovn/metadata/agent.py b/neutron/agent/ovn/metadata/agent.py index f0be39d235c..54a8e54056e 100644 --- a/neutron/agent/ovn/metadata/agent.py +++ b/neutron/agent/ovn/metadata/agent.py @@ -371,7 +371,10 @@ class MetadataAgent(object): ns.startswith(NS_PREFIX) and ns not in metadata_namespaces] for ns in unused_namespaces: - self.teardown_datapath(self._get_datapath_name(ns)) + try: + self.teardown_datapath(self._get_datapath_name(ns)) + except Exception: + LOG.exception('Error unable to destroy namespace: %s', ns) # resync all network namespaces based on the associated datapaths, # even those that are already running. This is to make sure diff --git a/neutron/tests/unit/agent/linux/test_ip_lib.py b/neutron/tests/unit/agent/linux/test_ip_lib.py index c5ccf56acf2..22ff6aaf8a1 100644 --- a/neutron/tests/unit/agent/linux/test_ip_lib.py +++ b/neutron/tests/unit/agent/linux/test_ip_lib.py @@ -356,6 +356,21 @@ class TestIpWrapper(base.BaseTestCase): self.assertNotIn(mock.call().delete('ns'), ip_ns_cmd_cls.mock_calls) + def test_garbage_collect_namespace_existing_broken(self): + with mock.patch.object(ip_lib, 'IpNetnsCommand') as ip_ns_cmd_cls: + ip_ns_cmd_cls.return_value.exists.return_value = True + + ip = ip_lib.IPWrapper(namespace='ns') + + with mock.patch.object(ip, 'get_devices', + side_effect=OSError(errno.EINVAL, None) + ) as mock_get_devices: + self.assertTrue(ip.garbage_collect_namespace()) + + mock_get_devices.assert_called_once_with() + expected = [mock.call().delete('ns')] + ip_ns_cmd_cls.assert_has_calls(expected) + @mock.patch.object(priv_lib, 'create_interface') def test_add_vlan(self, create): retval = ip_lib.IPWrapper().add_vlan('eth0.1', 'eth0', '1') diff --git a/neutron/tests/unit/agent/ovn/metadata/test_agent.py b/neutron/tests/unit/agent/ovn/metadata/test_agent.py index 8bdfd1c6208..ab114065fb5 100644 --- a/neutron/tests/unit/agent/ovn/metadata/test_agent.py +++ b/neutron/tests/unit/agent/ovn/metadata/test_agent.py @@ -134,6 +134,31 @@ class TestMetadataAgent(base.BaseTestCase): lnn.assert_called_once_with() tdp.assert_called_once_with('3') + def test_sync_teardown_namespace_does_not_crash_on_error(self): + """Test that sync tears down unneeded metadata namespaces. + Even if that fails it continues to provision other datapaths + """ + with mock.patch.object( + self.agent, 'provision_datapath') as pdp,\ + mock.patch.object( + ip_lib, 'list_network_namespaces', + return_value=['ovnmeta-1', 'ovnmeta-2', 'ovnmeta-3', + 'ns1', 'ns2']) as lnn,\ + mock.patch.object( + self.agent, 'teardown_datapath', + side_effect=Exception()) as tdp: + self.agent.sync() + + pdp.assert_has_calls( + [ + mock.call(p.datapath) + for p in self.ports + ], + any_order=True + ) + lnn.assert_called_once_with() + tdp.assert_called_once_with('3') + def test_get_networks_datapaths(self): """Test get_networks_datapaths returns only datapath objects for the networks containing vif ports of type ''(blank) and 'external'.