From 4cb90623193bd6826e279129e993e0ceaf4a1816 Mon Sep 17 00:00:00 2001 From: Andreas Scheuring Date: Wed, 11 Nov 2015 14:03:08 +0100 Subject: [PATCH] lb: avoid doing nova VIF work plumbing tap to qbr neutron should rely on nova doing the job instead of trying to 'fix' it. 'Fixing' it introduces race conditions between lb agent and nova VIF driver. Particularly, lb agent can scan for new tap devices in the middle of nova plumbing qbr-tap setup, and attempt to do it on its own. So if agent is more lucky to plug the tap device into the bridge, nova may fail to do the same, getting the following error: libvirtError: Unable to add bridge brqxxx-xx port tapxxx-xx: Device or resource busy This also requires a change in how the port admin_state_up is implemented by setting the tap device's link state instead of moving it in or out of the bridge. Conflicts: neutron/common/constants.py neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py Co-Authored-By: Sean M. Collins Co-Authored-By: Darragh O'Reilly Co-Authored-By: Andreas Scheuring Closes-Bug: #1312016 (cherry picked from commit f42ea67995537c7fe3e36447489872b0dcb82dd9) === Also squashed in the following follow up fix: lb: Correct String formatting to get rid of logged ValueError The following error is caused by a missing String formatting in the linuxbridge agent: "ValueError: unsupported format character 'a' (0x61) at index 90 Logged from file linuxbridge_neutron_agent.py, line 447" In addition a duplicated word in the log text has been fixed. Change-Id: I587f1165fc7084dc9c4806149b65652f6e27b14e (cherry picked from commit 1f86d8687b2781f0c287ee656f3cbc65aaa4b5e4) === Also squashed in: Only ensure admin state on ports that exist The linux bridge agent was calling ensure_port_admin state unconditionally on ports in treat_devices_added_or_updated. This would cause it to throw an error on interfaces that didn't exist so it would restart the entire processing loop. If another port was being updated in the same loop before this one, that port would experience a port status life-cycle of DOWN->BUILD->ACTIVE->BUILD->ACTIVE ^ <--- Exception in unrelated port causes cycle to start over again. This causes the bug below because the first active transition will cause Nova to boot the VM. At this point tempest tests expect the ports that belong to the VM to be in the ACTIVE state so it filters Neutron port list calls with "status=ACTIVE". Therefore tempest would not get any ports back and assume there was some kind of error with the port and bail. This patch just makes sure the admin state call is skipped if the port doesn't exist and it includes a basic unit test to prevent a regression. Conflicts: neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py Closes-Bug: #1523638 Change-Id: I5330c6111cbb20bf45aec9ade7e30d34e8dd16ca (cherry picked from commit 96c67e22f9cba2ea0e7fb3ba2a63e4905e48c1a4) === Change-Id: I02971103407b4ec11a65218e9ef7e2708915d938 --- neutron/common/constants.py | 12 +- .../agent/linuxbridge_neutron_agent.py | 117 ++++++++++++------ .../agent/test_linuxbridge_neutron_agent.py | 115 +++++++++++------ 3 files changed, 165 insertions(+), 79 deletions(-) diff --git a/neutron/common/constants.py b/neutron/common/constants.py index c1c79302a31..059ae363d55 100644 --- a/neutron/common/constants.py +++ b/neutron/common/constants.py @@ -30,6 +30,10 @@ FLOATINGIP_STATUS_ACTIVE = 'ACTIVE' FLOATINGIP_STATUS_DOWN = 'DOWN' FLOATINGIP_STATUS_ERROR = 'ERROR' +DEVICE_OWNER_COMPUTE_PREFIX = "compute:" +DEVICE_OWNER_NETWORK_PREFIX = "network:" +DEVICE_OWNER_NEUTRON_PREFIX = "neutron:" + DEVICE_OWNER_ROUTER_HA_INTF = "network:router_ha_interface" DEVICE_OWNER_ROUTER_INTF = "network:router_interface" DEVICE_OWNER_ROUTER_GW = "network:router_gateway" @@ -38,11 +42,11 @@ DEVICE_OWNER_DHCP = "network:dhcp" DEVICE_OWNER_DVR_INTERFACE = "network:router_interface_distributed" DEVICE_OWNER_AGENT_GW = "network:floatingip_agent_gateway" DEVICE_OWNER_ROUTER_SNAT = "network:router_centralized_snat" -DEVICE_OWNER_LOADBALANCER = "neutron:LOADBALANCER" -DEVICE_OWNER_LOADBALANCERV2 = "neutron:LOADBALANCERV2" +DEVICE_OWNER_LOADBALANCER = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCER" +DEVICE_OWNER_LOADBALANCERV2 = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCERV2" -DEVICE_OWNER_COMPUTE_PREFIX = "compute:" -DEVICE_OWNER_PREFIXES = ["network:", "neutron:"] +DEVICE_OWNER_PREFIXES = (DEVICE_OWNER_NETWORK_PREFIX, + DEVICE_OWNER_NEUTRON_PREFIX) # Collection used to identify devices owned by router interfaces. # DEVICE_OWNER_ROUTER_HA_INTF is a special case and so is not included. diff --git a/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py b/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py index e008c66e345..0a8d76a3336 100644 --- a/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py @@ -465,12 +465,12 @@ class LinuxBridgeManager(object): network_id: network_id}) def add_tap_interface(self, network_id, network_type, physical_network, - segmentation_id, tap_device_name): + segmentation_id, tap_device_name, device_owner): """Add tap interface and handle interface missing exeptions.""" try: return self._add_tap_interface(network_id, network_type, physical_network, segmentation_id, - tap_device_name) + tap_device_name, device_owner) except Exception: with excutils.save_and_reraise_exception() as ctx: if not ip_lib.device_exists(tap_device_name): @@ -481,7 +481,7 @@ class LinuxBridgeManager(object): return False def _add_tap_interface(self, network_id, network_type, physical_network, - segmentation_id, tap_device_name): + segmentation_id, tap_device_name, device_owner): """Add tap interface. If a VIF has been plugged into a network, this function will @@ -507,20 +507,24 @@ class LinuxBridgeManager(object): return False self.ensure_tap_mtu(tap_device_name, phy_dev_name) - # Check if device needs to be added to bridge - tap_device_in_bridge = self.get_bridge_for_tap_device(tap_device_name) - if not tap_device_in_bridge: - data = {'tap_device_name': tap_device_name, - 'bridge_name': bridge_name} - LOG.debug("Adding device %(tap_device_name)s to bridge " - "%(bridge_name)s", data) - if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name): - return False + # Avoid messing with plugging devices into a bridge that the agent + # does not own + if device_owner.startswith(constants.DEVICE_OWNER_PREFIXES): + # Check if device needs to be added to bridge + if not self.get_bridge_for_tap_device(tap_device_name): + data = {'tap_device_name': tap_device_name, + 'bridge_name': bridge_name} + LOG.debug("Adding device %(tap_device_name)s to bridge " + "%(bridge_name)s", data) + if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name): + return False else: data = {'tap_device_name': tap_device_name, + 'device_owner': device_owner, 'bridge_name': bridge_name} - LOG.debug("%(tap_device_name)s already exists on bridge " - "%(bridge_name)s", data) + LOG.debug("Skip adding device %(tap_device_name)s to " + "%(bridge_name)s. It is owned by %(device_owner)s and " + "thus added elsewhere.", data) return True def ensure_tap_mtu(self, tap_dev_name, phy_dev_name): @@ -529,14 +533,14 @@ class LinuxBridgeManager(object): ip_lib.IPDevice(tap_dev_name).link.set_mtu(phy_dev_mtu) def add_interface(self, network_id, network_type, physical_network, - segmentation_id, port_id): + segmentation_id, port_id, device_owner): self.network_map[network_id] = NetworkSegment(network_type, physical_network, segmentation_id) tap_device_name = self.get_tap_device_name(port_id) return self.add_tap_interface(network_id, network_type, physical_network, segmentation_id, - tap_device_name) + tap_device_name, device_owner) def delete_bridge(self, bridge_name): if ip_lib.device_exists(bridge_name): @@ -982,12 +986,14 @@ class LinuxBridgeNeutronAgentRPC(service.Service): def setup_linux_bridge(self, bridge_mappings, interface_mappings): self.br_mgr = LinuxBridgeManager(bridge_mappings, interface_mappings) - def remove_port_binding(self, network_id, physical_network, interface_id): - bridge_name = self.br_mgr.get_existing_bridge_name(physical_network) - if not bridge_name: - bridge_name = self.br_mgr.get_bridge_name(network_id) - tap_device_name = self.br_mgr.get_tap_device_name(interface_id) - return self.br_mgr.remove_interface(bridge_name, tap_device_name) + def _ensure_port_admin_state(self, port_id, admin_state_up): + LOG.debug("Setting admin_state_up to %s for port %s", + admin_state_up, port_id) + tap_name = self.br_mgr.get_tap_device_name(port_id) + if admin_state_up: + ip_lib.IPDevice(tap_name).link.set_up() + else: + ip_lib.IPDevice(tap_name).link.set_down() def process_network_devices(self, device_info): resync_a = False @@ -1031,18 +1037,58 @@ class LinuxBridgeNeutronAgentRPC(service.Service): device_details['port_id']) arp_protect.setup_arp_spoofing_protection(port, device_details) + # create the networking for the port + network_type = device_details.get('network_type') + segmentation_id = device_details.get('segmentation_id') + tap_in_bridge = self.br_mgr.add_interface( + device_details['network_id'], network_type, + device_details['physical_network'], segmentation_id, + device_details['port_id'], device_details['device_owner']) + # REVISIT(scheuran): Changed the way how ports admin_state_up + # is implemented. + # + # Old lb implementation: + # - admin_state_up: ensure that tap is plugged into bridge + # - admin_state_down: remove tap from bridge + # New lb implementation: + # - admin_state_up: set tap device state to up + # - admin_state_down: set tap device stae to down + # + # However both approaches could result in races with + # nova/libvirt and therefore to an invalid system state in the + # scenario, where an instance is booted with a port configured + # with admin_state_up = False: + # + # Libvirt does the following actions in exactly + # this order (see libvirt virnetdevtap.c) + # 1) Create the tap device, set its MAC and MTU + # 2) Plug the tap into the bridge + # 3) Set the tap online + # + # Old lb implementation: + # A race could occur, if the lb agent removes the tap device + # right after step 1). Then libvirt will add it to the bridge + # again in step 2). + # New lb implementation: + # The race could occur if the lb-agent sets the taps device + # state to down right after step 2). In step 3) libvirt + # might set it to up again. + # + # This is not an issue if an instance is booted with a port + # configured with admin_state_up = True. Libvirt would just + # set the tap device up again. + # + # This refactoring is recommended for the following reasons: + # 1) An existing race with libvirt caused by the behavior of + # the old implementation. See Bug #1312016 + # 2) The new code is much more readable + if tap_in_bridge: + self._ensure_port_admin_state( + device_details['port_id'], + device_details['admin_state_up']) + # update plugin about port status if admin_state is up if device_details['admin_state_up']: - # create the networking for the port - network_type = device_details.get('network_type') - segmentation_id = device_details.get('segmentation_id') - if self.br_mgr.add_interface( - device_details['network_id'], - network_type, - device_details['physical_network'], - segmentation_id, - device_details['port_id']): - - # update plugin about port status + if tap_in_bridge: self.plugin_rpc.update_device_up(self.context, device, self.agent_id, @@ -1052,11 +1098,6 @@ class LinuxBridgeNeutronAgentRPC(service.Service): device, self.agent_id, cfg.CONF.host) - else: - physical_network = device_details['physical_network'] - self.remove_port_binding(device_details['network_id'], - physical_network, - device_details['port_id']) else: LOG.info(_LI("Device %s not defined on plugin"), device) return False diff --git a/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py index b16fb4cd96e..7a9957e8a6d 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py @@ -147,6 +147,7 @@ class TestLinuxBridgeAgent(base.BaseTestCase): def test_treat_devices_removed_with_existed_device(self): agent = self.agent + agent._ensure_port_admin_state = mock.Mock() devices = [DEVICE_1] with mock.patch.object(agent.plugin_rpc, "update_device_down") as fn_udd,\ @@ -405,6 +406,25 @@ class TestLinuxBridgeAgent(base.BaseTestCase): 'tap4'])) agent.treat_devices_removed.assert_called_with(set(['tap1'])) + def test_treat_devices_added_updated_no_local_interface(self): + agent = self.agent + mock_details = {'device': 'dev123', + 'port_id': 'port123', + 'network_id': 'net123', + 'admin_state_up': True, + 'network_type': 'vlan', + 'segmentation_id': 100, + 'physical_network': 'physnet1', + 'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX} + agent.ext_manager = mock.Mock() + agent.plugin_rpc = mock.Mock() + agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] + agent.mgr = mock.Mock() + agent.mgr.plug_interface.return_value = False + agent.mgr.ensure_port_admin_state = mock.Mock() + agent.treat_devices_added_updated(set(['tap1'])) + self.assertFalse(agent.mgr.ensure_port_admin_state.called) + def test_treat_devices_added_updated_admin_state_up_true(self): agent = self.agent mock_details = {'device': 'dev123', @@ -413,39 +433,22 @@ class TestLinuxBridgeAgent(base.BaseTestCase): 'admin_state_up': True, 'network_type': 'vlan', 'segmentation_id': 100, - 'physical_network': 'physnet1'} + 'physical_network': 'physnet1', + 'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX} agent.plugin_rpc = mock.Mock() agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] agent.br_mgr = mock.Mock() agent.br_mgr.add_interface.return_value = True + agent._ensure_port_admin_state = mock.Mock() resync_needed = agent.treat_devices_added_updated(set(['tap1'])) self.assertFalse(resync_needed) - agent.br_mgr.add_interface.assert_called_with('net123', 'vlan', - 'physnet1', 100, - 'port123') + agent.br_mgr.add_interface.assert_called_with( + 'net123', 'vlan', 'physnet1', + 100, 'port123', + constants.DEVICE_OWNER_NETWORK_PREFIX) self.assertTrue(agent.plugin_rpc.update_device_up.called) - def test_treat_devices_added_updated_admin_state_up_false(self): - agent = self.agent - mock_details = {'device': 'dev123', - 'port_id': 'port123', - 'network_id': 'net123', - 'admin_state_up': False, - 'network_type': 'vlan', - 'segmentation_id': 100, - 'physical_network': 'physnet1'} - agent.plugin_rpc = mock.Mock() - agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] - agent.remove_port_binding = mock.Mock() - resync_needed = agent.treat_devices_added_updated(set(['tap1'])) - - self.assertFalse(resync_needed) - agent.remove_port_binding.assert_called_with('net123', - 'physnet1', - 'port123') - self.assertFalse(agent.plugin_rpc.update_device_up.called) - def test_treat_devices_added_updated_prevent_arp_spoofing_true(self): agent = self.agent agent.prevent_arp_spoofing = True @@ -490,6 +493,23 @@ class TestLinuxBridgeAgent(base.BaseTestCase): self.agent._report_state() self.assertTrue(self.agent.fullsync) + def _test_ensure_port_admin_state(self, admin_state): + port_id = 'fake_id' + with mock.patch.object(ip_lib, 'IPDevice') as dev_mock: + self.agent._ensure_port_admin_state(port_id, admin_state) + + tap_name = self.agent.br_mgr.get_tap_device_name(port_id) + self.assertEqual(admin_state, + dev_mock(tap_name).link.set_up.called) + self.assertNotEqual(admin_state, + dev_mock(tap_name).link.set_down.called) + + def test_ensure_port_admin_state_up(self): + self._test_ensure_port_admin_state(True) + + def test_ensure_port_admin_state_down(self): + self._test_ensure_port_admin_state(False) + class TestLinuxBridgeManager(base.BaseTestCase): def setUp(self): @@ -925,22 +945,31 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertFalse(self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, "physnet1", None, - "tap1")) + "tap1", "foo")) @mock.patch.object(ip_lib, "device_exists", return_value=True) def test_add_tap_interface_with_other_error(self, exists): with mock.patch.object(self.lbm, "_add_tap_interface", side_effect=RuntimeError("No more fuel")): self.assertRaises(RuntimeError, self.lbm.add_tap_interface, "123", - p_const.TYPE_VLAN, "physnet1", None, "tap1") + p_const.TYPE_VLAN, "physnet1", None, "tap1", + "foo") - def test_add_tap_interface(self): + def test_add_tap_interface_owner_other(self): + with mock.patch.object(ip_lib, "device_exists"): + with mock.patch.object(self.lbm, "ensure_local_bridge"): + self.assertTrue(self.lbm.add_tap_interface("123", + p_const.TYPE_LOCAL, + "physnet1", None, + "tap1", "foo")) + + def _test_add_tap_interface(self, dev_owner_prefix): with mock.patch.object(ip_lib, "device_exists") as de_fn: de_fn.return_value = False self.assertFalse( self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, - "physnet1", "1", "tap1") - ) + "physnet1", "1", "tap1", + dev_owner_prefix)) de_fn.return_value = True bridge_device = mock.Mock() @@ -954,14 +983,16 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertTrue(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) + "tap1", + dev_owner_prefix)) en_fn.assert_called_with("123", "brq123") self.lbm.bridge_mappings = {"physnet1": "brq999"} self.assertTrue(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) + "tap1", + dev_owner_prefix)) en_fn.assert_called_with("123", "brq999") get_br.return_value = False @@ -969,8 +1000,8 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertFalse(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) - + "tap1", + dev_owner_prefix)) with mock.patch.object(self.lbm, "ensure_physical_in_bridge") as ens_fn,\ mock.patch.object(self.lbm, @@ -981,21 +1012,31 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertFalse(self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, "physnet1", "1", - "tap1")) + "tap1", + dev_owner_prefix)) ens_fn.return_value = "eth0.1" get_br.return_value = "brq123" self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, - "physnet1", "1", "tap1") + "physnet1", "1", "tap1", + dev_owner_prefix) en_mtu_fn.assert_called_once_with("tap1", "eth0.1") bridge_device.addif.assert_called_once_with("tap1") + def test_add_tap_interface_owner_network(self): + self._test_add_tap_interface(constants.DEVICE_OWNER_NETWORK_PREFIX) + + def test_add_tap_interface_owner_neutron(self): + self._test_add_tap_interface(constants.DEVICE_OWNER_NEUTRON_PREFIX) + def test_add_interface(self): with mock.patch.object(self.lbm, "add_tap_interface") as add_tap: self.lbm.add_interface("123", p_const.TYPE_VLAN, "physnet-1", - "1", "234") + "1", "234", + constants.DEVICE_OWNER_NETWORK_PREFIX) add_tap.assert_called_with("123", p_const.TYPE_VLAN, "physnet-1", - "1", "tap234") + "1", "tap234", + constants.DEVICE_OWNER_NETWORK_PREFIX) def test_delete_bridge(self): bridge_device = mock.Mock()