From f42ea67995537c7fe3e36447489872b0dcb82dd9 Mon Sep 17 00:00:00 2001 From: Andreas Scheuring Date: Wed, 11 Nov 2015 14:03:08 +0100 Subject: [PATCH] lb: avoid doing nova VIF work plumbing tap to qbr neutron should rely on nova doing the job instead of trying to 'fix' it. 'Fixing' it introduces race conditions between lb agent and nova VIF driver. Particularly, lb agent can scan for new tap devices in the middle of nova plumbing qbr-tap setup, and attempt to do it on its own. So if agent is more lucky to plug the tap device into the bridge, nova may fail to do the same, getting the following error: libvirtError: Unable to add bridge brqxxx-xx port tapxxx-xx: Device or resource busy This also requires a change in how the port admin_state_up is implemented by setting the tap device's link state instead of moving it in or out of the bridge. Co-Authored-By: Sean M. Collins Co-Authored-By: Darragh O'Reilly Co-Authored-By: Andreas Scheuring Change-Id: I02971103407b4ec11a65218e9ef7e2708915d938 Closes-Bug: #1312016 --- neutron/common/constants.py | 8 +- .../agent/linuxbridge_neutron_agent.py | 115 ++++++++++++------ .../agent/test_linuxbridge_neutron_agent.py | 91 ++++++++------ 3 files changed, 138 insertions(+), 76 deletions(-) diff --git a/neutron/common/constants.py b/neutron/common/constants.py index ebbb01499b4..9fdd8dee299 100644 --- a/neutron/common/constants.py +++ b/neutron/common/constants.py @@ -32,6 +32,7 @@ FLOATINGIP_STATUS_ERROR = 'ERROR' DEVICE_OWNER_COMPUTE_PREFIX = "compute:" DEVICE_OWNER_NETWORK_PREFIX = "network:" +DEVICE_OWNER_NEUTRON_PREFIX = "neutron:" DEVICE_OWNER_ROUTER_HA_INTF = (DEVICE_OWNER_NETWORK_PREFIX + "router_ha_interface") @@ -45,10 +46,11 @@ DEVICE_OWNER_AGENT_GW = (DEVICE_OWNER_NETWORK_PREFIX + "floatingip_agent_gateway") DEVICE_OWNER_ROUTER_SNAT = (DEVICE_OWNER_NETWORK_PREFIX + "router_centralized_snat") -DEVICE_OWNER_LOADBALANCER = "neutron:LOADBALANCER" -DEVICE_OWNER_LOADBALANCERV2 = "neutron:LOADBALANCERV2" +DEVICE_OWNER_LOADBALANCER = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCER" +DEVICE_OWNER_LOADBALANCERV2 = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCERV2" -DEVICE_OWNER_PREFIXES = [DEVICE_OWNER_NETWORK_PREFIX, "neutron:"] +DEVICE_OWNER_PREFIXES = (DEVICE_OWNER_NETWORK_PREFIX, + DEVICE_OWNER_NEUTRON_PREFIX) # Collection used to identify devices owned by router interfaces. # DEVICE_OWNER_ROUTER_HA_INTF is a special case and so is not included. diff --git a/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py b/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py index 9036e27fb8e..1cb876d59ab 100644 --- a/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py @@ -402,7 +402,7 @@ class LinuxBridgeManager(object): network_id: network_id}) def add_tap_interface(self, network_id, network_type, physical_network, - segmentation_id, tap_device_name): + segmentation_id, tap_device_name, device_owner): """Add tap interface. If a VIF has been plugged into a network, this function will @@ -427,22 +427,26 @@ class LinuxBridgeManager(object): if not phy_dev_name: return False self.ensure_tap_mtu(tap_device_name, phy_dev_name) - - # Check if device needs to be added to bridge - tap_device_in_bridge = bridge_lib.BridgeDevice.get_interface_bridge( - tap_device_name) - if not tap_device_in_bridge: - data = {'tap_device_name': tap_device_name, - 'bridge_name': bridge_name} - LOG.debug("Adding device %(tap_device_name)s to bridge " - "%(bridge_name)s", data) - if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name): - return False + # Avoid messing with plugging devices into a bridge that the agent + # does not own + if device_owner.startswith(constants.DEVICE_OWNER_PREFIXES): + # Check if device needs to be added to bridge + if not bridge_lib.BridgeDevice.get_interface_bridge( + tap_device_name): + data = {'tap_device_name': tap_device_name, + 'bridge_name': bridge_name} + LOG.debug("Adding device %(tap_device_name)s to bridge " + "%(bridge_name)s", data) + if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name): + return False else: data = {'tap_device_name': tap_device_name, + 'device_owner': device_owner, 'bridge_name': bridge_name} - LOG.debug("%(tap_device_name)s already exists on bridge " - "%(bridge_name)s", data) + LOG.debug("Skip adding device %(tap_device_name)s to " + "%(bridge_name)s. It is owned by %(device_owner) and " + "thus added elsewhere." + "elsewhere", data) return True def ensure_tap_mtu(self, tap_dev_name, phy_dev_name): @@ -451,14 +455,14 @@ class LinuxBridgeManager(object): ip_lib.IPDevice(tap_dev_name).link.set_mtu(phy_dev_mtu) def add_interface(self, network_id, network_type, physical_network, - segmentation_id, port_id): + segmentation_id, port_id, device_owner): self.network_map[network_id] = NetworkSegment(network_type, physical_network, segmentation_id) tap_device_name = self.get_tap_device_name(port_id) return self.add_tap_interface(network_id, network_type, physical_network, segmentation_id, - tap_device_name) + tap_device_name, device_owner) def delete_bridge(self, bridge_name): bridge_device = bridge_lib.BridgeDevice(bridge_name) @@ -896,12 +900,14 @@ class LinuxBridgeNeutronAgentRPC(service.Service): def setup_linux_bridge(self, bridge_mappings, interface_mappings): self.br_mgr = LinuxBridgeManager(bridge_mappings, interface_mappings) - def remove_port_binding(self, network_id, physical_network, interface_id): - bridge_name = self.br_mgr.get_existing_bridge_name(physical_network) - if not bridge_name: - bridge_name = self.br_mgr.get_bridge_name(network_id) - tap_device_name = self.br_mgr.get_tap_device_name(interface_id) - return self.br_mgr.remove_interface(bridge_name, tap_device_name) + def _ensure_port_admin_state(self, port_id, admin_state_up): + LOG.debug("Setting admin_state_up to %s for port %s", + admin_state_up, port_id) + tap_name = self.br_mgr.get_tap_device_name(port_id) + if admin_state_up: + ip_lib.IPDevice(tap_name).link.set_up() + else: + ip_lib.IPDevice(tap_name).link.set_down() def process_network_devices(self, device_info): resync_a = False @@ -943,18 +949,56 @@ class LinuxBridgeNeutronAgentRPC(service.Service): device_details['port_id']) arp_protect.setup_arp_spoofing_protection(port, device_details) + # create the networking for the port + network_type = device_details.get('network_type') + segmentation_id = device_details.get('segmentation_id') + tap_in_bridge = self.br_mgr.add_interface( + device_details['network_id'], network_type, + device_details['physical_network'], segmentation_id, + device_details['port_id'], device_details['device_owner']) + # REVISIT(scheuran): Changed the way how ports admin_state_up + # is implemented. + # + # Old lb implementation: + # - admin_state_up: ensure that tap is plugged into bridge + # - admin_state_down: remove tap from bridge + # New lb implementation: + # - admin_state_up: set tap device state to up + # - admin_state_down: set tap device stae to down + # + # However both approaches could result in races with + # nova/libvirt and therefore to an invalid system state in the + # scenario, where an instance is booted with a port configured + # with admin_state_up = False: + # + # Libvirt does the following actions in exactly + # this order (see libvirt virnetdevtap.c) + # 1) Create the tap device, set its MAC and MTU + # 2) Plug the tap into the bridge + # 3) Set the tap online + # + # Old lb implementation: + # A race could occur, if the lb agent removes the tap device + # right after step 1). Then libvirt will add it to the bridge + # again in step 2). + # New lb implementation: + # The race could occur if the lb-agent sets the taps device + # state to down right after step 2). In step 3) libvirt + # might set it to up again. + # + # This is not an issue if an instance is booted with a port + # configured with admin_state_up = True. Libvirt would just + # set the tap device up again. + # + # This refactoring is recommended for the following reasons: + # 1) An existing race with libvirt caused by the behavior of + # the old implementation. See Bug #1312016 + # 2) The new code is much more readable + self._ensure_port_admin_state(device_details['port_id'], + device_details['admin_state_up']) + # update plugin about port status if admin_state is up if device_details['admin_state_up']: - # create the networking for the port - network_type = device_details.get('network_type') - segmentation_id = device_details.get('segmentation_id') - if self.br_mgr.add_interface( - device_details['network_id'], - network_type, - device_details['physical_network'], - segmentation_id, - device_details['port_id']): - - # update plugin about port status + if tap_in_bridge: self.plugin_rpc.update_device_up(self.context, device, self.agent_id, @@ -964,11 +1008,6 @@ class LinuxBridgeNeutronAgentRPC(service.Service): device, self.agent_id, cfg.CONF.host) - else: - physical_network = device_details['physical_network'] - self.remove_port_binding(device_details['network_id'], - physical_network, - device_details['port_id']) else: LOG.info(_LI("Device %s not defined on plugin"), device) return False diff --git a/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py index 0a84f1b2314..db7e36267a7 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py @@ -126,6 +126,7 @@ class TestLinuxBridgeAgent(base.BaseTestCase): def test_treat_devices_removed_with_existed_device(self): agent = self.agent + agent._ensure_port_admin_state = mock.Mock() devices = [DEVICE_1] with mock.patch.object(agent.plugin_rpc, "update_device_down") as fn_udd,\ @@ -315,39 +316,22 @@ class TestLinuxBridgeAgent(base.BaseTestCase): 'admin_state_up': True, 'network_type': 'vlan', 'segmentation_id': 100, - 'physical_network': 'physnet1'} + 'physical_network': 'physnet1', + 'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX} agent.plugin_rpc = mock.Mock() agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] agent.br_mgr = mock.Mock() agent.br_mgr.add_interface.return_value = True + agent._ensure_port_admin_state = mock.Mock() resync_needed = agent.treat_devices_added_updated(set(['tap1'])) self.assertFalse(resync_needed) - agent.br_mgr.add_interface.assert_called_with('net123', 'vlan', - 'physnet1', 100, - 'port123') + agent.br_mgr.add_interface.assert_called_with( + 'net123', 'vlan', 'physnet1', + 100, 'port123', + constants.DEVICE_OWNER_NETWORK_PREFIX) self.assertTrue(agent.plugin_rpc.update_device_up.called) - def test_treat_devices_added_updated_admin_state_up_false(self): - agent = self.agent - mock_details = {'device': 'dev123', - 'port_id': 'port123', - 'network_id': 'net123', - 'admin_state_up': False, - 'network_type': 'vlan', - 'segmentation_id': 100, - 'physical_network': 'physnet1'} - agent.plugin_rpc = mock.Mock() - agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] - agent.remove_port_binding = mock.Mock() - resync_needed = agent.treat_devices_added_updated(set(['tap1'])) - - self.assertFalse(resync_needed) - agent.remove_port_binding.assert_called_with('net123', - 'physnet1', - 'port123') - self.assertFalse(agent.plugin_rpc.update_device_up.called) - def test_set_rpc_timeout(self): self.agent.stop() for rpc_client in (self.agent.plugin_rpc.client, @@ -369,6 +353,23 @@ class TestLinuxBridgeAgent(base.BaseTestCase): self.agent._report_state() self.assertTrue(self.agent.fullsync) + def _test_ensure_port_admin_state(self, admin_state): + port_id = 'fake_id' + with mock.patch.object(ip_lib, 'IPDevice') as dev_mock: + self.agent._ensure_port_admin_state(port_id, admin_state) + + tap_name = self.agent.br_mgr.get_tap_device_name(port_id) + self.assertEqual(admin_state, + dev_mock(tap_name).link.set_up.called) + self.assertNotEqual(admin_state, + dev_mock(tap_name).link.set_down.called) + + def test_ensure_port_admin_state_up(self): + self._test_ensure_port_admin_state(True) + + def test_ensure_port_admin_state_down(self): + self._test_ensure_port_admin_state(False) + class TestLinuxBridgeManager(base.BaseTestCase): def setUp(self): @@ -698,13 +699,21 @@ class TestLinuxBridgeManager(base.BaseTestCase): "physnet9", "1") self.assertEqual(1, log.call_count) - def test_add_tap_interface(self): + def test_add_tap_interface_owner_other(self): + with mock.patch.object(ip_lib, "device_exists"): + with mock.patch.object(self.lbm, "ensure_local_bridge"): + self.assertTrue(self.lbm.add_tap_interface("123", + p_const.TYPE_LOCAL, + "physnet1", None, + "tap1", "foo")) + + def _test_add_tap_interface(self, dev_owner_prefix): with mock.patch.object(ip_lib, "device_exists") as de_fn: de_fn.return_value = False self.assertFalse( self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, - "physnet1", "1", "tap1") - ) + "physnet1", "1", "tap1", + dev_owner_prefix)) de_fn.return_value = True bridge_device = mock.Mock() @@ -718,14 +727,16 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertTrue(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) + "tap1", + dev_owner_prefix)) en_fn.assert_called_with("123", "brq123") self.lbm.bridge_mappings = {"physnet1": "brq999"} self.assertTrue(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) + "tap1", + dev_owner_prefix)) en_fn.assert_called_with("123", "brq999") get_br.return_value = False @@ -733,8 +744,8 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertFalse(self.lbm.add_tap_interface("123", p_const.TYPE_LOCAL, "physnet1", None, - "tap1")) - + "tap1", + dev_owner_prefix)) with mock.patch.object(self.lbm, "ensure_physical_in_bridge") as ens_fn,\ mock.patch.object(self.lbm, @@ -745,21 +756,31 @@ class TestLinuxBridgeManager(base.BaseTestCase): self.assertFalse(self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, "physnet1", "1", - "tap1")) + "tap1", + dev_owner_prefix)) ens_fn.return_value = "eth0.1" get_br.return_value = "brq123" self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, - "physnet1", "1", "tap1") + "physnet1", "1", "tap1", + dev_owner_prefix) en_mtu_fn.assert_called_once_with("tap1", "eth0.1") bridge_device.addif.assert_called_once_with("tap1") + def test_add_tap_interface_owner_network(self): + self._test_add_tap_interface(constants.DEVICE_OWNER_NETWORK_PREFIX) + + def test_add_tap_interface_owner_neutron(self): + self._test_add_tap_interface(constants.DEVICE_OWNER_NEUTRON_PREFIX) + def test_add_interface(self): with mock.patch.object(self.lbm, "add_tap_interface") as add_tap: self.lbm.add_interface("123", p_const.TYPE_VLAN, "physnet-1", - "1", "234") + "1", "234", + constants.DEVICE_OWNER_NETWORK_PREFIX) add_tap.assert_called_with("123", p_const.TYPE_VLAN, "physnet-1", - "1", "tap234") + "1", "tap234", + constants.DEVICE_OWNER_NETWORK_PREFIX) def test_delete_bridge(self): with mock.patch.object(ip_lib.IPDevice, "exists") as de_fn,\