lb: avoid doing nova VIF work plumbing tap to qbr

neutron should rely on nova doing the job instead of trying to 'fix' it.
'Fixing' it introduces race conditions between lb agent and nova VIF
driver. Particularly, lb agent can scan for new tap devices in the
middle of nova plumbing qbr-tap setup, and attempt to do it on its own.
So if agent is more lucky to plug the tap device into the bridge, nova
may fail to do the same, getting the following error:

libvirtError: Unable to add bridge brqxxx-xx port tapxxx-xx: Device or
resource busy

This also requires a change in how the port admin_state_up is implemented
by setting the tap device's link state instead of moving it in or out
of the bridge.

Co-Authored-By: Sean M. Collins <sean@coreitpro.com>
Co-Authored-By: Darragh O'Reilly <darragh.oreilly@hp.com>
Co-Authored-By: Andreas Scheuring <andreas.scheuring@de.ibm.com>
Change-Id: I02971103407b4ec11a65218e9ef7e2708915d938
Closes-Bug: #1312016
This commit is contained in:
Andreas Scheuring 2015-11-11 14:03:08 +01:00
parent c8a7d9bfdb
commit f42ea67995
3 changed files with 138 additions and 76 deletions

View File

@ -32,6 +32,7 @@ FLOATINGIP_STATUS_ERROR = 'ERROR'
DEVICE_OWNER_COMPUTE_PREFIX = "compute:" DEVICE_OWNER_COMPUTE_PREFIX = "compute:"
DEVICE_OWNER_NETWORK_PREFIX = "network:" DEVICE_OWNER_NETWORK_PREFIX = "network:"
DEVICE_OWNER_NEUTRON_PREFIX = "neutron:"
DEVICE_OWNER_ROUTER_HA_INTF = (DEVICE_OWNER_NETWORK_PREFIX + DEVICE_OWNER_ROUTER_HA_INTF = (DEVICE_OWNER_NETWORK_PREFIX +
"router_ha_interface") "router_ha_interface")
@ -45,10 +46,11 @@ DEVICE_OWNER_AGENT_GW = (DEVICE_OWNER_NETWORK_PREFIX +
"floatingip_agent_gateway") "floatingip_agent_gateway")
DEVICE_OWNER_ROUTER_SNAT = (DEVICE_OWNER_NETWORK_PREFIX + DEVICE_OWNER_ROUTER_SNAT = (DEVICE_OWNER_NETWORK_PREFIX +
"router_centralized_snat") "router_centralized_snat")
DEVICE_OWNER_LOADBALANCER = "neutron:LOADBALANCER" DEVICE_OWNER_LOADBALANCER = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCER"
DEVICE_OWNER_LOADBALANCERV2 = "neutron:LOADBALANCERV2" DEVICE_OWNER_LOADBALANCERV2 = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCERV2"
DEVICE_OWNER_PREFIXES = [DEVICE_OWNER_NETWORK_PREFIX, "neutron:"] DEVICE_OWNER_PREFIXES = (DEVICE_OWNER_NETWORK_PREFIX,
DEVICE_OWNER_NEUTRON_PREFIX)
# Collection used to identify devices owned by router interfaces. # Collection used to identify devices owned by router interfaces.
# DEVICE_OWNER_ROUTER_HA_INTF is a special case and so is not included. # DEVICE_OWNER_ROUTER_HA_INTF is a special case and so is not included.

View File

@ -402,7 +402,7 @@ class LinuxBridgeManager(object):
network_id: network_id}) network_id: network_id})
def add_tap_interface(self, network_id, network_type, physical_network, def add_tap_interface(self, network_id, network_type, physical_network,
segmentation_id, tap_device_name): segmentation_id, tap_device_name, device_owner):
"""Add tap interface. """Add tap interface.
If a VIF has been plugged into a network, this function will If a VIF has been plugged into a network, this function will
@ -427,22 +427,26 @@ class LinuxBridgeManager(object):
if not phy_dev_name: if not phy_dev_name:
return False return False
self.ensure_tap_mtu(tap_device_name, phy_dev_name) self.ensure_tap_mtu(tap_device_name, phy_dev_name)
# Avoid messing with plugging devices into a bridge that the agent
# Check if device needs to be added to bridge # does not own
tap_device_in_bridge = bridge_lib.BridgeDevice.get_interface_bridge( if device_owner.startswith(constants.DEVICE_OWNER_PREFIXES):
tap_device_name) # Check if device needs to be added to bridge
if not tap_device_in_bridge: if not bridge_lib.BridgeDevice.get_interface_bridge(
data = {'tap_device_name': tap_device_name, tap_device_name):
'bridge_name': bridge_name} data = {'tap_device_name': tap_device_name,
LOG.debug("Adding device %(tap_device_name)s to bridge " 'bridge_name': bridge_name}
"%(bridge_name)s", data) LOG.debug("Adding device %(tap_device_name)s to bridge "
if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name): "%(bridge_name)s", data)
return False if bridge_lib.BridgeDevice(bridge_name).addif(tap_device_name):
return False
else: else:
data = {'tap_device_name': tap_device_name, data = {'tap_device_name': tap_device_name,
'device_owner': device_owner,
'bridge_name': bridge_name} 'bridge_name': bridge_name}
LOG.debug("%(tap_device_name)s already exists on bridge " LOG.debug("Skip adding device %(tap_device_name)s to "
"%(bridge_name)s", data) "%(bridge_name)s. It is owned by %(device_owner) and "
"thus added elsewhere."
"elsewhere", data)
return True return True
def ensure_tap_mtu(self, tap_dev_name, phy_dev_name): def ensure_tap_mtu(self, tap_dev_name, phy_dev_name):
@ -451,14 +455,14 @@ class LinuxBridgeManager(object):
ip_lib.IPDevice(tap_dev_name).link.set_mtu(phy_dev_mtu) ip_lib.IPDevice(tap_dev_name).link.set_mtu(phy_dev_mtu)
def add_interface(self, network_id, network_type, physical_network, def add_interface(self, network_id, network_type, physical_network,
segmentation_id, port_id): segmentation_id, port_id, device_owner):
self.network_map[network_id] = NetworkSegment(network_type, self.network_map[network_id] = NetworkSegment(network_type,
physical_network, physical_network,
segmentation_id) segmentation_id)
tap_device_name = self.get_tap_device_name(port_id) tap_device_name = self.get_tap_device_name(port_id)
return self.add_tap_interface(network_id, network_type, return self.add_tap_interface(network_id, network_type,
physical_network, segmentation_id, physical_network, segmentation_id,
tap_device_name) tap_device_name, device_owner)
def delete_bridge(self, bridge_name): def delete_bridge(self, bridge_name):
bridge_device = bridge_lib.BridgeDevice(bridge_name) bridge_device = bridge_lib.BridgeDevice(bridge_name)
@ -896,12 +900,14 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
def setup_linux_bridge(self, bridge_mappings, interface_mappings): def setup_linux_bridge(self, bridge_mappings, interface_mappings):
self.br_mgr = LinuxBridgeManager(bridge_mappings, interface_mappings) self.br_mgr = LinuxBridgeManager(bridge_mappings, interface_mappings)
def remove_port_binding(self, network_id, physical_network, interface_id): def _ensure_port_admin_state(self, port_id, admin_state_up):
bridge_name = self.br_mgr.get_existing_bridge_name(physical_network) LOG.debug("Setting admin_state_up to %s for port %s",
if not bridge_name: admin_state_up, port_id)
bridge_name = self.br_mgr.get_bridge_name(network_id) tap_name = self.br_mgr.get_tap_device_name(port_id)
tap_device_name = self.br_mgr.get_tap_device_name(interface_id) if admin_state_up:
return self.br_mgr.remove_interface(bridge_name, tap_device_name) ip_lib.IPDevice(tap_name).link.set_up()
else:
ip_lib.IPDevice(tap_name).link.set_down()
def process_network_devices(self, device_info): def process_network_devices(self, device_info):
resync_a = False resync_a = False
@ -943,18 +949,56 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
device_details['port_id']) device_details['port_id'])
arp_protect.setup_arp_spoofing_protection(port, arp_protect.setup_arp_spoofing_protection(port,
device_details) device_details)
# create the networking for the port
network_type = device_details.get('network_type')
segmentation_id = device_details.get('segmentation_id')
tap_in_bridge = self.br_mgr.add_interface(
device_details['network_id'], network_type,
device_details['physical_network'], segmentation_id,
device_details['port_id'], device_details['device_owner'])
# REVISIT(scheuran): Changed the way how ports admin_state_up
# is implemented.
#
# Old lb implementation:
# - admin_state_up: ensure that tap is plugged into bridge
# - admin_state_down: remove tap from bridge
# New lb implementation:
# - admin_state_up: set tap device state to up
# - admin_state_down: set tap device stae to down
#
# However both approaches could result in races with
# nova/libvirt and therefore to an invalid system state in the
# scenario, where an instance is booted with a port configured
# with admin_state_up = False:
#
# Libvirt does the following actions in exactly
# this order (see libvirt virnetdevtap.c)
# 1) Create the tap device, set its MAC and MTU
# 2) Plug the tap into the bridge
# 3) Set the tap online
#
# Old lb implementation:
# A race could occur, if the lb agent removes the tap device
# right after step 1). Then libvirt will add it to the bridge
# again in step 2).
# New lb implementation:
# The race could occur if the lb-agent sets the taps device
# state to down right after step 2). In step 3) libvirt
# might set it to up again.
#
# This is not an issue if an instance is booted with a port
# configured with admin_state_up = True. Libvirt would just
# set the tap device up again.
#
# This refactoring is recommended for the following reasons:
# 1) An existing race with libvirt caused by the behavior of
# the old implementation. See Bug #1312016
# 2) The new code is much more readable
self._ensure_port_admin_state(device_details['port_id'],
device_details['admin_state_up'])
# update plugin about port status if admin_state is up
if device_details['admin_state_up']: if device_details['admin_state_up']:
# create the networking for the port if tap_in_bridge:
network_type = device_details.get('network_type')
segmentation_id = device_details.get('segmentation_id')
if self.br_mgr.add_interface(
device_details['network_id'],
network_type,
device_details['physical_network'],
segmentation_id,
device_details['port_id']):
# update plugin about port status
self.plugin_rpc.update_device_up(self.context, self.plugin_rpc.update_device_up(self.context,
device, device,
self.agent_id, self.agent_id,
@ -964,11 +1008,6 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
device, device,
self.agent_id, self.agent_id,
cfg.CONF.host) cfg.CONF.host)
else:
physical_network = device_details['physical_network']
self.remove_port_binding(device_details['network_id'],
physical_network,
device_details['port_id'])
else: else:
LOG.info(_LI("Device %s not defined on plugin"), device) LOG.info(_LI("Device %s not defined on plugin"), device)
return False return False

View File

@ -126,6 +126,7 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
def test_treat_devices_removed_with_existed_device(self): def test_treat_devices_removed_with_existed_device(self):
agent = self.agent agent = self.agent
agent._ensure_port_admin_state = mock.Mock()
devices = [DEVICE_1] devices = [DEVICE_1]
with mock.patch.object(agent.plugin_rpc, with mock.patch.object(agent.plugin_rpc,
"update_device_down") as fn_udd,\ "update_device_down") as fn_udd,\
@ -315,39 +316,22 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
'admin_state_up': True, 'admin_state_up': True,
'network_type': 'vlan', 'network_type': 'vlan',
'segmentation_id': 100, 'segmentation_id': 100,
'physical_network': 'physnet1'} 'physical_network': 'physnet1',
'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX}
agent.plugin_rpc = mock.Mock() agent.plugin_rpc = mock.Mock()
agent.plugin_rpc.get_devices_details_list.return_value = [mock_details] agent.plugin_rpc.get_devices_details_list.return_value = [mock_details]
agent.br_mgr = mock.Mock() agent.br_mgr = mock.Mock()
agent.br_mgr.add_interface.return_value = True agent.br_mgr.add_interface.return_value = True
agent._ensure_port_admin_state = mock.Mock()
resync_needed = agent.treat_devices_added_updated(set(['tap1'])) resync_needed = agent.treat_devices_added_updated(set(['tap1']))
self.assertFalse(resync_needed) self.assertFalse(resync_needed)
agent.br_mgr.add_interface.assert_called_with('net123', 'vlan', agent.br_mgr.add_interface.assert_called_with(
'physnet1', 100, 'net123', 'vlan', 'physnet1',
'port123') 100, 'port123',
constants.DEVICE_OWNER_NETWORK_PREFIX)
self.assertTrue(agent.plugin_rpc.update_device_up.called) self.assertTrue(agent.plugin_rpc.update_device_up.called)
def test_treat_devices_added_updated_admin_state_up_false(self):
agent = self.agent
mock_details = {'device': 'dev123',
'port_id': 'port123',
'network_id': 'net123',
'admin_state_up': False,
'network_type': 'vlan',
'segmentation_id': 100,
'physical_network': 'physnet1'}
agent.plugin_rpc = mock.Mock()
agent.plugin_rpc.get_devices_details_list.return_value = [mock_details]
agent.remove_port_binding = mock.Mock()
resync_needed = agent.treat_devices_added_updated(set(['tap1']))
self.assertFalse(resync_needed)
agent.remove_port_binding.assert_called_with('net123',
'physnet1',
'port123')
self.assertFalse(agent.plugin_rpc.update_device_up.called)
def test_set_rpc_timeout(self): def test_set_rpc_timeout(self):
self.agent.stop() self.agent.stop()
for rpc_client in (self.agent.plugin_rpc.client, for rpc_client in (self.agent.plugin_rpc.client,
@ -369,6 +353,23 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
self.agent._report_state() self.agent._report_state()
self.assertTrue(self.agent.fullsync) self.assertTrue(self.agent.fullsync)
def _test_ensure_port_admin_state(self, admin_state):
port_id = 'fake_id'
with mock.patch.object(ip_lib, 'IPDevice') as dev_mock:
self.agent._ensure_port_admin_state(port_id, admin_state)
tap_name = self.agent.br_mgr.get_tap_device_name(port_id)
self.assertEqual(admin_state,
dev_mock(tap_name).link.set_up.called)
self.assertNotEqual(admin_state,
dev_mock(tap_name).link.set_down.called)
def test_ensure_port_admin_state_up(self):
self._test_ensure_port_admin_state(True)
def test_ensure_port_admin_state_down(self):
self._test_ensure_port_admin_state(False)
class TestLinuxBridgeManager(base.BaseTestCase): class TestLinuxBridgeManager(base.BaseTestCase):
def setUp(self): def setUp(self):
@ -698,13 +699,21 @@ class TestLinuxBridgeManager(base.BaseTestCase):
"physnet9", "1") "physnet9", "1")
self.assertEqual(1, log.call_count) self.assertEqual(1, log.call_count)
def test_add_tap_interface(self): def test_add_tap_interface_owner_other(self):
with mock.patch.object(ip_lib, "device_exists"):
with mock.patch.object(self.lbm, "ensure_local_bridge"):
self.assertTrue(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL,
"physnet1", None,
"tap1", "foo"))
def _test_add_tap_interface(self, dev_owner_prefix):
with mock.patch.object(ip_lib, "device_exists") as de_fn: with mock.patch.object(ip_lib, "device_exists") as de_fn:
de_fn.return_value = False de_fn.return_value = False
self.assertFalse( self.assertFalse(
self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, self.lbm.add_tap_interface("123", p_const.TYPE_VLAN,
"physnet1", "1", "tap1") "physnet1", "1", "tap1",
) dev_owner_prefix))
de_fn.return_value = True de_fn.return_value = True
bridge_device = mock.Mock() bridge_device = mock.Mock()
@ -718,14 +727,16 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertTrue(self.lbm.add_tap_interface("123", self.assertTrue(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL, p_const.TYPE_LOCAL,
"physnet1", None, "physnet1", None,
"tap1")) "tap1",
dev_owner_prefix))
en_fn.assert_called_with("123", "brq123") en_fn.assert_called_with("123", "brq123")
self.lbm.bridge_mappings = {"physnet1": "brq999"} self.lbm.bridge_mappings = {"physnet1": "brq999"}
self.assertTrue(self.lbm.add_tap_interface("123", self.assertTrue(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL, p_const.TYPE_LOCAL,
"physnet1", None, "physnet1", None,
"tap1")) "tap1",
dev_owner_prefix))
en_fn.assert_called_with("123", "brq999") en_fn.assert_called_with("123", "brq999")
get_br.return_value = False get_br.return_value = False
@ -733,8 +744,8 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertFalse(self.lbm.add_tap_interface("123", self.assertFalse(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL, p_const.TYPE_LOCAL,
"physnet1", None, "physnet1", None,
"tap1")) "tap1",
dev_owner_prefix))
with mock.patch.object(self.lbm, with mock.patch.object(self.lbm,
"ensure_physical_in_bridge") as ens_fn,\ "ensure_physical_in_bridge") as ens_fn,\
mock.patch.object(self.lbm, mock.patch.object(self.lbm,
@ -745,21 +756,31 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertFalse(self.lbm.add_tap_interface("123", self.assertFalse(self.lbm.add_tap_interface("123",
p_const.TYPE_VLAN, p_const.TYPE_VLAN,
"physnet1", "1", "physnet1", "1",
"tap1")) "tap1",
dev_owner_prefix))
ens_fn.return_value = "eth0.1" ens_fn.return_value = "eth0.1"
get_br.return_value = "brq123" get_br.return_value = "brq123"
self.lbm.add_tap_interface("123", p_const.TYPE_VLAN, self.lbm.add_tap_interface("123", p_const.TYPE_VLAN,
"physnet1", "1", "tap1") "physnet1", "1", "tap1",
dev_owner_prefix)
en_mtu_fn.assert_called_once_with("tap1", "eth0.1") en_mtu_fn.assert_called_once_with("tap1", "eth0.1")
bridge_device.addif.assert_called_once_with("tap1") bridge_device.addif.assert_called_once_with("tap1")
def test_add_tap_interface_owner_network(self):
self._test_add_tap_interface(constants.DEVICE_OWNER_NETWORK_PREFIX)
def test_add_tap_interface_owner_neutron(self):
self._test_add_tap_interface(constants.DEVICE_OWNER_NEUTRON_PREFIX)
def test_add_interface(self): def test_add_interface(self):
with mock.patch.object(self.lbm, "add_tap_interface") as add_tap: with mock.patch.object(self.lbm, "add_tap_interface") as add_tap:
self.lbm.add_interface("123", p_const.TYPE_VLAN, "physnet-1", self.lbm.add_interface("123", p_const.TYPE_VLAN, "physnet-1",
"1", "234") "1", "234",
constants.DEVICE_OWNER_NETWORK_PREFIX)
add_tap.assert_called_with("123", p_const.TYPE_VLAN, "physnet-1", add_tap.assert_called_with("123", p_const.TYPE_VLAN, "physnet-1",
"1", "tap234") "1", "tap234",
constants.DEVICE_OWNER_NETWORK_PREFIX)
def test_delete_bridge(self): def test_delete_bridge(self):
with mock.patch.object(ip_lib.IPDevice, "exists") as de_fn,\ with mock.patch.object(ip_lib.IPDevice, "exists") as de_fn,\