Browse Source

lb: avoid doing nova VIF work plumbing tap to qbr

neutron should rely on nova doing the job instead of trying to 'fix' it.
'Fixing' it introduces race conditions between lb agent and nova VIF
driver. Particularly, lb agent can scan for new tap devices in the
middle of nova plumbing qbr-tap setup, and attempt to do it on its own.
So if agent is more lucky to plug the tap device into the bridge, nova
may fail to do the same, getting the following error:

libvirtError: Unable to add bridge brqxxx-xx port tapxxx-xx: Device or
resource busy

This also requires a change in how the port admin_state_up is implemented
by setting the tap device's link state instead of moving it in or out
of the bridge.

Conflicts:
	neutron/common/constants.py
	neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py
	neutron/tests/unit/plugins/ml2/drivers/linuxbridge/agent/test_linuxbridge_neutron_agent.py

Co-Authored-By: Sean M. Collins <sean@coreitpro.com>
Co-Authored-By: Darragh O'Reilly <darragh.oreilly@hp.com>
Co-Authored-By: Andreas Scheuring <andreas.scheuring@de.ibm.com>
Closes-Bug: #1312016
(cherry picked from commit f42ea67995)
(cherry picked from commit eb61b837f7)

===

Also squashed the following follow up fix:

lb: Correct String formatting to get rid of logged ValueError

The following error is caused by a missing String formatting in the
linuxbridge agent:
"ValueError: unsupported format character 'a' (0x61) at index 90
Logged from file linuxbridge_neutron_agent.py, line 447"

In addition a duplicated word in the log text has been fixed.

Change-Id: I587f1165fc7084dc9c4806149b65652f6e27b14e
(cherry picked from commit 1f86d8687b)

===

Also squashed in:

Only ensure admin state on ports that exist

The linux bridge agent was calling ensure_port_admin state
unconditionally on ports in treat_devices_added_or_updated.
This would cause it to throw an error on interfaces that
didn't exist so it would restart the entire processing loop.

If another port was being updated in the same loop before this
one, that port would experience a port status life-cycle of
DOWN->BUILD->ACTIVE->BUILD->ACTIVE
                   ^ <--- Exception in unrelated port causes cycle
                          to start over again.

This causes the bug below because the first active transition will
cause Nova to boot the VM. At this point tempest tests expect the
ports that belong to the VM to be in the ACTIVE state so it filters
Neutron port list calls with "status=ACTIVE". Therefore tempest would
not get any ports back and assume there was some kind of error with
the port and bail.

This patch just makes sure the admin state call is skipped if the port
doesn't exist and it includes a basic unit test to prevent a regression.

Conflicts:
	neutron/plugins/ml2/drivers/linuxbridge/agent/linuxbridge_neutron_agent.py

Closes-Bug: #1523638
Change-Id: I5330c6111cbb20bf45aec9ade7e30d34e8dd16ca
(cherry picked from commit 96c67e22f9)

Change-Id: I02971103407b4ec11a65218e9ef7e2708915d938
changes/03/296803/4
Andreas Scheuring 6 years ago
committed by Ihar Hrachyshka
parent
commit
a8b300ac6d
3 changed files with 162 additions and 67 deletions
  1. +9
    -2
      neutron/common/constants.py
  2. +85
    -39
      neutron/plugins/linuxbridge/agent/linuxbridge_neutron_agent.py
  3. +68
    -26
      neutron/tests/unit/plugins/linuxbridge/agent/test_linuxbridge_neutron_agent.py

+ 9
- 2
neutron/common/constants.py View File

@ -30,6 +30,10 @@ FLOATINGIP_STATUS_ACTIVE = 'ACTIVE'
FLOATINGIP_STATUS_DOWN = 'DOWN'
FLOATINGIP_STATUS_ERROR = 'ERROR'
DEVICE_OWNER_COMPUTE_PREFIX = "compute:"
DEVICE_OWNER_NETWORK_PREFIX = "network:"
DEVICE_OWNER_NEUTRON_PREFIX = "neutron:"
DEVICE_OWNER_ROUTER_HA_INTF = "network:router_ha_interface"
DEVICE_OWNER_ROUTER_INTF = "network:router_interface"
DEVICE_OWNER_ROUTER_GW = "network:router_gateway"
@ -38,8 +42,11 @@ DEVICE_OWNER_DHCP = "network:dhcp"
DEVICE_OWNER_DVR_INTERFACE = "network:router_interface_distributed"
DEVICE_OWNER_AGENT_GW = "network:floatingip_agent_gateway"
DEVICE_OWNER_ROUTER_SNAT = "network:router_centralized_snat"
DEVICE_OWNER_LOADBALANCER = "neutron:LOADBALANCER"
DEVICE_OWNER_LOADBALANCERV2 = "neutron:LOADBALANCERV2"
DEVICE_OWNER_LOADBALANCER = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCER"
DEVICE_OWNER_LOADBALANCERV2 = DEVICE_OWNER_NEUTRON_PREFIX + "LOADBALANCERV2"
DEVICE_OWNER_PREFIXES = (DEVICE_OWNER_NETWORK_PREFIX,
DEVICE_OWNER_NEUTRON_PREFIX)
# Collection used to identify devices owned by router interfaces.
# DEVICE_OWNER_ROUTER_HA_INTF is a special case and so is not included.


+ 85
- 39
neutron/plugins/linuxbridge/agent/linuxbridge_neutron_agent.py View File

@ -372,7 +372,7 @@ class LinuxBridgeManager(object):
network_id: network_id})
def add_tap_interface(self, network_id, network_type, physical_network,
segmentation_id, tap_device_name):
segmentation_id, tap_device_name, device_owner):
"""Add tap interface.
If a VIF has been plugged into a network, this function will
@ -395,21 +395,26 @@ class LinuxBridgeManager(object):
return False
self.ensure_tap_mtu(tap_device_name, phy_dev_name)
# Check if device needs to be added to bridge
tap_device_in_bridge = self.get_bridge_for_tap_device(tap_device_name)
if not tap_device_in_bridge:
data = {'tap_device_name': tap_device_name,
'bridge_name': bridge_name}
LOG.debug("Adding device %(tap_device_name)s to bridge "
"%(bridge_name)s", data)
if utils.execute(['brctl', 'addif', bridge_name, tap_device_name],
run_as_root=True):
return False
# Avoid messing with plugging devices into a bridge that the agent
# does not own
if device_owner.startswith(constants.DEVICE_OWNER_PREFIXES):
# Check if device needs to be added to bridge
if not self.get_bridge_for_tap_device(tap_device_name):
data = {'tap_device_name': tap_device_name,
'bridge_name': bridge_name}
LOG.debug("Adding device %(tap_device_name)s to bridge "
"%(bridge_name)s", data)
if utils.execute(['brctl', 'addif',
bridge_name, tap_device_name],
run_as_root=True):
return False
else:
data = {'tap_device_name': tap_device_name,
'device_owner': device_owner,
'bridge_name': bridge_name}
LOG.debug("%(tap_device_name)s already exists on bridge "
"%(bridge_name)s", data)
LOG.debug("Skip adding device %(tap_device_name)s to "
"%(bridge_name)s. It is owned by %(device_owner)s and "
"thus added elsewhere.", data)
return True
def ensure_tap_mtu(self, tap_dev_name, phy_dev_name):
@ -418,14 +423,14 @@ class LinuxBridgeManager(object):
ip_lib.IPDevice(tap_dev_name).link.set_mtu(phy_dev_mtu)
def add_interface(self, network_id, network_type, physical_network,
segmentation_id, port_id):
segmentation_id, port_id, device_owner):
self.network_map[network_id] = NetworkSegment(network_type,
physical_network,
segmentation_id)
tap_device_name = self.get_tap_device_name(port_id)
return self.add_tap_interface(network_id, network_type,
physical_network, segmentation_id,
tap_device_name)
tap_device_name, device_owner)
def delete_vlan_bridge(self, bridge_name):
if ip_lib.device_exists(bridge_name):
@ -831,10 +836,14 @@ class LinuxBridgeNeutronAgentRPC(object):
def setup_linux_bridge(self, interface_mappings):
self.br_mgr = LinuxBridgeManager(interface_mappings)
def remove_port_binding(self, network_id, interface_id):
bridge_name = self.br_mgr.get_bridge_name(network_id)
tap_device_name = self.br_mgr.get_tap_device_name(interface_id)
return self.br_mgr.remove_interface(bridge_name, tap_device_name)
def _ensure_port_admin_state(self, port_id, admin_state_up):
LOG.debug("Setting admin_state_up to %s for port %s",
admin_state_up, port_id)
tap_name = self.br_mgr.get_tap_device_name(port_id)
if admin_state_up:
ip_lib.IPDevice(tap_name).link.set_up()
else:
ip_lib.IPDevice(tap_name).link.set_down()
def process_network_devices(self, device_info):
resync_a = False
@ -881,24 +890,64 @@ class LinuxBridgeNeutronAgentRPC(object):
device_details['port_id'])
arp_protect.setup_arp_spoofing_protection(port,
device_details)
# create the networking for the port
network_type = device_details.get('network_type')
if network_type:
segmentation_id = device_details.get('segmentation_id')
else:
# compatibility with pre-Havana RPC vlan_id encoding
vlan_id = device_details.get('vlan_id')
(network_type,
segmentation_id) = lconst.interpret_vlan_id(vlan_id)
tap_in_bridge = self.br_mgr.add_interface(
device_details['network_id'], network_type,
device_details['physical_network'], segmentation_id,
device_details['port_id'], device_details['device_owner'])
# REVISIT(scheuran): Changed the way how ports admin_state_up
# is implemented.
#
# Old lb implementation:
# - admin_state_up: ensure that tap is plugged into bridge
# - admin_state_down: remove tap from bridge
# New lb implementation:
# - admin_state_up: set tap device state to up
# - admin_state_down: set tap device stae to down
#
# However both approaches could result in races with
# nova/libvirt and therefore to an invalid system state in the
# scenario, where an instance is booted with a port configured
# with admin_state_up = False:
#
# Libvirt does the following actions in exactly
# this order (see libvirt virnetdevtap.c)
# 1) Create the tap device, set its MAC and MTU
# 2) Plug the tap into the bridge
# 3) Set the tap online
#
# Old lb implementation:
# A race could occur, if the lb agent removes the tap device
# right after step 1). Then libvirt will add it to the bridge
# again in step 2).
# New lb implementation:
# The race could occur if the lb-agent sets the taps device
# state to down right after step 2). In step 3) libvirt
# might set it to up again.
#
# This is not an issue if an instance is booted with a port
# configured with admin_state_up = True. Libvirt would just
# set the tap device up again.
#
# This refactoring is recommended for the following reasons:
# 1) An existing race with libvirt caused by the behavior of
# the old implementation. See Bug #1312016
# 2) The new code is much more readable
if tap_in_bridge:
self._ensure_port_admin_state(
device_details['port_id'],
device_details['admin_state_up'])
# update plugin about port status if admin_state is up
if device_details['admin_state_up']:
# create the networking for the port
network_type = device_details.get('network_type')
if network_type:
segmentation_id = device_details.get('segmentation_id')
else:
# compatibility with pre-Havana RPC vlan_id encoding
vlan_id = device_details.get('vlan_id')
(network_type,
segmentation_id) = lconst.interpret_vlan_id(vlan_id)
if self.br_mgr.add_interface(
device_details['network_id'],
network_type,
device_details['physical_network'],
segmentation_id,
device_details['port_id']):
# update plugin about port status
if tap_in_bridge:
self.plugin_rpc.update_device_up(self.context,
device,
self.agent_id,
@ -908,9 +957,6 @@ class LinuxBridgeNeutronAgentRPC(object):
device,
self.agent_id,
cfg.CONF.host)
else:
self.remove_port_binding(device_details['network_id'],
device_details['port_id'])
else:
LOG.info(_LI("Device %s not defined on plugin"), device)
return False


+ 68
- 26
neutron/tests/unit/plugins/linuxbridge/agent/test_linuxbridge_neutron_agent.py View File

@ -105,6 +105,7 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
def test_treat_devices_removed_with_existed_device(self):
agent = self.agent
agent._ensure_port_admin_state = mock.Mock()
devices = [DEVICE_1]
with contextlib.nested(
mock.patch.object(agent.plugin_rpc, "update_device_down"),
@ -332,7 +333,7 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
'tap4']))
agent.treat_devices_removed.assert_called_with(set(['tap1']))
def test_treat_devices_added_updated_admin_state_up_true(self):
def test_treat_devices_added_updated_no_local_interface(self):
agent = self.agent
mock_details = {'device': 'dev123',
'port_id': 'port123',
@ -340,36 +341,57 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
'admin_state_up': True,
'network_type': 'vlan',
'segmentation_id': 100,
'physical_network': 'physnet1'}
'physical_network': 'physnet1',
'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX}
agent.ext_manager = mock.Mock()
agent.plugin_rpc = mock.Mock()
agent.plugin_rpc.get_devices_details_list.return_value = [mock_details]
agent.br_mgr = mock.Mock()
agent.br_mgr.add_interface.return_value = True
resync_needed = agent.treat_devices_added_updated(set(['tap1']))
self.assertFalse(resync_needed)
agent.br_mgr.add_interface.assert_called_with('net123', 'vlan',
'physnet1', 100,
'port123')
self.assertTrue(agent.plugin_rpc.update_device_up.called)
agent.mgr = mock.Mock()
agent.mgr.plug_interface.return_value = False
agent.mgr.ensure_port_admin_state = mock.Mock()
agent.treat_devices_added_updated(set(['tap1']))
self.assertFalse(agent.mgr.ensure_port_admin_state.called)
def test_treat_devices_added_updated_admin_state_up_false(self):
def test_treat_devices_added_updated_admin_state_up_true(self):
agent = self.agent
mock_details = {'device': 'dev123',
'port_id': 'port123',
'network_id': 'net123',
'admin_state_up': False,
'admin_state_up': True,
'network_type': 'vlan',
'segmentation_id': 100,
'physical_network': 'physnet1'}
'physical_network': 'physnet1',
'device_owner': constants.DEVICE_OWNER_NETWORK_PREFIX}
agent.plugin_rpc = mock.Mock()
agent.plugin_rpc.get_devices_details_list.return_value = [mock_details]
agent.remove_port_binding = mock.Mock()
agent.br_mgr = mock.Mock()
agent.br_mgr.add_interface.return_value = True
agent._ensure_port_admin_state = mock.Mock()
resync_needed = agent.treat_devices_added_updated(set(['tap1']))
self.assertFalse(resync_needed)
agent.remove_port_binding.assert_called_with('net123', 'port123')
self.assertFalse(agent.plugin_rpc.update_device_up.called)
agent.br_mgr.add_interface.assert_called_with(
'net123', 'vlan', 'physnet1',
100, 'port123',
constants.DEVICE_OWNER_NETWORK_PREFIX)
self.assertTrue(agent.plugin_rpc.update_device_up.called)
def _test_ensure_port_admin_state(self, admin_state):
port_id = 'fake_id'
with mock.patch.object(ip_lib, 'IPDevice') as dev_mock:
self.agent._ensure_port_admin_state(port_id, admin_state)
tap_name = self.agent.br_mgr.get_tap_device_name(port_id)
self.assertEqual(admin_state,
dev_mock(tap_name).link.set_up.called)
self.assertNotEqual(admin_state,
dev_mock(tap_name).link.set_down.called)
def test_ensure_port_admin_state_up(self):
self._test_ensure_port_admin_state(True)
def test_ensure_port_admin_state_down(self):
self._test_ensure_port_admin_state(False)
class TestLinuxBridgeManager(base.BaseTestCase):
@ -685,13 +707,21 @@ class TestLinuxBridgeManager(base.BaseTestCase):
)
self.assertTrue(vlbr_fn.called)
def test_add_tap_interface(self):
def test_add_tap_interface_owner_other(self):
with mock.patch.object(ip_lib, "device_exists"):
with mock.patch.object(self.lbm, "ensure_local_bridge"):
self.assertTrue(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL,
"physnet1", None,
"tap1", "foo"))
def _test_add_tap_interface(self, dev_owner_prefix):
with mock.patch.object(ip_lib, "device_exists") as de_fn:
de_fn.return_value = False
self.assertFalse(
self.lbm.add_tap_interface("123", p_const.TYPE_VLAN,
"physnet1", "1", "tap1")
)
"physnet1", "1", "tap1",
dev_owner_prefix))
de_fn.return_value = True
with contextlib.nested(
@ -704,7 +734,8 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertTrue(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL,
"physnet1", None,
"tap1"))
"tap1",
dev_owner_prefix))
en_fn.assert_called_with("123")
get_br.return_value = False
@ -712,7 +743,8 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertFalse(self.lbm.add_tap_interface("123",
p_const.TYPE_LOCAL,
"physnet1", None,
"tap1"))
"tap1",
dev_owner_prefix))
with contextlib.nested(
mock.patch.object(self.lbm, "ensure_physical_in_bridge"),
@ -723,20 +755,30 @@ class TestLinuxBridgeManager(base.BaseTestCase):
self.assertFalse(self.lbm.add_tap_interface("123",
p_const.TYPE_VLAN,
"physnet1", "1",
"tap1"))
"tap1",
dev_owner_prefix))
ens_fn.return_value = "eth0.1"
get_br.return_value = "brq123"
self.lbm.add_tap_interface("123", p_const.TYPE_VLAN,
"physnet1", "1", "tap1")
"physnet1", "1", "tap1",
dev_owner_prefix)
en_mtu_fn.assert_called_once_with("tap1", "eth0.1")
def test_add_tap_interface_owner_network(self):
self._test_add_tap_interface(constants.DEVICE_OWNER_NETWORK_PREFIX)
def test_add_tap_interface_owner_neutron(self):
self._test_add_tap_interface(constants.DEVICE_OWNER_NEUTRON_PREFIX)
def test_add_interface(self):
with mock.patch.object(self.lbm, "add_tap_interface") as add_tap:
self.lbm.add_interface("123", p_const.TYPE_VLAN, "physnet-1",
"1", "234")
"1", "234",
constants.DEVICE_OWNER_NETWORK_PREFIX)
add_tap.assert_called_with("123", p_const.TYPE_VLAN, "physnet-1",
"1", "tap234")
"1", "tap234",
constants.DEVICE_OWNER_NETWORK_PREFIX)
def test_delete_vlan_bridge(self):
with contextlib.nested(


Loading…
Cancel
Save