From 93e9dc5426764b791ac69e62c6d60be7591c16ab Mon Sep 17 00:00:00 2001 From: Darragh O'Reilly Date: Fri, 6 Dec 2019 10:06:21 +0000 Subject: [PATCH] ovs agent: signal to plugin if tunnel refresh needed Currently the ovs agent calls update_device_list with the agent_restarted flag set only on the first loop iteration. Then the server knows to send the l2pop flooding entries for the network to the agent. But when a compute node with many instances on many networks reboots, it takes time to readd all the active devices and some may be readded after the first loop iteration. Then the server can fail to send the flooding entries which means there will be no flood_to_tuns flow and broadcasts like dhcp will fail. This patch fixes that by renaming the agent_restarted flag to refresh_tunnels and setting it if the agent has not received the flooding entries for the network. Change-Id: I607aa8fa399e72b037fd068ad4f02b6210e57e91 Closes-Bug: #1853613 --- neutron/agent/rpc.py | 8 +++-- .../plugins/ml2/drivers/l2pop/mech_driver.py | 7 ++-- .../openvswitch/agent/ovs_neutron_agent.py | 12 +++++-- neutron/plugins/ml2/rpc.py | 18 ++++++---- neutron/tests/functional/agent/l2/base.py | 6 ++-- .../ml2/drivers/l2pop/test_mech_driver.py | 2 +- .../agent/test_ovs_neutron_agent.py | 36 ++++++++++++++++++- neutron/tests/unit/plugins/ml2/test_rpc.py | 4 +-- 8 files changed, 71 insertions(+), 22 deletions(-) diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index 466ef426cd0..247a0b26d9d 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -114,6 +114,8 @@ class PluginApi(object): get_devices_details_list_and_failed_devices 1.6 - Support get_network_details 1.7 - Support get_ports_by_vnic_type_and_host + 1.8 - Rename agent_restarted to refresh_tunnels in + update_device_list to reflect its expanded purpose ''' def __init__(self, topic): @@ -161,8 +163,8 @@ class PluginApi(object): agent_id=agent_id, host=host) def update_device_list(self, context, devices_up, devices_down, - agent_id, host, agent_restarted=False): - cctxt = self.client.prepare(version='1.5') + agent_id, host, refresh_tunnels=False): + cctxt = self.client.prepare(version='1.8') ret_devices_up = [] failed_devices_up = [] @@ -178,7 +180,7 @@ class PluginApi(object): devices_up=devices_up[i:i + step], devices_down=devices_down[i:i + step], agent_id=agent_id, host=host, - agent_restarted=agent_restarted) + refresh_tunnels=refresh_tunnels) ret_devices_up.extend(ret.get("devices_up", [])) failed_devices_up.extend(ret.get("failed_devices_up", [])) ret_devices_down.extend(ret.get("devices_down", [])) diff --git a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py index c835bf8c369..79f8f356f23 100644 --- a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py +++ b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py @@ -271,7 +271,7 @@ class L2populationMechanismDriver(api.MechanismDriver): self.L2populationAgentNotify.remove_fdb_entries( self.rpc_ctx, fdb_entries) - def update_port_up(self, context, agent_restarted=False): + def update_port_up(self, context, refresh_tunnels=False): port = context.current agent_host = context.host port_context = context._plugin_context @@ -285,7 +285,8 @@ class L2populationMechanismDriver(api.MechanismDriver): agent_active_ports = l2pop_db.get_agent_network_active_port_count( port_context, agent_host, network_id) - + LOG.debug("host: %s, agent_active_ports: %s, refresh_tunnels: %s", + agent_host, agent_active_ports, refresh_tunnels) agent_ip = l2pop_db.get_agent_ip(agent) segment = context.bottom_bound_segment if not self._validate_segment(segment, port['id'], agent): @@ -297,7 +298,7 @@ class L2populationMechanismDriver(api.MechanismDriver): # with high concurrency more than 1 port may be activated on an agent # at the same time (like VM port + a DVR port) so checking for 1 or 2 is_first_port = agent_active_ports in (1, 2) - if is_first_port or agent_restarted: + if is_first_port or refresh_tunnels: # First port(s) activated on current agent in this network, # we have to provide it with the whole list of fdb entries agent_fdb_entries = self._create_agent_fdb(port_context, diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index c5881aadc1d..52d6e81894c 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -1096,6 +1096,7 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, devices_up = [] devices_down = [] failed_devices = [] + tunnels_missing = False port_names = [p['vif_port'].port_name for p in need_binding_ports] port_info = self.int_br.get_ports_attributes( "Port", columns=["name", "tag"], ports=port_names, if_exists=True) @@ -1130,6 +1131,10 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, if port_detail.get('admin_state_up'): LOG.debug("Setting status for %s to UP", device) devices_up.append(device) + if (not tunnels_missing and + lvm.network_type in constants.TUNNEL_NETWORK_TYPES and + len(lvm.tun_ofports) == 0): + tunnels_missing = True else: LOG.debug("Setting status for %s to DOWN", device) devices_down.append(device) @@ -1138,11 +1143,12 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, # the initialization work. L2 pop needs this precise knowledge # to notify the agent to refresh the tunnel related flows. # Otherwise, these flows will be cleaned as stale due to the - # different cookie id. - agent_restarted = self.iter_num == 0 + # different cookie id. We also set refresh_tunnels if the agent + # has not received a notification and is missing tunnels. + refresh_tunnels = (self.iter_num == 0) or tunnels_missing devices_set = self.plugin_rpc.update_device_list( self.context, devices_up, devices_down, self.agent_id, - self.conf.host, agent_restarted=agent_restarted) + self.conf.host, refresh_tunnels=refresh_tunnels) failed_devices = (devices_set.get('failed_devices_up') + devices_set.get('failed_devices_down')) if failed_devices: diff --git a/neutron/plugins/ml2/rpc.py b/neutron/plugins/ml2/rpc.py index b020183c5e4..c0356366f8f 100644 --- a/neutron/plugins/ml2/rpc.py +++ b/neutron/plugins/ml2/rpc.py @@ -55,7 +55,10 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): # get_devices_details_list_and_failed_devices # 1.6 Support get_network_details # 1.7 Support get_ports_by_vnic_type_and_host - target = oslo_messaging.Target(version='1.7') + # 1.8 Rename agent_restarted to refresh_tunnels in + # update_device_list to reflect its expanded purpose + + target = oslo_messaging.Target(version='1.8') def __init__(self, notifier, type_manager): self.setup_tunnel_callback_mixin(notifier, type_manager) @@ -267,7 +270,10 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): @profiler.trace("rpc") def update_device_up(self, rpc_context, **kwargs): """Device is up on agent.""" - agent_restarted = kwargs.pop('agent_restarted', False) + refresh_tunnels = kwargs.pop('refresh_tunnels', False) + if not refresh_tunnels: + # For backward compatibility with older agents + refresh_tunnels = kwargs.pop('agent_restarted', False) agent_id, host, device = self._get_request_details(kwargs) LOG.debug("Device %(device)s up at agent %(agent_id)s", {'device': device, 'agent_id': agent_id}) @@ -301,7 +307,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): self.update_port_status_to_active(port, rpc_context, port_id, host) self.notify_l2pop_port_wiring(port_id, rpc_context, n_const.PORT_STATUS_ACTIVE, host, - agent_restarted) + refresh_tunnels) def update_port_status_to_active(self, port, rpc_context, port_id, host): plugin = directory.get_plugin() @@ -325,7 +331,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): provisioning_blocks.L2_AGENT_ENTITY) def notify_l2pop_port_wiring(self, port_id, rpc_context, - status, host, agent_restarted=False): + status, host, refresh_tunnels=False): """Notify the L2pop driver that a port has been wired/unwired. The L2pop driver uses this notification to broadcast forwarding @@ -349,7 +355,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): # handle DVR ports while restart neutron-*-agent, we need to handle # it here. if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and - not agent_restarted): + not refresh_tunnels): return port = port_context.current if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and @@ -365,7 +371,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): port_context.current['status'] = status port_context.current[portbindings.HOST_ID] = host if status == n_const.PORT_STATUS_ACTIVE: - l2pop_driver.obj.update_port_up(port_context, agent_restarted) + l2pop_driver.obj.update_port_up(port_context, refresh_tunnels) else: l2pop_driver.obj.update_port_down(port_context) diff --git a/neutron/tests/functional/agent/l2/base.py b/neutron/tests/functional/agent/l2/base.py index b7d68614e65..e153d143937 100644 --- a/neutron/tests/functional/agent/l2/base.py +++ b/neutron/tests/functional/agent/l2/base.py @@ -336,7 +336,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase, OVSOFControllerHelper): return ports def _mock_update_device(self, context, devices_up, devices_down, agent_id, - host=None, agent_restarted=False): + host=None, refresh_tunnels=False): dev_up = [] dev_down = [] for port in self.ports: @@ -382,7 +382,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase, OVSOFControllerHelper): def mock_failed_devices_up(context, devices_up, devices_down, agent_id, host=None, - agent_restarted=False): + refresh_tunnels=False): failed_devices = [] devices = list(devices_up) # first port fails @@ -404,7 +404,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase, OVSOFControllerHelper): def mock_failed_devices_down(context, devices_up, devices_down, agent_id, host=None, - agent_restarted=False): + refresh_tunnels=False): # first port fails failed_port_id = self.ports[0]['id'] failed_devices_down = [] diff --git a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py index 05fec1f69a6..290f69f7873 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py @@ -373,7 +373,7 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): agent_id=HOST_4, device=port_id, host=HOST_4, - agent_restarted=True) + refresh_tunnels=True) fanout_expected = {port['port']['network_id']: { 'network_type': u'vxlan', 'ports': {u'20.0.0.4': [('00:00:00:00:00:00', '0.0.0.0')]}, diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py index 514bb310a40..41dd3b01db7 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py @@ -774,7 +774,41 @@ class TestOvsNeutronAgent(object): update_devices.assert_called_once_with(mock.ANY, devices_up, devices_down, mock.ANY, mock.ANY, - agent_restarted=True) + refresh_tunnels=True) + + def _test_bind_devices_sets_refresh_tunnels(self, tun_ofports, expected): + self.agent.iter_num = 3 + self.agent.prevent_arp_spoofing = False + self.agent.vlan_manager.add('fake_network', 1, + n_const.TYPE_VXLAN, None, 1) + ovs_db_list = [{'name': 'fake_device', 'tag': []}] + self.agent.vlan_manager.get('fake_network').tun_ofports = tun_ofports + vif_port = mock.Mock() + vif_port.port_name = 'fake_device' + vif_port.ofport = 1 + need_binding_ports = [{'network_id': 'fake_network', + 'vif_port': vif_port, + 'device': 'fake_device', + 'admin_state_up': True}] + with mock.patch.object( + self.agent.plugin_rpc, 'update_device_list', + return_value={'devices_up': [], + 'devices_down': [], + 'failed_devices_up': [], + 'failed_devices_down': []}) as update_devices, \ + mock.patch.object(self.agent, + 'int_br') as int_br: + int_br.get_ports_attributes.return_value = ovs_db_list + self.agent._bind_devices(need_binding_ports) + update_devices.assert_called_once_with(mock.ANY, ['fake_device'], + [], mock.ANY, mock.ANY, + refresh_tunnels=expected) + + def test_bind_devices_sets_refresh_tunnels_if_tunnels_missing(self): + self._test_bind_devices_sets_refresh_tunnels([], True) + + def test_bind_devices_does_not_set_refresh_tunnels_if_tunnels_exist(self): + self._test_bind_devices_sets_refresh_tunnels([1, 2, 3], False) def _test_arp_spoofing(self, enable_prevent_arp_spoofing): self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing diff --git a/neutron/tests/unit/plugins/ml2/test_rpc.py b/neutron/tests/unit/plugins/ml2/test_rpc.py index a719bc408a9..775b1fe5253 100644 --- a/neutron/tests/unit/plugins/ml2/test_rpc.py +++ b/neutron/tests/unit/plugins/ml2/test_rpc.py @@ -479,8 +479,8 @@ class RpcApiTestCase(base.BaseTestCase): devices_down=['fake_device3', 'fake_device4'], agent_id='fake_agent_id', host='fake_host', - agent_restarted=False, - version='1.5') + refresh_tunnels=False, + version='1.8') def test_get_devices_details_list_and_failed_devices(self): rpcapi = agent_rpc.PluginApi(topics.PLUGIN)