From 62fe7852bbd70a24174853997096c52ee015e269 Mon Sep 17 00:00:00 2001 From: LIU Yulong Date: Mon, 4 Mar 2019 21:17:20 +0800 Subject: [PATCH] More accurate agent restart state transfer Ovs-agent can be very time-consuming in handling a large number of ports. At this point, the ovs-agent status report may have exceeded the set timeout value. Some flows updating operations will not be triggerred. This results in flows loss during agent restart, especially for hosts to hosts of vxlan tunnel flow. This fix will let the ovs-agent explicitly, in the first rpc loop, indicate that the status is restarted. Then l2pop will be required to update fdb entries. Conflicts: neutron/plugins/ml2/rpc.py Conflicts: neutron/plugins/ml2/drivers/l2pop/mech_driver.py Closes-Bug: #1813703 Closes-Bug: #1813714 Closes-Bug: #1813715 Closes-Bug: #1794991 Closes-Bug: #1799178 Change-Id: I8edc2deb509216add1fb21e1893f1c17dda80961 (cherry picked from commit a5244d6d44d2b66de27dc77efa7830fa657260be) (cherry picked from commit cc49ab550179bdc76d79f48be67886681cb43d4e) (cherry picked from commit 5ffca4966877454c605442e9e429aa83ea7d7348) --- neutron/agent/rpc.py | 5 ++-- .../plugins/ml2/drivers/l2pop/mech_driver.py | 9 ++++--- .../openvswitch/agent/ovs_neutron_agent.py | 8 +++++- neutron/plugins/ml2/rpc.py | 12 ++++++--- neutron/tests/functional/agent/l2/base.py | 8 +++--- .../ml2/drivers/l2pop/test_mech_driver.py | 23 +++++++++++----- .../agent/test_ovs_neutron_agent.py | 3 ++- neutron/tests/unit/plugins/ml2/test_rpc.py | 1 + ...agent-state-transfer-67c771cb1ee04dd0.yaml | 27 +++++++++++++++++++ 9 files changed, 76 insertions(+), 20 deletions(-) create mode 100644 releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index 144094217d7..58be7daa1c7 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -144,11 +144,12 @@ class PluginApi(object): agent_id=agent_id, host=host) def update_device_list(self, context, devices_up, devices_down, - agent_id, host): + agent_id, host, agent_restarted=False): cctxt = self.client.prepare(version='1.5') return cctxt.call(context, 'update_device_list', devices_up=devices_up, devices_down=devices_down, - agent_id=agent_id, host=host) + agent_id=agent_id, host=host, + agent_restarted=agent_restarted) def tunnel_sync(self, context, tunnel_ip, tunnel_type=None, host=None): cctxt = self.client.prepare(version='1.4') diff --git a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py index f2c20fa1054..f2757bec3bc 100644 --- a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py +++ b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py @@ -277,7 +277,7 @@ class L2populationMechanismDriver(api.MechanismDriver): self.L2populationAgentNotify.remove_fdb_entries( self.rpc_ctx, fdb_entries) - def update_port_up(self, context): + def update_port_up(self, context, agent_restarted=None): port = context.current agent_host = context.host session = db_api.get_reader_session() @@ -304,8 +304,11 @@ class L2populationMechanismDriver(api.MechanismDriver): # with high concurrency more than 1 port may be activated on an agent # at the same time (like VM port + a DVR port) so checking for 1 or 2 is_first_port = agent_active_ports in (1, 2) - if is_first_port or self.agent_restarted(context): - # First port activated on current agent in this network, + if agent_restarted is None: + # Only for backport compatibility, will be removed. + agent_restarted = self.agent_restarted(context) + if is_first_port or agent_restarted: + # First port(s) activated on current agent in this network, # we have to provide it with the whole list of fdb entries agent_fdb_entries = self._create_agent_fdb(session, agent, diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index 812d696b0c4..5871d1b7e6c 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -892,9 +892,15 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, LOG.debug("Setting status for %s to DOWN", device) devices_down.append(device) if devices_up or devices_down: + # When the iter_num == 0, that indicate the ovs-agent is doing + # the initialization work. L2 pop needs this precise knowledge + # to notify the agent to refresh the tunnel related flows. + # Otherwise, these flows will be cleaned as stale due to the + # different cookie id. + agent_restarted = self.iter_num == 0 devices_set = self.plugin_rpc.update_device_list( self.context, devices_up, devices_down, self.agent_id, - self.conf.host) + self.conf.host, agent_restarted=agent_restarted) failed_devices = (devices_set.get('failed_devices_up') + devices_set.get('failed_devices_down')) if failed_devices: diff --git a/neutron/plugins/ml2/rpc.py b/neutron/plugins/ml2/rpc.py index 441b1856bf4..69d7d7dd494 100644 --- a/neutron/plugins/ml2/rpc.py +++ b/neutron/plugins/ml2/rpc.py @@ -240,6 +240,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): agent_id = kwargs.get('agent_id') device = kwargs.get('device') host = kwargs.get('host') + agent_restarted = kwargs.pop('agent_restarted', None) LOG.debug("Device %(device)s up at agent %(agent_id)s", {'device': device, 'agent_id': agent_id}) plugin = directory.get_plugin() @@ -266,7 +267,8 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): else: self.update_port_status_to_active(port, rpc_context, port_id, host) self.notify_l2pop_port_wiring(port_id, rpc_context, - n_const.PORT_STATUS_ACTIVE, host) + n_const.PORT_STATUS_ACTIVE, host, + agent_restarted) def update_port_status_to_active(self, port, rpc_context, port_id, host): plugin = directory.get_plugin() @@ -290,7 +292,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): provisioning_blocks.L2_AGENT_ENTITY) def notify_l2pop_port_wiring(self, port_id, rpc_context, - status, host): + status, host, agent_restarted=None): """Notify the L2pop driver that a port has been wired/unwired. The L2pop driver uses this notification to broadcast forwarding @@ -313,8 +315,10 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): # and so we don't need to update it again here. But, l2pop did not # handle DVR ports while restart neutron-*-agent, we need to handle # it here. + if agent_restarted is None: + agent_restarted = l2pop_driver.obj.agent_restarted(port_context) if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and - not l2pop_driver.obj.agent_restarted(port_context)): + not agent_restarted): return port = port_context.current if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and @@ -330,7 +334,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): port_context.current['status'] = status port_context.current[portbindings.HOST_ID] = host if status == n_const.PORT_STATUS_ACTIVE: - l2pop_driver.obj.update_port_up(port_context) + l2pop_driver.obj.update_port_up(port_context, agent_restarted) else: l2pop_driver.obj.update_port_down(port_context) diff --git a/neutron/tests/functional/agent/l2/base.py b/neutron/tests/functional/agent/l2/base.py index aeb7e659caf..34818182a8c 100644 --- a/neutron/tests/functional/agent/l2/base.py +++ b/neutron/tests/functional/agent/l2/base.py @@ -268,7 +268,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): return ports def _mock_update_device(self, context, devices_up, devices_down, agent_id, - host=None): + host=None, agent_restarted=False): dev_up = [] dev_down = [] for port in self.ports: @@ -312,7 +312,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_up_trigger(self, agent): def mock_failed_devices_up(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): failed_devices = [] devices = list(devices_up) # first port fails @@ -333,7 +334,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_down_trigger(self, agent): def mock_failed_devices_down(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): # first port fails failed_port_id = self.ports[0]['id'] failed_devices_down = [] diff --git a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py index d69dd0b95e9..a8b71b8feef 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py @@ -355,11 +355,13 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): self.mock_fanout.assert_called_with( mock.ANY, 'remove_fdb_entries', expected) - def test_ovs_agent_restarted_with_dvr_port(self): + def _test_ovs_agent_restarted_with_dvr_port( + self, agent_boot_timeout=True, agent_restarted=False): plugin = directory.get_plugin() router = self._create_dvr_router() with mock.patch.object(l2pop_mech_driver.L2populationMechanismDriver, - 'agent_restarted', return_value=True): + 'agent_restarted', + return_value=agent_boot_timeout): with self.subnet(network=self._network, enable_dhcp=False) as snet: with self.port( @@ -373,10 +375,12 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): port = self._show('ports', port_id) self.assertEqual(portbindings.VIF_TYPE_DISTRIBUTED, port['port'][portbindings.VIF_TYPE]) - self.callbacks.update_device_up(self.adminContext, - agent_id=HOST_4, - device=port_id, - host=HOST_4) + self.callbacks.update_device_up( + self.adminContext, + agent_id=HOST_4, + device=port_id, + host=HOST_4, + agent_restarted=agent_restarted) fanout_expected = {port['port']['network_id']: { 'network_type': u'vxlan', 'ports': { @@ -386,6 +390,13 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): 'add_fdb_entries', fanout_expected) + def test_ovs_agent_restarted_with_dvr_port_boot_config_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port() + + def test_ovs_agent_restarted_with_dvr_port_rpc_send_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port( + agent_boot_timeout=False, agent_restarted=True) + def test_ha_agents_with_dvr_rtr_does_not_get_other_fdb(self): router = self._create_dvr_router() directory.add_plugin(plugin_constants.L3, self.plugin) diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py index 54ca04e35e1..d3adb28ea3e 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py @@ -741,7 +741,8 @@ class TestOvsNeutronAgent(object): self.agent._bind_devices(port_details) update_devices.assert_called_once_with(mock.ANY, devices_up, devices_down, - mock.ANY, mock.ANY) + mock.ANY, mock.ANY, + agent_restarted=True) def _test_arp_spoofing(self, enable_prevent_arp_spoofing): self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing diff --git a/neutron/tests/unit/plugins/ml2/test_rpc.py b/neutron/tests/unit/plugins/ml2/test_rpc.py index 4c9ca8cd902..c762bcd3c74 100644 --- a/neutron/tests/unit/plugins/ml2/test_rpc.py +++ b/neutron/tests/unit/plugins/ml2/test_rpc.py @@ -445,6 +445,7 @@ class RpcApiTestCase(base.BaseTestCase): devices_down=['fake_device3', 'fake_device4'], agent_id='fake_agent_id', host='fake_host', + agent_restarted=False, version='1.5') def test_get_devices_details_list_and_failed_devices(self): diff --git a/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml new file mode 100644 index 00000000000..043aab2f2fe --- /dev/null +++ b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml @@ -0,0 +1,27 @@ +--- +critical: + - | + The neutron-openvswitch-agent can sometimes spend too much time handling + a large number of ports, exceeding its timeout value, ``agent_boot_time``, + for L2 population. Because of this, some flow update operations will not + be triggerred, resulting in lost flows during agent restart, especially + for host-to-host vxlan tunnel flows, causing the original tunnel flows to + be treated as stale due to the different cookie IDs. The agent's first + RPC loop will also do a stale flow clean-up procedure and delete them, + leading to a loss of connectivity. + Please ensure that all neutron-server and neutron-openvswitch-agent + binaries are upgraded for the changes to take effect, after which + the L2 population ``agent_boot_time`` config option will no longer + be used. +fixes: + - | + The neutron-openvswitch-agent was changed to notify the neutron-server + in its first RPC loop that it has restarted. This signals neutron-server + to provide updated L2 population information to correctly program FDB + entries, ensuring connectivity to instances is not interrupted. + This fixes the following bugs: + `1794991 `_, + `1799178 `_, + `1813703 `_, + `1813714 `_, + `1813715 `_.