diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index 47fc4a6f9be..bf76aca0114 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -149,11 +149,12 @@ class PluginApi(object): agent_id=agent_id, host=host) def update_device_list(self, context, devices_up, devices_down, - agent_id, host): + agent_id, host, agent_restarted=False): cctxt = self.client.prepare(version='1.5') return cctxt.call(context, 'update_device_list', devices_up=devices_up, devices_down=devices_down, - agent_id=agent_id, host=host) + agent_id=agent_id, host=host, + agent_restarted=agent_restarted) def tunnel_sync(self, context, tunnel_ip, tunnel_type=None, host=None): cctxt = self.client.prepare(version='1.4') diff --git a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py index 6ed258b77de..eccd6f4148d 100644 --- a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py +++ b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py @@ -275,7 +275,7 @@ class L2populationMechanismDriver(api.MechanismDriver): self.L2populationAgentNotify.remove_fdb_entries( self.rpc_ctx, fdb_entries) - def update_port_up(self, context): + def update_port_up(self, context, agent_restarted=None): port = context.current agent_host = context.host port_context = context._plugin_context @@ -301,7 +301,10 @@ class L2populationMechanismDriver(api.MechanismDriver): # with high concurrency more than 1 port may be activated on an agent # at the same time (like VM port + a DVR port) so checking for 1 or 2 is_first_port = agent_active_ports in (1, 2) - if is_first_port or self.agent_restarted(context): + if agent_restarted is None: + # Only for backport compatibility, will be removed. + agent_restarted = self.agent_restarted(context) + if is_first_port or agent_restarted: # First port(s) activated on current agent in this network, # we have to provide it with the whole list of fdb entries agent_fdb_entries = self._create_agent_fdb(port_context, diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index 14806ac545f..14bd461ad8a 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -908,9 +908,15 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin, LOG.debug("Setting status for %s to DOWN", device) devices_down.append(device) if devices_up or devices_down: + # When the iter_num == 0, that indicate the ovs-agent is doing + # the initialization work. L2 pop needs this precise knowledge + # to notify the agent to refresh the tunnel related flows. + # Otherwise, these flows will be cleaned as stale due to the + # different cookie id. + agent_restarted = self.iter_num == 0 devices_set = self.plugin_rpc.update_device_list( self.context, devices_up, devices_down, self.agent_id, - self.conf.host) + self.conf.host, agent_restarted=agent_restarted) failed_devices = (devices_set.get('failed_devices_up') + devices_set.get('failed_devices_down')) if failed_devices: diff --git a/neutron/plugins/ml2/rpc.py b/neutron/plugins/ml2/rpc.py index 8254366cc48..9c39e49b9aa 100644 --- a/neutron/plugins/ml2/rpc.py +++ b/neutron/plugins/ml2/rpc.py @@ -251,6 +251,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): agent_id = kwargs.get('agent_id') device = kwargs.get('device') host = kwargs.get('host') + agent_restarted = kwargs.pop('agent_restarted', None) LOG.debug("Device %(device)s up at agent %(agent_id)s", {'device': device, 'agent_id': agent_id}) plugin = directory.get_plugin() @@ -277,7 +278,8 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): else: self.update_port_status_to_active(port, rpc_context, port_id, host) self.notify_l2pop_port_wiring(port_id, rpc_context, - n_const.PORT_STATUS_ACTIVE, host) + n_const.PORT_STATUS_ACTIVE, host, + agent_restarted) def update_port_status_to_active(self, port, rpc_context, port_id, host): plugin = directory.get_plugin() @@ -301,7 +303,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): provisioning_blocks.L2_AGENT_ENTITY) def notify_l2pop_port_wiring(self, port_id, rpc_context, - status, host): + status, host, agent_restarted=None): """Notify the L2pop driver that a port has been wired/unwired. The L2pop driver uses this notification to broadcast forwarding @@ -324,8 +326,10 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): # and so we don't need to update it again here. But, l2pop did not # handle DVR ports while restart neutron-*-agent, we need to handle # it here. + if agent_restarted is None: + agent_restarted = l2pop_driver.obj.agent_restarted(port_context) if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and - not l2pop_driver.obj.agent_restarted(port_context)): + not agent_restarted): return port = port_context.current if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and @@ -341,7 +345,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): port_context.current['status'] = status port_context.current[portbindings.HOST_ID] = host if status == n_const.PORT_STATUS_ACTIVE: - l2pop_driver.obj.update_port_up(port_context) + l2pop_driver.obj.update_port_up(port_context, agent_restarted) else: l2pop_driver.obj.update_port_down(port_context) diff --git a/neutron/tests/functional/agent/l2/base.py b/neutron/tests/functional/agent/l2/base.py index 5686a5c5a3d..8102d79330b 100644 --- a/neutron/tests/functional/agent/l2/base.py +++ b/neutron/tests/functional/agent/l2/base.py @@ -269,7 +269,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): return ports def _mock_update_device(self, context, devices_up, devices_down, agent_id, - host=None): + host=None, agent_restarted=False): dev_up = [] dev_down = [] for port in self.ports: @@ -313,7 +313,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_up_trigger(self, agent): def mock_failed_devices_up(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): failed_devices = [] devices = list(devices_up) # first port fails @@ -334,7 +335,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_down_trigger(self, agent): def mock_failed_devices_down(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): # first port fails failed_port_id = self.ports[0]['id'] failed_devices_down = [] diff --git a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py index 4db12517f9c..1e655f4e535 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py @@ -355,11 +355,13 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): self.mock_fanout.assert_called_with( mock.ANY, 'remove_fdb_entries', expected) - def test_ovs_agent_restarted_with_dvr_port(self): + def _test_ovs_agent_restarted_with_dvr_port( + self, agent_boot_timeout=True, agent_restarted=False): plugin = directory.get_plugin() router = self._create_dvr_router() with mock.patch.object(l2pop_mech_driver.L2populationMechanismDriver, - 'agent_restarted', return_value=True): + 'agent_restarted', + return_value=agent_boot_timeout): with self.subnet(network=self._network, enable_dhcp=False) as snet: with self.port( @@ -373,10 +375,12 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): port = self._show('ports', port_id) self.assertEqual(portbindings.VIF_TYPE_DISTRIBUTED, port['port'][portbindings.VIF_TYPE]) - self.callbacks.update_device_up(self.adminContext, - agent_id=HOST_4, - device=port_id, - host=HOST_4) + self.callbacks.update_device_up( + self.adminContext, + agent_id=HOST_4, + device=port_id, + host=HOST_4, + agent_restarted=agent_restarted) fanout_expected = {port['port']['network_id']: { 'network_type': u'vxlan', 'ports': { @@ -386,6 +390,13 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): 'add_fdb_entries', fanout_expected) + def test_ovs_agent_restarted_with_dvr_port_boot_config_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port() + + def test_ovs_agent_restarted_with_dvr_port_rpc_send_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port( + agent_boot_timeout=False, agent_restarted=True) + def test_ha_agents_with_dvr_rtr_does_not_get_other_fdb(self): router = self._create_dvr_router() directory.add_plugin(plugin_constants.L3, self.plugin) diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py index 62f567538ff..9406710d292 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py @@ -742,7 +742,8 @@ class TestOvsNeutronAgent(object): self.agent._bind_devices(port_details) update_devices.assert_called_once_with(mock.ANY, devices_up, devices_down, - mock.ANY, mock.ANY) + mock.ANY, mock.ANY, + agent_restarted=True) def _test_arp_spoofing(self, enable_prevent_arp_spoofing): self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing diff --git a/neutron/tests/unit/plugins/ml2/test_rpc.py b/neutron/tests/unit/plugins/ml2/test_rpc.py index e4e6ccda18d..850b29db305 100644 --- a/neutron/tests/unit/plugins/ml2/test_rpc.py +++ b/neutron/tests/unit/plugins/ml2/test_rpc.py @@ -455,6 +455,7 @@ class RpcApiTestCase(base.BaseTestCase): devices_down=['fake_device3', 'fake_device4'], agent_id='fake_agent_id', host='fake_host', + agent_restarted=False, version='1.5') def test_get_devices_details_list_and_failed_devices(self): diff --git a/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml new file mode 100644 index 00000000000..043aab2f2fe --- /dev/null +++ b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml @@ -0,0 +1,27 @@ +--- +critical: + - | + The neutron-openvswitch-agent can sometimes spend too much time handling + a large number of ports, exceeding its timeout value, ``agent_boot_time``, + for L2 population. Because of this, some flow update operations will not + be triggerred, resulting in lost flows during agent restart, especially + for host-to-host vxlan tunnel flows, causing the original tunnel flows to + be treated as stale due to the different cookie IDs. The agent's first + RPC loop will also do a stale flow clean-up procedure and delete them, + leading to a loss of connectivity. + Please ensure that all neutron-server and neutron-openvswitch-agent + binaries are upgraded for the changes to take effect, after which + the L2 population ``agent_boot_time`` config option will no longer + be used. +fixes: + - | + The neutron-openvswitch-agent was changed to notify the neutron-server + in its first RPC loop that it has restarted. This signals neutron-server + to provide updated L2 population information to correctly program FDB + entries, ensuring connectivity to instances is not interrupted. + This fixes the following bugs: + `1794991 `_, + `1799178 `_, + `1813703 `_, + `1813714 `_, + `1813715 `_.