diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py index f084053765c..f4f30ecc4c1 100644 --- a/neutron/agent/rpc.py +++ b/neutron/agent/rpc.py @@ -139,11 +139,12 @@ class PluginApi(object): agent_id=agent_id, host=host) def update_device_list(self, context, devices_up, devices_down, - agent_id, host): + agent_id, host, agent_restarted=False): cctxt = self.client.prepare(version='1.5') return cctxt.call(context, 'update_device_list', devices_up=devices_up, devices_down=devices_down, - agent_id=agent_id, host=host) + agent_id=agent_id, host=host, + agent_restarted=agent_restarted) def tunnel_sync(self, context, tunnel_ip, tunnel_type=None, host=None): cctxt = self.client.prepare(version='1.4') diff --git a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py index b60b78495f2..ab4393af2d3 100644 --- a/neutron/plugins/ml2/drivers/l2pop/mech_driver.py +++ b/neutron/plugins/ml2/drivers/l2pop/mech_driver.py @@ -245,6 +245,19 @@ class L2populationMechanismDriver(api.MechanismDriver): return agents + def agent_restarted(self, context): + agent_host = context.host + session = db_api.get_reader_session() + agent = l2pop_db.get_agent_by_host(session, agent_host) + if l2pop_db.get_agent_uptime(agent) < cfg.CONF.l2pop.agent_boot_time: + LOG.warning(_LW("Agent on host '%s' did not supply " + "'agent_restarted'information in RPC message, " + "determined it restarted based on deprecated " + "'agent_boot_time' config option."), + agent_host) + return True + return False + def update_port_down(self, context): port = context.current agent_host = context.host @@ -265,7 +278,7 @@ class L2populationMechanismDriver(api.MechanismDriver): self.L2populationAgentNotify.remove_fdb_entries( self.rpc_ctx, fdb_entries) - def update_port_up(self, context): + def update_port_up(self, context, agent_restarted=None): port = context.current agent_host = context.host session = db_api.get_reader_session() @@ -291,8 +304,10 @@ class L2populationMechanismDriver(api.MechanismDriver): # with high concurrency more than 1 port may be activated on an agent # at the same time (like VM port + a DVR port) so checking for 1 or 2 is_first_port = agent_active_ports in (1, 2) - if is_first_port or (l2pop_db.get_agent_uptime(agent) < - cfg.CONF.l2pop.agent_boot_time): + if agent_restarted is None: + # Only for backport compatibility, will be removed. + agent_restarted = self.agent_restarted(context) + if is_first_port or agent_restarted: # First port(s) activated on current agent in this network, # we have to provide it with the whole list of fdb entries agent_fdb_entries = self._create_agent_fdb(session, diff --git a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py index 3fe449e15e5..705d7e1da67 100644 --- a/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py +++ b/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py @@ -879,9 +879,15 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin, LOG.debug("Setting status for %s to DOWN", device) devices_down.append(device) if devices_up or devices_down: + # When the iter_num == 0, that indicate the ovs-agent is doing + # the initialization work. L2 pop needs this precise knowledge + # to notify the agent to refresh the tunnel related flows. + # Otherwise, these flows will be cleaned as stale due to the + # different cookie id. + agent_restarted = self.iter_num == 0 devices_set = self.plugin_rpc.update_device_list( self.context, devices_up, devices_down, self.agent_id, - self.conf.host) + self.conf.host, agent_restarted=agent_restarted) failed_devices = (devices_set.get('failed_devices_up') + devices_set.get('failed_devices_down')) if failed_devices: diff --git a/neutron/plugins/ml2/rpc.py b/neutron/plugins/ml2/rpc.py index a15c7426158..981e2dbd5d5 100644 --- a/neutron/plugins/ml2/rpc.py +++ b/neutron/plugins/ml2/rpc.py @@ -241,6 +241,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): agent_id = kwargs.get('agent_id') device = kwargs.get('device') host = kwargs.get('host') + agent_restarted = kwargs.pop('agent_restarted', None) LOG.debug("Device %(device)s up at agent %(agent_id)s", {'device': device, 'agent_id': agent_id}) plugin = directory.get_plugin() @@ -267,7 +268,8 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): else: self.update_port_status_to_active(port, rpc_context, port_id, host) self.notify_l2pop_port_wiring(port_id, rpc_context, - n_const.PORT_STATUS_ACTIVE, host) + n_const.PORT_STATUS_ACTIVE, host, + agent_restarted) def update_port_status_to_active(self, port, rpc_context, port_id, host): plugin = directory.get_plugin() @@ -291,7 +293,7 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): provisioning_blocks.L2_AGENT_ENTITY) def notify_l2pop_port_wiring(self, port_id, rpc_context, - status, host): + status, host, agent_restarted=None): """Notify the L2pop driver that a port has been wired/unwired. The L2pop driver uses this notification to broadcast forwarding @@ -302,25 +304,38 @@ class RpcCallbacks(type_tunnel.TunnelRpcCallbackMixin): 'l2population') if not l2pop_driver: return + port = ml2_db.get_port(rpc_context, port_id) + if not port: + return port_context = plugin.get_bound_port_context( - rpc_context, port_id) + rpc_context, port_id, host) if not port_context: # port deleted return + # NOTE: DVR ports are already handled and updated through l2pop + # and so we don't need to update it again here. But, l2pop did not + # handle DVR ports while restart neutron-*-agent, we need to handle + # it here. + if agent_restarted is None: + agent_restarted = l2pop_driver.obj.agent_restarted(port_context) + if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and + not agent_restarted): + return port = port_context.current - if (status == n_const.PORT_STATUS_ACTIVE and + if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and + status == n_const.PORT_STATUS_ACTIVE and port[portbindings.HOST_ID] != host and not l3_hamode_db.is_ha_router_port(rpc_context, port['device_owner'], port['device_id'])): # don't setup ACTIVE forwarding entries unless bound to this - # host or if it's an HA port (which is special-cased in the - # mech driver) + # host or if it's an HA or DVR port (which is special-cased in + # the mech driver) return port_context.current['status'] = status port_context.current[portbindings.HOST_ID] = host if status == n_const.PORT_STATUS_ACTIVE: - l2pop_driver.obj.update_port_up(port_context) + l2pop_driver.obj.update_port_up(port_context, agent_restarted) else: l2pop_driver.obj.update_port_down(port_context) diff --git a/neutron/tests/functional/agent/l2/base.py b/neutron/tests/functional/agent/l2/base.py index 0980d7301cf..76aac41da3f 100644 --- a/neutron/tests/functional/agent/l2/base.py +++ b/neutron/tests/functional/agent/l2/base.py @@ -267,7 +267,7 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): return ports def _mock_update_device(self, context, devices_up, devices_down, agent_id, - host=None): + host=None, agent_restarted=False): dev_up = [] dev_down = [] for port in self.ports: @@ -311,7 +311,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_up_trigger(self, agent): def mock_failed_devices_up(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): failed_devices = [] devices = list(devices_up) # first port fails @@ -332,7 +333,8 @@ class OVSAgentTestFramework(base.BaseOVSLinuxTestCase): def _prepare_failed_dev_down_trigger(self, agent): def mock_failed_devices_down(context, devices_up, devices_down, - agent_id, host=None): + agent_id, host=None, + agent_restarted=False): # first port fails failed_port_id = self.ports[0]['id'] failed_devices_down = [] diff --git a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py index 4dd72ed8b16..e6186e03013 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py +++ b/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py @@ -338,6 +338,49 @@ class TestL2PopulationRpcTestCase(test_plugin.Ml2PluginV2TestCase): self.mock_fanout.assert_called_with( mock.ANY, 'remove_fdb_entries', expected) + def _test_ovs_agent_restarted_with_dvr_port( + self, agent_boot_timeout=True, agent_restarted=False): + plugin = directory.get_plugin() + self._setup_l3() + router = self._create_router(distributed=True) + with mock.patch.object(l2pop_mech_driver.L2populationMechanismDriver, + 'agent_restarted', + return_value=agent_boot_timeout): + with self.subnet(network=self._network, + enable_dhcp=False) as snet: + with self.port( + subnet=snet, + device_owner=constants.DEVICE_OWNER_DVR_INTERFACE)\ + as port: + port_id = port['port']['id'] + plugin.update_distributed_port_binding(self.adminContext, + port_id, {'port': {portbindings.HOST_ID: HOST_4, + 'device_id': router['id']}}) + port = self._show('ports', port_id) + self.assertEqual(portbindings.VIF_TYPE_DISTRIBUTED, + port['port'][portbindings.VIF_TYPE]) + self.callbacks.update_device_up( + self.adminContext, + agent_id=HOST_4, + device=port_id, + host=HOST_4, + agent_restarted=agent_restarted) + fanout_expected = {port['port']['network_id']: { + 'network_type': u'vxlan', + 'ports': { + u'20.0.0.4': [('00:00:00:00:00:00', '0.0.0.0')]}, + 'segment_id': 1}} + self.mock_fanout.assert_called_with(mock.ANY, + 'add_fdb_entries', + fanout_expected) + + def test_ovs_agent_restarted_with_dvr_port_boot_config_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port() + + def test_ovs_agent_restarted_with_dvr_port_rpc_send_timeout(self): + self._test_ovs_agent_restarted_with_dvr_port( + agent_boot_timeout=False, agent_restarted=True) + def test_ha_agents_get_other_fdb(self): # First network port is added on HOST4, then HA router port is # added on HOST and HOST2. diff --git a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py index 210571ffb59..4514614e17a 100644 --- a/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py +++ b/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py @@ -720,7 +720,8 @@ class TestOvsNeutronAgent(object): self.agent._bind_devices(port_details) update_devices.assert_called_once_with(mock.ANY, devices_up, devices_down, - mock.ANY, mock.ANY) + mock.ANY, mock.ANY, + agent_restarted=True) def _test_arp_spoofing(self, enable_prevent_arp_spoofing): self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing diff --git a/neutron/tests/unit/plugins/ml2/test_rpc.py b/neutron/tests/unit/plugins/ml2/test_rpc.py index 19fe9cb530a..c471fd3358b 100644 --- a/neutron/tests/unit/plugins/ml2/test_rpc.py +++ b/neutron/tests/unit/plugins/ml2/test_rpc.py @@ -445,6 +445,7 @@ class RpcApiTestCase(base.BaseTestCase): devices_down=['fake_device3', 'fake_device4'], agent_id='fake_agent_id', host='fake_host', + agent_restarted=False, version='1.5') def test_get_devices_details_list_and_failed_devices(self): diff --git a/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml new file mode 100644 index 00000000000..043aab2f2fe --- /dev/null +++ b/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml @@ -0,0 +1,27 @@ +--- +critical: + - | + The neutron-openvswitch-agent can sometimes spend too much time handling + a large number of ports, exceeding its timeout value, ``agent_boot_time``, + for L2 population. Because of this, some flow update operations will not + be triggerred, resulting in lost flows during agent restart, especially + for host-to-host vxlan tunnel flows, causing the original tunnel flows to + be treated as stale due to the different cookie IDs. The agent's first + RPC loop will also do a stale flow clean-up procedure and delete them, + leading to a loss of connectivity. + Please ensure that all neutron-server and neutron-openvswitch-agent + binaries are upgraded for the changes to take effect, after which + the L2 population ``agent_boot_time`` config option will no longer + be used. +fixes: + - | + The neutron-openvswitch-agent was changed to notify the neutron-server + in its first RPC loop that it has restarted. This signals neutron-server + to provide updated L2 population information to correctly program FDB + entries, ensuring connectivity to instances is not interrupted. + This fixes the following bugs: + `1794991 `_, + `1799178 `_, + `1813703 `_, + `1813714 `_, + `1813715 `_.