DVR-HA: Unbinding a HA router from agent does not clear HA interface

Removing an active or a standby HA router from an agent that has a
valid DVR serviceable port (such as DHCP), does not remove the
HA interface associated with the Router in the SNAT namespace.

When we try to add the HA router back to the agent, then it
adds more than one HA interface to the SNAT Namespace causing
more problems and we sometimes also see multiple active routers.

This bug might have been introduced by this patch [1].

Fix the problem by just adding the router namespaces without HA
interfaces when there is no HA and re-insert the HA interfaces
when HA router is bound to the agent into the namespace.

[1] https://review.openstack.org/#/c/522362/
Closes-Bug: #1816698

Change-Id: Ie625abcb73f8185bb2bee06dcd26a01d8af0b0d1
This commit is contained in:
Swaminathan Vasudevan 2019-02-21 17:14:03 -08:00
parent c3d34901dd
commit d9e0bab6ac
3 changed files with 137 additions and 16 deletions

View File

@ -397,11 +397,15 @@ class L3NATAgent(ha.AgentMixin,
kwargs['host'] = self.host
if router.get('distributed') and router.get('ha'):
# if the router does not contain information about the HA interface
# this means that this DVR+HA router needs to host only the edge
# side of it, typically because it's landing on a node that needs
# to provision a router namespace because of a DVR service port
# (e.g. DHCP).
# Case 1: If the router contains information about the HA interface
# and if the requesting agent is a DVR_SNAT agent then go ahead
# and create a HA router.
# Case 2: If the router does not contain information about the HA
# interface this means that this DVR+HA router needs to host only
# the edge side of it, typically because it's landing on a node
# that needs to provision a router namespace because of a DVR
# service port (e.g. DHCP). So go ahead and create a regular DVR
# edge router.
if (self.conf.agent_mode == lib_const.L3_AGENT_MODE_DVR_SNAT and
router.get(lib_const.HA_INTERFACE_KEY) is not None):
kwargs['state_change_callback'] = self.enqueue_state_change
@ -577,20 +581,39 @@ class L3NATAgent(ha.AgentMixin,
def _process_updated_router(self, router):
ri = self.router_info[router['id']]
is_dvr_snat_agent = (self.conf.agent_mode ==
lib_const.L3_AGENT_MODE_DVR_SNAT)
is_dvr_only_agent = (self.conf.agent_mode in
[lib_const.L3_AGENT_MODE_DVR,
lib_const.L3_AGENT_MODE_DVR_NO_EXTERNAL])
is_ha_router = getattr(ri, 'ha_state', False)
# For HA routers check that DB state matches actual state
if router.get('ha') and not is_dvr_only_agent and is_ha_router:
self.check_ha_state_for_router(
router['id'], router.get(l3_constants.HA_ROUTER_STATE_KEY))
ri.router = router
registry.notify(resources.ROUTER, events.BEFORE_UPDATE,
self, router=ri)
ri.process()
registry.notify(resources.ROUTER, events.AFTER_UPDATE, self, router=ri)
self.l3_ext_manager.update_router(self.context, router)
old_router_ha_interface = ri.router.get(lib_const.HA_INTERFACE_KEY)
current_router_ha_interface = router.get(lib_const.HA_INTERFACE_KEY)
ha_interface_change = ((old_router_ha_interface is None and
current_router_ha_interface is not None) or
(old_router_ha_interface is not None and
current_router_ha_interface is None))
is_dvr_ha_router = router.get('distributed') and router.get('ha')
if is_dvr_snat_agent and is_dvr_ha_router and ha_interface_change:
LOG.debug("Removing HA router %s, since it is not bound to "
"the current agent, and recreating regular DVR router "
"based on service port requirements.",
router['id'])
if self._safe_router_removed(router['id']):
self._process_added_router(router)
else:
is_ha_router = getattr(ri, 'ha_state', False)
# For HA routers check that DB state matches actual state
if router.get('ha') and not is_dvr_only_agent and is_ha_router:
self.check_ha_state_for_router(
router['id'], router.get(l3_constants.HA_ROUTER_STATE_KEY))
ri.router = router
registry.notify(resources.ROUTER, events.BEFORE_UPDATE,
self, router=ri)
ri.process()
registry.notify(
resources.ROUTER, events.AFTER_UPDATE, self, router=ri)
self.l3_ext_manager.update_router(self.context, router)
def _resync_router(self, router_update,
priority=PRIORITY_SYNC_ROUTERS_TASK):

View File

@ -1469,6 +1469,43 @@ class TestDvrRouter(framework.L3AgentTestFramework):
self.manage_router(restarted_agent, router1.router)
self.assertTrue(fip_cidr_centralized_mock.called)
def test_dvr_ha_router_unbound_from_agents(self):
self._setup_dvr_ha_agents()
self._setup_dvr_ha_bridges()
router1 = self._create_dvr_ha_router(
self.agent, enable_gw=True)
router2 = self._create_dvr_ha_router(
self.failover_agent, enable_gw=True)
utils.wait_until_true(lambda: router1.ha_state == 'master')
utils.wait_until_true(lambda: router2.ha_state == 'backup')
self._assert_ip_addresses_in_dvr_ha_snat_namespace(router1)
self._assert_no_ip_addresses_in_dvr_ha_snat_namespace(router2)
router1_ha_device = router1.get_ha_device_name()
router2_ha_device = router2.get_ha_device_name()
self.assertTrue(
ip_lib.device_exists(router1_ha_device, router1.ha_namespace))
self.assertTrue(
ip_lib.device_exists(router2_ha_device, router2.ha_namespace))
router1.router['_ha_interface'] = None
self.agent._process_updated_router(router1.router)
router_updated = self.agent.router_info[router1.router_id]
self.assertTrue(self._namespace_exists(router_updated.ns_name))
self._assert_snat_namespace_exists(router_updated)
snat_namespace_name = dvr_snat_ns.SnatNamespace.get_snat_ns_name(
router_updated.router_id)
self.assertFalse(
ip_lib.device_exists(router1_ha_device, snat_namespace_name))
utils.wait_until_true(lambda: router2.ha_state == 'master')
self._assert_ip_addresses_in_dvr_ha_snat_namespace(router2)
self.assertTrue(
ip_lib.device_exists(router2_ha_device, router2.ha_namespace))
def _test_dvr_ha_router_failover_with_gw_and_fip(self, enable_gw,
enable_centralized_fip,
snat_bound_fip):

View File

@ -2653,6 +2653,67 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
router)
safe_router_removed.assert_called_once_with(router['id'])
def test_process_dvr_routers_ha_on_update_when_router_unbound(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent.conf.agent_mode = 'dvr_snat'
router = mock.Mock()
router.id = '1234'
router.distributed = True
router.ha = True
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
updated_router = {'id': '1234',
'distributed': True,
'ha': True,
'external_gateway_info': {},
'routes': [],
'admin_state_up': True}
self.plugin_api.get_routers.return_value = [updated_router]
update = resource_processing_queue.ResourceUpdate(
updated_router['id'], l3_agent.PRIORITY_RPC,
resource=updated_router)
with mock.patch.object(agent,
"_safe_router_removed"
) as router_remove,\
mock.patch.object(agent,
"_process_added_router"
) as add_router:
agent._process_routers_if_compatible([updated_router], update)
router_remove.assert_called_once_with(updated_router['id'])
add_router.assert_called_once_with(updated_router)
def test_process_dvr_routers_ha_on_update_without_ha_interface(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent.conf.agent_mode = 'dvr_snat'
router = mock.Mock()
router.id = '1234'
router.distributed = True
router._ha_interface = True
router.ha = True
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
updated_router = {'id': '1234',
'distributed': True, 'ha': True,
'external_gateway_info': {}, 'routes': [],
'admin_state_up': True}
self.plugin_api.get_routers.return_value = [updated_router]
update = resource_processing_queue.ResourceUpdate(
updated_router['id'], l3_agent.PRIORITY_RPC,
resource=updated_router)
with mock.patch.object(agent,
"_safe_router_removed"
) as router_remove,\
mock.patch.object(agent,
"_process_added_router"
) as add_router:
agent._process_routers_if_compatible([updated_router], update)
router_remove.assert_called_once_with(updated_router['id'])
add_router.assert_called_once_with(updated_router)
def test_process_routers_if_compatible_error(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = {'id': _uuid()}