Delay HA router transition from "backup" to "master"

As described in the bug, when a HA router transitions from "master" to
"backup", "keepalived" processes will set the virtual IP in all other
HA routers. Each HA router will then advert it and "keepalived" will
decide, according to a trivial algorithm (higher interface IP), which
one should be "master". At this point, the other "keepalived" processes
running in the other servers, will remove the HA router virtual IP
assigned an instant before

To avoid transitioning some routers form "backup" to "master" and then
to "backup" in a very short period, this patch delays the "backup" to
"master" transition, waiting for a possible new "backup" state. If
during the waiting period (set to the HA VRRP advert time, 2 seconds
default) to set the HA state to "master", the L3 agent receives a new
"backup" HA state, the L3 agent does nothing.

Conflicts:
    neutron/agent/l3/agent.py

Closes-Bug: #1837635

Change-Id: I70037da9cdd0f8448e0af8dd96b4e3f5de5728ad
(cherry picked from commit 3f022a193f)
(cherry picked from commit adac5d9b7a)
changes/68/719968/1
Rodolfo Alonso Hernandez 3 years ago committed by Slawek Kaplonski
parent 9ab3d21789
commit 7682d2fa77
  1. 15
      neutron/agent/l3/agent.py
  2. 41
      neutron/agent/l3/ha.py
  3. 25
      neutron/agent/l3/ha_router.py
  4. 9
      neutron/tests/functional/agent/l3/framework.py
  5. 3
      neutron/tests/functional/agent/l3/test_ha_router.py
  6. 36
      neutron/tests/unit/agent/l3/test_agent.py

@ -407,6 +407,15 @@ class L3NATAgent(ha.AgentMixin,
return True
def _router_removed(self, router_id):
"""Delete the router and stop the auxiliary processes
This stops the auxiliary processes (keepalived, keepvalived-state-
change, radvd, etc) and deletes the router ports and the namespace.
The "router_info" cache is updated too at the beginning of the process,
to avoid any other concurrent process to handle the router being
deleted. If an exception is raised, the "router_info" cache is
restored.
"""
ri = self.router_info.get(router_id)
if ri is None:
LOG.warning("Info for router %s was not found. "
@ -419,8 +428,12 @@ class L3NATAgent(ha.AgentMixin,
self.context, states=(ri,),
resource_id=router_id))
ri.delete()
del self.router_info[router_id]
try:
ri.delete()
except Exception:
with excutils.save_and_reraise_exception():
self.router_info[router_id] = ri
registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri)

@ -14,6 +14,7 @@
# under the License.
import os
import threading
import eventlet
from oslo_log import log as logging
@ -83,6 +84,8 @@ class AgentMixin(object):
self.state_change_notifier = batch_notifier.BatchNotifier(
self._calculate_batch_duration(), self.notify_server)
eventlet.spawn(self._start_keepalived_notifications_server)
self._transition_states = {}
self._transition_state_mutex = threading.Lock()
def _get_router_info(self, router_id):
try:
@ -112,7 +115,44 @@ class AgentMixin(object):
# default 2 seconds.
return self.conf.ha_vrrp_advert_int
def _update_transition_state(self, router_id, new_state=None):
with self._transition_state_mutex:
transition_state = self._transition_states.get(router_id)
if new_state:
self._transition_states[router_id] = new_state
else:
self._transition_states.pop(router_id, None)
return transition_state
def enqueue_state_change(self, router_id, state):
"""Inform the server about the new router state
This function will also update the metadata proxy, the radvd daemon,
process the prefix delegation and inform to the L3 extensions. If the
HA router changes to "master", this transition will be delayed for at
least "ha_vrrp_advert_int" seconds. When the "master" router
transitions to "backup", "keepalived" will set the rest of HA routers
to "master" until it decides which one should be the only "master".
The transition from "backup" to "master" and then to "backup" again,
should not be registered in the Neutron server.
:param router_id: router ID
:param state: ['master', 'backup']
"""
if not self._update_transition_state(router_id, state):
eventlet.spawn_n(self._enqueue_state_change, router_id, state)
eventlet.sleep(0)
def _enqueue_state_change(self, router_id, state):
# NOTE(ralonsoh): move 'master' and 'backup' constants to n-lib
if state == 'master':
eventlet.sleep(self.conf.ha_vrrp_advert_int)
if self._update_transition_state(router_id) != state:
# If the current "transition state" is not the initial "state" sent
# to update the router, that means the actual router state is the
# same as the "transition state" (e.g.: backup-->master-->backup).
return
state_change_data = {"router_id": router_id, "state": state}
LOG.info('Router %(router_id)s transitioned to %(state)s',
state_change_data)
@ -125,6 +165,7 @@ class AgentMixin(object):
# configuration to keepalived-state-change in order to remove the
# dependency that currently exists on l3-agent running for the IPv6
# failover.
ri.ha_state = state
self._configure_ipv6_params(ri, state)
if self.conf.enable_metadata_proxy:
self._update_metadata_proxy(ri, router_id, state)

@ -65,12 +65,21 @@ class HaRouter(router.RouterInfo):
self.ha_port = None
self.keepalived_manager = None
self.state_change_callback = state_change_callback
self._ha_state = None
self._ha_state_path = None
def create_router_namespace_object(
self, router_id, agent_conf, iface_driver, use_ipv6):
return HaRouterNamespace(
router_id, agent_conf, iface_driver, use_ipv6)
@property
def ha_state_path(self):
if not self._ha_state_path and self.keepalived_manager:
self._ha_state_path = (self.keepalived_manager.
get_full_config_file_path('state'))
return self._ha_state_path
@property
def ha_priority(self):
return self.router.get('priority', keepalived.HA_DEFAULT_PRIORITY)
@ -81,22 +90,20 @@ class HaRouter(router.RouterInfo):
@property
def ha_state(self):
state = None
ha_state_path = self.keepalived_manager.get_full_config_file_path(
'state')
if self._ha_state:
return self._ha_state
try:
with open(ha_state_path, 'r') as f:
state = f.read()
with open(self.ha_state_path, 'r') as f:
self._ha_state = f.read()
except (OSError, IOError):
LOG.debug('Error while reading HA state for %s', self.router_id)
return state or 'unknown'
return self._ha_state or 'unknown'
@ha_state.setter
def ha_state(self, new_state):
ha_state_path = self.keepalived_manager.get_full_config_file_path(
'state')
self._ha_state = new_state
try:
with open(ha_state_path, 'w') as f:
with open(self.ha_state_path, 'w') as f:
f.write(new_state)
except (OSError, IOError):
LOG.error('Error while writing HA state for %s',

@ -128,6 +128,12 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
v6_ext_gw_with_sub),
qos_policy_id=qos_policy_id)
def change_router_state(self, router_id, state):
ri = self.agent.router_info.get(router_id)
if not ri:
self.fail('Router %s is not present in the L3 agent' % router_id)
ri.ha_state = state
def _test_conntrack_disassociate_fip(self, ha):
'''Test that conntrack immediately drops stateful connection
that uses floating IP once it's disassociated.
@ -483,7 +489,8 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
# so there's no need to check that explicitly.
self.assertFalse(self._namespace_exists(router.ns_name))
common_utils.wait_until_true(
lambda: not self._metadata_proxy_exists(self.agent.conf, router))
lambda: not self._metadata_proxy_exists(self.agent.conf, router),
timeout=10)
def _assert_snat_chains(self, router):
self.assertFalse(router.iptables_manager.is_chain_empty(

@ -36,7 +36,8 @@ class L3HATestCase(framework.L3AgentTestFramework):
def test_keepalived_state_change_notification(self):
enqueue_mock = mock.patch.object(
self.agent, 'enqueue_state_change').start()
self.agent, 'enqueue_state_change',
side_effect=self.change_router_state).start()
router_info = self.generate_router_info(enable_ha=True)
router = self.manage_router(self.agent, router_info)
common_utils.wait_until_true(lambda: router.ha_state == 'master')

@ -230,23 +230,59 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
# Make sure the exceptional code path has coverage
agent.enqueue_state_change(non_existent_router, 'master')
def _enqueue_state_change_transitions(self, transitions, num_called):
self.conf.set_override('ha_vrrp_advert_int', 1)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._update_transition_state('router_id')
with mock.patch.object(agent, '_get_router_info', return_value=None) \
as mock_get_router_info:
for state in transitions:
agent.enqueue_state_change('router_id', state)
eventlet.sleep(0.2)
# NOTE(ralonsoh): the wait process should be done inside the mock
# context, to allow the spawned thread to call the mocked function
# before the context ends.
eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
if num_called:
mock_get_router_info.assert_has_calls(
[mock.call('router_id') for _ in range(num_called)])
else:
mock_get_router_info.assert_not_called()
def test_enqueue_state_change_from_none_to_master(self):
self._enqueue_state_change_transitions(['master'], 1)
def test_enqueue_state_change_from_none_to_backup(self):
self._enqueue_state_change_transitions(['backup'], 1)
def test_enqueue_state_change_from_none_to_master_to_backup(self):
self._enqueue_state_change_transitions(['master', 'backup'], 0)
def test_enqueue_state_change_from_none_to_backup_to_master(self):
self._enqueue_state_change_transitions(['backup', 'master'], 2)
def test_enqueue_state_change_metadata_disable(self):
self.conf.set_override('enable_metadata_proxy', False)
self.conf.set_override('ha_vrrp_advert_int', 1)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = mock.Mock()
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
agent._update_metadata_proxy = mock.Mock()
agent.enqueue_state_change(router.id, 'master')
eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
self.assertFalse(agent._update_metadata_proxy.call_count)
def test_enqueue_state_change_l3_extension(self):
self.conf.set_override('ha_vrrp_advert_int', 1)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
router = mock.Mock()
router_info = mock.MagicMock()
agent.router_info[router.id] = router_info
agent.l3_ext_manager.ha_state_change = mock.Mock()
agent.enqueue_state_change(router.id, 'master')
eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
agent.l3_ext_manager.ha_state_change.assert_called_once_with(
agent.context,
{'router_id': router.id, 'state': 'master'})

Loading…
Cancel
Save