From adac5d9b7a72b4edeba5357c6a47e7e528fcf775 Mon Sep 17 00:00:00 2001 From: Rodolfo Alonso Hernandez Date: Wed, 24 Jul 2019 16:32:02 +0000 Subject: [PATCH] Delay HA router transition from "backup" to "master" As described in the bug, when a HA router transitions from "master" to "backup", "keepalived" processes will set the virtual IP in all other HA routers. Each HA router will then advert it and "keepalived" will decide, according to a trivial algorithm (higher interface IP), which one should be "master". At this point, the other "keepalived" processes running in the other servers, will remove the HA router virtual IP assigned an instant before To avoid transitioning some routers form "backup" to "master" and then to "backup" in a very short period, this patch delays the "backup" to "master" transition, waiting for a possible new "backup" state. If during the waiting period (set to the HA VRRP advert time, 2 seconds default) to set the HA state to "master", the L3 agent receives a new "backup" HA state, the L3 agent does nothing. Closes-Bug: #1837635 Change-Id: I70037da9cdd0f8448e0af8dd96b4e3f5de5728ad (cherry picked from commit 3f022a193f66fde3bfd945af1119a60dfe91cb91) --- neutron/agent/l3/agent.py | 15 ++++++- neutron/agent/l3/ha.py | 41 +++++++++++++++++++ neutron/agent/l3/ha_router.py | 25 +++++++---- .../tests/functional/agent/l3/framework.py | 9 +++- .../functional/agent/l3/test_ha_router.py | 3 +- neutron/tests/unit/agent/l3/test_agent.py | 36 ++++++++++++++++ 6 files changed, 117 insertions(+), 12 deletions(-) diff --git a/neutron/agent/l3/agent.py b/neutron/agent/l3/agent.py index ba057a0adea..3c127cf5f1c 100644 --- a/neutron/agent/l3/agent.py +++ b/neutron/agent/l3/agent.py @@ -466,6 +466,15 @@ class L3NATAgent(ha.AgentMixin, return True def _router_removed(self, ri, router_id): + """Delete the router and stop the auxiliary processes + + This stops the auxiliary processes (keepalived, keepvalived-state- + change, radvd, etc) and deletes the router ports and the namespace. + The "router_info" cache is updated too at the beginning of the process, + to avoid any other concurrent process to handle the router being + deleted. If an exception is raised, the "router_info" cache is + restored. + """ if ri is None: LOG.warning("Info for router %s was not found. " "Performing router cleanup", router_id) @@ -477,8 +486,12 @@ class L3NATAgent(ha.AgentMixin, self.context, states=(ri,), resource_id=router_id)) - ri.delete() del self.router_info[router_id] + try: + ri.delete() + except Exception: + with excutils.save_and_reraise_exception(): + self.router_info[router_id] = ri registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri) diff --git a/neutron/agent/l3/ha.py b/neutron/agent/l3/ha.py index 899be9b7f00..0efe666aef2 100644 --- a/neutron/agent/l3/ha.py +++ b/neutron/agent/l3/ha.py @@ -14,6 +14,7 @@ # under the License. import os +import threading import eventlet from oslo_log import log as logging @@ -83,6 +84,8 @@ class AgentMixin(object): self.state_change_notifier = batch_notifier.BatchNotifier( self._calculate_batch_duration(), self.notify_server) eventlet.spawn(self._start_keepalived_notifications_server) + self._transition_states = {} + self._transition_state_mutex = threading.Lock() def _get_router_info(self, router_id): try: @@ -112,7 +115,44 @@ class AgentMixin(object): # default 2 seconds. return self.conf.ha_vrrp_advert_int + def _update_transition_state(self, router_id, new_state=None): + with self._transition_state_mutex: + transition_state = self._transition_states.get(router_id) + if new_state: + self._transition_states[router_id] = new_state + else: + self._transition_states.pop(router_id, None) + return transition_state + def enqueue_state_change(self, router_id, state): + """Inform the server about the new router state + + This function will also update the metadata proxy, the radvd daemon, + process the prefix delegation and inform to the L3 extensions. If the + HA router changes to "master", this transition will be delayed for at + least "ha_vrrp_advert_int" seconds. When the "master" router + transitions to "backup", "keepalived" will set the rest of HA routers + to "master" until it decides which one should be the only "master". + The transition from "backup" to "master" and then to "backup" again, + should not be registered in the Neutron server. + + :param router_id: router ID + :param state: ['master', 'backup'] + """ + if not self._update_transition_state(router_id, state): + eventlet.spawn_n(self._enqueue_state_change, router_id, state) + eventlet.sleep(0) + + def _enqueue_state_change(self, router_id, state): + # NOTE(ralonsoh): move 'master' and 'backup' constants to n-lib + if state == 'master': + eventlet.sleep(self.conf.ha_vrrp_advert_int) + if self._update_transition_state(router_id) != state: + # If the current "transition state" is not the initial "state" sent + # to update the router, that means the actual router state is the + # same as the "transition state" (e.g.: backup-->master-->backup). + return + state_change_data = {"router_id": router_id, "state": state} LOG.info('Router %(router_id)s transitioned to %(state)s', state_change_data) @@ -125,6 +165,7 @@ class AgentMixin(object): # configuration to keepalived-state-change in order to remove the # dependency that currently exists on l3-agent running for the IPv6 # failover. + ri.ha_state = state self._configure_ipv6_params(ri, state) if self.conf.enable_metadata_proxy: self._update_metadata_proxy(ri, router_id, state) diff --git a/neutron/agent/l3/ha_router.py b/neutron/agent/l3/ha_router.py index 876cd9343d0..47b0849fe27 100644 --- a/neutron/agent/l3/ha_router.py +++ b/neutron/agent/l3/ha_router.py @@ -69,12 +69,21 @@ class HaRouter(router.RouterInfo): self.ha_port = None self.keepalived_manager = None self.state_change_callback = state_change_callback + self._ha_state = None + self._ha_state_path = None def create_router_namespace_object( self, router_id, agent_conf, iface_driver, use_ipv6): return HaRouterNamespace( router_id, agent_conf, iface_driver, use_ipv6) + @property + def ha_state_path(self): + if not self._ha_state_path and self.keepalived_manager: + self._ha_state_path = (self.keepalived_manager. + get_full_config_file_path('state')) + return self._ha_state_path + @property def ha_priority(self): return self.router.get('priority', keepalived.HA_DEFAULT_PRIORITY) @@ -85,22 +94,20 @@ class HaRouter(router.RouterInfo): @property def ha_state(self): - state = None - ha_state_path = self.keepalived_manager.get_full_config_file_path( - 'state') + if self._ha_state: + return self._ha_state try: - with open(ha_state_path, 'r') as f: - state = f.read() + with open(self.ha_state_path, 'r') as f: + self._ha_state = f.read() except (OSError, IOError): LOG.debug('Error while reading HA state for %s', self.router_id) - return state or 'unknown' + return self._ha_state or 'unknown' @ha_state.setter def ha_state(self, new_state): - ha_state_path = self.keepalived_manager.get_full_config_file_path( - 'state') + self._ha_state = new_state try: - with open(ha_state_path, 'w') as f: + with open(self.ha_state_path, 'w') as f: f.write(new_state) except (OSError, IOError): LOG.error('Error while writing HA state for %s', diff --git a/neutron/tests/functional/agent/l3/framework.py b/neutron/tests/functional/agent/l3/framework.py index 30a5c5ca4d1..fba68421d8c 100644 --- a/neutron/tests/functional/agent/l3/framework.py +++ b/neutron/tests/functional/agent/l3/framework.py @@ -133,6 +133,12 @@ class L3AgentTestFramework(base.BaseSudoTestCase): enable_pf_floating_ip), qos_policy_id=qos_policy_id) + def change_router_state(self, router_id, state): + ri = self.agent.router_info.get(router_id) + if not ri: + self.fail('Router %s is not present in the L3 agent' % router_id) + ri.ha_state = state + def _test_conntrack_disassociate_fip(self, ha): '''Test that conntrack immediately drops stateful connection that uses floating IP once it's disassociated. @@ -494,7 +500,8 @@ class L3AgentTestFramework(base.BaseSudoTestCase): # so there's no need to check that explicitly. self.assertFalse(self._namespace_exists(router.ns_name)) common_utils.wait_until_true( - lambda: not self._metadata_proxy_exists(self.agent.conf, router)) + lambda: not self._metadata_proxy_exists(self.agent.conf, router), + timeout=10) def _assert_snat_chains(self, router): self.assertFalse(router.iptables_manager.is_chain_empty( diff --git a/neutron/tests/functional/agent/l3/test_ha_router.py b/neutron/tests/functional/agent/l3/test_ha_router.py index 22d8713ddda..2b08361aa4e 100644 --- a/neutron/tests/functional/agent/l3/test_ha_router.py +++ b/neutron/tests/functional/agent/l3/test_ha_router.py @@ -37,7 +37,8 @@ class L3HATestCase(framework.L3AgentTestFramework): def test_keepalived_state_change_notification(self): enqueue_mock = mock.patch.object( - self.agent, 'enqueue_state_change').start() + self.agent, 'enqueue_state_change', + side_effect=self.change_router_state).start() router_info = self.generate_router_info(enable_ha=True) router = self.manage_router(self.agent, router_info) common_utils.wait_until_true(lambda: router.ha_state == 'master') diff --git a/neutron/tests/unit/agent/l3/test_agent.py b/neutron/tests/unit/agent/l3/test_agent.py index e458c161507..bd26d7b49cb 100644 --- a/neutron/tests/unit/agent/l3/test_agent.py +++ b/neutron/tests/unit/agent/l3/test_agent.py @@ -228,23 +228,59 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): # Make sure the exceptional code path has coverage agent.enqueue_state_change(non_existent_router, 'master') + def _enqueue_state_change_transitions(self, transitions, num_called): + self.conf.set_override('ha_vrrp_advert_int', 1) + agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) + agent._update_transition_state('router_id') + with mock.patch.object(agent, '_get_router_info', return_value=None) \ + as mock_get_router_info: + for state in transitions: + agent.enqueue_state_change('router_id', state) + eventlet.sleep(0.2) + # NOTE(ralonsoh): the wait process should be done inside the mock + # context, to allow the spawned thread to call the mocked function + # before the context ends. + eventlet.sleep(self.conf.ha_vrrp_advert_int + 2) + + if num_called: + mock_get_router_info.assert_has_calls( + [mock.call('router_id') for _ in range(num_called)]) + else: + mock_get_router_info.assert_not_called() + + def test_enqueue_state_change_from_none_to_master(self): + self._enqueue_state_change_transitions(['master'], 1) + + def test_enqueue_state_change_from_none_to_backup(self): + self._enqueue_state_change_transitions(['backup'], 1) + + def test_enqueue_state_change_from_none_to_master_to_backup(self): + self._enqueue_state_change_transitions(['master', 'backup'], 0) + + def test_enqueue_state_change_from_none_to_backup_to_master(self): + self._enqueue_state_change_transitions(['backup', 'master'], 2) + def test_enqueue_state_change_metadata_disable(self): self.conf.set_override('enable_metadata_proxy', False) + self.conf.set_override('ha_vrrp_advert_int', 1) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) router = mock.Mock() router_info = mock.MagicMock() agent.router_info[router.id] = router_info agent._update_metadata_proxy = mock.Mock() agent.enqueue_state_change(router.id, 'master') + eventlet.sleep(self.conf.ha_vrrp_advert_int + 2) self.assertFalse(agent._update_metadata_proxy.call_count) def test_enqueue_state_change_l3_extension(self): + self.conf.set_override('ha_vrrp_advert_int', 1) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) router = mock.Mock() router_info = mock.MagicMock() agent.router_info[router.id] = router_info agent.l3_ext_manager.ha_state_change = mock.Mock() agent.enqueue_state_change(router.id, 'master') + eventlet.sleep(self.conf.ha_vrrp_advert_int + 2) agent.l3_ext_manager.ha_state_change.assert_called_once_with( agent.context, {'router_id': router.id, 'state': 'master'})