From 23b57a6dae715d71b02f0f6bf528294e7bdadeea Mon Sep 17 00:00:00 2001 From: yatinkarel <ykarel@redhat.com> Date: Mon, 16 Dec 2024 09:59:52 +0530 Subject: [PATCH] Revert "[HA] Do not add initial state change delay in HA router" This reverts commit c20f2e5136fd241f4be5c37403ab1ed54cdaefb5. The fix of bug #1945512 reintroduced bug #1837635 as after the initial backup state ha router can transition to 'primary' state on multiple hosts and due to this delay multiple routers get into 'active' ha_state even if one of the host quickly transition to backup after the primary state. The issue got visible since ha router fullstack tests are added as part of [1]. [1] https://review.opendev.org/c/openstack/neutron/+/917429 Related-Bug: #1837635 Related-Bug: #1945512 Related-Bug: #2083609 Change-Id: I83b53a07362861da98b8361dafd95e94e5048322 (cherry picked from commit a3689956dde80b9639a3e805257ac02e1044a4c2) Conflicts: neutron/agent/l3/ha.py neutron/tests/unit/agent/l3/test_agent.py --- neutron/agent/l3/ha.py | 20 +----------------- neutron/tests/unit/agent/l3/test_agent.py | 25 +---------------------- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/neutron/agent/l3/ha.py b/neutron/agent/l3/ha.py index f0369193818..c6cf90cfb2e 100644 --- a/neutron/agent/l3/ha.py +++ b/neutron/agent/l3/ha.py @@ -17,9 +17,6 @@ import os import threading import eventlet -from neutron_lib.callbacks import events -from neutron_lib.callbacks import registry -from neutron_lib.callbacks import resources from neutron_lib import constants from oslo_log import log as logging from oslo_utils import fileutils @@ -79,7 +76,6 @@ class L3AgentKeepalivedStateChangeServer(object): server.wait() -@registry.has_registry_receivers class AgentMixin(object): def __init__(self, host): self._init_ha_conf_path() @@ -91,13 +87,6 @@ class AgentMixin(object): eventlet.spawn(self._start_keepalived_notifications_server) self._transition_states = {} self._transition_state_mutex = threading.Lock() - self._initial_state_change_per_router = set() - - def initial_state_change(self, router_id): - initial_state = router_id not in self._initial_state_change_per_router - if initial_state: - self._initial_state_change_per_router.add(router_id) - return initial_state def _get_router_info(self, router_id): try: @@ -106,13 +95,6 @@ class AgentMixin(object): LOG.info('Router %s is not managed by this agent. It was ' 'possibly deleted concurrently.', router_id) - @registry.receives(resources.ROUTER, [events.AFTER_DELETE]) - def _delete_router(self, resource, event, trigger, payload): - try: - self._initial_state_change_per_router.remove(payload.resource_id) - except KeyError: - pass - def check_ha_state_for_router(self, router_id, current_state): ri = self._get_router_info(router_id) if not ri: @@ -166,7 +148,7 @@ class AgentMixin(object): def _enqueue_state_change(self, router_id, state): # NOTE(ralonsoh): move 'primary' and 'backup' constants to n-lib - if state == 'primary' and not self.initial_state_change(router_id): + if state == 'primary': eventlet.sleep(self.conf.ha_vrrp_advert_int) transition_state = self._update_transition_state(router_id) if transition_state != state: diff --git a/neutron/tests/unit/agent/l3/test_agent.py b/neutron/tests/unit/agent/l3/test_agent.py index b53014a4093..d0b05851c20 100644 --- a/neutron/tests/unit/agent/l3/test_agent.py +++ b/neutron/tests/unit/agent/l3/test_agent.py @@ -274,15 +274,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): self._enqueue_state_change_transitions(['backup'], 1) def test_enqueue_state_change_from_none_to_primary_to_backup(self): - # First transition (to primary), won't have a delay. - self._enqueue_state_change_transitions(['primary', 'backup'], 2) - - def test_enqueue_state_change_from_none_to_primary_to_backup_twice(self): - # Second transition to primary will have a delay and will be overridden - # by the second transition to backup; that means the transition from - # backup (second state) to primary (third state) is dismissed. - self._enqueue_state_change_transitions( - ['primary', 'backup', 'primary', 'backup'], 2) + self._enqueue_state_change_transitions(['primary', 'backup'], 0) def test_enqueue_state_change_from_none_to_backup_to_primary(self): self._enqueue_state_change_transitions(['backup', 'primary'], 2) @@ -2630,21 +2622,6 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): agent.router_removed_from_agent(None, {'router_id': FAKE_ID}) self.assertEqual(1, agent._queue.add.call_count) - @mock.patch.object(metadata_driver.MetadataDriver, - 'destroy_monitored_metadata_proxy') - def test__router_removed(self, *args): - agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) - ri = mock.Mock(router={'id': 'router_id'}) - agent._initial_state_change_per_router.add('router_id') - self.assertEqual({'router_id'}, agent._initial_state_change_per_router) - for _ in range(2): - # The second call will trigger a KeyError exception in - # AgentMixin._delete_router that should be dismissed. - agent.router_info['router_id'] = mock.ANY - agent.pd = mock.Mock(routers={'router_id': {'subnets': []}}) - agent._router_removed(ri, 'router_id') - self.assertEqual(set([]), agent._initial_state_change_per_router) - def test_added_to_agent(self): agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent._queue = mock.Mock()