Revert "[HA] Do not add initial state change delay in HA router"

This reverts commit c20f2e5136.

The fix of bug #1945512 reintroduced bug #1837635 as after
the initial backup state ha router can transition to
'primary' state on multiple hosts and due to this
delay multiple routers get into 'active' ha_state even
if one of the host quickly transition to backup after
the primary state.
The issue got visible since ha router fullstack tests
are added as part of [1].

[1] https://review.opendev.org/c/openstack/neutron/+/917429

Related-Bug: #1837635
Related-Bug: #1945512
Related-Bug: #2083609
Change-Id: I83b53a07362861da98b8361dafd95e94e5048322
(cherry picked from commit a3689956dd)
Conflicts:
	neutron/agent/l3/ha.py
	neutron/tests/unit/agent/l3/test_agent.py
This commit is contained in:
yatinkarel 2024-12-16 09:59:52 +05:30 committed by yatin
parent b962109dff
commit 549af9e07c
2 changed files with 2 additions and 43 deletions

View File

@ -17,9 +17,6 @@ import os
import threading import threading
import eventlet import eventlet
from neutron_lib.callbacks import events
from neutron_lib.callbacks import registry
from neutron_lib.callbacks import resources
from neutron_lib import constants from neutron_lib import constants
from oslo_log import log as logging from oslo_log import log as logging
from oslo_utils import fileutils from oslo_utils import fileutils
@ -79,7 +76,6 @@ class L3AgentKeepalivedStateChangeServer(object):
server.wait() server.wait()
@registry.has_registry_receivers
class AgentMixin(object): class AgentMixin(object):
def __init__(self, host): def __init__(self, host):
self._init_ha_conf_path() self._init_ha_conf_path()
@ -91,13 +87,6 @@ class AgentMixin(object):
eventlet.spawn(self._start_keepalived_notifications_server) eventlet.spawn(self._start_keepalived_notifications_server)
self._transition_states = {} self._transition_states = {}
self._transition_state_mutex = threading.Lock() self._transition_state_mutex = threading.Lock()
self._initial_state_change_per_router = set()
def initial_state_change(self, router_id):
initial_state = router_id not in self._initial_state_change_per_router
if initial_state:
self._initial_state_change_per_router.add(router_id)
return initial_state
def _get_router_info(self, router_id): def _get_router_info(self, router_id):
try: try:
@ -106,13 +95,6 @@ class AgentMixin(object):
LOG.info('Router %s is not managed by this agent. It was ' LOG.info('Router %s is not managed by this agent. It was '
'possibly deleted concurrently.', router_id) 'possibly deleted concurrently.', router_id)
@registry.receives(resources.ROUTER, [events.AFTER_DELETE])
def _delete_router(self, resource, event, trigger, payload):
try:
self._initial_state_change_per_router.remove(payload.resource_id)
except KeyError:
pass
def check_ha_state_for_router(self, router_id, current_state): def check_ha_state_for_router(self, router_id, current_state):
ri = self._get_router_info(router_id) ri = self._get_router_info(router_id)
if not ri: if not ri:
@ -166,7 +148,7 @@ class AgentMixin(object):
def _enqueue_state_change(self, router_id, state): def _enqueue_state_change(self, router_id, state):
# NOTE(ralonsoh): move 'primary' and 'backup' constants to n-lib # NOTE(ralonsoh): move 'primary' and 'backup' constants to n-lib
if state == 'primary' and not self.initial_state_change(router_id): if state == 'primary':
eventlet.sleep(self.conf.ha_vrrp_advert_int) eventlet.sleep(self.conf.ha_vrrp_advert_int)
transition_state = self._update_transition_state(router_id) transition_state = self._update_transition_state(router_id)
if transition_state != state: if transition_state != state:

View File

@ -274,15 +274,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
self._enqueue_state_change_transitions(['backup'], 1) self._enqueue_state_change_transitions(['backup'], 1)
def test_enqueue_state_change_from_none_to_primary_to_backup(self): def test_enqueue_state_change_from_none_to_primary_to_backup(self):
# First transition (to primary), won't have a delay. self._enqueue_state_change_transitions(['primary', 'backup'], 0)
self._enqueue_state_change_transitions(['primary', 'backup'], 2)
def test_enqueue_state_change_from_none_to_primary_to_backup_twice(self):
# Second transition to primary will have a delay and will be overridden
# by the second transition to backup; that means the transition from
# backup (second state) to primary (third state) is dismissed.
self._enqueue_state_change_transitions(
['primary', 'backup', 'primary', 'backup'], 2)
def test_enqueue_state_change_from_none_to_backup_to_primary(self): def test_enqueue_state_change_from_none_to_backup_to_primary(self):
self._enqueue_state_change_transitions(['backup', 'primary'], 2) self._enqueue_state_change_transitions(['backup', 'primary'], 2)
@ -2630,21 +2622,6 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent.router_removed_from_agent(None, {'router_id': FAKE_ID}) agent.router_removed_from_agent(None, {'router_id': FAKE_ID})
self.assertEqual(1, agent._queue.add.call_count) self.assertEqual(1, agent._queue.add.call_count)
@mock.patch.object(metadata_driver.MetadataDriver,
'destroy_monitored_metadata_proxy')
def test__router_removed(self, *args):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
ri = mock.Mock(router={'id': 'router_id'})
agent._initial_state_change_per_router.add('router_id')
self.assertEqual({'router_id'}, agent._initial_state_change_per_router)
for _ in range(2):
# The second call will trigger a KeyError exception in
# AgentMixin._delete_router that should be dismissed.
agent.router_info['router_id'] = mock.ANY
agent.pd = mock.Mock(routers={'router_id': {'subnets': []}})
agent._router_removed(ri, 'router_id')
self.assertEqual(set([]), agent._initial_state_change_per_router)
def test_added_to_agent(self): def test_added_to_agent(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
agent._queue = mock.Mock() agent._queue = mock.Mock()