Browse Source

Add check for ha state

If all agents are shown as a standby it is possible changing state
were lost due to problems with RabbitMQ. Current change adds check
for ha state in fetch_and_sync_all_routers. If state is different -
 notify server that state should be changed.

Also change _get_bindings_and_update_router_state_for_dead_agents
to set standby for dead agent only in case we have more than one
active.

Change-Id: If5596eb24041ea9fae1d5d2563dcaf655c5face7
Closes-bug:#1648242
tags/10.0.0.0b3
AKamyshnikova 2 years ago
parent
commit
1927da1bc7

+ 4
- 0
neutron/agent/l3/agent.py View File

@@ -575,6 +575,10 @@ class L3NATAgent(ha.AgentMixin,
575 575
                             ns_manager.keep_ext_net(ext_net_id)
576 576
                         elif is_snat_agent:
577 577
                             ns_manager.ensure_snat_cleanup(r['id'])
578
+                    # For HA routers check that DB state matches actual state
579
+                    if r.get('ha'):
580
+                        self.check_ha_state_for_router(
581
+                            r['id'], r.get(l3_constants.HA_ROUTER_STATE_KEY))
578 582
                     update = queue.RouterUpdate(
579 583
                         r['id'],
580 584
                         queue.PRIORITY_SYNC_ROUTERS_TASK,

+ 23
- 9
neutron/agent/l3/ha.py View File

@@ -22,12 +22,17 @@ import webob
22 22
 
23 23
 from neutron._i18n import _LI
24 24
 from neutron.agent.linux import utils as agent_utils
25
+from neutron.common import constants
25 26
 from neutron.notifiers import batch_notifier
26 27
 
27 28
 LOG = logging.getLogger(__name__)
28 29
 
29 30
 KEEPALIVED_STATE_CHANGE_SERVER_BACKLOG = 4096
30 31
 
32
+TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE,
33
+                   'backup': constants.HA_ROUTER_STATE_STANDBY,
34
+                   'fault': constants.HA_ROUTER_STATE_STANDBY}
35
+
31 36
 
32 37
 class KeepalivedStateChangeHandler(object):
33 38
     def __init__(self, agent):
@@ -77,6 +82,21 @@ class AgentMixin(object):
77 82
             self._calculate_batch_duration(), self.notify_server)
78 83
         eventlet.spawn(self._start_keepalived_notifications_server)
79 84
 
85
+    def _get_router_info(self, router_id):
86
+        try:
87
+            return self.router_info[router_id]
88
+        except KeyError:
89
+            LOG.info(_LI('Router %s is not managed by this agent. It was '
90
+                         'possibly deleted concurrently.'), router_id)
91
+
92
+    def check_ha_state_for_router(self, router_id, current_state):
93
+        ri = self._get_router_info(router_id)
94
+        if ri and current_state != TRANSLATION_MAP[ri.ha_state]:
95
+            LOG.debug("Updating server with state %(state)s for router "
96
+                      "%(router_id)s", {'router_id': router_id,
97
+                                        'state': ri.ha_state})
98
+            self.state_change_notifier.queue_event((router_id, ri.ha_state))
99
+
80 100
     def _start_keepalived_notifications_server(self):
81 101
         state_change_server = (
82 102
             L3AgentKeepalivedStateChangeServer(self, self.conf))
@@ -97,11 +117,8 @@ class AgentMixin(object):
97 117
                  {'router_id': router_id,
98 118
                   'state': state})
99 119
 
100
-        try:
101
-            ri = self.router_info[router_id]
102
-        except KeyError:
103
-            LOG.info(_LI('Router %s is not managed by this agent. It was '
104
-                         'possibly deleted concurrently.'), router_id)
120
+        ri = self._get_router_info(router_id)
121
+        if ri is None:
105 122
             return
106 123
 
107 124
         self._configure_ipv6_ra_on_ext_gw_port_if_necessary(ri, state)
@@ -144,10 +161,7 @@ class AgentMixin(object):
144 161
             ri.disable_radvd()
145 162
 
146 163
     def notify_server(self, batched_events):
147
-        translation_map = {'master': 'active',
148
-                           'backup': 'standby',
149
-                           'fault': 'standby'}
150
-        translated_states = dict((router_id, translation_map[state]) for
164
+        translated_states = dict((router_id, TRANSLATION_MAP[state]) for
151 165
                                  router_id, state in batched_events)
152 166
         LOG.debug('Updating server with HA routers states %s',
153 167
                   translated_states)

+ 13
- 10
neutron/db/l3_hamode_db.py View File

@@ -620,16 +620,19 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
620 620
         """
621 621
         with context.session.begin(subtransactions=True):
622 622
             bindings = self.get_ha_router_port_bindings(context, [router_id])
623
-            dead_agents = [
624
-                binding.agent for binding in bindings
625
-                if binding.state == n_const.HA_ROUTER_STATE_ACTIVE and
626
-                not (binding.agent.is_active and binding.agent.admin_state_up)]
627
-
628
-            for dead_agent in dead_agents:
629
-                self.update_routers_states(
630
-                    context, {router_id: n_const.HA_ROUTER_STATE_STANDBY},
631
-                    dead_agent.host)
632
-
623
+            dead_agents = []
624
+            active = [binding for binding in bindings
625
+                      if binding.state == n_const.HA_ROUTER_STATE_ACTIVE]
626
+            # Check dead agents only if we have more then one active agent
627
+            if len(active) > 1:
628
+                dead_agents = [binding.agent for binding in active
629
+                               if not (binding.agent.is_active and
630
+                                       binding.agent.admin_state_up)]
631
+                for dead_agent in dead_agents:
632
+                    self.update_routers_states(
633
+                        context,
634
+                        {router_id: n_const.HA_ROUTER_STATE_STANDBY},
635
+                        dead_agent.host)
633 636
         if dead_agents:
634 637
             return self.get_ha_router_port_bindings(context, [router_id])
635 638
         return bindings

+ 42
- 0
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -211,6 +211,48 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
211 211
         agent.enqueue_state_change(router.id, 'master')
212 212
         self.assertFalse(agent._update_metadata_proxy.call_count)
213 213
 
214
+    def test_check_ha_state_for_router_master_standby(self):
215
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
216
+        router = mock.Mock()
217
+        router.id = '1234'
218
+        router_info = mock.MagicMock()
219
+        agent.router_info[router.id] = router_info
220
+        router_info.ha_state = 'master'
221
+        with mock.patch.object(agent.state_change_notifier,
222
+                               'queue_event') as queue_event:
223
+            agent.check_ha_state_for_router(router.id,
224
+                                            n_const.HA_ROUTER_STATE_STANDBY)
225
+            queue_event.assert_called_once_with((router.id, 'master'))
226
+
227
+    def test_check_ha_state_for_router_standby_standby(self):
228
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
229
+        router = mock.Mock()
230
+        router.id = '1234'
231
+        router_info = mock.MagicMock()
232
+        agent.router_info[router.id] = router_info
233
+        router_info.ha_state = 'backup'
234
+        with mock.patch.object(agent.state_change_notifier,
235
+                               'queue_event') as queue_event:
236
+            agent.check_ha_state_for_router(router.id,
237
+                                            n_const.HA_ROUTER_STATE_STANDBY)
238
+            queue_event.assert_not_called()
239
+
240
+    def test_periodic_sync_routers_task_call_check_ha_state_for_router(self):
241
+        agent = l3_agent.L3NATAgentWithStateReport(HOSTNAME, self.conf)
242
+        ha_id = _uuid()
243
+        active_routers = [
244
+            {'id': ha_id,
245
+             n_const.HA_ROUTER_STATE_KEY: n_const.HA_ROUTER_STATE_STANDBY,
246
+             'ha': True},
247
+            {'id': _uuid()}]
248
+        self.plugin_api.get_router_ids.return_value = [r['id'] for r
249
+                                                       in active_routers]
250
+        self.plugin_api.get_routers.return_value = active_routers
251
+        with mock.patch.object(agent, 'check_ha_state_for_router') as check:
252
+            agent.periodic_sync_routers_task(agent.context)
253
+            check.assert_called_once_with(ha_id,
254
+                                          n_const.HA_ROUTER_STATE_STANDBY)
255
+
214 256
     def test_periodic_sync_routers_task_raise_exception(self):
215 257
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
216 258
         self.plugin_api.get_router_ids.return_value = ['fake_id']

+ 28
- 6
neutron/tests/unit/db/test_l3_hamode_db.py View File

@@ -190,29 +190,51 @@ class L3HATestCase(L3HATestFramework):
190 190
             self.admin_ctx, router['id'])
191 191
         self.assertEqual([], bindings)
192 192
 
193
-    def _assert_ha_state_for_agent_is_standby(self, router, agent):
193
+    def _assert_ha_state_for_agent(self, router, agent,
194
+                                   state=n_const.HA_ROUTER_STATE_STANDBY):
194 195
         bindings = (
195 196
             self.plugin.get_l3_bindings_hosting_router_with_ha_states(
196 197
                 self.admin_ctx, router['id']))
197 198
         agent_ids = [(a[0]['id'], a[1]) for a in bindings]
198
-        self.assertIn((agent['id'], 'standby'), agent_ids)
199
+        self.assertIn((agent['id'], state), agent_ids)
199 200
 
200 201
     def test_get_l3_bindings_hosting_router_with_ha_states_active_and_dead(
201 202
             self):
202 203
         router = self._create_router()
203 204
         self.plugin.update_routers_states(
204
-            self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
205
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
206
+            self.agent1['host'])
207
+        self.plugin.update_routers_states(
208
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
209
+            self.agent2['host'])
205 210
         with mock.patch.object(agent_utils, 'is_agent_down',
206 211
                                return_value=True):
207
-            self._assert_ha_state_for_agent_is_standby(router, self.agent1)
212
+            self._assert_ha_state_for_agent(router, self.agent1)
208 213
 
209 214
     def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
210 215
             self):
211 216
         router = self._create_router()
212 217
         self.plugin.update_routers_states(
213
-            self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
218
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
219
+            self.agent1['host'])
220
+        self.plugin.update_routers_states(
221
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
222
+            self.agent2['host'])
214 223
         helpers.set_agent_admin_state(self.agent1['id'])
215
-        self._assert_ha_state_for_agent_is_standby(router, self.agent1)
224
+        self._assert_ha_state_for_agent(router, self.agent1)
225
+
226
+    def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
227
+        router = self._create_router()
228
+        self.plugin.update_routers_states(
229
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
230
+            self.agent1['host'])
231
+        self.plugin.update_routers_states(
232
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_STANDBY},
233
+            self.agent2['host'])
234
+        with mock.patch.object(agent_utils, 'is_agent_down',
235
+                               return_value=True):
236
+            self._assert_ha_state_for_agent(
237
+                router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE)
216 238
 
217 239
     def test_router_created_in_active_state(self):
218 240
         router = self._create_router()

Loading…
Cancel
Save