Browse Source

Add check for ha state

If all agents are shown as a standby it is possible changing state
were lost due to problems with RabbitMQ. Current change adds check
for ha state in fetch_and_sync_all_routers. If state is different -
 notify server that state should be changed.

Also change _get_bindings_and_update_router_state_for_dead_agents
to set standby for dead agent only in case we have more than one
active.

(cherry picked from commit 1927da1bc7)
Change-Id: If5596eb24041ea9fae1d5d2563dcaf655c5face7
Closes-bug:#1648242
tags/9.2.0
AKamyshnikova 2 years ago
parent
commit
ff7c5c25d3

+ 4
- 0
neutron/agent/l3/agent.py View File

@@ -580,6 +580,10 @@ class L3NATAgent(ha.AgentMixin,
580 580
                             ns_manager.keep_ext_net(ext_net_id)
581 581
                         elif is_snat_agent:
582 582
                             ns_manager.ensure_snat_cleanup(r['id'])
583
+                    # For HA routers check that DB state matches actual state
584
+                    if r.get('ha'):
585
+                        self.check_ha_state_for_router(
586
+                            r['id'], r.get(l3_constants.HA_ROUTER_STATE_KEY))
583 587
                     update = queue.RouterUpdate(
584 588
                         r['id'],
585 589
                         queue.PRIORITY_SYNC_ROUTERS_TASK,

+ 23
- 9
neutron/agent/l3/ha.py View File

@@ -23,6 +23,7 @@ import webob
23 23
 from neutron._i18n import _, _LI
24 24
 from neutron.agent.linux import keepalived
25 25
 from neutron.agent.linux import utils as agent_utils
26
+from neutron.common import constants
26 27
 from neutron.common import utils as common_utils
27 28
 from neutron.notifiers import batch_notifier
28 29
 
@@ -54,6 +55,10 @@ OPTS = [
54 55
                       'on the agent node.')),
55 56
 ]
56 57
 
58
+TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE,
59
+                   'backup': constants.HA_ROUTER_STATE_STANDBY,
60
+                   'fault': constants.HA_ROUTER_STATE_STANDBY}
61
+
57 62
 
58 63
 class KeepalivedStateChangeHandler(object):
59 64
     def __init__(self, agent):
@@ -103,6 +108,21 @@ class AgentMixin(object):
103 108
             self._calculate_batch_duration(), self.notify_server)
104 109
         eventlet.spawn(self._start_keepalived_notifications_server)
105 110
 
111
+    def _get_router_info(self, router_id):
112
+        try:
113
+            return self.router_info[router_id]
114
+        except KeyError:
115
+            LOG.info(_LI('Router %s is not managed by this agent. It was '
116
+                         'possibly deleted concurrently.'), router_id)
117
+
118
+    def check_ha_state_for_router(self, router_id, current_state):
119
+        ri = self._get_router_info(router_id)
120
+        if ri and current_state != TRANSLATION_MAP[ri.ha_state]:
121
+            LOG.debug("Updating server with state %(state)s for router "
122
+                      "%(router_id)s", {'router_id': router_id,
123
+                                        'state': ri.ha_state})
124
+            self.state_change_notifier.queue_event((router_id, ri.ha_state))
125
+
106 126
     def _start_keepalived_notifications_server(self):
107 127
         state_change_server = (
108 128
             L3AgentKeepalivedStateChangeServer(self, self.conf))
@@ -123,11 +143,8 @@ class AgentMixin(object):
123 143
                  {'router_id': router_id,
124 144
                   'state': state})
125 145
 
126
-        try:
127
-            ri = self.router_info[router_id]
128
-        except KeyError:
129
-            LOG.info(_LI('Router %s is not managed by this agent. It was '
130
-                         'possibly deleted concurrently.'), router_id)
146
+        ri = self._get_router_info(router_id)
147
+        if ri is None:
131 148
             return
132 149
 
133 150
         self._configure_ipv6_ra_on_ext_gw_port_if_necessary(ri, state)
@@ -170,10 +187,7 @@ class AgentMixin(object):
170 187
             ri.disable_radvd()
171 188
 
172 189
     def notify_server(self, batched_events):
173
-        translation_map = {'master': 'active',
174
-                           'backup': 'standby',
175
-                           'fault': 'standby'}
176
-        translated_states = dict((router_id, translation_map[state]) for
190
+        translated_states = dict((router_id, TRANSLATION_MAP[state]) for
177 191
                                  router_id, state in batched_events)
178 192
         LOG.debug('Updating server with HA routers states %s',
179 193
                   translated_states)

+ 13
- 10
neutron/db/l3_hamode_db.py View File

@@ -674,16 +674,19 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
674 674
         """
675 675
         with context.session.begin(subtransactions=True):
676 676
             bindings = self.get_ha_router_port_bindings(context, [router_id])
677
-            dead_agents = [
678
-                binding.agent for binding in bindings
679
-                if binding.state == n_const.HA_ROUTER_STATE_ACTIVE and
680
-                not (binding.agent.is_active and binding.agent.admin_state_up)]
681
-
682
-            for dead_agent in dead_agents:
683
-                self.update_routers_states(
684
-                    context, {router_id: n_const.HA_ROUTER_STATE_STANDBY},
685
-                    dead_agent.host)
686
-
677
+            dead_agents = []
678
+            active = [binding for binding in bindings
679
+                      if binding.state == n_const.HA_ROUTER_STATE_ACTIVE]
680
+            # Check dead agents only if we have more then one active agent
681
+            if len(active) > 1:
682
+                dead_agents = [binding.agent for binding in active
683
+                               if not (binding.agent.is_active and
684
+                                       binding.agent.admin_state_up)]
685
+                for dead_agent in dead_agents:
686
+                    self.update_routers_states(
687
+                        context,
688
+                        {router_id: n_const.HA_ROUTER_STATE_STANDBY},
689
+                        dead_agent.host)
687 690
         if dead_agents:
688 691
             return self.get_ha_router_port_bindings(context, [router_id])
689 692
         return bindings

+ 42
- 0
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -210,6 +210,48 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
210 210
         agent.enqueue_state_change(router.id, 'master')
211 211
         self.assertFalse(agent._update_metadata_proxy.call_count)
212 212
 
213
+    def test_check_ha_state_for_router_master_standby(self):
214
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
215
+        router = mock.Mock()
216
+        router.id = '1234'
217
+        router_info = mock.MagicMock()
218
+        agent.router_info[router.id] = router_info
219
+        router_info.ha_state = 'master'
220
+        with mock.patch.object(agent.state_change_notifier,
221
+                               'queue_event') as queue_event:
222
+            agent.check_ha_state_for_router(router.id,
223
+                                            n_const.HA_ROUTER_STATE_STANDBY)
224
+            queue_event.assert_called_once_with((router.id, 'master'))
225
+
226
+    def test_check_ha_state_for_router_standby_standby(self):
227
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
228
+        router = mock.Mock()
229
+        router.id = '1234'
230
+        router_info = mock.MagicMock()
231
+        agent.router_info[router.id] = router_info
232
+        router_info.ha_state = 'backup'
233
+        with mock.patch.object(agent.state_change_notifier,
234
+                               'queue_event') as queue_event:
235
+            agent.check_ha_state_for_router(router.id,
236
+                                            n_const.HA_ROUTER_STATE_STANDBY)
237
+            queue_event.assert_not_called()
238
+
239
+    def test_periodic_sync_routers_task_call_check_ha_state_for_router(self):
240
+        agent = l3_agent.L3NATAgentWithStateReport(HOSTNAME, self.conf)
241
+        ha_id = _uuid()
242
+        active_routers = [
243
+            {'id': ha_id,
244
+             n_const.HA_ROUTER_STATE_KEY: n_const.HA_ROUTER_STATE_STANDBY,
245
+             'ha': True},
246
+            {'id': _uuid()}]
247
+        self.plugin_api.get_router_ids.return_value = [r['id'] for r
248
+                                                       in active_routers]
249
+        self.plugin_api.get_routers.return_value = active_routers
250
+        with mock.patch.object(agent, 'check_ha_state_for_router') as check:
251
+            agent.periodic_sync_routers_task(agent.context)
252
+            check.assert_called_once_with(ha_id,
253
+                                          n_const.HA_ROUTER_STATE_STANDBY)
254
+
213 255
     def test_periodic_sync_routers_task_raise_exception(self):
214 256
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
215 257
         self.plugin_api.get_router_ids.return_value = ['fake_id']

+ 28
- 6
neutron/tests/unit/db/test_l3_hamode_db.py View File

@@ -187,29 +187,51 @@ class L3HATestCase(L3HATestFramework):
187 187
             self.admin_ctx, router['id'])
188 188
         self.assertEqual([], bindings)
189 189
 
190
-    def _assert_ha_state_for_agent_is_standby(self, router, agent):
190
+    def _assert_ha_state_for_agent(self, router, agent,
191
+                                   state=n_const.HA_ROUTER_STATE_STANDBY):
191 192
         bindings = (
192 193
             self.plugin.get_l3_bindings_hosting_router_with_ha_states(
193 194
                 self.admin_ctx, router['id']))
194 195
         agent_ids = [(a[0]['id'], a[1]) for a in bindings]
195
-        self.assertIn((agent['id'], 'standby'), agent_ids)
196
+        self.assertIn((agent['id'], state), agent_ids)
196 197
 
197 198
     def test_get_l3_bindings_hosting_router_with_ha_states_active_and_dead(
198 199
             self):
199 200
         router = self._create_router()
200 201
         self.plugin.update_routers_states(
201
-            self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
202
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
203
+            self.agent1['host'])
204
+        self.plugin.update_routers_states(
205
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
206
+            self.agent2['host'])
202 207
         with mock.patch.object(agents_db.AgentDbMixin, 'is_agent_down',
203 208
                                return_value=True):
204
-            self._assert_ha_state_for_agent_is_standby(router, self.agent1)
209
+            self._assert_ha_state_for_agent(router, self.agent1)
205 210
 
206 211
     def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
207 212
             self):
208 213
         router = self._create_router()
209 214
         self.plugin.update_routers_states(
210
-            self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
215
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
216
+            self.agent1['host'])
217
+        self.plugin.update_routers_states(
218
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
219
+            self.agent2['host'])
211 220
         helpers.set_agent_admin_state(self.agent1['id'])
212
-        self._assert_ha_state_for_agent_is_standby(router, self.agent1)
221
+        self._assert_ha_state_for_agent(router, self.agent1)
222
+
223
+    def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
224
+        router = self._create_router()
225
+        self.plugin.update_routers_states(
226
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_ACTIVE},
227
+            self.agent1['host'])
228
+        self.plugin.update_routers_states(
229
+            self.admin_ctx, {router['id']: n_const.HA_ROUTER_STATE_STANDBY},
230
+            self.agent2['host'])
231
+        with mock.patch.object(agents_db.AgentDbMixin, 'is_agent_down',
232
+                               return_value=True):
233
+            self._assert_ha_state_for_agent(
234
+                router, self.agent1, state=n_const.HA_ROUTER_STATE_ACTIVE)
213 235
 
214 236
     def test_router_created_in_active_state(self):
215 237
         router = self._create_router()

Loading…
Cancel
Save