Browse Source

Add check for ha state

If all agents are shown as a standby it is possible changing state
were lost due to problems with RabbitMQ. Current change adds check
for ha state in fetch_and_sync_all_routers. If state is different -
 notify server that state should be changed.

Also change _get_bindings_and_update_router_state_for_dead_agents
to set standby for dead agent only in case we have more than one
active.

(cherry picked from commit 1927da1bc7)
Change-Id: If5596eb24041ea9fae1d5d2563dcaf655c5face7
Closes-bug:#1648242
tags/mitaka-eol
AKamyshnikova 2 years ago
parent
commit
c8a4fa4694

+ 4
- 0
neutron/agent/l3/agent.py View File

@@ -573,6 +573,10 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback,
573 573
                             ns_manager.keep_ext_net(ext_net_id)
574 574
                         elif is_snat_agent:
575 575
                             ns_manager.ensure_snat_cleanup(r['id'])
576
+                    # For HA routers check that DB state matches actual state
577
+                    if r.get('ha'):
578
+                        self.check_ha_state_for_router(
579
+                            r['id'], r.get(l3_constants.HA_ROUTER_STATE_KEY))
576 580
                     update = queue.RouterUpdate(
577 581
                         r['id'],
578 582
                         queue.PRIORITY_SYNC_ROUTERS_TASK,

+ 23
- 9
neutron/agent/l3/ha.py View File

@@ -23,6 +23,7 @@ import webob
23 23
 from neutron._i18n import _, _LI
24 24
 from neutron.agent.linux import keepalived
25 25
 from neutron.agent.linux import utils as agent_utils
26
+from neutron.common import constants
26 27
 from neutron.common import utils as common_utils
27 28
 from neutron.notifiers import batch_notifier
28 29
 
@@ -54,6 +55,10 @@ OPTS = [
54 55
                       'on the agent node.')),
55 56
 ]
56 57
 
58
+TRANSLATION_MAP = {'master': constants.HA_ROUTER_STATE_ACTIVE,
59
+                   'backup': constants.HA_ROUTER_STATE_STANDBY,
60
+                   'fault': constants.HA_ROUTER_STATE_STANDBY}
61
+
57 62
 
58 63
 class KeepalivedStateChangeHandler(object):
59 64
     def __init__(self, agent):
@@ -103,6 +108,21 @@ class AgentMixin(object):
103 108
             self._calculate_batch_duration(), self.notify_server)
104 109
         eventlet.spawn(self._start_keepalived_notifications_server)
105 110
 
111
+    def _get_router_info(self, router_id):
112
+        try:
113
+            return self.router_info[router_id]
114
+        except KeyError:
115
+            LOG.info(_LI('Router %s is not managed by this agent. It was '
116
+                         'possibly deleted concurrently.'), router_id)
117
+
118
+    def check_ha_state_for_router(self, router_id, current_state):
119
+        ri = self._get_router_info(router_id)
120
+        if ri and current_state != TRANSLATION_MAP[ri.ha_state]:
121
+            LOG.debug("Updating server with state %(state)s for router "
122
+                      "%(router_id)s", {'router_id': router_id,
123
+                                        'state': ri.ha_state})
124
+            self.state_change_notifier.queue_event((router_id, ri.ha_state))
125
+
106 126
     def _start_keepalived_notifications_server(self):
107 127
         state_change_server = (
108 128
             L3AgentKeepalivedStateChangeServer(self, self.conf))
@@ -123,11 +143,8 @@ class AgentMixin(object):
123 143
                  {'router_id': router_id,
124 144
                   'state': state})
125 145
 
126
-        try:
127
-            ri = self.router_info[router_id]
128
-        except KeyError:
129
-            LOG.info(_LI('Router %s is not managed by this agent. It was '
130
-                         'possibly deleted concurrently.'), router_id)
146
+        ri = self._get_router_info(router_id)
147
+        if ri is None:
131 148
             return
132 149
 
133 150
         self._configure_ipv6_ra_on_ext_gw_port_if_necessary(ri, state)
@@ -172,10 +189,7 @@ class AgentMixin(object):
172 189
             ri.disable_radvd()
173 190
 
174 191
     def notify_server(self, batched_events):
175
-        translation_map = {'master': 'active',
176
-                           'backup': 'standby',
177
-                           'fault': 'standby'}
178
-        translated_states = dict((router_id, translation_map[state]) for
192
+        translated_states = dict((router_id, TRANSLATION_MAP[state]) for
179 193
                                  router_id, state in batched_events)
180 194
         LOG.debug('Updating server with HA routers states %s',
181 195
                   translated_states)

+ 13
- 9
neutron/db/l3_hamode_db.py View File

@@ -647,15 +647,19 @@ class L3_HA_NAT_db_mixin(l3_dvr_db.L3_NAT_with_dvr_db_mixin,
647 647
         """
648 648
         with context.session.begin(subtransactions=True):
649 649
             bindings = self.get_ha_router_port_bindings(context, [router_id])
650
-            dead_agents = [
651
-                binding.agent for binding in bindings
652
-                if binding.state == constants.HA_ROUTER_STATE_ACTIVE and
653
-                not binding.agent.is_active]
654
-            for dead_agent in dead_agents:
655
-                self.update_routers_states(
656
-                    context, {router_id: constants.HA_ROUTER_STATE_STANDBY},
657
-                    dead_agent.host)
658
-
650
+            dead_agents = []
651
+            active = [binding for binding in bindings
652
+                      if binding.state == constants.HA_ROUTER_STATE_ACTIVE]
653
+            # Check dead agents only if we have more then one active agent
654
+            if len(active) > 1:
655
+                dead_agents = [binding.agent for binding in active
656
+                               if not (binding.agent.is_active and
657
+                                       binding.agent.admin_state_up)]
658
+                for dead_agent in dead_agents:
659
+                    self.update_routers_states(
660
+                        context,
661
+                        {router_id: constants.HA_ROUTER_STATE_STANDBY},
662
+                        dead_agent.host)
659 663
         if dead_agents:
660 664
             return self.get_ha_router_port_bindings(context, [router_id])
661 665
         return bindings

+ 43
- 0
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -204,6 +204,49 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
204 204
         agent.enqueue_state_change(router.id, 'master')
205 205
         self.assertFalse(agent._update_metadata_proxy.call_count)
206 206
 
207
+    def test_check_ha_state_for_router_master_standby(self):
208
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
209
+        router = mock.Mock()
210
+        router.id = '1234'
211
+        router_info = mock.MagicMock()
212
+        agent.router_info[router.id] = router_info
213
+        router_info.ha_state = 'master'
214
+        with mock.patch.object(agent.state_change_notifier,
215
+                               'queue_event') as queue_event:
216
+            agent.check_ha_state_for_router(
217
+                router.id, l3_constants.HA_ROUTER_STATE_STANDBY)
218
+            queue_event.assert_called_once_with((router.id, 'master'))
219
+
220
+    def test_check_ha_state_for_router_standby_standby(self):
221
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
222
+        router = mock.Mock()
223
+        router.id = '1234'
224
+        router_info = mock.MagicMock()
225
+        agent.router_info[router.id] = router_info
226
+        router_info.ha_state = 'backup'
227
+        with mock.patch.object(agent.state_change_notifier,
228
+                               'queue_event') as queue_event:
229
+            agent.check_ha_state_for_router(
230
+                router.id, l3_constants.HA_ROUTER_STATE_STANDBY)
231
+            queue_event.assert_not_called()
232
+
233
+    def test_periodic_sync_routers_task_call_check_ha_state_for_router(self):
234
+        agent = l3_agent.L3NATAgentWithStateReport(HOSTNAME, self.conf)
235
+        ha_id = _uuid()
236
+        active_routers = [
237
+            {'id': ha_id,
238
+             l3_constants.HA_ROUTER_STATE_KEY:
239
+                 l3_constants.HA_ROUTER_STATE_STANDBY,
240
+             'ha': True},
241
+            {'id': _uuid()}]
242
+        self.plugin_api.get_router_ids.return_value = [r['id'] for r
243
+                                                       in active_routers]
244
+        self.plugin_api.get_routers.return_value = active_routers
245
+        with mock.patch.object(agent, 'check_ha_state_for_router') as check:
246
+            agent.periodic_sync_routers_task(agent.context)
247
+            check.assert_called_once_with(ha_id,
248
+                                          l3_constants.HA_ROUTER_STATE_STANDBY)
249
+
207 250
     def test_periodic_sync_routers_task_raise_exception(self):
208 251
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
209 252
         self.plugin_api.get_router_ids.return_value = ['fake_id']

+ 42
- 10
neutron/tests/unit/db/test_l3_hamode_db.py View File

@@ -187,19 +187,51 @@ class L3HATestCase(L3HATestFramework):
187 187
             self.admin_ctx, router['id'])
188 188
         self.assertEqual([], bindings)
189 189
 
190
+    def _assert_ha_state_for_agent(self, router, agent,
191
+                                   state=constants.HA_ROUTER_STATE_STANDBY):
192
+        bindings = (
193
+            self.plugin.get_l3_bindings_hosting_router_with_ha_states(
194
+                self.admin_ctx, router['id']))
195
+        agent_ids = [(a[0]['id'], a[1]) for a in bindings]
196
+        self.assertIn((agent['id'], state), agent_ids)
197
+
190 198
     def test_get_l3_bindings_hosting_router_with_ha_states_active_and_dead(
191 199
             self):
192 200
         router = self._create_router()
193
-        with mock.patch.object(agents_db.Agent, 'is_active',
194
-                               new_callable=mock.PropertyMock,
195
-                               return_value=False):
196
-            self.plugin.update_routers_states(
197
-                self.admin_ctx, {router['id']: 'active'}, self.agent1['host'])
198
-            bindings = (
199
-                self.plugin.get_l3_bindings_hosting_router_with_ha_states(
200
-                    self.admin_ctx, router['id']))
201
-            agent_ids = [(agent[0]['id'], agent[1]) for agent in bindings]
202
-            self.assertIn((self.agent1['id'], 'standby'), agent_ids)
201
+        self.plugin.update_routers_states(
202
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_ACTIVE},
203
+            self.agent1['host'])
204
+        self.plugin.update_routers_states(
205
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_ACTIVE},
206
+            self.agent2['host'])
207
+        with mock.patch.object(agents_db.AgentDbMixin, 'is_agent_down',
208
+                               return_value=True):
209
+            self._assert_ha_state_for_agent(router, self.agent1)
210
+
211
+    def test_get_l3_bindings_hosting_router_agents_admin_state_up_is_false(
212
+            self):
213
+        router = self._create_router()
214
+        self.plugin.update_routers_states(
215
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_ACTIVE},
216
+            self.agent1['host'])
217
+        self.plugin.update_routers_states(
218
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_ACTIVE},
219
+            self.agent2['host'])
220
+        helpers.set_agent_admin_state(self.agent1['id'])
221
+        self._assert_ha_state_for_agent(router, self.agent1)
222
+
223
+    def test_get_l3_bindings_hosting_router_with_ha_states_one_dead(self):
224
+        router = self._create_router()
225
+        self.plugin.update_routers_states(
226
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_ACTIVE},
227
+            self.agent1['host'])
228
+        self.plugin.update_routers_states(
229
+            self.admin_ctx, {router['id']: constants.HA_ROUTER_STATE_STANDBY},
230
+            self.agent2['host'])
231
+        with mock.patch.object(agents_db.AgentDbMixin, 'is_agent_down',
232
+                               return_value=True):
233
+            self._assert_ha_state_for_agent(
234
+                router, self.agent1, state=constants.HA_ROUTER_STATE_ACTIVE)
203 235
 
204 236
     def test_router_created_in_active_state(self):
205 237
         router = self._create_router()

Loading…
Cancel
Save