Browse Source

Merge "Delay HA router transition from "backup" to "master"" into stable/stein

changes/62/681662/1
Zuul 1 week ago
parent
commit
d63adefdf9

+ 14
- 1
neutron/agent/l3/agent.py View File

@@ -466,6 +466,15 @@ class L3NATAgent(ha.AgentMixin,
466 466
         return True
467 467
 
468 468
     def _router_removed(self, ri, router_id):
469
+        """Delete the router and stop the auxiliary processes
470
+
471
+        This stops the auxiliary processes (keepalived, keepvalived-state-
472
+        change, radvd, etc) and deletes the router ports and the namespace.
473
+        The "router_info" cache is updated too at the beginning of the process,
474
+        to avoid any other concurrent process to handle the router being
475
+        deleted. If an exception is raised, the "router_info" cache is
476
+        restored.
477
+        """
469 478
         if ri is None:
470 479
             LOG.warning("Info for router %s was not found. "
471 480
                         "Performing router cleanup", router_id)
@@ -477,8 +486,12 @@ class L3NATAgent(ha.AgentMixin,
477 486
                              self.context, states=(ri,),
478 487
                              resource_id=router_id))
479 488
 
480
-        ri.delete()
481 489
         del self.router_info[router_id]
490
+        try:
491
+            ri.delete()
492
+        except Exception:
493
+            with excutils.save_and_reraise_exception():
494
+                self.router_info[router_id] = ri
482 495
 
483 496
         registry.notify(resources.ROUTER, events.AFTER_DELETE, self, router=ri)
484 497
 

+ 41
- 0
neutron/agent/l3/ha.py View File

@@ -14,6 +14,7 @@
14 14
 #    under the License.
15 15
 
16 16
 import os
17
+import threading
17 18
 
18 19
 import eventlet
19 20
 from oslo_log import log as logging
@@ -83,6 +84,8 @@ class AgentMixin(object):
83 84
         self.state_change_notifier = batch_notifier.BatchNotifier(
84 85
             self._calculate_batch_duration(), self.notify_server)
85 86
         eventlet.spawn(self._start_keepalived_notifications_server)
87
+        self._transition_states = {}
88
+        self._transition_state_mutex = threading.Lock()
86 89
 
87 90
     def _get_router_info(self, router_id):
88 91
         try:
@@ -112,7 +115,44 @@ class AgentMixin(object):
112 115
         # default 2 seconds.
113 116
         return self.conf.ha_vrrp_advert_int
114 117
 
118
+    def _update_transition_state(self, router_id, new_state=None):
119
+        with self._transition_state_mutex:
120
+            transition_state = self._transition_states.get(router_id)
121
+            if new_state:
122
+                self._transition_states[router_id] = new_state
123
+            else:
124
+                self._transition_states.pop(router_id, None)
125
+        return transition_state
126
+
115 127
     def enqueue_state_change(self, router_id, state):
128
+        """Inform the server about the new router state
129
+
130
+        This function will also update the metadata proxy, the radvd daemon,
131
+        process the prefix delegation and inform to the L3 extensions. If the
132
+        HA router changes to "master", this transition will be delayed for at
133
+        least "ha_vrrp_advert_int" seconds. When the "master" router
134
+        transitions to "backup", "keepalived" will set the rest of HA routers
135
+        to "master" until it decides which one should be the only "master".
136
+        The transition from "backup" to "master" and then to "backup" again,
137
+        should not be registered in the Neutron server.
138
+
139
+        :param router_id: router ID
140
+        :param state: ['master', 'backup']
141
+        """
142
+        if not self._update_transition_state(router_id, state):
143
+            eventlet.spawn_n(self._enqueue_state_change, router_id, state)
144
+            eventlet.sleep(0)
145
+
146
+    def _enqueue_state_change(self, router_id, state):
147
+        # NOTE(ralonsoh): move 'master' and 'backup' constants to n-lib
148
+        if state == 'master':
149
+            eventlet.sleep(self.conf.ha_vrrp_advert_int)
150
+        if self._update_transition_state(router_id) != state:
151
+            # If the current "transition state" is not the initial "state" sent
152
+            # to update the router, that means the actual router state is the
153
+            # same as the "transition state" (e.g.: backup-->master-->backup).
154
+            return
155
+
116 156
         state_change_data = {"router_id": router_id, "state": state}
117 157
         LOG.info('Router %(router_id)s transitioned to %(state)s',
118 158
                  state_change_data)
@@ -125,6 +165,7 @@ class AgentMixin(object):
125 165
         # configuration to keepalived-state-change in order to remove the
126 166
         # dependency that currently exists on l3-agent running for the IPv6
127 167
         # failover.
168
+        ri.ha_state = state
128 169
         self._configure_ipv6_params(ri, state)
129 170
         if self.conf.enable_metadata_proxy:
130 171
             self._update_metadata_proxy(ri, router_id, state)

+ 16
- 9
neutron/agent/l3/ha_router.py View File

@@ -69,12 +69,21 @@ class HaRouter(router.RouterInfo):
69 69
         self.ha_port = None
70 70
         self.keepalived_manager = None
71 71
         self.state_change_callback = state_change_callback
72
+        self._ha_state = None
73
+        self._ha_state_path = None
72 74
 
73 75
     def create_router_namespace_object(
74 76
             self, router_id, agent_conf, iface_driver, use_ipv6):
75 77
         return HaRouterNamespace(
76 78
             router_id, agent_conf, iface_driver, use_ipv6)
77 79
 
80
+    @property
81
+    def ha_state_path(self):
82
+        if not self._ha_state_path and self.keepalived_manager:
83
+            self._ha_state_path = (self.keepalived_manager.
84
+                                   get_full_config_file_path('state'))
85
+        return self._ha_state_path
86
+
78 87
     @property
79 88
     def ha_priority(self):
80 89
         return self.router.get('priority', keepalived.HA_DEFAULT_PRIORITY)
@@ -85,22 +94,20 @@ class HaRouter(router.RouterInfo):
85 94
 
86 95
     @property
87 96
     def ha_state(self):
88
-        state = None
89
-        ha_state_path = self.keepalived_manager.get_full_config_file_path(
90
-            'state')
97
+        if self._ha_state:
98
+            return self._ha_state
91 99
         try:
92
-            with open(ha_state_path, 'r') as f:
93
-                state = f.read()
100
+            with open(self.ha_state_path, 'r') as f:
101
+                self._ha_state = f.read()
94 102
         except (OSError, IOError):
95 103
             LOG.debug('Error while reading HA state for %s', self.router_id)
96
-        return state or 'unknown'
104
+        return self._ha_state or 'unknown'
97 105
 
98 106
     @ha_state.setter
99 107
     def ha_state(self, new_state):
100
-        ha_state_path = self.keepalived_manager.get_full_config_file_path(
101
-            'state')
108
+        self._ha_state = new_state
102 109
         try:
103
-            with open(ha_state_path, 'w') as f:
110
+            with open(self.ha_state_path, 'w') as f:
104 111
                 f.write(new_state)
105 112
         except (OSError, IOError):
106 113
             LOG.error('Error while writing HA state for %s',

+ 8
- 1
neutron/tests/functional/agent/l3/framework.py View File

@@ -133,6 +133,12 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
133 133
                                                       enable_pf_floating_ip),
134 134
                                                   qos_policy_id=qos_policy_id)
135 135
 
136
+    def change_router_state(self, router_id, state):
137
+        ri = self.agent.router_info.get(router_id)
138
+        if not ri:
139
+            self.fail('Router %s is not present in the L3 agent' % router_id)
140
+        ri.ha_state = state
141
+
136 142
     def _test_conntrack_disassociate_fip(self, ha):
137 143
         '''Test that conntrack immediately drops stateful connection
138 144
            that uses floating IP once it's disassociated.
@@ -494,7 +500,8 @@ class L3AgentTestFramework(base.BaseSudoTestCase):
494 500
         # so there's no need to check that explicitly.
495 501
         self.assertFalse(self._namespace_exists(router.ns_name))
496 502
         common_utils.wait_until_true(
497
-            lambda: not self._metadata_proxy_exists(self.agent.conf, router))
503
+            lambda: not self._metadata_proxy_exists(self.agent.conf, router),
504
+            timeout=10)
498 505
 
499 506
     def _assert_snat_chains(self, router):
500 507
         self.assertFalse(router.iptables_manager.is_chain_empty(

+ 2
- 1
neutron/tests/functional/agent/l3/test_ha_router.py View File

@@ -37,7 +37,8 @@ class L3HATestCase(framework.L3AgentTestFramework):
37 37
 
38 38
     def test_keepalived_state_change_notification(self):
39 39
         enqueue_mock = mock.patch.object(
40
-            self.agent, 'enqueue_state_change').start()
40
+            self.agent, 'enqueue_state_change',
41
+            side_effect=self.change_router_state).start()
41 42
         router_info = self.generate_router_info(enable_ha=True)
42 43
         router = self.manage_router(self.agent, router_info)
43 44
         common_utils.wait_until_true(lambda: router.ha_state == 'master')

+ 36
- 0
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -228,23 +228,59 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
228 228
         # Make sure the exceptional code path has coverage
229 229
         agent.enqueue_state_change(non_existent_router, 'master')
230 230
 
231
+    def _enqueue_state_change_transitions(self, transitions, num_called):
232
+        self.conf.set_override('ha_vrrp_advert_int', 1)
233
+        agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
234
+        agent._update_transition_state('router_id')
235
+        with mock.patch.object(agent, '_get_router_info', return_value=None) \
236
+                as mock_get_router_info:
237
+            for state in transitions:
238
+                agent.enqueue_state_change('router_id', state)
239
+                eventlet.sleep(0.2)
240
+            # NOTE(ralonsoh): the wait process should be done inside the mock
241
+            # context, to allow the spawned thread to call the mocked function
242
+            # before the context ends.
243
+            eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
244
+
245
+        if num_called:
246
+            mock_get_router_info.assert_has_calls(
247
+                [mock.call('router_id') for _ in range(num_called)])
248
+        else:
249
+            mock_get_router_info.assert_not_called()
250
+
251
+    def test_enqueue_state_change_from_none_to_master(self):
252
+        self._enqueue_state_change_transitions(['master'], 1)
253
+
254
+    def test_enqueue_state_change_from_none_to_backup(self):
255
+        self._enqueue_state_change_transitions(['backup'], 1)
256
+
257
+    def test_enqueue_state_change_from_none_to_master_to_backup(self):
258
+        self._enqueue_state_change_transitions(['master', 'backup'], 0)
259
+
260
+    def test_enqueue_state_change_from_none_to_backup_to_master(self):
261
+        self._enqueue_state_change_transitions(['backup', 'master'], 2)
262
+
231 263
     def test_enqueue_state_change_metadata_disable(self):
232 264
         self.conf.set_override('enable_metadata_proxy', False)
265
+        self.conf.set_override('ha_vrrp_advert_int', 1)
233 266
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
234 267
         router = mock.Mock()
235 268
         router_info = mock.MagicMock()
236 269
         agent.router_info[router.id] = router_info
237 270
         agent._update_metadata_proxy = mock.Mock()
238 271
         agent.enqueue_state_change(router.id, 'master')
272
+        eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
239 273
         self.assertFalse(agent._update_metadata_proxy.call_count)
240 274
 
241 275
     def test_enqueue_state_change_l3_extension(self):
276
+        self.conf.set_override('ha_vrrp_advert_int', 1)
242 277
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
243 278
         router = mock.Mock()
244 279
         router_info = mock.MagicMock()
245 280
         agent.router_info[router.id] = router_info
246 281
         agent.l3_ext_manager.ha_state_change = mock.Mock()
247 282
         agent.enqueue_state_change(router.id, 'master')
283
+        eventlet.sleep(self.conf.ha_vrrp_advert_int + 2)
248 284
         agent.l3_ext_manager.ha_state_change.assert_called_once_with(
249 285
             agent.context,
250 286
             {'router_id': router.id, 'state': 'master'})

Loading…
Cancel
Save