Browse Source

Fix netns_cleanup interrupted on rwd I/O

Functional tests for netns_cleanup have been failing a few times
in the gate lately. After thorough tests we've seen that the issue was
related to using rootwrap-daemon inside a wait_until_true loop. When
timeout fired while utils.execute() was reading from rootwrap-daemon,
it got interrupted and the output of the last command was not read.
Therefore, next calls to utils.execute() would read the output of
their previous command rather than their own, leading to unexpected
results.

This fix will poll existing processes in the namespace without making
use of the wait_until_true loop. Instead, it will check elapsed time
and raise the exception if timeout is exceeded.

Also, i'm removing debug traces introduced in
327f7fc4d5 which helped finding the root
cause of this bug.

Change-Id: Ie233261e4be36eecaf6ec6d0532f0f5e2e996cd2
Closes-Bug: #1654287
tags/10.0.0.0b3
Daniel Alvarez 2 years ago
parent
commit
3f9f740d81

+ 2
- 6
neutron/agent/linux/ip_lib.py View File

@@ -131,14 +131,10 @@ class IPWrapper(SubProcessBase):
131 131
                 cmd = ['ip', 'netns', 'exec', self.namespace,
132 132
                        'find', SYS_NET_PATH, '-maxdepth', '1',
133 133
                        '-type', 'l', '-printf', '%f ']
134
-                output_str = utils.execute(
134
+                output = utils.execute(
135 135
                     cmd,
136 136
                     run_as_root=True,
137
-                    log_fail_as_error=self.log_fail_as_error)
138
-                # NOTE(dalvarez): Logging the output of this call due to
139
-                # bug1654287.
140
-                LOG.debug('get_devices(): %s', output_str)
141
-                output = output_str.split()
137
+                    log_fail_as_error=self.log_fail_as_error).split()
142 138
             except RuntimeError:
143 139
                 # We could be racing with a cron job deleting namespaces.
144 140
                 # Just return a empty list if the namespace is deleted.

+ 13
- 9
neutron/cmd/netns_cleanup.py View File

@@ -35,7 +35,6 @@ from neutron.agent.linux import interface
35 35
 from neutron.agent.linux import ip_lib
36 36
 from neutron.agent.linux import utils
37 37
 from neutron.common import config
38
-from neutron.common import utils as common_utils
39 38
 from neutron.conf.agent import cmd
40 39
 from neutron.conf.agent import dhcp as dhcp_config
41 40
 
@@ -162,14 +161,19 @@ def wait_until_no_listen_pids_namespace(namespace, timeout=SIGTERM_WAITTIME):
162 161
     If after timeout seconds, there are remaining processes in the namespace,
163 162
     then a PidsInNamespaceException will be thrown.
164 163
     """
165
-    # Would be better to handle an eventlet.timeout.Timeout exception
166
-    # but currently there's a problem importing eventlet since it's
167
-    # doing a local import from cmd/eventlet which doesn't have a
168
-    # timeout module
169
-    common_utils.wait_until_true(
170
-        lambda: not find_listen_pids_namespace(namespace),
171
-        timeout=SIGTERM_WAITTIME,
172
-        exception=PidsInNamespaceException)
164
+    # NOTE(dalvarez): This function can block forever if
165
+    # find_listen_pids_in_namespace never returns which is really unlikely. We
166
+    # can't use wait_until_true because we might get interrupted by eventlet
167
+    # Timeout during our I/O with rootwrap daemon and that will lead to errors
168
+    # in subsequent calls to utils.execute grabbing always the output of the
169
+    # previous command
170
+    start = end = time.time()
171
+    while end - start < timeout:
172
+        if not find_listen_pids_namespace(namespace):
173
+            return
174
+        time.sleep(1)
175
+        end = time.time()
176
+    raise PidsInNamespaceException
173 177
 
174 178
 
175 179
 def _kill_listen_processes(namespace, force=False):

+ 9
- 12
neutron/tests/unit/cmd/test_netns_cleanup.py View File

@@ -274,11 +274,9 @@ class TestNetnsCleanup(base.BaseTestCase):
274 274
     def test_kill_listen_processes(self):
275 275
         with mock.patch.object(util, '_kill_listen_processes',
276 276
                                return_value=1) as mock_kill_listen:
277
-            with mock.patch('neutron.common.utils.wait_until_true')\
278
-                    as wait_until_true_mock:
279
-                wait_until_true_mock.side_effect = [
280
-                    util.PidsInNamespaceException,
281
-                    None]
277
+            with mock.patch.object(util, 'wait_until_no_listen_pids_namespace',
278
+                                   side_effect=[util.PidsInNamespaceException,
279
+                                                None]):
282 280
                 namespace = mock.ANY
283 281
                 util.kill_listen_processes(namespace)
284 282
                 mock_kill_listen.assert_has_calls(
@@ -288,10 +286,8 @@ class TestNetnsCleanup(base.BaseTestCase):
288 286
     def test_kill_listen_processes_still_procs(self):
289 287
         with mock.patch.object(util, '_kill_listen_processes',
290 288
                                return_value=1):
291
-            with mock.patch('neutron.common.utils.wait_until_true')\
292
-                    as wait_until_true_mock:
293
-                wait_until_true_mock.side_effect = (
294
-                    util.PidsInNamespaceException)
289
+            with mock.patch.object(util, 'wait_until_no_listen_pids_namespace',
290
+                            side_effect=util.PidsInNamespaceException):
295 291
                 namespace = mock.ANY
296 292
                 with testtools.ExpectedException(
297 293
                         util.PidsInNamespaceException):
@@ -300,13 +296,14 @@ class TestNetnsCleanup(base.BaseTestCase):
300 296
     def test_kill_listen_processes_no_procs(self):
301 297
         with mock.patch.object(util, '_kill_listen_processes',
302 298
                                return_value=0) as mock_kill_listen:
303
-            with mock.patch('neutron.common.utils.wait_until_true')\
304
-                    as wait_until_true_mock:
299
+            with mock.patch.object(util,
300
+                                   'wait_until_no_listen_pids_namespace')\
301
+                    as wait_until_mock:
305 302
                 namespace = mock.ANY
306 303
                 util.kill_listen_processes(namespace)
307 304
                 mock_kill_listen.assert_called_once_with(namespace,
308 305
                                                          force=False)
309
-                self.assertFalse(wait_until_true_mock.called)
306
+                self.assertFalse(wait_until_mock.called)
310 307
 
311 308
     def _test_destroy_namespace_helper(self, force, num_devices):
312 309
         ns = 'qrouter-6e322ac7-ab50-4f53-9cdc-d1d3c1164b6d'

Loading…
Cancel
Save