Browse Source

Check if container is running before doing an exec

container_running is a new method which will allow to return True if a
container is detected as running or False if not running.
There is a retry mechanism which if "podman ps" is used, will add "--sync" to
the command so we synchronize the state of OCI runtime. Before doing a
"podman ps", we try to check if the service is running in systemd.
There is a very short sleep between the retries to give a chance to
podman to find the container if it takes a bit of time to start or seen
as started.

It will be used by the builder when a container is configured to
run "podman exec"; we'll first verify that the container exist otherwise
return an error and stop the deployment.

This patch is mainly a workaround against a race condition where in
heavy-loaded environments, an exec can be run too early in a step where
the container is still starting.

It also consolidate the discover_container_name method in order to get
more chance to actually get a name.

Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com>
Closes-Bug: #1839559
Change-Id: If4d8c268218bf83abed877a699fc583fb55ccbed
(cherry picked from commit 983ab98f61)
tags/4.5.1
Emilien Macchi 4 weeks ago
parent
commit
17a7432947
4 changed files with 98 additions and 5 deletions
  1. 16
    0
      paunch/builder/base.py
  2. 71
    5
      paunch/runner.py
  3. 7
    0
      paunch/tests/test_utils_systemctl.py
  4. 4
    0
      paunch/utils/systemctl.py

+ 16
- 0
paunch/builder/base.py View File

@@ -109,6 +109,22 @@ class BaseBuilder(object):
109 109
                                                              container,
110 110
                                                              container_name)
111 111
             elif action == 'exec':
112
+                # for exec, the first argument is the fixed named container
113
+                # used when running the command into the running container.
114
+                command = self.command_argument(cconfig.get('command'))
115
+                if command:
116
+                    c_name = self.runner.discover_container_name(
117
+                        command[0], self.config_id)
118
+                else:
119
+                    c_name = self.runner.discover_container_name(
120
+                        container, self.config_id)
121
+                # Before running the exec, we want to make sure the container
122
+                # is running.
123
+                # https://bugs.launchpad.net/bugs/1839559
124
+                if not self.runner.container_running(c_name):
125
+                    msg = ('Failing to apply action exec for '
126
+                           'container: %s' % container)
127
+                    raise RuntimeError(msg)
112 128
                 cmd = [self.runner.cont_cmd, 'exec']
113 129
                 validations_passed = self.cont_exec_args(cmd, container)
114 130
 

+ 71
- 5
paunch/runner.py View File

@@ -17,9 +17,11 @@ import json
17 17
 import random
18 18
 import string
19 19
 import subprocess
20
+import time
20 21
 
21 22
 from paunch.builder import podman
22 23
 from paunch.utils import common
24
+from paunch.utils import systemctl
23 25
 from paunch.utils import systemd
24 26
 
25 27
 
@@ -145,11 +147,29 @@ class BaseRunner(object):
145 147
             '{{.Names}}'
146 148
         ]
147 149
         (cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
148
-        if returncode != 0:
149
-            return container
150
-        names = cmd_stdout.split()
151
-        if names:
152
-            return names[0]
150
+        if returncode == 0:
151
+            names = cmd_stdout.split()
152
+            if names:
153
+                return names[0]
154
+        self.log.warning('Did not find container with "%s" - retrying without '
155
+                         'config_id' % cmd)
156
+
157
+        cmd = [
158
+            self.cont_cmd,
159
+            'ps',
160
+            '-a',
161
+            '--filter',
162
+            'label=container_name=%s' % container,
163
+            '--format',
164
+            '{{.Names}}'
165
+        ]
166
+        (cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
167
+        if returncode == 0:
168
+            names = cmd_stdout.split()
169
+            if names:
170
+                return names[0]
171
+
172
+        self.log.warning('Did not find container with "%s"' % cmd)
153 173
         return container
154 174
 
155 175
     def delete_missing_configs(self, config_ids):
@@ -287,6 +307,11 @@ class DockerRunner(BaseRunner):
287 307
                          "by %s" % self.cont_cmd)
288 308
         return True
289 309
 
310
+    def container_running(self, container):
311
+        self.log.warning("container_running isn't supported "
312
+                         "by %s" % self.cont_cmd)
313
+        return True
314
+
290 315
 
291 316
 class PodmanRunner(BaseRunner):
292 317
 
@@ -361,3 +386,44 @@ class PodmanRunner(BaseRunner):
361 386
         cmd = ['podman', 'container', 'exists', name]
362 387
         (_, _, returncode) = self.execute(cmd, self.log, quiet)
363 388
         return returncode == 0
389
+
390
+    def container_running(self, container):
391
+        service_name = 'tripleo_' + container + '.service'
392
+        try:
393
+            systemctl.is_active(service_name)
394
+            self.log.debug('Unit %s is running' % service_name)
395
+            return True
396
+        except systemctl.SystemctlException:
397
+            chk_cmd = [
398
+                self.cont_cmd,
399
+                'ps',
400
+                '--filter',
401
+                'label=container_name=%s' % container,
402
+                '--format',
403
+                '{{.Names}}'
404
+            ]
405
+            cmd_stdout = ''
406
+            returncode = -1
407
+            count = 1
408
+            while (not cmd_stdout or returncode != 0) and count <= 5:
409
+                self.log.warning('Attempt %i to check if %s is '
410
+                                 'running' % (count, container))
411
+                # at the first retry, we will force a sync with the OCI runtime
412
+                if self.cont_cmd == 'podman' and count == 2:
413
+                    chk_cmd.append('--sync')
414
+                (cmd_stdout, cmd_stderr, returncode) = self.execute(chk_cmd,
415
+                                                                    self.log)
416
+
417
+                if returncode != 0:
418
+                    self.log.warning('Attempt %i Error when running '
419
+                                     '%s:' % (count, chk_cmd))
420
+                    self.log.warning(cmd_stderr)
421
+                else:
422
+                    if not cmd_stdout:
423
+                        self.log.warning('Attempt %i Container %s '
424
+                                         'is not running' % (count, container))
425
+
426
+                count += 1
427
+                time.sleep(0.2)
428
+            # return True if ps ran successfuly and returned a container name.
429
+            return (cmd_stdout and returncode == 0)

+ 7
- 0
paunch/tests/test_utils_systemctl.py View File

@@ -41,6 +41,13 @@ class TestUtilsSystemctl(base.TestCase):
41 41
             mock.call(['systemctl', 'daemon-reload']),
42 42
         ])
43 43
 
44
+    @mock.patch('subprocess.check_call', autospec=True)
45
+    def test_is_active(self, mock_subprocess_check_call):
46
+        systemctl.is_active('foo')
47
+        mock_subprocess_check_call.assert_has_calls([
48
+            mock.call(['systemctl', 'is-active', '-q', 'foo']),
49
+        ])
50
+
44 51
     @mock.patch('subprocess.check_call', autospec=True)
45 52
     def test_enable(self, mock_subprocess_check_call):
46 53
         test = 'test'

+ 4
- 0
paunch/utils/systemctl.py View File

@@ -50,6 +50,10 @@ def reset_failed(service, log=None):
50 50
     systemctl(['reset-failed', service], log)
51 51
 
52 52
 
53
+def is_active(service, log=None):
54
+    systemctl(['is-active', '-q', service], log)
55
+
56
+
53 57
 # NOTE(bogdando): this implements a crash-loop with reset-failed
54 58
 # counters approach that provides an efficient feature parity to the
55 59
 # classic rate limiting, shall we want to implement that for the

Loading…
Cancel
Save