Check if container is running before doing an exec

container_running is a new method which will allow to return True if a container is detected as running or False if not running. There is a retry mechanism which if "podman ps" is used, will add "--sync" to the command so we synchronize the state of OCI runtime. Before doing a "podman ps", we try to check if the service is running in systemd. There is a very short sleep between the retries to give a chance to podman to find the container if it takes a bit of time to start or seen as started. It will be used by the builder when a container is configured to run "podman exec"; we'll first verify that the container exist otherwise return an error and stop the deployment. This patch is mainly a workaround against a race condition where in heavy-loaded environments, an exec can be run too early in a step where the container is still starting. It also consolidate the discover_container_name method in order to get more chance to actually get a name. Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com> Closes-Bug: #1839559 Change-Id: If4d8c268218bf83abed877a699fc583fb55ccbed (cherry picked from commit 983ab98f61)
2019-08-21 22:06:34 -04:00 · 2019-08-21 22:06:34 -04:00 · 17a7432947
parent 0be2e7a9b2
commit 17a7432947
4 changed files with 98 additions and 5 deletions
--- a/paunch/builder/base.py
+++ b/paunch/builder/base.py
@ -109,6 +109,22 @@ class BaseBuilder(object):
                                                             container,
                                                             container_name)
            elif action == 'exec':
+                # for exec, the first argument is the fixed named container
+                # used when running the command into the running container.
+                command = self.command_argument(cconfig.get('command'))
+                if command:
+                    c_name = self.runner.discover_container_name(
+                        command[0], self.config_id)
+                else:
+                    c_name = self.runner.discover_container_name(
+                        container, self.config_id)
+                # Before running the exec, we want to make sure the container
+                # is running.
+                # https://bugs.launchpad.net/bugs/1839559
+                if not self.runner.container_running(c_name):
+                    msg = ('Failing to apply action exec for '
+                           'container: %s' % container)
+                    raise RuntimeError(msg)
                cmd = [self.runner.cont_cmd, 'exec']
                validations_passed = self.cont_exec_args(cmd, container)

--- a/paunch/runner.py
+++ b/paunch/runner.py
@ -17,9 +17,11 @@ import json
 import random
 import string
 import subprocess
+import time

 from paunch.builder import podman
 from paunch.utils import common
+from paunch.utils import systemctl
 from paunch.utils import systemd


@ -145,11 +147,29 @@ class BaseRunner(object):
            '{{.Names}}'
        ]
        (cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
-        if returncode != 0:
-            return container
-        names = cmd_stdout.split()
-        if names:
-            return names[0]
+        if returncode == 0:
+            names = cmd_stdout.split()
+            if names:
+                return names[0]
+        self.log.warning('Did not find container with "%s" - retrying without '
+                         'config_id' % cmd)
+
+        cmd = [
+            self.cont_cmd,
+            'ps',
+            '-a',
+            '--filter',
+            'label=container_name=%s' % container,
+            '--format',
+            '{{.Names}}'
+        ]
+        (cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
+        if returncode == 0:
+            names = cmd_stdout.split()
+            if names:
+                return names[0]
+
+        self.log.warning('Did not find container with "%s"' % cmd)
        return container

    def delete_missing_configs(self, config_ids):
@ -287,6 +307,11 @@ class DockerRunner(BaseRunner):
                         "by %s" % self.cont_cmd)
        return True

+    def container_running(self, container):
+        self.log.warning("container_running isn't supported "
+                         "by %s" % self.cont_cmd)
+        return True
+

 class PodmanRunner(BaseRunner):

@ -361,3 +386,44 @@ class PodmanRunner(BaseRunner):
        cmd = ['podman', 'container', 'exists', name]
        (_, _, returncode) = self.execute(cmd, self.log, quiet)
        return returncode == 0
+
+    def container_running(self, container):
+        service_name = 'tripleo_' + container + '.service'
+        try:
+            systemctl.is_active(service_name)
+            self.log.debug('Unit %s is running' % service_name)
+            return True
+        except systemctl.SystemctlException:
+            chk_cmd = [
+                self.cont_cmd,
+                'ps',
+                '--filter',
+                'label=container_name=%s' % container,
+                '--format',
+                '{{.Names}}'
+            ]
+            cmd_stdout = ''
+            returncode = -1
+            count = 1
+            while (not cmd_stdout or returncode != 0) and count <= 5:
+                self.log.warning('Attempt %i to check if %s is '
+                                 'running' % (count, container))
+                # at the first retry, we will force a sync with the OCI runtime
+                if self.cont_cmd == 'podman' and count == 2:
+                    chk_cmd.append('--sync')
+                (cmd_stdout, cmd_stderr, returncode) = self.execute(chk_cmd,
+                                                                    self.log)
+
+                if returncode != 0:
+                    self.log.warning('Attempt %i Error when running '
+                                     '%s:' % (count, chk_cmd))
+                    self.log.warning(cmd_stderr)
+                else:
+                    if not cmd_stdout:
+                        self.log.warning('Attempt %i Container %s '
+                                         'is not running' % (count, container))
+
+                count += 1
+                time.sleep(0.2)
+            # return True if ps ran successfuly and returned a container name.
+            return (cmd_stdout and returncode == 0)
--- a/paunch/tests/test_utils_systemctl.py
+++ b/paunch/tests/test_utils_systemctl.py
@ -41,6 +41,13 @@ class TestUtilsSystemctl(base.TestCase):
            mock.call(['systemctl', 'daemon-reload']),
        ])

+    @mock.patch('subprocess.check_call', autospec=True)
+    def test_is_active(self, mock_subprocess_check_call):
+        systemctl.is_active('foo')
+        mock_subprocess_check_call.assert_has_calls([
+            mock.call(['systemctl', 'is-active', '-q', 'foo']),
+        ])
+
    @mock.patch('subprocess.check_call', autospec=True)
    def test_enable(self, mock_subprocess_check_call):
        test = 'test'
--- a/paunch/utils/systemctl.py
+++ b/paunch/utils/systemctl.py
@ -50,6 +50,10 @@ def reset_failed(service, log=None):
    systemctl(['reset-failed', service], log)


+def is_active(service, log=None):
+    systemctl(['is-active', '-q', service], log)
+
+
 # NOTE(bogdando): this implements a crash-loop with reset-failed
 # counters approach that provides an efficient feature parity to the
 # classic rate limiting, shall we want to implement that for the