Check if container is running before doing an exec
container_running is a new method which will allow to return True if a
container is detected as running or False if not running.
There is a retry mechanism which if "podman ps" is used, will add "--sync" to
the command so we synchronize the state of OCI runtime. Before doing a
"podman ps", we try to check if the service is running in systemd.
There is a very short sleep between the retries to give a chance to
podman to find the container if it takes a bit of time to start or seen
as started.
It will be used by the builder when a container is configured to
run "podman exec"; we'll first verify that the container exist otherwise
return an error and stop the deployment.
This patch is mainly a workaround against a race condition where in
heavy-loaded environments, an exec can be run too early in a step where
the container is still starting.
It also consolidate the discover_container_name method in order to get
more chance to actually get a name.
Co-Authored-By: Cédric Jeanneret <cjeanner@redhat.com>
Closes-Bug: #1839559
Change-Id: If4d8c268218bf83abed877a699fc583fb55ccbed
(cherry picked from commit 983ab98f61
)
This commit is contained in:
parent
0be2e7a9b2
commit
17a7432947
|
@ -109,6 +109,22 @@ class BaseBuilder(object):
|
|||
container,
|
||||
container_name)
|
||||
elif action == 'exec':
|
||||
# for exec, the first argument is the fixed named container
|
||||
# used when running the command into the running container.
|
||||
command = self.command_argument(cconfig.get('command'))
|
||||
if command:
|
||||
c_name = self.runner.discover_container_name(
|
||||
command[0], self.config_id)
|
||||
else:
|
||||
c_name = self.runner.discover_container_name(
|
||||
container, self.config_id)
|
||||
# Before running the exec, we want to make sure the container
|
||||
# is running.
|
||||
# https://bugs.launchpad.net/bugs/1839559
|
||||
if not self.runner.container_running(c_name):
|
||||
msg = ('Failing to apply action exec for '
|
||||
'container: %s' % container)
|
||||
raise RuntimeError(msg)
|
||||
cmd = [self.runner.cont_cmd, 'exec']
|
||||
validations_passed = self.cont_exec_args(cmd, container)
|
||||
|
||||
|
|
|
@ -17,9 +17,11 @@ import json
|
|||
import random
|
||||
import string
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from paunch.builder import podman
|
||||
from paunch.utils import common
|
||||
from paunch.utils import systemctl
|
||||
from paunch.utils import systemd
|
||||
|
||||
|
||||
|
@ -145,11 +147,29 @@ class BaseRunner(object):
|
|||
'{{.Names}}'
|
||||
]
|
||||
(cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
|
||||
if returncode != 0:
|
||||
return container
|
||||
names = cmd_stdout.split()
|
||||
if names:
|
||||
return names[0]
|
||||
if returncode == 0:
|
||||
names = cmd_stdout.split()
|
||||
if names:
|
||||
return names[0]
|
||||
self.log.warning('Did not find container with "%s" - retrying without '
|
||||
'config_id' % cmd)
|
||||
|
||||
cmd = [
|
||||
self.cont_cmd,
|
||||
'ps',
|
||||
'-a',
|
||||
'--filter',
|
||||
'label=container_name=%s' % container,
|
||||
'--format',
|
||||
'{{.Names}}'
|
||||
]
|
||||
(cmd_stdout, cmd_stderr, returncode) = self.execute(cmd, self.log)
|
||||
if returncode == 0:
|
||||
names = cmd_stdout.split()
|
||||
if names:
|
||||
return names[0]
|
||||
|
||||
self.log.warning('Did not find container with "%s"' % cmd)
|
||||
return container
|
||||
|
||||
def delete_missing_configs(self, config_ids):
|
||||
|
@ -287,6 +307,11 @@ class DockerRunner(BaseRunner):
|
|||
"by %s" % self.cont_cmd)
|
||||
return True
|
||||
|
||||
def container_running(self, container):
|
||||
self.log.warning("container_running isn't supported "
|
||||
"by %s" % self.cont_cmd)
|
||||
return True
|
||||
|
||||
|
||||
class PodmanRunner(BaseRunner):
|
||||
|
||||
|
@ -361,3 +386,44 @@ class PodmanRunner(BaseRunner):
|
|||
cmd = ['podman', 'container', 'exists', name]
|
||||
(_, _, returncode) = self.execute(cmd, self.log, quiet)
|
||||
return returncode == 0
|
||||
|
||||
def container_running(self, container):
|
||||
service_name = 'tripleo_' + container + '.service'
|
||||
try:
|
||||
systemctl.is_active(service_name)
|
||||
self.log.debug('Unit %s is running' % service_name)
|
||||
return True
|
||||
except systemctl.SystemctlException:
|
||||
chk_cmd = [
|
||||
self.cont_cmd,
|
||||
'ps',
|
||||
'--filter',
|
||||
'label=container_name=%s' % container,
|
||||
'--format',
|
||||
'{{.Names}}'
|
||||
]
|
||||
cmd_stdout = ''
|
||||
returncode = -1
|
||||
count = 1
|
||||
while (not cmd_stdout or returncode != 0) and count <= 5:
|
||||
self.log.warning('Attempt %i to check if %s is '
|
||||
'running' % (count, container))
|
||||
# at the first retry, we will force a sync with the OCI runtime
|
||||
if self.cont_cmd == 'podman' and count == 2:
|
||||
chk_cmd.append('--sync')
|
||||
(cmd_stdout, cmd_stderr, returncode) = self.execute(chk_cmd,
|
||||
self.log)
|
||||
|
||||
if returncode != 0:
|
||||
self.log.warning('Attempt %i Error when running '
|
||||
'%s:' % (count, chk_cmd))
|
||||
self.log.warning(cmd_stderr)
|
||||
else:
|
||||
if not cmd_stdout:
|
||||
self.log.warning('Attempt %i Container %s '
|
||||
'is not running' % (count, container))
|
||||
|
||||
count += 1
|
||||
time.sleep(0.2)
|
||||
# return True if ps ran successfuly and returned a container name.
|
||||
return (cmd_stdout and returncode == 0)
|
||||
|
|
|
@ -41,6 +41,13 @@ class TestUtilsSystemctl(base.TestCase):
|
|||
mock.call(['systemctl', 'daemon-reload']),
|
||||
])
|
||||
|
||||
@mock.patch('subprocess.check_call', autospec=True)
|
||||
def test_is_active(self, mock_subprocess_check_call):
|
||||
systemctl.is_active('foo')
|
||||
mock_subprocess_check_call.assert_has_calls([
|
||||
mock.call(['systemctl', 'is-active', '-q', 'foo']),
|
||||
])
|
||||
|
||||
@mock.patch('subprocess.check_call', autospec=True)
|
||||
def test_enable(self, mock_subprocess_check_call):
|
||||
test = 'test'
|
||||
|
|
|
@ -50,6 +50,10 @@ def reset_failed(service, log=None):
|
|||
systemctl(['reset-failed', service], log)
|
||||
|
||||
|
||||
def is_active(service, log=None):
|
||||
systemctl(['is-active', '-q', service], log)
|
||||
|
||||
|
||||
# NOTE(bogdando): this implements a crash-loop with reset-failed
|
||||
# counters approach that provides an efficient feature parity to the
|
||||
# classic rate limiting, shall we want to implement that for the
|
||||
|
|
Loading…
Reference in New Issue