List process ids in bwrap namespace

If the kernel kills a process due to an out of memory error, it
can be difficult to track the process back to the build that triggered
it.  The kernel error just gives us a PID, but we don't know any of
the Ansible process ids.  Further, since they are in bwrap, Ansible
only knows its namespaced pid rather than the host pid, so we can't
simply output it in one of our callback plugins.

To aid in debugging, output all of the process ids within a namespace
right at the start of an ansible-playbook execution.  At this time,
it is certain that the Ansible process will have started, and it is
very likely that it is still running.  That should provide a way
to map from an OOM message back to an Ansible process id.

(Note that Ansible forks and this is unlikely to catch any forked
processes, so we will only see the main Ansible process id.  Typically
this is what the kernel should elect to kill, but if it does not,
we may need a futher change to repeat this process each time Ansible
forks.  Since that is more costly, let's see if we can avoid it.)

Change-Id: I9f262c3a3c5410427b0fb301cb4f1697b033ba2f
This commit is contained in:
James E. Blair 2023-06-27 17:41:31 -07:00
parent c4345214ac
commit 1c92165ab7
4 changed files with 56 additions and 0 deletions

View File

@ -69,6 +69,29 @@ class BubblewrapExecutionContext(BaseExecutionContext):
self.mounts_map = {'ro': ro_paths, 'rw': rw_paths}
self.secrets = secrets
def getNamespacePids(self, proc):
# Given a Popen object (proc), return the namespace and a list
# of host-side process ids in proc's namespace.
ps = subprocess.Popen(
['ps', '-axo', 'pidns,pid,ppid'],
stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
pid_to_child_list = {}
ns_to_pid_list = {}
pid_to_ns_map = {}
for line in ps.stdout:
try:
(pidns, pid, ppid) = map(int, line.rstrip().split())
except ValueError:
continue
pid_to_child_list.setdefault(ppid, []).append(pid)
ns_to_pid_list.setdefault(pidns, []).append(pid)
pid_to_ns_map[pid] = pidns
for child in pid_to_child_list.get(proc.pid):
ns = pid_to_ns_map.get(child)
if ns is not None:
return ns, ns_to_pid_list.get(ns)
return None, []
def startPipeWriter(self, pipe, data):
# In case we have a large amount of data to write through a
# pipe, spawn a thread to handle the writes.

View File

@ -27,6 +27,9 @@ class NullExecutionContext(BaseExecutionContext):
def getPopen(self, **kwargs):
return psutil.Popen
def getNamespacePids(self, proc):
return None, []
class NullwrapDriver(Driver, WrapperInterface):
name = 'nullwrap'

View File

@ -38,3 +38,18 @@ class BaseExecutionContext(object, metaclass=abc.ABCMeta):
:rtype: Callable
"""
pass
@abc.abstractmethod
def getNamespacePids(self, proc):
"""Given a Popen object, return the namespace and a list of host-side
process ids in proc's first child namespace.
:arg Popen proc: The Popen object that is the parent of the namespace.
:returns: A tuple of the namespace id and all of the process
ids in the namespace. The namespace id is None if no namespace
was found.
:rtype: tuple(int, list[int])
"""
pass

View File

@ -2808,7 +2808,22 @@ class AnsibleJob(object):
# Use manual idx instead of enumerate so that RESULT lines
# don't count towards BUFFER_LINES_FOR_SYNTAX
idx = 0
first = True
for line in iter(self.proc.stdout.readline, b''):
if first:
# When we receive our first log line, bwrap should
# have started Ansible and it should still be
# running. This is our best opportunity to list
# the process ids in the namespace.
try:
ns, pids = context.getNamespacePids(self.proc)
if ns is not None:
self.log.debug("Process ids in namespace %s: %s",
ns, pids)
except Exception:
self.log.exception("Unable to list namespace pids")
first = False
if line.startswith(b'RESULT'):
# TODO(mordred) Process result commands if sent
continue