List process ids in bwrap namespace

If the kernel kills a process due to an out of memory error, it can be difficult to track the process back to the build that triggered it. The kernel error just gives us a PID, but we don't know any of the Ansible process ids. Further, since they are in bwrap, Ansible only knows its namespaced pid rather than the host pid, so we can't simply output it in one of our callback plugins. To aid in debugging, output all of the process ids within a namespace right at the start of an ansible-playbook execution. At this time, it is certain that the Ansible process will have started, and it is very likely that it is still running. That should provide a way to map from an OOM message back to an Ansible process id. (Note that Ansible forks and this is unlikely to catch any forked processes, so we will only see the main Ansible process id. Typically this is what the kernel should elect to kill, but if it does not, we may need a futher change to repeat this process each time Ansible forks. Since that is more costly, let's see if we can avoid it.) Change-Id: I9f262c3a3c5410427b0fb301cb4f1697b033ba2f
2023-06-27 17:41:31 -07:00
parent c4345214ac
commit 1c92165ab7
4 changed files with 56 additions and 0 deletions
--- a/zuul/driver/bubblewrap/init.py
+++ b/zuul/driver/bubblewrap/init.py
@@ -69,6 +69,29 @@ class BubblewrapExecutionContext(BaseExecutionContext):
        self.mounts_map = {'ro': ro_paths, 'rw': rw_paths}
        self.secrets = secrets

+    def getNamespacePids(self, proc):
+        # Given a Popen object (proc), return the namespace and a list
+        # of host-side process ids in proc's namespace.
+        ps = subprocess.Popen(
+            ['ps', '-axo', 'pidns,pid,ppid'],
+            stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
+        pid_to_child_list = {}
+        ns_to_pid_list = {}
+        pid_to_ns_map = {}
+        for line in ps.stdout:
+            try:
+                (pidns, pid, ppid) = map(int, line.rstrip().split())
+            except ValueError:
+                continue
+            pid_to_child_list.setdefault(ppid, []).append(pid)
+            ns_to_pid_list.setdefault(pidns, []).append(pid)
+            pid_to_ns_map[pid] = pidns
+        for child in pid_to_child_list.get(proc.pid):
+            ns = pid_to_ns_map.get(child)
+            if ns is not None:
+                return ns, ns_to_pid_list.get(ns)
+        return None, []
+
    def startPipeWriter(self, pipe, data):
        # In case we have a large amount of data to write through a
        # pipe, spawn a thread to handle the writes.
--- a/zuul/driver/nullwrap/init.py
+++ b/zuul/driver/nullwrap/init.py
@@ -27,6 +27,9 @@ class NullExecutionContext(BaseExecutionContext):
    def getPopen(self, **kwargs):
        return psutil.Popen

+    def getNamespacePids(self, proc):
+        return None, []
+

 class NullwrapDriver(Driver, WrapperInterface):
    name = 'nullwrap'
--- a/zuul/execution_context/init.py
+++ b/zuul/execution_context/init.py
@@ -38,3 +38,18 @@ class BaseExecutionContext(object, metaclass=abc.ABCMeta):
        :rtype: Callable
        """
        pass
+
+    @abc.abstractmethod
+    def getNamespacePids(self, proc):
+        """Given a Popen object, return the namespace and a list of host-side
+        process ids in proc's first child namespace.
+
+        :arg Popen proc: The Popen object that is the parent of the namespace.
+
+        :returns: A tuple of the namespace id and all of the process
+            ids in the namespace.  The namespace id is None if no namespace
+            was found.
+        :rtype: tuple(int, list[int])
+
+        """
+        pass
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@@ -2808,7 +2808,22 @@ class AnsibleJob(object):
            # Use manual idx instead of enumerate so that RESULT lines
            # don't count towards BUFFER_LINES_FOR_SYNTAX
            idx = 0
+            first = True
            for line in iter(self.proc.stdout.readline, b''):
+                if first:
+                    # When we receive our first log line, bwrap should
+                    # have started Ansible and it should still be
+                    # running.  This is our best opportunity to list
+                    # the process ids in the namespace.
+                    try:
+                        ns, pids = context.getNamespacePids(self.proc)
+                        if ns is not None:
+                            self.log.debug("Process ids in namespace %s: %s",
+                                           ns, pids)
+                    except Exception:
+                        self.log.exception("Unable to list namespace pids")
+                    first = False
+
                if line.startswith(b'RESULT'):
                    # TODO(mordred) Process result commands if sent
                    continue