Start ssh persistent connections outside bwrap

So that we can use ssh persistent connections across playbook invocations, start them with our initial ansible ping process before any playbook invocations and outside bubblewrap. This will allow the process to remain running in the background without being terminated as it would be if run under bubblewrap. Explicitly set the control path directory (where ssh will store the persistent sockets) so we can ensure it is mounted correctly by bubblewrap for later playbook invocations. The timeout remains 60s, so if a connection is not used in that timeframe, it will be closed and will need to be re-opened. In that case, the process holding the persistent connection will be inside of bubblewrap, and so it will no longer persist to other playbooks, but should still work. In other words, we will fall back on the current behavior. The 60s timeout will mean that Linux will hold a reference to the unlinked jobdir for 60s after the completion of the job. That shouldn't cause any problems. When hosts are added to the inventory by a job, they will not benefit from this -- their connections will be dropped at the end of the playbook. We would like to be able to explicitly stop the persistent connections upon completion of the job for faster and more reliable cleanup. In order to do that, we need the control path that Ansible generates if we wanted to execute the "ssh -O stop" command directly, or we need to ask Ansible to do it for us. The path generation is not straightforward and the implementation may change, so it would be more reliable to ask Ansible to stop the connections. There is a facility for this in the meta module, however, it has a fatal bug in 2.3. The bug is corrected in 2.4, so once we use that, we can explicitly stop connections, and at that time it would be reasonable to extend the persist timeout as we won't be relying solely on it to avoid processes piling up on the executor. Change-Id: I0e3607a195bf20e0df8cc2be0e2f96334f74088a
2017-10-15 20:59:50 -07:00 · 2017-10-15 20:59:50 -07:00 · a86aaf1158
parent 0999879bf1
commit a86aaf1158
2 changed files with 40 additions and 6 deletions
--- a/tests/base.py
+++ b/tests/base.py
@ -1392,12 +1392,12 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
        self.recordResult(result)
        return result

-    def runAnsible(self, cmd, timeout, playbook):
+    def runAnsible(self, cmd, timeout, playbook, wrapped=True):
        build = self.executor_server.job_builds[self.job.unique]

        if self.executor_server._run_ansible:
            result = super(RecordingAnsibleJob, self).runAnsible(
-                cmd, timeout, playbook)
+                cmd, timeout, playbook, wrapped)
        else:
            if playbook.path:
                result = build.run()
--- a/zuul/executor/server.py
+++ b/zuul/executor/server.py
@ -286,6 +286,7 @@ class JobDir(object):
        #     inventory.yaml
        #   .ansible (mounted in bwrap read-write)
        #     fact-cache/localhost
+        #     cp
        #   playbook_0 (mounted in bwrap for each playbook read-only)
        #     secrets.yaml
        #     project -> ../trusted/project_0/...
@ -326,6 +327,8 @@ class JobDir(object):
        self.ansible_cache_root = os.path.join(self.root, '.ansible')
        self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
        os.makedirs(self.fact_cache)
+        self.control_path = os.path.join(self.ansible_cache_root, 'cp')
+        os.makedirs(self.control_path)
        localhost_facts = os.path.join(self.fact_cache, 'localhost')
        # NOTE(pabelanger): We do not want to leak zuul-executor facts to other
        # playbooks now that smart fact gathering is enabled by default.  We
@ -701,6 +704,11 @@ class AnsibleJob(object):
        self.job.sendWorkStatus(0, 100)

        result = self.runPlaybooks(args)
+
+        # Stop the persistent SSH connections.
+        setup_status, setup_code = self.runAnsibleCleanup(
+            self.jobdir.setup_playbook)
+
        if self.aborted_reason == self.RESULT_DISK_FULL:
            result = 'DISK_FULL'
        data = self.getResultData()
@ -1198,6 +1206,7 @@ class AnsibleJob(object):
            # command which expects interactive input on a tty (such
            # as sudo) it does not hang.
            config.write('pipelining = True\n')
+            config.write('control_path_dir = %s\n' % self.jobdir.control_path)
            ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \
                "-o UserKnownHostsFile=%s" % self.jobdir.known_hosts
            config.write('ssh_args = %s\n' % ssh_args)
@ -1219,7 +1228,7 @@ class AnsibleJob(object):
            except Exception:
                self.log.exception("Exception while killing ansible process:")

-    def runAnsible(self, cmd, timeout, playbook):
+    def runAnsible(self, cmd, timeout, playbook, wrapped=True):
        config_file = playbook.ansible_config
        env_copy = os.environ.copy()
        env_copy.update(self.ssh_agent.env)
@ -1260,8 +1269,12 @@ class AnsibleJob(object):
        if playbook.secrets_content:
            secrets[playbook.secrets] = playbook.secrets_content

-        context = self.executor_server.execution_wrapper.getExecutionContext(
-            ro_paths, rw_paths, secrets)
+        if wrapped:
+            wrapper = self.executor_server.execution_wrapper
+        else:
+            wrapper = self.executor_server.connections.drivers['nullwrap']
+
+        context = wrapper.getExecutionContext(ro_paths, rw_paths, secrets)

        popen = context.getPopen(
            work_dir=self.jobdir.work_root,
@ -1376,7 +1389,28 @@ class AnsibleJob(object):
               '-a', 'gather_subset=!all']

        result, code = self.runAnsible(
-            cmd=cmd, timeout=60, playbook=playbook)
+            cmd=cmd, timeout=60, playbook=playbook,
+            wrapped=False)
+        self.log.debug("Ansible complete, result %s code %s" % (
+            self.RESULT_MAP[result], code))
+        return result, code
+
+    def runAnsibleCleanup(self, playbook):
+        # TODO(jeblair): This requires a bugfix in Ansible 2.4
+        # Once this is used, increase the controlpersist timeout.
+        return (self.RESULT_NORMAL, 0)
+
+        if self.executor_server.verbose:
+            verbose = '-vvv'
+        else:
+            verbose = '-v'
+
+        cmd = ['ansible', '*', verbose, '-m', 'meta',
+               '-a', 'reset_connection']
+
+        result, code = self.runAnsible(
+            cmd=cmd, timeout=60, playbook=playbook,
+            wrapped=False)
        self.log.debug("Ansible complete, result %s code %s" % (
            self.RESULT_MAP[result], code))
        return result, code