Start ssh persistent connections outside bwrap

So that we can use ssh persistent connections across playbook
invocations, start them with our initial ansible ping process
before any playbook invocations and outside bubblewrap.  This
will allow the process to remain running in the background without
being terminated as it would be if run under bubblewrap.

Explicitly set the control path directory (where ssh will store the
persistent sockets) so we can ensure it is mounted correctly by
bubblewrap for later playbook invocations.

The timeout remains 60s, so if a connection is not used in that
timeframe, it will be closed and will need to be re-opened.  In that
case, the process holding the persistent connection will be inside
of bubblewrap, and so it will no longer persist to other playbooks,
but should still work.  In other words, we will fall back on the
current behavior.

The 60s timeout will mean that Linux will hold a reference to the
unlinked jobdir for 60s after the completion of the job.  That
shouldn't cause any problems.

When hosts are added to the inventory by a job, they will not benefit
from this -- their connections will be dropped at the end of the
playbook.

We would like to be able to explicitly stop the persistent connections
upon completion of the job for faster and more reliable cleanup.  In
order to do that, we need the control path that Ansible generates
if we wanted to execute the "ssh -O stop" command directly, or we need
to ask Ansible to do it for us.  The path generation is not
straightforward and the implementation may change, so it would be more
reliable to ask Ansible to stop the connections.  There is a facility
for this in the meta module, however, it has a fatal bug in 2.3.  The
bug is corrected in 2.4, so once we use that, we can explicitly stop
connections, and at that time it would be reasonable to extend the
persist timeout as we won't be relying solely on it to avoid processes
piling up on the executor.

Change-Id: I0e3607a195bf20e0df8cc2be0e2f96334f74088a
This commit is contained in:
James E. Blair 2017-10-15 20:59:50 -07:00
parent 0999879bf1
commit a86aaf1158
2 changed files with 40 additions and 6 deletions

View File

@ -1392,12 +1392,12 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
self.recordResult(result) self.recordResult(result)
return result return result
def runAnsible(self, cmd, timeout, playbook): def runAnsible(self, cmd, timeout, playbook, wrapped=True):
build = self.executor_server.job_builds[self.job.unique] build = self.executor_server.job_builds[self.job.unique]
if self.executor_server._run_ansible: if self.executor_server._run_ansible:
result = super(RecordingAnsibleJob, self).runAnsible( result = super(RecordingAnsibleJob, self).runAnsible(
cmd, timeout, playbook) cmd, timeout, playbook, wrapped)
else: else:
if playbook.path: if playbook.path:
result = build.run() result = build.run()

View File

@ -286,6 +286,7 @@ class JobDir(object):
# inventory.yaml # inventory.yaml
# .ansible (mounted in bwrap read-write) # .ansible (mounted in bwrap read-write)
# fact-cache/localhost # fact-cache/localhost
# cp
# playbook_0 (mounted in bwrap for each playbook read-only) # playbook_0 (mounted in bwrap for each playbook read-only)
# secrets.yaml # secrets.yaml
# project -> ../trusted/project_0/... # project -> ../trusted/project_0/...
@ -326,6 +327,8 @@ class JobDir(object):
self.ansible_cache_root = os.path.join(self.root, '.ansible') self.ansible_cache_root = os.path.join(self.root, '.ansible')
self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache') self.fact_cache = os.path.join(self.ansible_cache_root, 'fact-cache')
os.makedirs(self.fact_cache) os.makedirs(self.fact_cache)
self.control_path = os.path.join(self.ansible_cache_root, 'cp')
os.makedirs(self.control_path)
localhost_facts = os.path.join(self.fact_cache, 'localhost') localhost_facts = os.path.join(self.fact_cache, 'localhost')
# NOTE(pabelanger): We do not want to leak zuul-executor facts to other # NOTE(pabelanger): We do not want to leak zuul-executor facts to other
# playbooks now that smart fact gathering is enabled by default. We # playbooks now that smart fact gathering is enabled by default. We
@ -701,6 +704,11 @@ class AnsibleJob(object):
self.job.sendWorkStatus(0, 100) self.job.sendWorkStatus(0, 100)
result = self.runPlaybooks(args) result = self.runPlaybooks(args)
# Stop the persistent SSH connections.
setup_status, setup_code = self.runAnsibleCleanup(
self.jobdir.setup_playbook)
if self.aborted_reason == self.RESULT_DISK_FULL: if self.aborted_reason == self.RESULT_DISK_FULL:
result = 'DISK_FULL' result = 'DISK_FULL'
data = self.getResultData() data = self.getResultData()
@ -1198,6 +1206,7 @@ class AnsibleJob(object):
# command which expects interactive input on a tty (such # command which expects interactive input on a tty (such
# as sudo) it does not hang. # as sudo) it does not hang.
config.write('pipelining = True\n') config.write('pipelining = True\n')
config.write('control_path_dir = %s\n' % self.jobdir.control_path)
ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \ ssh_args = "-o ControlMaster=auto -o ControlPersist=60s " \
"-o UserKnownHostsFile=%s" % self.jobdir.known_hosts "-o UserKnownHostsFile=%s" % self.jobdir.known_hosts
config.write('ssh_args = %s\n' % ssh_args) config.write('ssh_args = %s\n' % ssh_args)
@ -1219,7 +1228,7 @@ class AnsibleJob(object):
except Exception: except Exception:
self.log.exception("Exception while killing ansible process:") self.log.exception("Exception while killing ansible process:")
def runAnsible(self, cmd, timeout, playbook): def runAnsible(self, cmd, timeout, playbook, wrapped=True):
config_file = playbook.ansible_config config_file = playbook.ansible_config
env_copy = os.environ.copy() env_copy = os.environ.copy()
env_copy.update(self.ssh_agent.env) env_copy.update(self.ssh_agent.env)
@ -1260,8 +1269,12 @@ class AnsibleJob(object):
if playbook.secrets_content: if playbook.secrets_content:
secrets[playbook.secrets] = playbook.secrets_content secrets[playbook.secrets] = playbook.secrets_content
context = self.executor_server.execution_wrapper.getExecutionContext( if wrapped:
ro_paths, rw_paths, secrets) wrapper = self.executor_server.execution_wrapper
else:
wrapper = self.executor_server.connections.drivers['nullwrap']
context = wrapper.getExecutionContext(ro_paths, rw_paths, secrets)
popen = context.getPopen( popen = context.getPopen(
work_dir=self.jobdir.work_root, work_dir=self.jobdir.work_root,
@ -1376,7 +1389,28 @@ class AnsibleJob(object):
'-a', 'gather_subset=!all'] '-a', 'gather_subset=!all']
result, code = self.runAnsible( result, code = self.runAnsible(
cmd=cmd, timeout=60, playbook=playbook) cmd=cmd, timeout=60, playbook=playbook,
wrapped=False)
self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code))
return result, code
def runAnsibleCleanup(self, playbook):
# TODO(jeblair): This requires a bugfix in Ansible 2.4
# Once this is used, increase the controlpersist timeout.
return (self.RESULT_NORMAL, 0)
if self.executor_server.verbose:
verbose = '-vvv'
else:
verbose = '-v'
cmd = ['ansible', '*', verbose, '-m', 'meta',
'-a', 'reset_connection']
result, code = self.runAnsible(
cmd=cmd, timeout=60, playbook=playbook,
wrapped=False)
self.log.debug("Ansible complete, result %s code %s" % ( self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code)) self.RESULT_MAP[result], code))
return result, code return result, code