Merge "executor: run cleanup playbook on stop"

This commit is contained in:
Zuul 2019-07-03 14:51:53 +00:00 committed by Gerrit Code Review
commit b300d6520d
12 changed files with 229 additions and 18 deletions

View File

@ -1834,6 +1834,8 @@ class FakeBuild(object):
class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
result = None
def doMergeChanges(self, merger, items, repo_state):
# Get a merger in order to update the repos involved in this job.
commit = super(RecordingAnsibleJob, self).doMergeChanges(
@ -1843,8 +1845,12 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
return commit
def recordResult(self, result):
build = self.executor_server.job_builds[self.job.unique]
self.executor_server.lock.acquire()
build = self.executor_server.job_builds.get(self.job.unique)
if not build:
self.executor_server.lock.release()
# Already recorded
return
self.executor_server.build_history.append(
BuildHistory(name=build.name, result=result, changes=build.changes,
node=build.node, uuid=build.unique,
@ -1861,12 +1867,19 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
build = self.executor_server.job_builds[self.job.unique]
build.jobdir = self.jobdir
result = super(RecordingAnsibleJob, self).runPlaybooks(args)
self.recordResult(result)
return result
self.result = super(RecordingAnsibleJob, self).runPlaybooks(args)
if self.result is None:
# Record result now because cleanup won't be performed
self.recordResult(None)
return self.result
def runCleanupPlaybooks(self):
super(RecordingAnsibleJob, self).runCleanupPlaybooks()
if self.result is not None:
self.recordResult(self.result)
def runAnsible(self, cmd, timeout, playbook, ansible_version,
wrapped=True):
wrapped=True, cleanup=False):
build = self.executor_server.job_builds[self.job.unique]
if self.executor_server._run_ansible:
@ -1876,7 +1889,7 @@ class RecordingAnsibleJob(zuul.executor.server.AnsibleJob):
build.run()
result = super(RecordingAnsibleJob, self).runAnsible(
cmd, timeout, playbook, ansible_version, wrapped)
cmd, timeout, playbook, ansible_version, wrapped, cleanup)
else:
if playbook.path:
result = build.run()

View File

@ -0,0 +1,5 @@
- hosts: all
tasks:
- file:
path: "{{zuul._test.test_root}}/{{zuul.build}}.cleanup.flag"
state: touch

View File

@ -0,0 +1,3 @@
- hosts: all
tasks:
- fail: msg="This is a failure"

View File

@ -0,0 +1,13 @@
- hosts: all
tasks:
- debug: var=waitpath
- file:
path: "{{zuul._test.test_root}}/{{zuul.build}}.post_start.flag"
state: touch
# Do not finish until test creates the flag file
- wait_for:
state: present
path: "{{waitpath}}"
- file:
path: "{{zuul._test.test_root}}/{{zuul.build}}.post_end.flag"
state: touch

View File

@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@ -0,0 +1,2 @@
- hosts: all
tasks: []

View File

@ -0,0 +1,31 @@
- pipeline:
name: check
manager: independent
post-review: true
trigger:
gerrit:
- event: patchset-created
success:
gerrit:
Verified: 1
failure:
gerrit:
Verified: -1
- job:
name: base
parent: null
- job:
name: python27
pre-run: playbooks/pre.yaml
post-run: playbooks/post.yaml
cleanup-run: playbooks/cleanup.yaml
vars:
waitpath: '{{zuul._test.test_root}}/{{zuul.build}}/test_wait'
run: playbooks/python27.yaml
- job:
name: python27-failure
cleanup-run: playbooks/cleanup.yaml
run: playbooks/failure.yaml

View File

@ -0,0 +1,5 @@
- project:
name: org/project
check:
jobs:
- python27

View File

@ -0,0 +1 @@
test

View File

@ -0,0 +1,8 @@
- tenant:
name: tenant-one
source:
gerrit:
config-projects:
- common-config
untrusted-projects:
- org/project

View File

@ -2891,6 +2891,93 @@ class TestPostPlaybooks(AnsibleZuulTestCase):
self.assertFalse(os.path.exists(post_end))
class TestCleanupPlaybooks(AnsibleZuulTestCase):
tenant_config_file = 'config/cleanup-playbook/main.yaml'
def test_cleanup_playbook_success(self):
# Test that the cleanup run is performed
self.executor_server.verbose = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
for _ in iterate_timeout(60, 'job started'):
if len(self.builds):
break
build = self.builds[0]
post_start = os.path.join(self.test_root, build.uuid +
'.post_start.flag')
for _ in iterate_timeout(60, 'job post running'):
if os.path.exists(post_start):
break
with open(os.path.join(self.test_root, build.uuid, 'test_wait'),
"w") as of:
of.write("continue")
self.waitUntilSettled()
build = self.getJobFromHistory('python27')
self.assertEqual('SUCCESS', build.result)
cleanup_flag = os.path.join(self.test_root, build.uuid +
'.cleanup.flag')
self.assertTrue(os.path.exists(cleanup_flag))
def test_cleanup_playbook_failure(self):
# Test that the cleanup run is performed
self.executor_server.verbose = True
in_repo_conf = textwrap.dedent(
"""
- project:
check:
jobs:
- python27-failure
""")
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A',
files={'.zuul.yaml': in_repo_conf})
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
for _ in iterate_timeout(60, 'job started'):
if len(self.builds):
break
self.waitUntilSettled()
build = self.getJobFromHistory('python27-failure')
self.assertEqual('FAILURE', build.result)
cleanup_flag = os.path.join(self.test_root, build.uuid +
'.cleanup.flag')
self.assertTrue(os.path.exists(cleanup_flag))
def test_cleanup_playbook_abort(self):
# Test that when we abort a job the cleanup run is performed
self.executor_server.verbose = True
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A')
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1))
for _ in iterate_timeout(60, 'job started'):
if len(self.builds):
break
build = self.builds[0]
post_start = os.path.join(self.test_root, build.uuid +
'.post_start.flag')
for _ in iterate_timeout(60, 'job post running'):
if os.path.exists(post_start):
break
# The post playbook has started, abort the job
self.fake_gerrit.addEvent(A.getChangeAbandonedEvent())
self.waitUntilSettled()
build = self.getJobFromHistory('python27')
self.assertEqual('ABORTED', build.result)
post_end = os.path.join(self.test_root, build.uuid +
'.post_end.flag')
cleanup_flag = os.path.join(self.test_root, build.uuid +
'.cleanup.flag')
self.assertTrue(os.path.exists(cleanup_flag))
self.assertTrue(os.path.exists(post_start))
self.assertFalse(os.path.exists(post_end))
class TestBrokenTrustedConfig(ZuulTestCase):
# Test we can deal with a broken config only with trusted projects. This
# is different then TestBrokenConfig, as it does not have a missing

View File

@ -717,6 +717,9 @@ class AnsibleJob(object):
self.paused = False
self.aborted = False
self.aborted_reason = None
self.cleaned = False
self.cleanup_lock = threading.Lock()
self.cleanup_started = False
self._resume_event = threading.Event()
self.thread = None
self.project_info = {}
@ -997,6 +1000,10 @@ class AnsibleJob(object):
result = self.runPlaybooks(args)
if result is not None:
# Only run cleanup when playbooks ran (e.g. result is not None)
self.runCleanupPlaybooks()
# Stop the persistent SSH connections.
setup_status, setup_code = self.runAnsibleCleanup(
self.jobdir.setup_playbook)
@ -1195,6 +1202,8 @@ class AnsibleJob(object):
key, (time.monotonic() - self.time_starting_build) * 1000)
self.started = True
# Record ansible version being used for the cleanup phase
self.ansible_version = ansible_version
time_started = time.time()
# timeout value is "total" job timeout which accounts for
# pre-run and run playbooks. post-run is different because
@ -1288,6 +1297,29 @@ class AnsibleJob(object):
return result
def runCleanupPlaybooks(self):
if not self.jobdir.cleanup_playbooks:
return
# TODO: make this configurable
cleanup_timeout = 300
with open(self.jobdir.job_output_file, 'a') as job_output:
job_output.write("{now} | Running Ansible cleanup...\n".format(
now=datetime.datetime.now()
))
with self.cleanup_lock:
if self.cleaned:
# Cleanup phase may already ran when multiple aborts got issued
return
self.cleanup_started = True
for index, playbook in enumerate(self.jobdir.cleanup_playbooks):
self.runAnsiblePlaybook(
playbook, cleanup_timeout, self.ansible_version,
phase='cleanup', index=index)
self.cleaned = True
def _logFinalPlaybookError(self):
# Failures in the final post playbook can include failures
# uploading logs, which makes diagnosing issues difficult.
@ -1875,19 +1907,24 @@ class AnsibleJob(object):
def abortRunningProc(self):
with self.proc_lock:
if not self.proc:
if self.proc and not self.cleanup_started:
self.log.debug("Abort: sending kill signal to job "
"process group")
try:
pgid = os.getpgid(self.proc.pid)
os.killpg(pgid, signal.SIGKILL)
except Exception:
self.log.exception(
"Exception while killing ansible process:")
elif self.proc and self.cleanup_started:
self.log.debug("Abort: cleanup is in progress")
else:
self.log.debug("Abort: no process is running")
return
self.log.debug("Abort: sending kill signal to job "
"process group")
try:
pgid = os.getpgid(self.proc.pid)
os.killpg(pgid, signal.SIGKILL)
except Exception:
self.log.exception("Exception while killing ansible process:")
if self.started and not self.cleaned:
self.runCleanupPlaybooks()
def runAnsible(self, cmd, timeout, playbook, ansible_version,
wrapped=True):
wrapped=True, cleanup=False):
config_file = playbook.ansible_config
env_copy = os.environ.copy()
env_copy.update(self.ssh_agent.env)
@ -1956,7 +1993,7 @@ class AnsibleJob(object):
env_copy['HOME'] = self.jobdir.work_root
with self.proc_lock:
if self.aborted:
if self.aborted and not cleanup:
return (self.RESULT_ABORTED, None)
self.log.debug("Ansible command: ANSIBLE_CONFIG=%s ZUUL_JOBDIR=%s "
"ZUUL_JOB_LOG_CONFIG=%s PYTHONPATH=%s TMP=%s %s",
@ -2101,6 +2138,9 @@ class AnsibleJob(object):
now=datetime.datetime.now(),
line=line.decode('utf-8').rstrip()))
if self.aborted:
return (self.RESULT_ABORTED, None)
return (self.RESULT_NORMAL, ret)
def runAnsibleSetup(self, playbook, ansible_version):
@ -2220,7 +2260,8 @@ class AnsibleJob(object):
self.emitPlaybookBanner(playbook, 'START', phase)
result, code = self.runAnsible(cmd, timeout, playbook, ansible_version)
result, code = self.runAnsible(cmd, timeout, playbook, ansible_version,
cleanup=phase == 'cleanup')
self.log.debug("Ansible complete, result %s code %s" % (
self.RESULT_MAP[result], code))
if self.executor_server.statsd: